aiden200 commited on
Commit
4660c02
·
verified ·
1 Parent(s): 2313665

Training in progress, step 3050

Browse files
Files changed (2) hide show
  1. adapter_model.safetensors +1 -1
  2. train.log +373 -0
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:472daf6308ec1dd1557df3603f3e85e6ecc913baf250d500c16975c93e3e8f2e
3
  size 1204780872
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d34e23c8484e80b6ef5c7243ef0e9595e8747d365664c46ffcfd790b9e48b3eb
3
  size 1204780872
train.log CHANGED
@@ -15526,3 +15526,376 @@ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device=
15526
  tensor(0.0556, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
15527
  tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
15528
  {'train/tv_loss': 0.00018669653218239546, 'train/lm_loss': 1.6736284305807202e-05, 'train/info_loss': 1.4662527973996475e-05, 'train/ref_loss': 0.09598883241415024, 'train/uncertainty_loss': -6.740234675817192e-05, 'train/video_loss': 0.09742966294288635, 'train/total_loss': 0.09744639694690704}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15526
  tensor(0.0556, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
15527
  tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
15528
  {'train/tv_loss': 0.00018669653218239546, 'train/lm_loss': 1.6736284305807202e-05, 'train/info_loss': 1.4662527973996475e-05, 'train/ref_loss': 0.09598883241415024, 'train/uncertainty_loss': -6.740234675817192e-05, 'train/video_loss': 0.09742966294288635, 'train/total_loss': 0.09744639694690704}
15529
+ [Rank 2] Trainer log: {'loss': 0.2252, 'grad_norm': 2.7614905834198, 'learning_rate': 3.030577462765139e-08}[Rank 3] Trainer log: {'loss': 0.2252, 'grad_norm': 2.7614905834198, 'learning_rate': 3.030577462765139e-08}
15530
+ [Rank 1] Trainer log: {'loss': 0.2252, 'grad_norm': 2.7614905834198, 'learning_rate': 3.030577462765139e-08}
15531
+
15532
+ [Rank 0] Trainer log: {'loss': 0.2252, 'grad_norm': 2.7614905834198, 'learning_rate': 3.030577462765139e-08}
15533
+ {'loss': 0.2252, 'grad_norm': 2.7614905834198, 'learning_rate': 3.030577462765139e-08, 'epoch': 0.98}
15534
+ tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
15535
+ tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
15536
+ tensor(-0.0014, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:0', grad_fn=<MulBackward0>)
15537
+ {'train/tv_loss': None, 'train/lm_loss': 0.1601017475128174, 'train/info_loss': 0.18674495816230774, 'train/ref_loss': None, 'train/uncertainty_loss': -0.000141742208506912, 'train/video_loss': 0.18660321831703186, 'train/total_loss': 0.34670495986938477}
15538
+ tensor(0.0981, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
15539
+ tensor(-0.0013, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:3', grad_fn=<MulBackward0>)
15540
+ tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
15541
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
15542
+ tensor(0.0275, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
15543
+ {'train/tv_loss': 0.00012622961075976492, 'train/lm_loss': 4.0336354868486526e-05, 'train/info_loss': 1.561617318657227e-05, 'train/ref_loss': 0.2368880957365036, 'train/uncertainty_loss': 0.002750598080456257, 'train/video_loss': 0.24066415429115295, 'train/total_loss': 0.2407044917345047}
15544
+ [Rank 1] Trainer log: {'loss': 0.3173, 'grad_norm': 3.536466360092163, 'learning_rate': 2.9481571807336018e-08}
15545
+ [Rank 2] Trainer log: {'loss': 0.3173, 'grad_norm': 3.536466360092163, 'learning_rate': 2.9481571807336018e-08}[Rank 3] Trainer log: {'loss': 0.3173, 'grad_norm': 3.536466360092163, 'learning_rate': 2.9481571807336018e-08}
15546
+
15547
+ [Rank 0] Trainer log: {'loss': 0.3173, 'grad_norm': 3.536466360092163, 'learning_rate': 2.9481571807336018e-08}
15548
+ {'loss': 0.3173, 'grad_norm': 3.536466360092163, 'learning_rate': 2.9481571807336018e-08, 'epoch': 0.98}
15549
+ tensor(-0.0011, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:2', grad_fn=<MulBackward0>)
15550
+ tensor(-0.0010, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:0', grad_fn=<MulBackward0>)
15551
+ {'train/tv_loss': None, 'train/lm_loss': 0.022706881165504456, 'train/info_loss': 0.18825730681419373, 'train/ref_loss': None, 'train/uncertainty_loss': -9.704146068543196e-05, 'train/video_loss': 0.1881602704524994, 'train/total_loss': 0.21086715161800385}
15552
+ tensor(-0.0009, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:3', grad_fn=<MulBackward0>)
15553
+ tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:1', grad_fn=<MulBackward0>)
15554
+ tensor(-0.0009, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:3', grad_fn=<MulBackward0>)
15555
+ tensor(0.0030, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
15556
+ tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
15557
+ tensor(0.2461, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
15558
+ {'train/tv_loss': 0.00019956468604505063, 'train/lm_loss': 1.6450205293949693e-05, 'train/info_loss': 1.4424115761357825e-05, 'train/ref_loss': 0.38291317224502563, 'train/uncertainty_loss': 0.024610528349876405, 'train/video_loss': 0.4091346263885498, 'train/total_loss': 0.4091510772705078}
15559
+ [Rank 3] Trainer log: {'loss': 0.235, 'grad_norm': 2.253711223602295, 'learning_rate': 2.866871509327962e-08}[Rank 2] Trainer log: {'loss': 0.235, 'grad_norm': 2.253711223602295, 'learning_rate': 2.866871509327962e-08}
15560
+
15561
+ [Rank 0] Trainer log: {'loss': 0.235, 'grad_norm': 2.253711223602295, 'learning_rate': 2.866871509327962e-08}[Rank 1] Trainer log: {'loss': 0.235, 'grad_norm': 2.253711223602295, 'learning_rate': 2.866871509327962e-08}
15562
+
15563
+ {'loss': 0.235, 'grad_norm': 2.253711223602295, 'learning_rate': 2.866871509327962e-08, 'epoch': 0.98}
15564
+ tensor(-0.0016, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0016, device='cuda:0', grad_fn=<MulBackward0>)
15565
+ {'train/tv_loss': None, 'train/lm_loss': 0.27252380847930907, 'train/info_loss': 0.33596906065940857, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00015895817195996643, 'train/video_loss': 0.3358100950717926, 'train/total_loss': 0.608333945274353}
15566
+ tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
15567
+ tensor(0.2933, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
15568
+ tensor(-0.0009, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:3', grad_fn=<MulBackward0>)
15569
+ tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
15570
+ tensor(-0.0010, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:0', grad_fn=<MulBackward0>)
15571
+ {'train/tv_loss': None, 'train/lm_loss': 0.021349179744720462, 'train/info_loss': 0.18826666474342346, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010396195575594902, 'train/video_loss': 0.188162699341774, 'train/total_loss': 0.2095118761062622}
15572
+ tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
15573
+ tensor(0.0953, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
15574
+ [Rank 0] Trainer log: {'loss': 0.3715, 'grad_norm': 3.0025641918182373, 'learning_rate': 2.7867205410484488e-08}[Rank 1] Trainer log: {'loss': 0.3715, 'grad_norm': 3.0025641918182373, 'learning_rate': 2.7867205410484488e-08}
15575
+ [Rank 3] Trainer log: {'loss': 0.3715, 'grad_norm': 3.0025641918182373, 'learning_rate': 2.7867205410484488e-08}
15576
+
15577
+ [Rank 2] Trainer log: {'loss': 0.3715, 'grad_norm': 3.0025641918182373, 'learning_rate': 2.7867205410484488e-08}
15578
+ {'loss': 0.3715, 'grad_norm': 3.0025641918182373, 'learning_rate': 2.7867205410484488e-08, 'epoch': 0.98}
15579
+ tensor(-0.0013, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:1', grad_fn=<MulBackward0>)
15580
+ tensor(-0.0009, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:0', grad_fn=<MulBackward0>)
15581
+ {'train/tv_loss': None, 'train/lm_loss': 0.3378294467926026, 'train/info_loss': 0.1996716856956482, 'train/ref_loss': None, 'train/uncertainty_loss': -8.704561041668059e-05, 'train/video_loss': 0.19958463311195374, 'train/total_loss': 0.5374140739440918}
15582
+ tensor(0.0409, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
15583
+ tensor(0.0946, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
15584
+ tensor(-0.0013, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:3', grad_fn=<MulBackward0>)
15585
+ tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
15586
+ tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
15587
+ tensor(0.0392, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
15588
+ {'train/tv_loss': 0.0001396071049384773, 'train/lm_loss': 2.4341055541299285e-05, 'train/info_loss': 1.633140527701471e-05, 'train/ref_loss': 0.2384827733039856, 'train/uncertainty_loss': 0.003921708464622498, 'train/video_loss': 0.24353766441345215, 'train/total_loss': 0.2435620129108429}
15589
+ [Rank 2] Trainer log: {'loss': 0.3805, 'grad_norm': 4.503754615783691, 'learning_rate': 2.7077043671039914e-08}
15590
+ [Rank 1] Trainer log: {'loss': 0.3805, 'grad_norm': 4.503754615783691, 'learning_rate': 2.7077043671039914e-08}[Rank 3] Trainer log: {'loss': 0.3805, 'grad_norm': 4.503754615783691, 'learning_rate': 2.7077043671039914e-08}
15591
+
15592
+ [Rank 0] Trainer log: {'loss': 0.3805, 'grad_norm': 4.503754615783691, 'learning_rate': 2.7077043671039914e-08}
15593
+ {'loss': 0.3805, 'grad_norm': 4.503754615783691, 'learning_rate': 2.7077043671039914e-08, 'epoch': 0.98}
15594
+ tensor(-0.0013, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:1', grad_fn=<MulBackward0>)
15595
+ tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
15596
+ {'train/tv_loss': None, 'train/lm_loss': 0.4039686679840088, 'train/info_loss': 0.2144818902015686, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012274450855329633, 'train/video_loss': 0.21435914933681488, 'train/total_loss': 0.618327796459198}
15597
+ tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
15598
+ tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
15599
+ tensor(-0.0010, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:1', grad_fn=<MulBackward0>)
15600
+ tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
15601
+ tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
15602
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
15603
+ {'train/tv_loss': 0.00021858105901628733, 'train/lm_loss': 2.1337324869818986e-05, 'train/info_loss': 1.740425250318367e-05, 'train/ref_loss': 0.06369525194168091, 'train/uncertainty_loss': -7.047837134450674e-05, 'train/video_loss': 0.06539083272218704, 'train/total_loss': 0.06541217118501663}
15604
+ [Rank 1] Trainer log: {'loss': 0.3141, 'grad_norm': 1.3152093887329102, 'learning_rate': 2.62982307741233e-08}[Rank 3] Trainer log: {'loss': 0.3141, 'grad_norm': 1.3152093887329102, 'learning_rate': 2.62982307741233e-08}
15605
+
15606
+ [Rank 2] Trainer log: {'loss': 0.3141, 'grad_norm': 1.3152093887329102, 'learning_rate': 2.62982307741233e-08}
15607
+ [Rank 0] Trainer log: {'loss': 0.3141, 'grad_norm': 1.3152093887329102, 'learning_rate': 2.62982307741233e-08}
15608
+ {'loss': 0.3141, 'grad_norm': 1.3152093887329102, 'learning_rate': 2.62982307741233e-08, 'epoch': 0.98}
15609
+ tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
15610
+ {'train/tv_loss': None, 'train/lm_loss': 0.3192060708999634, 'train/info_loss': 0.10271652787923813, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010767884086817504, 'train/video_loss': 0.10260885208845139, 'train/total_loss': 0.4218149483203888}
15611
+ tensor(-0.0008, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:2', grad_fn=<MulBackward0>)
15612
+ tensor(0.0468, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
15613
+ tensor(0.5917, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
15614
+ tensor(-0.0013, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:0', grad_fn=<MulBackward0>)
15615
+ {'train/tv_loss': None, 'train/lm_loss': 0.3076723337173462, 'train/info_loss': 0.20474545657634735, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013230217155069113, 'train/video_loss': 0.20461314916610718, 'train/total_loss': 0.5122854709625244}
15616
+ tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
15617
+ tensor(-0.0008, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:1', grad_fn=<MulBackward0>)
15618
+ tensor(0.2428, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
15619
+ [Rank 1] Trainer log: {'loss': 0.4295, 'grad_norm': 5.666561126708984, 'learning_rate': 2.553076760599793e-08}
15620
+ [Rank 2] Trainer log: {'loss': 0.4295, 'grad_norm': 5.666561126708984, 'learning_rate': 2.553076760599793e-08}
15621
+ [Rank 0] Trainer log: {'loss': 0.4295, 'grad_norm': 5.666561126708984, 'learning_rate': 2.553076760599793e-08}
15622
+ [Rank 3] Trainer log: {'loss': 0.4295, 'grad_norm': 5.666561126708984, 'learning_rate': 2.553076760599793e-08}
15623
+ {'loss': 0.4295, 'grad_norm': 5.666561126708984, 'learning_rate': 2.553076760599793e-08, 'epoch': 0.98}
15624
+ tensor(-0.0008, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:1', grad_fn=<MulBackward0>)
15625
+ tensor(0.0288, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
15626
+ tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
15627
+ tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
15628
+ {'train/tv_loss': None, 'train/lm_loss': 0.2875659942626953, 'train/info_loss': 0.11822149902582169, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011541531421244144, 'train/video_loss': 0.11810608208179474, 'train/total_loss': 0.4056720733642578}
15629
+ tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
15630
+ tensor(-0.0013, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:1', grad_fn=<MulBackward0>)
15631
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
15632
+ {'train/tv_loss': 0.00014680708991363645, 'train/lm_loss': 1.6664764552842828e-05, 'train/info_loss': 1.5139350580284372e-05, 'train/ref_loss': 0.1329575628042221, 'train/uncertainty_loss': -6.927012582309544e-05, 'train/video_loss': 0.13407787680625916, 'train/total_loss': 0.13409453630447388}
15633
+ tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
15634
+ [Rank 1] Trainer log: {'loss': 0.3915, 'grad_norm': 2.96402645111084, 'learning_rate': 2.4774655040008534e-08}[Rank 2] Trainer log: {'loss': 0.3915, 'grad_norm': 2.96402645111084, 'learning_rate': 2.4774655040008534e-08}
15635
+ [Rank 3] Trainer log: {'loss': 0.3915, 'grad_norm': 2.96402645111084, 'learning_rate': 2.4774655040008534e-08}
15636
+
15637
+ [Rank 0] Trainer log: {'loss': 0.3915, 'grad_norm': 2.96402645111084, 'learning_rate': 2.4774655040008534e-08}
15638
+ {'loss': 0.3915, 'grad_norm': 2.96402645111084, 'learning_rate': 2.4774655040008534e-08, 'epoch': 0.98}
15639
+ tensor(-0.0010, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:1', grad_fn=<MulBackward0>)
15640
+ tensor(0.0327, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
15641
+ tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
15642
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
15643
+ {'train/tv_loss': 0.00015368129825219513, 'train/lm_loss': 1.4590684440918268e-05, 'train/info_loss': 1.3112849956087302e-05, 'train/ref_loss': 0.08234774321317673, 'train/uncertainty_loss': -6.846861215308309e-05, 'train/video_loss': 0.08352183550596237, 'train/total_loss': 0.08353642374277115}
15644
+ tensor(-0.0013, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:1', grad_fn=<MulBackward0>)
15645
+ tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
15646
+ tensor(-0.0011, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:2', grad_fn=<MulBackward0>)
15647
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
15648
+ {'train/tv_loss': 0.00024882378056645393, 'train/lm_loss': 2.4436411331407728e-05, 'train/info_loss': 1.7702266632113606e-05, 'train/ref_loss': 0.04439636319875717, 'train/uncertainty_loss': -6.729786400683224e-05, 'train/video_loss': 0.04633735865354538, 'train/total_loss': 0.04636179655790329}
15649
+ [Rank 3] Trainer log: {'loss': 0.3067, 'grad_norm': 5.423553466796875, 'learning_rate': 2.4029893936586835e-08}[Rank 1] Trainer log: {'loss': 0.3067, 'grad_norm': 5.423553466796875, 'learning_rate': 2.4029893936586835e-08}[Rank 2] Trainer log: {'loss': 0.3067, 'grad_norm': 5.423553466796875, 'learning_rate': 2.4029893936586835e-08}
15650
+
15651
+
15652
+ [Rank 0] Trainer log: {'loss': 0.3067, 'grad_norm': 5.423553466796875, 'learning_rate': 2.4029893936586835e-08}
15653
+ {'loss': 0.3067, 'grad_norm': 5.423553466796875, 'learning_rate': 2.4029893936586835e-08, 'epoch': 0.98}
15654
+ tensor(-0.0015, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0015, device='cuda:2', grad_fn=<MulBackward0>)
15655
+ tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
15656
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
15657
+ tensor(0.3023, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
15658
+ {'train/tv_loss': 0.00023792595602571966, 'train/lm_loss': 1.6783963656052946e-05, 'train/info_loss': 1.4006895071361214e-05, 'train/ref_loss': 0.4113433361053467, 'train/uncertainty_loss': 0.030228328704833985, 'train/video_loss': 0.44348907470703125, 'train/total_loss': 0.4435058534145355}
15659
+ tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
15660
+ tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
15661
+ tensor(0.3504, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
15662
+ {'train/tv_loss': 0.0002617940539494157, 'train/lm_loss': 1.4757565804757179e-05, 'train/info_loss': 1.3768483768217266e-05, 'train/ref_loss': 0.45626747608184814, 'train/uncertainty_loss': 0.035035207867622375, 'train/video_loss': 0.4934108257293701, 'train/total_loss': 0.493425577878952}
15663
+ tensor(0.1190, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
15664
+ [Rank 1] Trainer log: {'loss': 0.4272, 'grad_norm': 13.721080780029297, 'learning_rate': 2.3296485143244896e-08}
15665
+ [Rank 0] Trainer log: {'loss': 0.4272, 'grad_norm': 13.721080780029297, 'learning_rate': 2.3296485143244896e-08}[Rank 3] Trainer log: {'loss': 0.4272, 'grad_norm': 13.721080780029297, 'learning_rate': 2.3296485143244896e-08}
15666
+
15667
+ [Rank 2] Trainer log: {'loss': 0.4272, 'grad_norm': 13.721080780029297, 'learning_rate': 2.3296485143244896e-08}
15668
+ {'loss': 0.4272, 'grad_norm': 13.721080780029297, 'learning_rate': 2.3296485143244896e-08, 'epoch': 0.98}
15669
+ tensor(-0.0011, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:2', grad_fn=<MulBackward0>)
15670
+ tensor(-0.0010, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:1', grad_fn=<MulBackward0>)
15671
+ tensor(0.0039, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
15672
+ {'train/tv_loss': 0.00018340945243835451, 'train/lm_loss': 2.1456521062646063e-05, 'train/info_loss': 1.537776188342832e-05, 'train/ref_loss': 0.21618469059467316, 'train/uncertainty_loss': 0.0003915365319699049, 'train/video_loss': 0.21805888414382935, 'train/total_loss': 0.2180803418159485}
15673
+ tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
15674
+ tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
15675
+ tensor(0.3194, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
15676
+ tensor(0.1016, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
15677
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
15678
+ {'train/tv_loss': 0.00016525617102161052, 'train/lm_loss': 2.1575717255473137e-05, 'train/info_loss': 1.6867830709088594e-05, 'train/ref_loss': 0.14564038813114166, 'train/uncertainty_loss': -6.878315471112729e-05, 'train/video_loss': 0.14691051840782166, 'train/total_loss': 0.14693209528923035}
15679
+ [Rank 0] Trainer log: {'loss': 0.3268, 'grad_norm': 4.3162922859191895, 'learning_rate': 2.2574429494575112e-08}[Rank 1] Trainer log: {'loss': 0.3268, 'grad_norm': 4.3162922859191895, 'learning_rate': 2.2574429494575112e-08}
15680
+ [Rank 3] Trainer log: {'loss': 0.3268, 'grad_norm': 4.3162922859191895, 'learning_rate': 2.2574429494575112e-08}
15681
+
15682
+ [Rank 2] Trainer log: {'loss': 0.3268, 'grad_norm': 4.3162922859191895, 'learning_rate': 2.2574429494575112e-08}
15683
+ {'loss': 0.3268, 'grad_norm': 4.3162922859191895, 'learning_rate': 2.2574429494575112e-08, 'epoch': 0.98}
15684
+ tensor(-0.0014, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:3', grad_fn=<MulBackward0>)
15685
+ tensor(0.2224, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0006, device='cuda:2', grad_fn=<MulBackward0>)
15686
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
15687
+ tensor(0.3880, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
15688
+ {'train/tv_loss': 0.0002865825081244111, 'train/lm_loss': 7.627939921803773e-05, 'train/info_loss': 2.235124156868551e-05, 'train/ref_loss': 0.47501638531684875, 'train/uncertainty_loss': 0.03880217373371125, 'train/video_loss': 0.5161335468292236, 'train/total_loss': 0.5162098407745361}
15689
+ tensor(-0.0013, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:1', grad_fn=<MulBackward0>)
15690
+ tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
15691
+ {'train/tv_loss': None, 'train/lm_loss': 0.2736924409866333, 'train/info_loss': 0.19035007059574127, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012445753673091532, 'train/video_loss': 0.19022561609745026, 'train/total_loss': 0.4639180898666382}
15692
+ tensor(-0.0008, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:2', grad_fn=<MulBackward0>)
15693
+ tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
15694
+ [Rank 1] Trainer log: {'loss': 0.3889, 'grad_norm': 8.067330360412598, 'learning_rate': 2.1863727812254653e-08}
15695
+ [Rank 0] Trainer log: {'loss': 0.3889, 'grad_norm': 8.067330360412598, 'learning_rate': 2.1863727812254653e-08}[Rank 3] Trainer log: {'loss': 0.3889, 'grad_norm': 8.067330360412598, 'learning_rate': 2.1863727812254653e-08}
15696
+
15697
+ [Rank 2] Trainer log: {'loss': 0.3889, 'grad_norm': 8.067330360412598, 'learning_rate': 2.1863727812254653e-08}
15698
+ {'loss': 0.3889, 'grad_norm': 8.067330360412598, 'learning_rate': 2.1863727812254653e-08, 'epoch': 0.98}
15699
+ tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
15700
+ tensor(-0.0009, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:0', grad_fn=<MulBackward0>)
15701
+ {'train/tv_loss': None, 'train/lm_loss': 0.14509937763214112, 'train/info_loss': 0.2527730464935303, 'train/ref_loss': None, 'train/uncertainty_loss': -8.878109510987998e-05, 'train/video_loss': 0.25268426537513733, 'train/total_loss': 0.39778363704681396}
15702
+ tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
15703
+ tensor(0.5379, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0006, device='cuda:3', grad_fn=<MulBackward0>)
15704
+ tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
15705
+ tensor(-0.0010, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:3', grad_fn=<MulBackward0>)
15706
+ tensor(0.1698, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
15707
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
15708
+ {'train/tv_loss': 9.16590914130211e-05, 'train/lm_loss': 3.5878777271136646e-05, 'train/info_loss': 2.0980394765501842e-05, 'train/ref_loss': 0.069771908223629, 'train/uncertainty_loss': -6.670778384432197e-05, 'train/video_loss': 0.07045945525169373, 'train/total_loss': 0.07049533724784851}
15709
+ [Rank 1] Trainer log: {'loss': 0.3645, 'grad_norm': 2.2399256229400635, 'learning_rate': 2.1164380905035476e-08}
15710
+ [Rank 3] Trainer log: {'loss': 0.3645, 'grad_norm': 2.2399256229400635, 'learning_rate': 2.1164380905035476e-08}
15711
+ [Rank 2] Trainer log: {'loss': 0.3645, 'grad_norm': 2.2399256229400635, 'learning_rate': 2.1164380905035476e-08}
15712
+ [Rank 0] Trainer log: {'loss': 0.3645, 'grad_norm': 2.2399256229400635, 'learning_rate': 2.1164380905035476e-08}
15713
+ {'loss': 0.3645, 'grad_norm': 2.2399256229400635, 'learning_rate': 2.1164380905035476e-08, 'epoch': 0.98}
15714
+ tensor(-0.0017, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0017, device='cuda:2', grad_fn=<MulBackward0>)
15715
+ tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
15716
+ tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
15717
+ tensor(0.4245, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
15718
+ {'train/tv_loss': 0.0001862031174823642, 'train/lm_loss': 1.2969550152774901e-05, 'train/info_loss': 1.3768483768217266e-05, 'train/ref_loss': 0.5063884854316711, 'train/uncertainty_loss': 0.042449754476547245, 'train/video_loss': 0.5503416657447815, 'train/total_loss': 0.5503546595573425}
15719
+ tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
15720
+ tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
15721
+ tensor(-0.0010, device='cuda:2', grad_fn=<MulBackward0>)
15722
+ {'train/tv_loss': None, 'train/lm_loss': 0.3805509328842163, 'train/info_loss': 0.2284892201423645, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011929129250347615, 'train/video_loss': 0.22836992144584656, 'train/total_loss': 0.6089208722114563}
15723
+ tensor(-0.0008, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:3', grad_fn=<MulBackward0>)
15724
+ [Rank 2] Trainer log: {'loss': 0.4398, 'grad_norm': 3.0451931953430176, 'learning_rate': 2.0476389568749867e-08}
15725
+ [Rank 1] Trainer log: {'loss': 0.4398, 'grad_norm': 3.0451931953430176, 'learning_rate': 2.0476389568749867e-08}
15726
+ [Rank 0] Trainer log: {'loss': 0.4398, 'grad_norm': 3.0451931953430176, 'learning_rate': 2.0476389568749867e-08}[Rank 3] Trainer log: {'loss': 0.4398, 'grad_norm': 3.0451931953430176, 'learning_rate': 2.0476389568749867e-08}
15727
+
15728
+ {'loss': 0.4398, 'grad_norm': 3.0451931953430176, 'learning_rate': 2.0476389568749867e-08, 'epoch': 0.98}
15729
+ tensor(-0.0014, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:2', grad_fn=<MulBackward0>)
15730
+ tensor(0.0213, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
15731
+ tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
15732
+ tensor(0.1013, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
15733
+ {'train/tv_loss': 0.00010751406662166119, 'train/lm_loss': 1.6736284305807202e-05, 'train/info_loss': 1.4424115761357825e-05, 'train/ref_loss': 0.28108254075050354, 'train/uncertainty_loss': 0.010134818404912949, 'train/video_loss': 0.29209190607070923, 'train/total_loss': 0.2921086549758911}
15734
+ tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
15735
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
15736
+ tensor(0.1053, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
15737
+ tensor(-0.0008, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:0', grad_fn=<MulBackward0>)
15738
+ {'train/tv_loss': None, 'train/lm_loss': 0.13767752647399903, 'train/info_loss': 0.11634093523025513, 'train/ref_loss': None, 'train/uncertainty_loss': -8.035243372432888e-05, 'train/video_loss': 0.1162605807185173, 'train/total_loss': 0.25393810868263245}
15739
+ [Rank 3] Trainer log: {'loss': 0.2633, 'grad_norm': 11.52269172668457, 'learning_rate': 1.979975458631045e-08}
15740
+ [Rank 0] Trainer log: {'loss': 0.2633, 'grad_norm': 11.52269172668457, 'learning_rate': 1.979975458631045e-08}[Rank 1] Trainer log: {'loss': 0.2633, 'grad_norm': 11.52269172668457, 'learning_rate': 1.979975458631045e-08}
15741
+ [Rank 2] Trainer log: {'loss': 0.2633, 'grad_norm': 11.52269172668457, 'learning_rate': 1.979975458631045e-08}
15742
+
15743
+ {'loss': 0.2633, 'grad_norm': 11.52269172668457, 'learning_rate': 1.979975458631045e-08, 'epoch': 0.98}
15744
+ tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
15745
+ tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
15746
+ tensor(-0.0014, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:0', grad_fn=<MulBackward0>)
15747
+ {'train/tv_loss': None, 'train/lm_loss': 0.340410852432251, 'train/info_loss': 0.22465088963508606, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013702560681849718, 'train/video_loss': 0.22451385855674744, 'train/total_loss': 0.5649247169494629}
15748
+ tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:1', grad_fn=<MulBackward0>)
15749
+ tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
15750
+ tensor(-0.0013, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:1', grad_fn=<MulBackward0>)
15751
+ tensor(0.3107, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
15752
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
15753
+ {'train/tv_loss': 0.00016929933335632088, 'train/lm_loss': 1.6664764552842828e-05, 'train/info_loss': 1.3530071555578616e-05, 'train/ref_loss': 0.1848124861717224, 'train/uncertainty_loss': -6.540960166603328e-05, 'train/video_loss': 0.18611499667167664, 'train/total_loss': 0.18613165616989136}
15754
+ [Rank 1] Trainer log: {'loss': 0.4731, 'grad_norm': 6.321042060852051, 'learning_rate': 1.913447672770241e-08}[Rank 3] Trainer log: {'loss': 0.4731, 'grad_norm': 6.321042060852051, 'learning_rate': 1.913447672770241e-08}[Rank 2] Trainer log: {'loss': 0.4731, 'grad_norm': 6.321042060852051, 'learning_rate': 1.913447672770241e-08}
15755
+
15756
+
15757
+ [Rank 0] Trainer log: {'loss': 0.4731, 'grad_norm': 6.321042060852051, 'learning_rate': 1.913447672770241e-08}
15758
+ {'loss': 0.4731, 'grad_norm': 6.321042060852051, 'learning_rate': 1.913447672770241e-08, 'epoch': 0.98}
15759
+ tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
15760
+ tensor(-0.0008, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:2', grad_fn=<MulBackward0>)
15761
+ tensor(-0.0010, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:1', grad_fn=<MulBackward0>)
15762
+ tensor(-0.0008, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:0', grad_fn=<MulBackward0>)
15763
+ {'train/tv_loss': None, 'train/lm_loss': 0.030660930275917056, 'train/info_loss': 0.1238495334982872, 'train/ref_loss': None, 'train/uncertainty_loss': -8.383804815821351e-05, 'train/video_loss': 0.12376569211483002, 'train/total_loss': 0.15442661941051483}
15764
+ tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
15765
+ tensor(-0.0008, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:2', grad_fn=<MulBackward0>)
15766
+ tensor(0.1368, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
15767
+ {'train/tv_loss': 0.00013003707863390446, 'train/lm_loss': 1.2945709750056267e-05, 'train/info_loss': 1.3530071555578616e-05, 'train/ref_loss': 0.3049245774745941, 'train/uncertainty_loss': 0.013680557906627656, 'train/video_loss': 0.31965896487236023, 'train/total_loss': 0.3196718990802765}
15768
+ tensor(0.0753, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
15769
+ [Rank 3] Trainer log: {'loss': 0.2912, 'grad_norm': 9.444515228271484, 'learning_rate': 1.8480556749991274e-08}[Rank 2] Trainer log: {'loss': 0.2912, 'grad_norm': 9.444515228271484, 'learning_rate': 1.8480556749991274e-08}[Rank 0] Trainer log: {'loss': 0.2912, 'grad_norm': 9.444515228271484, 'learning_rate': 1.8480556749991274e-08}
15770
+
15771
+
15772
+ {'loss': 0.2912, 'grad_norm': 9.444515228271484, 'learning_rate': 1.8480556749991274e-08, 'epoch': 0.98}[Rank 1] Trainer log: {'loss': 0.2912, 'grad_norm': 9.444515228271484, 'learning_rate': 1.8480556749991274e-08}
15773
+
15774
+ tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
15775
+ tensor(-0.0009, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:3', grad_fn=<MulBackward0>)
15776
+ tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
15777
+ tensor(0.1515, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
15778
+ {'train/tv_loss': 0.0001313490211032331, 'train/lm_loss': 1.4757565804757179e-05, 'train/info_loss': 1.4006895071361214e-05, 'train/ref_loss': 0.31524068117141724, 'train/uncertainty_loss': 0.01514904797077179, 'train/video_loss': 0.3314545452594757, 'train/total_loss': 0.3314692974090576}
15779
+ tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
15780
+ {'train/tv_loss': None, 'train/lm_loss': 0.19247951507568362, 'train/info_loss': 0.1875615417957306, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010782489553093911, 'train/video_loss': 0.1874537169933319, 'train/total_loss': 0.37993323802948}
15781
+ tensor(-0.0009, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:3', grad_fn=<MulBackward0>)
15782
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
15783
+ tensor(0.1763, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
15784
+ [Rank 0] Trainer log: {'loss': 0.274, 'grad_norm': 2.9086925983428955, 'learning_rate': 1.7837995397312903e-08}[Rank 3] Trainer log: {'loss': 0.274, 'grad_norm': 2.9086925983428955, 'learning_rate': 1.7837995397312903e-08}
15785
+
15786
+ [Rank 1] Trainer log: {'loss': 0.274, 'grad_norm': 2.9086925983428955, 'learning_rate': 1.7837995397312903e-08}
15787
+ [Rank 2] Trainer log: {'loss': 0.274, 'grad_norm': 2.9086925983428955, 'learning_rate': 1.7837995397312903e-08}
15788
+ {'loss': 0.274, 'grad_norm': 2.9086925983428955, 'learning_rate': 1.7837995397312903e-08, 'epoch': 0.98}
15789
+ tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
15790
+ {'train/tv_loss': None, 'train/lm_loss': 0.3090352535247803, 'train/info_loss': 0.11806312948465347, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010535655310377479, 'train/video_loss': 0.11795777082443237, 'train/total_loss': 0.4269930422306061}
15791
+ tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
15792
+ tensor(-0.0009, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:3', grad_fn=<MulBackward0>)
15793
+ tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:1', grad_fn=<MulBackward0>)
15794
+ tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
15795
+ tensor(-0.0009, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:0', grad_fn=<MulBackward0>)
15796
+ {'train/tv_loss': None, 'train/lm_loss': 0.0443669855594635, 'train/info_loss': 0.20817534625530243, 'train/ref_loss': None, 'train/uncertainty_loss': -8.949771290645003e-05, 'train/video_loss': 0.20808584988117218, 'train/total_loss': 0.2524528503417969}
15797
+ tensor(-0.0008, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:2', grad_fn=<MulBackward0>)
15798
+ tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
15799
+ [Rank 1] Trainer log: {'loss': 0.3108, 'grad_norm': 2.2581498622894287, 'learning_rate': 1.7206793400880163e-08}[Rank 0] Trainer log: {'loss': 0.3108, 'grad_norm': 2.2581498622894287, 'learning_rate': 1.7206793400880163e-08}
15800
+ [Rank 2] Trainer log: {'loss': 0.3108, 'grad_norm': 2.2581498622894287, 'learning_rate': 1.7206793400880163e-08}
15801
+
15802
+ [Rank 3] Trainer log: {'loss': 0.3108, 'grad_norm': 2.2581498622894287, 'learning_rate': 1.7206793400880163e-08}
15803
+ {'loss': 0.3108, 'grad_norm': 2.2581498622894287, 'learning_rate': 1.7206793400880163e-08, 'epoch': 0.98}
15804
+ tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
15805
+ tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
15806
+ tensor(0.1164, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
15807
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
15808
+ {'train/tv_loss': 0.00012342289555817844, 'train/lm_loss': 2.7726159896701577e-05, 'train/info_loss': 1.7702266632113606e-05, 'train/ref_loss': 0.15569385886192322, 'train/uncertainty_loss': -6.904435576871037e-05, 'train/video_loss': 0.15662990510463715, 'train/total_loss': 0.1566576361656189}
15809
+ tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
15810
+ {'train/tv_loss': None, 'train/lm_loss': 0.060094434022903445, 'train/info_loss': 0.060833293944597244, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011266678338870407, 'train/video_loss': 0.06072062626481056, 'train/total_loss': 0.12081506103277206}
15811
+ tensor(-0.0013, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:3', grad_fn=<MulBackward0>)
15812
+ tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
15813
+ tensor(-0.0010, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:2', grad_fn=<MulBackward0>)
15814
+ [Rank 0] Trainer log: {'loss': 0.3028, 'grad_norm': 4.024272441864014, 'learning_rate': 1.6586951478981818e-08}[Rank 3] Trainer log: {'loss': 0.3028, 'grad_norm': 4.024272441864014, 'learning_rate': 1.6586951478981818e-08}[Rank 2] Trainer log: {'loss': 0.3028, 'grad_norm': 4.024272441864014, 'learning_rate': 1.6586951478981818e-08}
15815
+
15816
+
15817
+ [Rank 1] Trainer log: {'loss': 0.3028, 'grad_norm': 4.024272441864014, 'learning_rate': 1.6586951478981818e-08}
15818
+ {'loss': 0.3028, 'grad_norm': 4.024272441864014, 'learning_rate': 1.6586951478981818e-08, 'epoch': 0.98}
15819
+ tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
15820
+ tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
15821
+ {'train/tv_loss': None, 'train/lm_loss': 0.19400665760040284, 'train/info_loss': 0.0838468000292778, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010808079969137907, 'train/video_loss': 0.08373872190713882, 'train/total_loss': 0.27774539589881897}
15822
+ tensor(0.6478, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
15823
+ tensor(0.1944, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
15824
+ tensor(-0.0014, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:2', grad_fn=<MulBackward0>)
15825
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
15826
+ tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
15827
+ {'train/tv_loss': 0.00012543149059638382, 'train/lm_loss': 4.641471023205668e-05, 'train/info_loss': 1.740425250318367e-05, 'train/ref_loss': 0.07925719767808914, 'train/uncertainty_loss': -6.660351064056159e-05, 'train/video_loss': 0.08021145313978195, 'train/total_loss': 0.08025787025690079}
15828
+ tensor(0.0176, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
15829
+ [Rank 2] Trainer log: {'loss': 0.2885, 'grad_norm': 9.745797157287598, 'learning_rate': 1.597847033697475e-08}
15830
+ [Rank 3] Trainer log: {'loss': 0.2885, 'grad_norm': 9.745797157287598, 'learning_rate': 1.597847033697475e-08}
15831
+ [Rank 1] Trainer log: {'loss': 0.2885, 'grad_norm': 9.745797157287598, 'learning_rate': 1.597847033697475e-08}
15832
+ [Rank 0] Trainer log: {'loss': 0.2885, 'grad_norm': 9.745797157287598, 'learning_rate': 1.597847033697475e-08}
15833
+ {'loss': 0.2885, 'grad_norm': 9.745797157287598, 'learning_rate': 1.597847033697475e-08, 'epoch': 0.98}
15834
+ tensor(-0.0010, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:0', grad_fn=<MulBackward0>)
15835
+ {'train/tv_loss': None, 'train/lm_loss': 0.07550049424171448, 'train/info_loss': 0.2326270490884781, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010250152554363013, 'train/video_loss': 0.2325245440006256, 'train/total_loss': 0.3080250322818756}
15836
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
15837
+ tensor(0.0303, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
15838
+ tensor(0.0823, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
15839
+ tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
15840
+ {'train/tv_loss': None, 'train/lm_loss': 0.3728404760360718, 'train/info_loss': 0.21700052917003632, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011042470578104258, 'train/video_loss': 0.2168901115655899, 'train/total_loss': 0.5897306203842163}
15841
+ tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
15842
+ tensor(0.1946, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0006, device='cuda:1', grad_fn=<MulBackward0>)
15843
+ tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
15844
+ [Rank 0] Trainer log: {'loss': 0.3109, 'grad_norm': 2.3931143283843994, 'learning_rate': 1.538135066728841e-08}[Rank 3] Trainer log: {'loss': 0.3109, 'grad_norm': 2.3931143283843994, 'learning_rate': 1.538135066728841e-08}
15845
+ [Rank 1] Trainer log: {'loss': 0.3109, 'grad_norm': 2.3931143283843994, 'learning_rate': 1.538135066728841e-08}
15846
+
15847
+ [Rank 2] Trainer log: {'loss': 0.3109, 'grad_norm': 2.3931143283843994, 'learning_rate': 1.538135066728841e-08}
15848
+ {'loss': 0.3109, 'grad_norm': 2.3931143283843994, 'learning_rate': 1.538135066728841e-08, 'epoch': 0.98}
15849
+ tensor(-0.0010, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:2', grad_fn=<MulBackward0>)
15850
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
15851
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
15852
+ {'train/tv_loss': 0.00019662757404148579, 'train/lm_loss': 1.6736284305807202e-05, 'train/info_loss': 1.4185704458213877e-05, 'train/ref_loss': 0.1474931538105011, 'train/uncertainty_loss': -6.86797546222806e-05, 'train/video_loss': 0.14901168644428253, 'train/total_loss': 0.14902842044830322}
15853
+ tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
15854
+ tensor(-0.0014, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:2', grad_fn=<MulBackward0>)
15855
+ tensor(0.2225, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
15856
+ {'train/tv_loss': 0.0002068731002509594, 'train/lm_loss': 1.9001058535650373e-05, 'train/info_loss': 1.4006895071361214e-05, 'train/ref_loss': 0.3619014620780945, 'train/uncertainty_loss': 0.02225349098443985, 'train/video_loss': 0.38582393527030945, 'train/total_loss': 0.3858429491519928}
15857
+ tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
15858
+ tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
15859
+ [Rank 1] Trainer log: {'loss': 0.3651, 'grad_norm': 5.535183429718018, 'learning_rate': 1.4795593149427023e-08}
15860
+ [Rank 3] Trainer log: {'loss': 0.3651, 'grad_norm': 5.535183429718018, 'learning_rate': 1.4795593149427023e-08}[Rank 0] Trainer log: {'loss': 0.3651, 'grad_norm': 5.535183429718018, 'learning_rate': 1.4795593149427023e-08}
15861
+
15862
+ [Rank 2] Trainer log: {'loss': 0.3651, 'grad_norm': 5.535183429718018, 'learning_rate': 1.4795593149427023e-08}
15863
+ {'loss': 0.3651, 'grad_norm': 5.535183429718018, 'learning_rate': 1.4795593149427023e-08, 'epoch': 0.98}
15864
+ tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
15865
+ {'train/tv_loss': None, 'train/lm_loss': 0.1908965349197388, 'train/info_loss': 0.2302519679069519, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012198447948321701, 'train/video_loss': 0.23012998700141907, 'train/total_loss': 0.4210265278816223}
15866
+ tensor(0.2627, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0006, device='cuda:1', grad_fn=<MulBackward0>)
15867
+ tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
15868
+ tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
15869
+ tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
15870
+ tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
15871
+ tensor(-0.0013, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:0', grad_fn=<MulBackward0>)
15872
+ {'train/tv_loss': None, 'train/lm_loss': 0.3204694747924805, 'train/info_loss': 0.20709535479545593, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012635280145332218, 'train/video_loss': 0.2069690078496933, 'train/total_loss': 0.5274384617805481}
15873
+ tensor(-0.0010, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:1', grad_fn=<MulBackward0>)
15874
+ [Rank 2] Trainer log: {'loss': 0.3904, 'grad_norm': 7.206387042999268, 'learning_rate': 1.4221198449960727e-08}
15875
+ [Rank 1] Trainer log: {'loss': 0.3904, 'grad_norm': 7.206387042999268, 'learning_rate': 1.4221198449960727e-08}
15876
+ [Rank 0] Trainer log: {'loss': 0.3904, 'grad_norm': 7.206387042999268, 'learning_rate': 1.4221198449960727e-08}[Rank 3] Trainer log: {'loss': 0.3904, 'grad_norm': 7.206387042999268, 'learning_rate': 1.4221198449960727e-08}
15877
+
15878
+ {'loss': 0.3904, 'grad_norm': 7.206387042999268, 'learning_rate': 1.4221198449960727e-08, 'epoch': 0.98}
15879
+ tensor(-0.0014, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:2', grad_fn=<MulBackward0>)
15880
+ tensor(0.0467, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
15881
+ tensor(0.1316, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
15882
+ tensor(0.3947, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
15883
+ {'train/tv_loss': 0.00013940890785306694, 'train/lm_loss': 5.173014360480011e-05, 'train/info_loss': 1.7702266632113606e-05, 'train/ref_loss': 0.48780110478401184, 'train/uncertainty_loss': 0.03946700990200043, 'train/video_loss': 0.5284010767936707, 'train/total_loss': 0.5284528136253357}
15884
+ tensor(-0.0010, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:1', grad_fn=<MulBackward0>)
15885
+ tensor(0.1995, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
15886
+ tensor(0.1702, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
15887
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
15888
+ {'train/tv_loss': 0.00011268751695752144, 'train/lm_loss': 1.680780405877158e-05, 'train/info_loss': 1.4185704458213877e-05, 'train/ref_loss': 0.07056979835033417, 'train/uncertainty_loss': -6.798054673708976e-05, 'train/video_loss': 0.07141750305891037, 'train/total_loss': 0.07143431156873703}
15889
+ [Rank 0] Trainer log: {'loss': 0.3433, 'grad_norm': 4.602771759033203, 'learning_rate': 1.3658167222529994e-08}[Rank 1] Trainer log: {'loss': 0.3433, 'grad_norm': 4.602771759033203, 'learning_rate': 1.3658167222529994e-08}
15890
+ [Rank 2] Trainer log: {'loss': 0.3433, 'grad_norm': 4.602771759033203, 'learning_rate': 1.3658167222529994e-08}
15891
+
15892
+ [Rank 3] Trainer log: {'loss': 0.3433, 'grad_norm': 4.602771759033203, 'learning_rate': 1.3658167222529994e-08}
15893
+ {'loss': 0.3433, 'grad_norm': 4.602771759033203, 'learning_rate': 1.3658167222529994e-08, 'epoch': 0.98}
15894
+ tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
15895
+ tensor(0.1286, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
15896
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
15897
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
15898
+ {'train/tv_loss': 9.077095310203732e-05, 'train/lm_loss': 1.8905699835158885e-05, 'train/info_loss': 1.4006895071361214e-05, 'train/ref_loss': 0.1560274064540863, 'train/uncertainty_loss': -6.968472735024989e-05, 'train/video_loss': 0.15669789910316467, 'train/total_loss': 0.15671680867671967}
15899
+ tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
15900
+ {'train/tv_loss': None, 'train/lm_loss': 0.10165367126464844, 'train/info_loss': 0.22112657129764557, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010826733196154237, 'train/video_loss': 0.22101829946041107, 'train/total_loss': 0.3226719796657562}
15901
+ tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)