aiden200 commited on
Commit
0b8b2a3
·
verified ·
1 Parent(s): 4660c02

Training in progress, step 3075

Browse files
Files changed (2) hide show
  1. adapter_model.safetensors +1 -1
  2. train.log +377 -0
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d34e23c8484e80b6ef5c7243ef0e9595e8747d365664c46ffcfd790b9e48b3eb
3
  size 1204780872
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4ba76fbc41735d0ff74c7b6d036dbd2170f764042bdc8759ae76333d2c00e81
3
  size 1204780872
train.log CHANGED
@@ -15899,3 +15899,380 @@ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device=
15899
  tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
15900
  {'train/tv_loss': None, 'train/lm_loss': 0.10165367126464844, 'train/info_loss': 0.22112657129764557, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010826733196154237, 'train/video_loss': 0.22101829946041107, 'train/total_loss': 0.3226719796657562}
15901
  tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15899
  tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
15900
  {'train/tv_loss': None, 'train/lm_loss': 0.10165367126464844, 'train/info_loss': 0.22112657129764557, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010826733196154237, 'train/video_loss': 0.22101829946041107, 'train/total_loss': 0.3226719796657562}
15901
  tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
15902
+ tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
15903
+ tensor(0.1692, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
15904
+ [Rank 1] Trainer log: {'loss': 0.2934, 'grad_norm': 3.267611503601074, 'learning_rate': 1.3106500107847863e-08}
15905
+ [Rank 0] Trainer log: {'loss': 0.2934, 'grad_norm': 3.267611503601074, 'learning_rate': 1.3106500107847863e-08}[Rank 2] Trainer log: {'loss': 0.2934, 'grad_norm': 3.267611503601074, 'learning_rate': 1.3106500107847863e-08}
15906
+
15907
+ [Rank 3] Trainer log: {'loss': 0.2934, 'grad_norm': 3.267611503601074, 'learning_rate': 1.3106500107847863e-08}
15908
+ {'loss': 0.2934, 'grad_norm': 3.267611503601074, 'learning_rate': 1.3106500107847863e-08, 'epoch': 0.98}
15909
+ tensor(-0.0013, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:3', grad_fn=<MulBackward0>)
15910
+ tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
15911
+ {'train/tv_loss': None, 'train/lm_loss': 0.08958430886268616, 'train/info_loss': 0.23267042636871338, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011827643029391766, 'train/video_loss': 0.2325521558523178, 'train/total_loss': 0.32213646173477173}
15912
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
15913
+ tensor(0.6069, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
15914
+ tensor(-0.0013, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:1', grad_fn=<MulBackward0>)
15915
+ tensor(-0.0013, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:3', grad_fn=<MulBackward0>)
15916
+ tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
15917
+ {'train/tv_loss': None, 'train/lm_loss': 0.276532244682312, 'train/info_loss': 0.1254427284002304, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011011430760845543, 'train/video_loss': 0.12533260881900787, 'train/total_loss': 0.4018648862838745}
15918
+ tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
15919
+ [Rank 1] Trainer log: {'loss': 0.3998, 'grad_norm': 4.857947826385498, 'learning_rate': 1.2566197733689945e-08}[Rank 2] Trainer log: {'loss': 0.3998, 'grad_norm': 4.857947826385498, 'learning_rate': 1.2566197733689945e-08}
15920
+
15921
+ [Rank 0] Trainer log: {'loss': 0.3998, 'grad_norm': 4.857947826385498, 'learning_rate': 1.2566197733689945e-08}[Rank 3] Trainer log: {'loss': 0.3998, 'grad_norm': 4.857947826385498, 'learning_rate': 1.2566197733689945e-08}
15922
+
15923
+ {'loss': 0.3998, 'grad_norm': 4.857947826385498, 'learning_rate': 1.2566197733689945e-08, 'epoch': 0.98}
15924
+ tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
15925
+ tensor(-0.0010, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:0', grad_fn=<MulBackward0>)
15926
+ {'train/tv_loss': None, 'train/lm_loss': 0.0507108747959137, 'train/info_loss': 0.17671772837638855, 'train/ref_loss': None, 'train/uncertainty_loss': -9.840057464316487e-05, 'train/video_loss': 0.17661932110786438, 'train/total_loss': 0.22733019292354584}
15927
+ tensor(-0.0009, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:3', grad_fn=<MulBackward0>)
15928
+ tensor(-0.0008, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:2', grad_fn=<MulBackward0>)
15929
+ tensor(-0.0013, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:0', grad_fn=<MulBackward0>)
15930
+ {'train/tv_loss': None, 'train/lm_loss': 0.42907042503356935, 'train/info_loss': 0.2444683164358139, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013469330733641982, 'train/video_loss': 0.24433362483978271, 'train/total_loss': 0.6734040975570679}
15931
+ tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
15932
+ tensor(-0.0006, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0006, device='cuda:1', grad_fn=<MulBackward0>)
15933
+ tensor(0.0780, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
15934
+ [Rank 1] Trainer log: {'loss': 0.3129, 'grad_norm': 6.4214324951171875, 'learning_rate': 1.2037260714902187e-08}
15935
+ [Rank 2] Trainer log: {'loss': 0.3129, 'grad_norm': 6.4214324951171875, 'learning_rate': 1.2037260714902187e-08}
15936
+ [Rank 3] Trainer log: {'loss': 0.3129, 'grad_norm': 6.4214324951171875, 'learning_rate': 1.2037260714902187e-08}
15937
+ [Rank 0] Trainer log: {'loss': 0.3129, 'grad_norm': 6.4214324951171875, 'learning_rate': 1.2037260714902187e-08}
15938
+ {'loss': 0.3129, 'grad_norm': 6.4214324951171875, 'learning_rate': 1.2037260714902187e-08, 'epoch': 0.99}
15939
+ tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
15940
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
15941
+ {'train/tv_loss': 0.00014790244167670608, 'train/lm_loss': 1.6760123253334316e-05, 'train/info_loss': 1.4900939277140424e-05, 'train/ref_loss': 0.14792372286319733, 'train/uncertainty_loss': -6.890620570629836e-05, 'train/video_loss': 0.1490529477596283, 'train/total_loss': 0.14906971156597137}
15942
+ tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
15943
+ tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
15944
+ tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
15945
+ {'train/tv_loss': None, 'train/lm_loss': 0.43226170539855957, 'train/info_loss': 0.12203794717788696, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011703873751685024, 'train/video_loss': 0.12192090600728989, 'train/total_loss': 0.5541825890541077}
15946
+ tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
15947
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
15948
+ tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
15949
+ [Rank 1] Trainer log: {'loss': 0.3673, 'grad_norm': 1.8332874774932861, 'learning_rate': 1.151968965339756e-08}
15950
+ [Rank 2] Trainer log: {'loss': 0.3673, 'grad_norm': 1.8332874774932861, 'learning_rate': 1.151968965339756e-08}[Rank 3] Trainer log: {'loss': 0.3673, 'grad_norm': 1.8332874774932861, 'learning_rate': 1.151968965339756e-08}
15951
+
15952
+ [Rank 0] Trainer log: {'loss': 0.3673, 'grad_norm': 1.8332874774932861, 'learning_rate': 1.151968965339756e-08}
15953
+ {'loss': 0.3673, 'grad_norm': 1.8332874774932861, 'learning_rate': 1.151968965339756e-08, 'epoch': 0.99}
15954
+ tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
15955
+ {'train/tv_loss': None, 'train/lm_loss': 0.3395741701126099, 'train/info_loss': 0.23486897349357605, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001245252205990255, 'train/video_loss': 0.23474444448947906, 'train/total_loss': 0.5743186473846436}
15956
+ tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
15957
+ tensor(0.0821, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
15958
+ tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
15959
+ tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
15960
+ tensor(-0.0013, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:0', grad_fn=<MulBackward0>)
15961
+ {'train/tv_loss': None, 'train/lm_loss': 0.2048206329345703, 'train/info_loss': 0.27331027388572693, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012871609069406987, 'train/video_loss': 0.2731815576553345, 'train/total_loss': 0.4780021905899048}
15962
+ tensor(-0.0013, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:2', grad_fn=<MulBackward0>)
15963
+ tensor(0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
15964
+ [Rank 2] Trainer log: {'loss': 0.3772, 'grad_norm': 2.6298654079437256, 'learning_rate': 1.1013485138153812e-08}[Rank 3] Trainer log: {'loss': 0.3772, 'grad_norm': 2.6298654079437256, 'learning_rate': 1.1013485138153812e-08}[Rank 1] Trainer log: {'loss': 0.3772, 'grad_norm': 2.6298654079437256, 'learning_rate': 1.1013485138153812e-08}
15965
+
15966
+
15967
+ [Rank 0] Trainer log: {'loss': 0.3772, 'grad_norm': 2.6298654079437256, 'learning_rate': 1.1013485138153812e-08}
15968
+ {'loss': 0.3772, 'grad_norm': 2.6298654079437256, 'learning_rate': 1.1013485138153812e-08, 'epoch': 0.99}
15969
+ tensor(-0.0014, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:1', grad_fn=<MulBackward0>)
15970
+ tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
15971
+ tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
15972
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
15973
+ {'train/tv_loss': 0.00013138524955138564, 'train/lm_loss': 1.4662205649074168e-05, 'train/info_loss': 1.5139350580284372e-05, 'train/ref_loss': 0.07244722545146942, 'train/uncertainty_loss': -6.740533863194288e-05, 'train/video_loss': 0.07344604283571243, 'train/total_loss': 0.07346070557832718}
15974
+ tensor(-0.0013, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:0', grad_fn=<MulBackward0>)
15975
+ {'train/tv_loss': None, 'train/lm_loss': 0.31378602981567383, 'train/info_loss': 0.18519382178783417, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012989046517759563, 'train/video_loss': 0.1850639283657074, 'train/total_loss': 0.4988499581813812}
15976
+ tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
15977
+ tensor(0.2215, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
15978
+ tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
15979
+ [Rank 1] Trainer log: {'loss': 0.3339, 'grad_norm': 8.79587459564209, 'learning_rate': 1.0518647745214605e-08}[Rank 3] Trainer log: {'loss': 0.3339, 'grad_norm': 8.79587459564209, 'learning_rate': 1.0518647745214605e-08}[Rank 0] Trainer log: {'loss': 0.3339, 'grad_norm': 8.79587459564209, 'learning_rate': 1.0518647745214605e-08}
15980
+
15981
+
15982
+ [Rank 2] Trainer log: {'loss': 0.3339, 'grad_norm': 8.79587459564209, 'learning_rate': 1.0518647745214605e-08}
15983
+ {'loss': 0.3339, 'grad_norm': 8.79587459564209, 'learning_rate': 1.0518647745214605e-08, 'epoch': 0.99}
15984
+ tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
15985
+ tensor(-0.0013, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:1', grad_fn=<MulBackward0>)
15986
+ tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
15987
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
15988
+ {'train/tv_loss': 0.00012993289856240154, 'train/lm_loss': 2.159955620300025e-05, 'train/info_loss': 1.4900939277140424e-05, 'train/ref_loss': 0.12718532979488373, 'train/uncertainty_loss': -6.540914182551205e-05, 'train/video_loss': 0.12817427515983582, 'train/total_loss': 0.1281958818435669}
15989
+ tensor(-0.0010, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:1', grad_fn=<MulBackward0>)
15990
+ tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
15991
+ tensor(0.1028, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
15992
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
15993
+ {'train/tv_loss': 0.0001415692619048059, 'train/lm_loss': 1.885802048491314e-05, 'train/info_loss': 1.5854584489716217e-05, 'train/ref_loss': 0.19705133140087128, 'train/uncertainty_loss': -6.794760120101274e-05, 'train/video_loss': 0.19813178479671478, 'train/total_loss': 0.1981506496667862}
15994
+ [Rank 3] Trainer log: {'loss': 0.3185, 'grad_norm': 7.227172374725342, 'learning_rate': 1.0035178037686166e-08}
15995
+ [Rank 0] Trainer log: {'loss': 0.3185, 'grad_norm': 7.227172374725342, 'learning_rate': 1.0035178037686166e-08}[Rank 1] Trainer log: {'loss': 0.3185, 'grad_norm': 7.227172374725342, 'learning_rate': 1.0035178037686166e-08}
15996
+
15997
+ [Rank 2] Trainer log: {'loss': 0.3185, 'grad_norm': 7.227172374725342, 'learning_rate': 1.0035178037686166e-08}
15998
+ {'loss': 0.3185, 'grad_norm': 7.227172374725342, 'learning_rate': 1.0035178037686166e-08, 'epoch': 0.99}
15999
+ tensor(-0.0010, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:0', grad_fn=<MulBackward0>)
16000
+ {'train/tv_loss': None, 'train/lm_loss': 0.20952115058898926, 'train/info_loss': 0.2307051569223404, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010274610249325634, 'train/video_loss': 0.2306024134159088, 'train/total_loss': 0.4401235580444336}
16001
+ tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
16002
+ tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
16003
+ tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
16004
+ tensor(-0.0010, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:1', grad_fn=<MulBackward0>)
16005
+ tensor(0.0037, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
16006
+ tensor(-0.0006, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0006, device='cuda:3', grad_fn=<MulBackward0>)
16007
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
16008
+ {'train/tv_loss': 0.00016491730930283667, 'train/lm_loss': 2.159955620300025e-05, 'train/info_loss': 1.561617318657227e-05, 'train/ref_loss': 0.06835027039051056, 'train/uncertainty_loss': -7.077574846334756e-05, 'train/video_loss': 0.0696144551038742, 'train/total_loss': 0.06963605433702469}
16009
+ [Rank 1] Trainer log: {'loss': 0.2489, 'grad_norm': 2.4929118156433105, 'learning_rate': 9.563076565741737e-09}[Rank 3] Trainer log: {'loss': 0.2489, 'grad_norm': 2.4929118156433105, 'learning_rate': 9.563076565741737e-09}
16010
+
16011
+ [Rank 0] Trainer log: {'loss': 0.2489, 'grad_norm': 2.4929118156433105, 'learning_rate': 9.563076565741737e-09}[Rank 2] Trainer log: {'loss': 0.2489, 'grad_norm': 2.4929118156433105, 'learning_rate': 9.563076565741737e-09}
16012
+
16013
+ {'loss': 0.2489, 'grad_norm': 2.4929118156433105, 'learning_rate': 9.563076565741737e-09, 'epoch': 0.99}
16014
+ tensor(-0.0016, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0016, device='cuda:2', grad_fn=<MulBackward0>)
16015
+ tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
16016
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
16017
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
16018
+ {'train/tv_loss': 0.00015485152835026383, 'train/lm_loss': 1.6760123253334316e-05, 'train/info_loss': 1.4662527973996475e-05, 'train/ref_loss': 0.1597135365009308, 'train/uncertainty_loss': -7.034336449578405e-05, 'train/video_loss': 0.1608966588973999, 'train/total_loss': 0.16091342270374298}
16019
+ tensor(-0.0014, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:0', grad_fn=<MulBackward0>)
16020
+ {'train/tv_loss': None, 'train/lm_loss': 0.2428765058517456, 'train/info_loss': 0.2180495709180832, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00014412429882213474, 'train/video_loss': 0.2179054468870163, 'train/total_loss': 0.4607819616794586}
16021
+ tensor(-0.0013, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:3', grad_fn=<MulBackward0>)
16022
+ tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:1', grad_fn=<MulBackward0>)
16023
+ tensor(0.0778, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
16024
+ [Rank 2] Trainer log: {'loss': 0.3591, 'grad_norm': 4.326765537261963, 'learning_rate': 9.102343866616014e-09}[Rank 3] Trainer log: {'loss': 0.3591, 'grad_norm': 4.326765537261963, 'learning_rate': 9.102343866616014e-09}[Rank 1] Trainer log: {'loss': 0.3591, 'grad_norm': 4.326765537261963, 'learning_rate': 9.102343866616014e-09}[Rank 0] Trainer log: {'loss': 0.3591, 'grad_norm': 4.326765537261963, 'learning_rate': 9.102343866616014e-09}
16025
+
16026
+
16027
+
16028
+ {'loss': 0.3591, 'grad_norm': 4.326765537261963, 'learning_rate': 9.102343866616014e-09, 'epoch': 0.99}
16029
+ tensor(-0.0009, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:0', grad_fn=<MulBackward0>)
16030
+ {'train/tv_loss': None, 'train/lm_loss': 0.06931447982788086, 'train/info_loss': 0.16403447091579437, 'train/ref_loss': None, 'train/uncertainty_loss': -9.056694689206779e-05, 'train/video_loss': 0.16394390165805817, 'train/total_loss': 0.23325838148593903}
16031
+ tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
16032
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
16033
+ tensor(0.1401, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
16034
+ tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
16035
+ tensor(-0.0014, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:1', grad_fn=<MulBackward0>)
16036
+ tensor(-0.0010, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:0', grad_fn=<MulBackward0>)
16037
+ {'train/tv_loss': None, 'train/lm_loss': 0.01512097716331482, 'train/info_loss': 0.18348032236099243, 'train/ref_loss': None, 'train/uncertainty_loss': -9.820846607908607e-05, 'train/video_loss': 0.18338210880756378, 'train/total_loss': 0.19850309193134308}
16038
+ tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
16039
+ [Rank 1] Trainer log: {'loss': 0.291, 'grad_norm': 7.863997459411621, 'learning_rate': 8.652980464608495e-09}
16040
+ [Rank 3] Trainer log: {'loss': 0.291, 'grad_norm': 7.863997459411621, 'learning_rate': 8.652980464608495e-09}
16041
+ [Rank 0] Trainer log: {'loss': 0.291, 'grad_norm': 7.863997459411621, 'learning_rate': 8.652980464608495e-09}[Rank 2] Trainer log: {'loss': 0.291, 'grad_norm': 7.863997459411621, 'learning_rate': 8.652980464608495e-09}
16042
+
16043
+ {'loss': 0.291, 'grad_norm': 7.863997459411621, 'learning_rate': 8.652980464608495e-09, 'epoch': 0.99}
16044
+ tensor(-0.0013, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:2', grad_fn=<MulBackward0>)
16045
+ tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
16046
+ tensor(-0.0010, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:0', grad_fn=<MulBackward0>)
16047
+ {'train/tv_loss': None, 'train/lm_loss': 0.13454277515411378, 'train/info_loss': 0.08152417093515396, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010352558456361294, 'train/video_loss': 0.0814206451177597, 'train/total_loss': 0.2159634232521057}
16048
+ tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
16049
+ tensor(-0.0011, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:2', grad_fn=<MulBackward0>)
16050
+ tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
16051
+ tensor(0.0257, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
16052
+ tensor(0.0792, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
16053
+ {'train/tv_loss': 0.00010825926437973976, 'train/lm_loss': 1.683164300629869e-05, 'train/info_loss': 1.4006895071361214e-05, 'train/ref_loss': 0.26161786913871765, 'train/uncertainty_loss': 0.007922624051570893, 'train/video_loss': 0.2704205811023712, 'train/total_loss': 0.27043741941452026}
16054
+ [Rank 0] Trainer log: {'loss': 0.3472, 'grad_norm': 1.9801222085952759, 'learning_rate': 8.214986871076803e-09}[Rank 3] Trainer log: {'loss': 0.3472, 'grad_norm': 1.9801222085952759, 'learning_rate': 8.214986871076803e-09}
16055
+ [Rank 2] Trainer log: {'loss': 0.3472, 'grad_norm': 1.9801222085952759, 'learning_rate': 8.214986871076803e-09}
16056
+ [Rank 1] Trainer log: {'loss': 0.3472, 'grad_norm': 1.9801222085952759, 'learning_rate': 8.214986871076803e-09}
16057
+
16058
+ {'loss': 0.3472, 'grad_norm': 1.9801222085952759, 'learning_rate': 8.214986871076803e-09, 'epoch': 0.99}
16059
+ tensor(-0.0011, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:2', grad_fn=<MulBackward0>)
16060
+ tensor(-0.0008, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:0', grad_fn=<MulBackward0>)
16061
+ {'train/tv_loss': None, 'train/lm_loss': 0.06428139209747315, 'train/info_loss': 0.18149709701538086, 'train/ref_loss': None, 'train/uncertainty_loss': -8.136932738125325e-05, 'train/video_loss': 0.18141572177410126, 'train/total_loss': 0.24569711089134216}
16062
+ tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
16063
+ tensor(0.2234, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
16064
+ tensor(-0.0014, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:0', grad_fn=<MulBackward0>)
16065
+ {'train/tv_loss': None, 'train/lm_loss': 0.35800039768218994, 'train/info_loss': 0.33279451727867126, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013650618493556976, 'train/video_loss': 0.3326580226421356, 'train/total_loss': 0.690658450126648}
16066
+ tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
16067
+ tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:1', grad_fn=<MulBackward0>)
16068
+ tensor(0.2653, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
16069
+ [Rank 1] Trainer log: {'loss': 0.3549, 'grad_norm': 3.5326428413391113, 'learning_rate': 7.788363584443348e-09}[Rank 2] Trainer log: {'loss': 0.3549, 'grad_norm': 3.5326428413391113, 'learning_rate': 7.788363584443348e-09}
16070
+
16071
+ [Rank 0] Trainer log: {'loss': 0.3549, 'grad_norm': 3.5326428413391113, 'learning_rate': 7.788363584443348e-09}[Rank 3] Trainer log: {'loss': 0.3549, 'grad_norm': 3.5326428413391113, 'learning_rate': 7.788363584443348e-09}
16072
+
16073
+ {'loss': 0.3549, 'grad_norm': 3.5326428413391113, 'learning_rate': 7.788363584443348e-09, 'epoch': 0.99}
16074
+ tensor(-0.0015, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0015, device='cuda:2', grad_fn=<MulBackward0>)
16075
+ tensor(-0.0014, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:3', grad_fn=<MulBackward0>)
16076
+ tensor(0.1503, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
16077
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
16078
+ {'train/tv_loss': 0.00010081094224005938, 'train/lm_loss': 1.7046202265191825e-05, 'train/info_loss': 1.561617318657227e-05, 'train/ref_loss': 0.2110326886177063, 'train/uncertainty_loss': -7.015446899458767e-05, 'train/video_loss': 0.21178463101387024, 'train/total_loss': 0.211801677942276}
16079
+ tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
16080
+ {'train/tv_loss': None, 'train/lm_loss': 0.3401105642318726, 'train/info_loss': 0.2033451348543167, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012256884947419168, 'train/video_loss': 0.20322257280349731, 'train/total_loss': 0.5433331727981567}
16081
+ tensor(-0.0009, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:3', grad_fn=<MulBackward0>)
16082
+ tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
16083
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
16084
+ [Rank 1] Trainer log: {'loss': 0.325, 'grad_norm': 4.1238484382629395, 'learning_rate': 7.3731110901920086e-09}
16085
+ [Rank 3] Trainer log: {'loss': 0.325, 'grad_norm': 4.1238484382629395, 'learning_rate': 7.3731110901920086e-09}
16086
+ [Rank 2] Trainer log: {'loss': 0.325, 'grad_norm': 4.1238484382629395, 'learning_rate': 7.3731110901920086e-09}
16087
+ [Rank 0] Trainer log: {'loss': 0.325, 'grad_norm': 4.1238484382629395, 'learning_rate': 7.3731110901920086e-09}
16088
+ {'loss': 0.325, 'grad_norm': 4.1238484382629395, 'learning_rate': 7.3731110901920086e-09, 'epoch': 0.99}
16089
+ tensor(-0.0010, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:3', grad_fn=<MulBackward0>)
16090
+ tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
16091
+ {'train/tv_loss': None, 'train/lm_loss': 0.3318980693817139, 'train/info_loss': 0.20346176624298096, 'train/ref_loss': None, 'train/uncertainty_loss': -0.000106909463647753, 'train/video_loss': 0.2033548504114151, 'train/total_loss': 0.5352529287338257}
16092
+ tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
16093
+ tensor(0.1910, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
16094
+ tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
16095
+ {'train/tv_loss': None, 'train/lm_loss': 0.30203094482421877, 'train/info_loss': 0.09250164777040482, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001158339437097311, 'train/video_loss': 0.09238581359386444, 'train/total_loss': 0.3944167494773865}
16096
+ tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
16097
+ tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
16098
+ tensor(0.0061, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
16099
+ [Rank 3] Trainer log: {'loss': 0.4512, 'grad_norm': 2.808148145675659, 'learning_rate': 6.96922986086368e-09}[Rank 0] Trainer log: {'loss': 0.4512, 'grad_norm': 2.808148145675659, 'learning_rate': 6.96922986086368e-09}[Rank 1] Trainer log: {'loss': 0.4512, 'grad_norm': 2.808148145675659, 'learning_rate': 6.96922986086368e-09}
16100
+
16101
+
16102
+ [Rank 2] Trainer log: {'loss': 0.4512, 'grad_norm': 2.808148145675659, 'learning_rate': 6.96922986086368e-09}
16103
+ {'loss': 0.4512, 'grad_norm': 2.808148145675659, 'learning_rate': 6.96922986086368e-09, 'epoch': 0.99}
16104
+ tensor(-0.0014, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:0', grad_fn=<MulBackward0>)
16105
+ {'train/tv_loss': None, 'train/lm_loss': 0.259464430809021, 'train/info_loss': 0.2232973575592041, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013513574376702308, 'train/video_loss': 0.2231622189283371, 'train/total_loss': 0.48262667655944824}
16106
+ tensor(-0.0009, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:3', grad_fn=<MulBackward0>)
16107
+ tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
16108
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
16109
+ tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
16110
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
16111
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
16112
+ {'train/tv_loss': 0.00021494310349226, 'train/lm_loss': 1.919177448144183e-05, 'train/info_loss': 1.561617318657227e-05, 'train/ref_loss': 0.16463173925876617, 'train/uncertainty_loss': -6.895140977576375e-05, 'train/video_loss': 0.16629795730113983, 'train/total_loss': 0.1663171499967575}
16113
+ tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
16114
+ [Rank 3] Trainer log: {'loss': 0.2346, 'grad_norm': 2.7327122688293457, 'learning_rate': 6.576720356062938e-09}[Rank 2] Trainer log: {'loss': 0.2346, 'grad_norm': 2.7327122688293457, 'learning_rate': 6.576720356062938e-09}[Rank 0] Trainer log: {'loss': 0.2346, 'grad_norm': 2.7327122688293457, 'learning_rate': 6.576720356062938e-09}
16115
+
16116
+
16117
+ [Rank 1] Trainer log: {'loss': 0.2346, 'grad_norm': 2.7327122688293457, 'learning_rate': 6.576720356062938e-09}
16118
+ {'loss': 0.2346, 'grad_norm': 2.7327122688293457, 'learning_rate': 6.576720356062938e-09, 'epoch': 0.99}
16119
+ tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:1', grad_fn=<MulBackward0>)
16120
+ tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
16121
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
16122
+ {'train/tv_loss': 0.0001760338549502194, 'train/lm_loss': 1.9167935533914715e-05, 'train/info_loss': 1.5854584489716217e-05, 'train/ref_loss': 0.14493098855018616, 'train/uncertainty_loss': -7.221120176836848e-05, 'train/video_loss': 0.14628289639949799, 'train/total_loss': 0.14630205929279327}
16123
+ tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
16124
+ tensor(-0.0011, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:2', grad_fn=<MulBackward0>)
16125
+ tensor(-0.0009, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:3', grad_fn=<MulBackward0>)
16126
+ tensor(0.0545, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
16127
+ tensor(0.1361, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
16128
+ {'train/tv_loss': 0.00022776047699153424, 'train/lm_loss': 6.007245974615216e-05, 'train/info_loss': 1.8834713046089746e-05, 'train/ref_loss': 0.30153852701187134, 'train/uncertainty_loss': 0.013609066605567932, 'train/video_loss': 0.3169885277748108, 'train/total_loss': 0.3170486092567444}
16129
+ [Rank 2] Trainer log: {'loss': 0.2366, 'grad_norm': 5.121971130371094, 'learning_rate': 6.195583022451379e-09}[Rank 3] Trainer log: {'loss': 0.2366, 'grad_norm': 5.121971130371094, 'learning_rate': 6.195583022451379e-09}
16130
+
16131
+ [Rank 1] Trainer log: {'loss': 0.2366, 'grad_norm': 5.121971130371094, 'learning_rate': 6.195583022451379e-09}
16132
+ [Rank 0] Trainer log: {'loss': 0.2366, 'grad_norm': 5.121971130371094, 'learning_rate': 6.195583022451379e-09}
16133
+ {'loss': 0.2366, 'grad_norm': 5.121971130371094, 'learning_rate': 6.195583022451379e-09, 'epoch': 0.99}
16134
+ tensor(-0.0013, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:3', grad_fn=<MulBackward0>)
16135
+ tensor(-0.0013, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:0', grad_fn=<MulBackward0>)
16136
+ {'train/tv_loss': None, 'train/lm_loss': 0.2746912717819214, 'train/info_loss': 0.17509594559669495, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013302697334438562, 'train/video_loss': 0.17496292293071747, 'train/total_loss': 0.449654221534729}
16137
+ tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
16138
+ tensor(-0.0010, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:1', grad_fn=<MulBackward0>)
16139
+ tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
16140
+ tensor(-0.0013, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:2', grad_fn=<MulBackward0>)
16141
+ tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
16142
+ {'train/tv_loss': None, 'train/lm_loss': 0.2859696626663208, 'train/info_loss': 0.22219498455524445, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010885485680773855, 'train/video_loss': 0.2220861315727234, 'train/total_loss': 0.5080558061599731}
16143
+ tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
16144
+ [Rank 1] Trainer log: {'loss': 0.3876, 'grad_norm': 3.7306649684906006, 'learning_rate': 5.825818293750951e-09}[Rank 2] Trainer log: {'loss': 0.3876, 'grad_norm': 3.7306649684906006, 'learning_rate': 5.825818293750951e-09}[Rank 0] Trainer log: {'loss': 0.3876, 'grad_norm': 3.7306649684906006, 'learning_rate': 5.825818293750951e-09}
16145
+
16146
+
16147
+ [Rank 3] Trainer log: {'loss': 0.3876, 'grad_norm': 3.7306649684906006, 'learning_rate': 5.825818293750951e-09}
16148
+ {'loss': 0.3876, 'grad_norm': 3.7306649684906006, 'learning_rate': 5.825818293750951e-09, 'epoch': 0.99}
16149
+ tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
16150
+ tensor(-0.0009, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:3', grad_fn=<MulBackward0>)
16151
+ tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
16152
+ tensor(0.4833, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
16153
+ {'train/tv_loss': 0.00014160065911710264, 'train/lm_loss': 3.637936606537551e-05, 'train/info_loss': 2.0682384274550714e-05, 'train/ref_loss': 0.5120903253555298, 'train/uncertainty_loss': 0.04832899570465088, 'train/video_loss': 0.561572790145874, 'train/total_loss': 0.561609148979187}
16154
+ tensor(-0.0014, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:0', grad_fn=<MulBackward0>)
16155
+ {'train/tv_loss': None, 'train/lm_loss': 0.3400126457214356, 'train/info_loss': 0.19071005284786224, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013963867677375675, 'train/video_loss': 0.1905704140663147, 'train/total_loss': 0.5305830240249634}
16156
+ tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
16157
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
16158
+ tensor(0.3868, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
16159
+ [Rank 1] Trainer log: {'loss': 0.3542, 'grad_norm': 9.478456497192383, 'learning_rate': 5.467426590739511e-09}
16160
+ [Rank 2] Trainer log: {'loss': 0.3542, 'grad_norm': 9.478456497192383, 'learning_rate': 5.467426590739511e-09}
16161
+ [Rank 3] Trainer log: {'loss': 0.3542, 'grad_norm': 9.478456497192383, 'learning_rate': 5.467426590739511e-09}
16162
+ [Rank 0] Trainer log: {'loss': 0.3542, 'grad_norm': 9.478456497192383, 'learning_rate': 5.467426590739511e-09}
16163
+ {'loss': 0.3542, 'grad_norm': 9.478456497192383, 'learning_rate': 5.467426590739511e-09, 'epoch': 0.99}
16164
+ tensor(-0.0008, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:1', grad_fn=<MulBackward0>)
16165
+ tensor(0.0826, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
16166
+ tensor(0.2363, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
16167
+ tensor(0.1431, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
16168
+ {'train/tv_loss': 0.0001582451048307121, 'train/lm_loss': 1.5115166024770588e-05, 'train/info_loss': 1.3351262168725953e-05, 'train/ref_loss': 0.3113810122013092, 'train/uncertainty_loss': 0.01430986523628235, 'train/video_loss': 0.3269701898097992, 'train/total_loss': 0.32698529958724976}
16169
+ tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
16170
+ tensor(-0.0009, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:3', grad_fn=<MulBackward0>)
16171
+ tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
16172
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
16173
+ {'train/tv_loss': 0.00012773239286616446, 'train/lm_loss': 2.550916105974466e-05, 'train/info_loss': 1.5139350580284372e-05, 'train/ref_loss': 0.13579870760440826, 'train/uncertainty_loss': -6.653695600107312e-05, 'train/video_loss': 0.13676917552947998, 'train/total_loss': 0.13679468631744385}
16174
+ [Rank 1] Trainer log: {'loss': 0.3057, 'grad_norm': 7.8359293937683105, 'learning_rate': 5.120408321256376e-09}
16175
+ [Rank 3] Trainer log: {'loss': 0.3057, 'grad_norm': 7.8359293937683105, 'learning_rate': 5.120408321256376e-09}
16176
+ [Rank 2] Trainer log: {'loss': 0.3057, 'grad_norm': 7.8359293937683105, 'learning_rate': 5.120408321256376e-09}
16177
+ [Rank 0] Trainer log: {'loss': 0.3057, 'grad_norm': 7.8359293937683105, 'learning_rate': 5.120408321256376e-09}
16178
+ {'loss': 0.3057, 'grad_norm': 7.8359293937683105, 'learning_rate': 5.120408321256376e-09, 'epoch': 0.99}
16179
+ tensor(-0.0011, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:2', grad_fn=<MulBackward0>)
16180
+ tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
16181
+ {'train/tv_loss': None, 'train/lm_loss': 0.2206066608428955, 'train/info_loss': 0.1133660078048706, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011263965861871839, 'train/video_loss': 0.11325336992740631, 'train/total_loss': 0.33386003971099854}
16182
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
16183
+ tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
16184
+ tensor(-0.0014, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:0', grad_fn=<MulBackward0>)
16185
+ {'train/tv_loss': None, 'train/lm_loss': 0.32869031429290774, 'train/info_loss': 0.1864195317029953, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00014115495141595603, 'train/video_loss': 0.18627837300300598, 'train/total_loss': 0.5149686932563782}
16186
+ tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
16187
+ tensor(-0.0008, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:2', grad_fn=<MulBackward0>)
16188
+ tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
16189
+ [Rank 2] Trainer log: {'loss': 0.3174, 'grad_norm': 3.652292013168335, 'learning_rate': 4.7847638801956644e-09}[Rank 3] Trainer log: {'loss': 0.3174, 'grad_norm': 3.652292013168335, 'learning_rate': 4.7847638801956644e-09}
16190
+
16191
+ [Rank 0] Trainer log: {'loss': 0.3174, 'grad_norm': 3.652292013168335, 'learning_rate': 4.7847638801956644e-09}[Rank 1] Trainer log: {'loss': 0.3174, 'grad_norm': 3.652292013168335, 'learning_rate': 4.7847638801956644e-09}
16192
+
16193
+ {'loss': 0.3174, 'grad_norm': 3.652292013168335, 'learning_rate': 4.7847638801956644e-09, 'epoch': 0.99}
16194
+ tensor(-0.0014, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:3', grad_fn=<MulBackward0>)
16195
+ tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
16196
+ {'train/tv_loss': None, 'train/lm_loss': 0.24325459003448488, 'train/info_loss': 0.22966772317886353, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012454456882551312, 'train/video_loss': 0.22954317927360535, 'train/total_loss': 0.4727977514266968}
16197
+ tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:1', grad_fn=<MulBackward0>)
16198
+ tensor(0.0047, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
16199
+ tensor(-0.0009, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:0', grad_fn=<MulBackward0>)
16200
+ {'train/tv_loss': None, 'train/lm_loss': 0.1443817973136902, 'train/info_loss': 0.10352417826652527, 'train/ref_loss': None, 'train/uncertainty_loss': -9.11391223780811e-05, 'train/video_loss': 0.1034330427646637, 'train/total_loss': 0.2478148490190506}
16201
+ tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
16202
+ tensor(-0.0009, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:3', grad_fn=<MulBackward0>)
16203
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
16204
+ [Rank 3] Trainer log: {'loss': 0.2496, 'grad_norm': 1.6570181846618652, 'learning_rate': 4.4604936495085125e-09}
16205
+ [Rank 1] Trainer log: {'loss': 0.2496, 'grad_norm': 1.6570181846618652, 'learning_rate': 4.4604936495085125e-09}
16206
+ [Rank 2] Trainer log: {'loss': 0.2496, 'grad_norm': 1.6570181846618652, 'learning_rate': 4.4604936495085125e-09}
16207
+ [Rank 0] Trainer log: {'loss': 0.2496, 'grad_norm': 1.6570181846618652, 'learning_rate': 4.4604936495085125e-09}
16208
+ {'loss': 0.2496, 'grad_norm': 1.6570181846618652, 'learning_rate': 4.4604936495085125e-09, 'epoch': 0.99}
16209
+ tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
16210
+ tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:1', grad_fn=<MulBackward0>)
16211
+ tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
16212
+ tensor(0.0908, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
16213
+ {'train/tv_loss': 0.00022518418263643982, 'train/lm_loss': 2.5103901862166822e-05, 'train/info_loss': 1.7702266632113606e-05, 'train/ref_loss': 0.27447566390037537, 'train/uncertainty_loss': 0.009077220410108567, 'train/video_loss': 0.2853720486164093, 'train/total_loss': 0.28539714217185974}
16214
+ tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
16215
+ {'train/tv_loss': None, 'train/lm_loss': 0.19816415309906008, 'train/info_loss': 0.15372924506664276, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012343341950327158, 'train/video_loss': 0.15360581874847412, 'train/total_loss': 0.35176998376846313}
16216
+ tensor(-0.0010, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:3', grad_fn=<MulBackward0>)
16217
+ tensor(0.0529, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
16218
+ tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
16219
+ [Rank 3] Trainer log: {'loss': 0.2788, 'grad_norm': 5.955737113952637, 'learning_rate': 4.1475979982030786e-09}
16220
+ [Rank 2] Trainer log: {'loss': 0.2788, 'grad_norm': 5.955737113952637, 'learning_rate': 4.1475979982030786e-09}[Rank 1] Trainer log: {'loss': 0.2788, 'grad_norm': 5.955737113952637, 'learning_rate': 4.1475979982030786e-09}
16221
+
16222
+ [Rank 0] Trainer log: {'loss': 0.2788, 'grad_norm': 5.955737113952637, 'learning_rate': 4.1475979982030786e-09}
16223
+ {'loss': 0.2788, 'grad_norm': 5.955737113952637, 'learning_rate': 4.1475979982030786e-09, 'epoch': 0.99}
16224
+ tensor(0.3397, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
16225
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
16226
+ {'train/tv_loss': 0.00020076511427760126, 'train/lm_loss': 1.5019805869087578e-05, 'train/info_loss': 1.2159199286543299e-05, 'train/ref_loss': 0.14647886157035828, 'train/uncertainty_loss': -7.060382631607354e-05, 'train/video_loss': 0.14802654087543488, 'train/total_loss': 0.14804156124591827}
16227
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
16228
+ tensor(0.2749, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
16229
+ tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
16230
+ tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
16231
+ {'train/tv_loss': None, 'train/lm_loss': 0.20169191360473634, 'train/info_loss': 0.1432848572731018, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011308164102956653, 'train/video_loss': 0.1431717723608017, 'train/total_loss': 0.3448636829853058}
16232
+ tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:1', grad_fn=<MulBackward0>)
16233
+ tensor(0.0510, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
16234
+ [Rank 3] Trainer log: {'loss': 0.337, 'grad_norm': 2.894012928009033, 'learning_rate': 3.8460772823456504e-09}[Rank 0] Trainer log: {'loss': 0.337, 'grad_norm': 2.894012928009033, 'learning_rate': 3.8460772823456504e-09}
16235
+ [Rank 1] Trainer log: {'loss': 0.337, 'grad_norm': 2.894012928009033, 'learning_rate': 3.8460772823456504e-09}
16236
+ [Rank 2] Trainer log: {'loss': 0.337, 'grad_norm': 2.894012928009033, 'learning_rate': 3.8460772823456504e-09}
16237
+
16238
+ {'loss': 0.337, 'grad_norm': 2.894012928009033, 'learning_rate': 3.8460772823456504e-09, 'epoch': 0.99}
16239
+ tensor(-0.0010, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:1', grad_fn=<MulBackward0>)
16240
+ tensor(-0.0014, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:3', grad_fn=<MulBackward0>)
16241
+ tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
16242
+ tensor(0.0681, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
16243
+ {'train/tv_loss': 9.881064761430025e-05, 'train/lm_loss': 2.2147859272081406e-05, 'train/info_loss': 1.5139350580284372e-05, 'train/ref_loss': 0.2649555206298828, 'train/uncertainty_loss': 0.006811643391847611, 'train/video_loss': 0.272572785615921, 'train/total_loss': 0.2725949287414551}
16244
+ tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
16245
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
16246
+ {'train/tv_loss': 0.0001409656135365367, 'train/lm_loss': 1.3231793127488345e-05, 'train/info_loss': 1.3351262168725953e-05, 'train/ref_loss': 0.19355475902557373, 'train/uncertainty_loss': -7.107839337550104e-05, 'train/video_loss': 0.19462475180625916, 'train/total_loss': 0.1946379840373993}
16247
+ tensor(0.4859, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
16248
+ tensor(0.3200, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
16249
+ [Rank 3] Trainer log: {'loss': 0.3862, 'grad_norm': 6.388055801391602, 'learning_rate': 3.555931845053984e-09}[Rank 2] Trainer log: {'loss': 0.3862, 'grad_norm': 6.388055801391602, 'learning_rate': 3.555931845053984e-09}[Rank 1] Trainer log: {'loss': 0.3862, 'grad_norm': 6.388055801391602, 'learning_rate': 3.555931845053984e-09}
16250
+
16251
+
16252
+ [Rank 0] Trainer log: {'loss': 0.3862, 'grad_norm': 6.388055801391602, 'learning_rate': 3.555931845053984e-09}
16253
+ {'loss': 0.3862, 'grad_norm': 6.388055801391602, 'learning_rate': 3.555931845053984e-09, 'epoch': 0.99}
16254
+ tensor(-0.0014, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:2', grad_fn=<MulBackward0>)
16255
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
16256
+ tensor(0.0618, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
16257
+ tensor(-0.0010, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:0', grad_fn=<MulBackward0>)
16258
+ {'train/tv_loss': None, 'train/lm_loss': 0.2521815776824951, 'train/info_loss': 0.17088307440280914, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010039815679192544, 'train/video_loss': 0.170782670378685, 'train/total_loss': 0.42296427488327026}
16259
+ tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
16260
+ {'train/tv_loss': None, 'train/lm_loss': 0.17505099773406985, 'train/info_loss': 0.1072128415107727, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011181181762367488, 'train/video_loss': 0.10710103064775467, 'train/total_loss': 0.2821520268917084}
16261
+ tensor(-0.0013, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:2', grad_fn=<MulBackward0>)
16262
+ tensor(0.6145, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
16263
+ tensor(0.0295, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
16264
+ [Rank 0] Trainer log: {'loss': 0.3621, 'grad_norm': 7.867546558380127, 'learning_rate': 3.2771620165061857e-09}[Rank 1] Trainer log: {'loss': 0.3621, 'grad_norm': 7.867546558380127, 'learning_rate': 3.2771620165061857e-09}[Rank 2] Trainer log: {'loss': 0.3621, 'grad_norm': 7.867546558380127, 'learning_rate': 3.2771620165061857e-09}
16265
+
16266
+ [Rank 3] Trainer log: {'loss': 0.3621, 'grad_norm': 7.867546558380127, 'learning_rate': 3.2771620165061857e-09}
16267
+
16268
+ {'loss': 0.3621, 'grad_norm': 7.867546558380127, 'learning_rate': 3.2771620165061857e-09, 'epoch': 0.99}
16269
+ tensor(-0.0011, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:2', grad_fn=<MulBackward0>)
16270
+ tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
16271
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
16272
+ {'train/tv_loss': 0.00014469012385234236, 'train/lm_loss': 1.699852291494608e-05, 'train/info_loss': 1.4424115761357825e-05, 'train/ref_loss': 0.22560232877731323, 'train/uncertainty_loss': -6.833352963440121e-05, 'train/video_loss': 0.22670593857765198, 'train/total_loss': 0.22672294080257416}
16273
+ tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
16274
+ tensor(-0.0013, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:2', grad_fn=<MulBackward0>)
16275
+ tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:1', grad_fn=<MulBackward0>)
16276
+ tensor(-0.0010, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:0', grad_fn=<MulBackward0>)
16277
+ {'train/tv_loss': None, 'train/lm_loss': 0.15130637884140016, 'train/info_loss': 0.1600653976202011, 'train/ref_loss': None, 'train/uncertainty_loss': -9.910131338983775e-05, 'train/video_loss': 0.15996628999710083, 'train/total_loss': 0.31127268075942993}
16278
+ tensor(0.0755, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)