aiden200 commited on
Commit
e1a1300
·
verified ·
1 Parent(s): a4f251f

Training in progress, step 2050

Browse files
Files changed (2) hide show
  1. adapter_model.safetensors +1 -1
  2. train.log +385 -0
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:208a41e4cc54a86bdd53676765ee523c73e15be2398cb334acf3c3c1c346f887
3
  size 1204780872
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29a81378ab1f78803e81097b916a079f8e851fd7ab413fde0729ae1de1e9207a
3
  size 1204780872
train.log CHANGED
@@ -17028,3 +17028,388 @@ tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device=
17028
  {'train/tv_loss': None, 'train/lm_loss': 0.32293994426727296, 'train/info_loss': 0.12928418815135956, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011226574424654246, 'train/video_loss': 0.1291719228029251, 'train/total_loss': 0.4521118998527527}
17029
  tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
17030
  tensor(0.2072, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17028
  {'train/tv_loss': None, 'train/lm_loss': 0.32293994426727296, 'train/info_loss': 0.12928418815135956, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011226574424654246, 'train/video_loss': 0.1291719228029251, 'train/total_loss': 0.4521118998527527}
17029
  tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
17030
  tensor(0.2072, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
17031
+ [Rank 0] Trainer log: {'loss': 0.4444, 'grad_norm': 12.160444259643555, 'learning_rate': 5.866149411468177e-06}[Rank 3] Trainer log: {'loss': 0.4444, 'grad_norm': 12.160444259643555, 'learning_rate': 5.866149411468177e-06}[Rank 1] Trainer log: {'loss': 0.4444, 'grad_norm': 12.160444259643555, 'learning_rate': 5.866149411468177e-06}
17032
+
17033
+
17034
+ [Rank 2] Trainer log: {'loss': 0.4444, 'grad_norm': 12.160444259643555, 'learning_rate': 5.866149411468177e-06}
17035
+ {'loss': 0.4444, 'grad_norm': 12.160444259643555, 'learning_rate': 5.866149411468177e-06, 'epoch': 0.65}
17036
+ tensor(-0.0014, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:1', grad_fn=<MulBackward0>)
17037
+ tensor(-0.0010, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:3', grad_fn=<MulBackward0>)
17038
+ tensor(-0.0008, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:2', grad_fn=<MulBackward0>)
17039
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
17040
+ {'train/tv_loss': 0.00020880883093923332, 'train/lm_loss': 2.4817834491841496e-05, 'train/info_loss': 1.8834713046089746e-05, 'train/ref_loss': 0.18321365118026733, 'train/uncertainty_loss': -6.815286469645798e-05, 'train/video_loss': 0.1848347932100296, 'train/total_loss': 0.18485960364341736}
17041
+ tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
17042
+ tensor(-0.0010, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:0', grad_fn=<MulBackward0>)
17043
+ {'train/tv_loss': None, 'train/lm_loss': 0.2403105974197388, 'train/info_loss': 0.2913327217102051, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010404838249087334, 'train/video_loss': 0.29122868180274963, 'train/total_loss': 0.531539261341095}
17044
+ tensor(0.0252, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
17045
+ tensor(0.0596, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
17046
+ [Rank 3] Trainer log: {'loss': 0.3638, 'grad_norm': 1.8527789115905762, 'learning_rate': 5.8564383629784175e-06}[Rank 1] Trainer log: {'loss': 0.3638, 'grad_norm': 1.8527789115905762, 'learning_rate': 5.8564383629784175e-06}
17047
+
17048
+ [Rank 0] Trainer log: {'loss': 0.3638, 'grad_norm': 1.8527789115905762, 'learning_rate': 5.8564383629784175e-06}
17049
+ [Rank 2] Trainer log: {'loss': 0.3638, 'grad_norm': 1.8527789115905762, 'learning_rate': 5.8564383629784175e-06}
17050
+ {'loss': 0.3638, 'grad_norm': 1.8527789115905762, 'learning_rate': 5.8564383629784175e-06, 'epoch': 0.65}
17051
+ tensor(-0.0013, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:0', grad_fn=<MulBackward0>)
17052
+ {'train/tv_loss': None, 'train/lm_loss': 0.38158850669860844, 'train/info_loss': 0.06055440008640289, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001311479019932449, 'train/video_loss': 0.06042325124144554, 'train/total_loss': 0.4420117735862732}
17053
+ tensor(-0.0014, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:3', grad_fn=<MulBackward0>)
17054
+ tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
17055
+ tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
17056
+ tensor(-0.0010, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:1', grad_fn=<MulBackward0>)
17057
+ tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
17058
+ tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
17059
+ {'train/tv_loss': None, 'train/lm_loss': 0.2511230230331421, 'train/info_loss': 0.2582288980484009, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012226408580318093, 'train/video_loss': 0.25810661911964417, 'train/total_loss': 0.5092296600341797}
17060
+ tensor(-0.0008, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:2', grad_fn=<MulBackward0>)
17061
+ [Rank 3] Trainer log: {'loss': 0.3692, 'grad_norm': 2.5602004528045654, 'learning_rate': 5.846732029718962e-06}[Rank 0] Trainer log: {'loss': 0.3692, 'grad_norm': 2.5602004528045654, 'learning_rate': 5.846732029718962e-06}
17062
+ [Rank 1] Trainer log: {'loss': 0.3692, 'grad_norm': 2.5602004528045654, 'learning_rate': 5.846732029718962e-06}
17063
+
17064
+ [Rank 2] Trainer log: {'loss': 0.3692, 'grad_norm': 2.5602004528045654, 'learning_rate': 5.846732029718962e-06}
17065
+ {'loss': 0.3692, 'grad_norm': 2.5602004528045654, 'learning_rate': 5.846732029718962e-06, 'epoch': 0.65}
17066
+ tensor(-0.0013, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:1', grad_fn=<MulBackward0>)
17067
+ tensor(-0.0013, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:0', grad_fn=<MulBackward0>)
17068
+ {'train/tv_loss': None, 'train/lm_loss': 0.3681243896484375, 'train/info_loss': 0.07267985492944717, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012856742832809687, 'train/video_loss': 0.07255128771066666, 'train/total_loss': 0.44067567586898804}
17069
+ tensor(0.2176, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
17070
+ tensor(0.0752, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
17071
+ tensor(-0.0010, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:0', grad_fn=<MulBackward0>)
17072
+ {'train/tv_loss': None, 'train/lm_loss': 0.29500226974487304, 'train/info_loss': 0.15204651653766632, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010267778998240829, 'train/video_loss': 0.15194383263587952, 'train/total_loss': 0.4469461143016815}
17073
+ tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
17074
+ tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
17075
+ tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
17076
+ [Rank 0] Trainer log: {'loss': 0.3501, 'grad_norm': 5.391907691955566, 'learning_rate': 5.837030422735281e-06}[Rank 2] Trainer log: {'loss': 0.3501, 'grad_norm': 5.391907691955566, 'learning_rate': 5.837030422735281e-06}[Rank 1] Trainer log: {'loss': 0.3501, 'grad_norm': 5.391907691955566, 'learning_rate': 5.837030422735281e-06}
17077
+
17078
+
17079
+ [Rank 3] Trainer log: {'loss': 0.3501, 'grad_norm': 5.391907691955566, 'learning_rate': 5.837030422735281e-06}
17080
+ {'loss': 0.3501, 'grad_norm': 5.391907691955566, 'learning_rate': 5.837030422735281e-06, 'epoch': 0.65}
17081
+ tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
17082
+ tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
17083
+ {'train/tv_loss': None, 'train/lm_loss': 0.27378618717193604, 'train/info_loss': 0.18530026078224182, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001140484819188714, 'train/video_loss': 0.1851862072944641, 'train/total_loss': 0.45897239446640015}
17084
+ tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
17085
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
17086
+ tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
17087
+ tensor(0.2328, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
17088
+ tensor(0.1607, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
17089
+ {'train/tv_loss': 0.00021110486704856157, 'train/lm_loss': 0.00011288317618891598, 'train/info_loss': 2.6761768822325394e-05, 'train/ref_loss': 0.3208291828632355, 'train/uncertainty_loss': 0.016073283553123475, 'train/video_loss': 0.33861806988716125, 'train/total_loss': 0.33873096108436584}
17090
+ tensor(0.0314, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
17091
+ [Rank 0] Trainer log: {'loss': 0.3379, 'grad_norm': 7.8567070960998535, 'learning_rate': 5.827333553067473e-06}[Rank 1] Trainer log: {'loss': 0.3379, 'grad_norm': 7.8567070960998535, 'learning_rate': 5.827333553067473e-06}
17092
+
17093
+ [Rank 2] Trainer log: {'loss': 0.3379, 'grad_norm': 7.8567070960998535, 'learning_rate': 5.827333553067473e-06}
17094
+ [Rank 3] Trainer log: {'loss': 0.3379, 'grad_norm': 7.8567070960998535, 'learning_rate': 5.827333553067473e-06}
17095
+ {'loss': 0.3379, 'grad_norm': 7.8567070960998535, 'learning_rate': 5.827333553067473e-06, 'epoch': 0.66}
17096
+ tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
17097
+ tensor(-0.0010, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:2', grad_fn=<MulBackward0>)
17098
+ tensor(0.0031, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
17099
+ {'train/tv_loss': 0.00014839788200333716, 'train/lm_loss': 2.1766431746073068e-05, 'train/info_loss': 1.794067611626815e-05, 'train/ref_loss': 0.21624045073986053, 'train/uncertainty_loss': 0.0003052026499062777, 'train/video_loss': 0.21775078773498535, 'train/total_loss': 0.21777255833148956}
17100
+ tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
17101
+ tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
17102
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
17103
+ tensor(0.0286, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
17104
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
17105
+ {'train/tv_loss': 0.00024025768507272006, 'train/lm_loss': 3.1874023261480036e-05, 'train/info_loss': 1.9728748156921938e-05, 'train/ref_loss': 0.1827160120010376, 'train/uncertainty_loss': -6.815321394242347e-05, 'train/video_loss': 0.18458963930606842, 'train/total_loss': 0.18462151288986206}
17106
+ [Rank 1] Trainer log: {'loss': 0.2946, 'grad_norm': 7.998960971832275, 'learning_rate': 5.817641431750234e-06}
17107
+ [Rank 3] Trainer log: {'loss': 0.2946, 'grad_norm': 7.998960971832275, 'learning_rate': 5.817641431750234e-06}
17108
+ [Rank 0] Trainer log: {'loss': 0.2946, 'grad_norm': 7.998960971832275, 'learning_rate': 5.817641431750234e-06}[Rank 2] Trainer log: {'loss': 0.2946, 'grad_norm': 7.998960971832275, 'learning_rate': 5.817641431750234e-06}
17109
+
17110
+ {'loss': 0.2946, 'grad_norm': 7.998960971832275, 'learning_rate': 5.817641431750234e-06, 'epoch': 0.66}
17111
+ tensor(-0.0009, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:3', grad_fn=<MulBackward0>)
17112
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
17113
+ tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
17114
+ tensor(0.0528, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
17115
+ {'train/tv_loss': 0.00027445454616099597, 'train/lm_loss': 2.1790270693600178e-05, 'train/info_loss': 1.6867830709088594e-05, 'train/ref_loss': 0.250593900680542, 'train/uncertainty_loss': 0.0052795026451349265, 'train/video_loss': 0.2580859065055847, 'train/total_loss': 0.2581076920032501}
17116
+ tensor(-0.0013, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:1', grad_fn=<MulBackward0>)
17117
+ tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
17118
+ {'train/tv_loss': None, 'train/lm_loss': 0.21145300865173342, 'train/info_loss': 0.3242379426956177, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012023800518363715, 'train/video_loss': 0.3241176903247833, 'train/total_loss': 0.5355706810951233}
17119
+ tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
17120
+ tensor(0.0132, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
17121
+ [Rank 2] Trainer log: {'loss': 0.3252, 'grad_norm': 10.534229278564453, 'learning_rate': 5.807954069812862e-06}[Rank 1] Trainer log: {'loss': 0.3252, 'grad_norm': 10.534229278564453, 'learning_rate': 5.807954069812862e-06}
17122
+ [Rank 3] Trainer log: {'loss': 0.3252, 'grad_norm': 10.534229278564453, 'learning_rate': 5.807954069812862e-06}
17123
+
17124
+ [Rank 0] Trainer log: {'loss': 0.3252, 'grad_norm': 10.534229278564453, 'learning_rate': 5.807954069812862e-06}
17125
+ {'loss': 0.3252, 'grad_norm': 10.534229278564453, 'learning_rate': 5.807954069812862e-06, 'epoch': 0.66}
17126
+ tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
17127
+ tensor(-0.0013, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:3', grad_fn=<MulBackward0>)
17128
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
17129
+ tensor(0.0294, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
17130
+ {'train/tv_loss': 0.00017149079358205201, 'train/lm_loss': 4.7129800077527764e-05, 'train/info_loss': 2.455650974297896e-05, 'train/ref_loss': 0.2356904149055481, 'train/uncertainty_loss': 0.0029447738081216815, 'train/video_loss': 0.2400316596031189, 'train/total_loss': 0.24007879197597504}
17131
+ tensor(-0.0011, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:2', grad_fn=<MulBackward0>)
17132
+ tensor(-0.0010, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:3', grad_fn=<MulBackward0>)
17133
+ tensor(-0.0009, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:0', grad_fn=<MulBackward0>)
17134
+ {'train/tv_loss': None, 'train/lm_loss': 0.10311661958694458, 'train/info_loss': 0.17362730205059052, 'train/ref_loss': None, 'train/uncertainty_loss': -9.315699571743608e-05, 'train/video_loss': 0.17353413999080658, 'train/total_loss': 0.2766507565975189}
17135
+ tensor(0.2431, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
17136
+ [Rank 0] Trainer log: {'loss': 0.3809, 'grad_norm': 4.766957759857178, 'learning_rate': 5.798271478279253e-06}[Rank 1] Trainer log: {'loss': 0.3809, 'grad_norm': 4.766957759857178, 'learning_rate': 5.798271478279253e-06}[Rank 3] Trainer log: {'loss': 0.3809, 'grad_norm': 4.766957759857178, 'learning_rate': 5.798271478279253e-06}
17137
+
17138
+ [Rank 2] Trainer log: {'loss': 0.3809, 'grad_norm': 4.766957759857178, 'learning_rate': 5.798271478279253e-06}
17139
+
17140
+ {'loss': 0.3809, 'grad_norm': 4.766957759857178, 'learning_rate': 5.798271478279253e-06, 'epoch': 0.66}
17141
+ tensor(-0.0014, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:2', grad_fn=<MulBackward0>)
17142
+ tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
17143
+ tensor(0.1595, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
17144
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
17145
+ {'train/tv_loss': 0.00022061308845877647, 'train/lm_loss': 3.185018722433597e-05, 'train/info_loss': 1.9430735846981406e-05, 'train/ref_loss': 0.18868407607078552, 'train/uncertainty_loss': -6.92706322297454e-05, 'train/video_loss': 0.1903991401195526, 'train/total_loss': 0.19043098390102386}
17146
+ tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
17147
+ {'train/tv_loss': None, 'train/lm_loss': 0.29790046215057375, 'train/info_loss': 0.1837206333875656, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011250831885263325, 'train/video_loss': 0.18360812962055206, 'train/total_loss': 0.48150861263275146}
17148
+ tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
17149
+ tensor(-0.0009, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:3', grad_fn=<MulBackward0>)
17150
+ tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:1', grad_fn=<MulBackward0>)
17151
+ [Rank 1] Trainer log: {'loss': 0.3361, 'grad_norm': 4.297579288482666, 'learning_rate': 5.788593668167854e-06}[Rank 0] Trainer log: {'loss': 0.3361, 'grad_norm': 4.297579288482666, 'learning_rate': 5.788593668167854e-06}
17152
+ [Rank 3] Trainer log: {'loss': 0.3361, 'grad_norm': 4.297579288482666, 'learning_rate': 5.788593668167854e-06}
17153
+
17154
+ [Rank 2] Trainer log: {'loss': 0.3361, 'grad_norm': 4.297579288482666, 'learning_rate': 5.788593668167854e-06}
17155
+ {'loss': 0.3361, 'grad_norm': 4.297579288482666, 'learning_rate': 5.788593668167854e-06, 'epoch': 0.66}
17156
+ tensor(-0.0014, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:2', grad_fn=<MulBackward0>)
17157
+ tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
17158
+ tensor(0.1702, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
17159
+ {'train/tv_loss': 0.00018524311017245056, 'train/lm_loss': 3.623634111136198e-05, 'train/info_loss': 2.0980394765501842e-05, 'train/ref_loss': 0.32768750190734863, 'train/uncertainty_loss': 0.017021270096302034, 'train/video_loss': 0.346211701631546, 'train/total_loss': 0.34624794125556946}
17160
+ tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
17161
+ tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
17162
+ tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
17163
+ tensor(0.1074, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:3', grad_fn=<MulBackward0>)
17164
+ tensor(0.0083, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
17165
+ {'train/tv_loss': 0.00019137355266138912, 'train/lm_loss': 3.5974127240478995e-05, 'train/info_loss': 2.235124156868551e-05, 'train/ref_loss': 0.22056730091571808, 'train/uncertainty_loss': 0.0008338701911270619, 'train/video_loss': 0.22295451164245605, 'train/total_loss': 0.222990483045578}
17166
+ [Rank 0] Trainer log: {'loss': 0.3789, 'grad_norm': 8.171895980834961, 'learning_rate': 5.7789206504916815e-06}[Rank 2] Trainer log: {'loss': 0.3789, 'grad_norm': 8.171895980834961, 'learning_rate': 5.7789206504916815e-06}[Rank 3] Trainer log: {'loss': 0.3789, 'grad_norm': 8.171895980834961, 'learning_rate': 5.7789206504916815e-06}
17167
+
17168
+
17169
+ {'loss': 0.3789, 'grad_norm': 8.171895980834961, 'learning_rate': 5.7789206504916815e-06, 'epoch': 0.66}[Rank 1] Trainer log: {'loss': 0.3789, 'grad_norm': 8.171895980834961, 'learning_rate': 5.7789206504916815e-06}
17170
+
17171
+ tensor(-0.0013, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:3', grad_fn=<MulBackward0>)
17172
+ tensor(-0.0013, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:1', grad_fn=<MulBackward0>)
17173
+ tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
17174
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
17175
+ {'train/tv_loss': 0.00026274872943758963, 'train/lm_loss': 3.1874023261480036e-05, 'train/info_loss': 2.0682384274550714e-05, 'train/ref_loss': 0.1149064302444458, 'train/uncertainty_loss': -7.323419558815659e-05, 'train/video_loss': 0.11695586889982224, 'train/total_loss': 0.11698774248361588}
17176
+ tensor(-0.0013, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:2', grad_fn=<MulBackward0>)
17177
+ tensor(-0.0010, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:3', grad_fn=<MulBackward0>)
17178
+ tensor(-0.0010, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:1', grad_fn=<MulBackward0>)
17179
+ tensor(0.0592, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
17180
+ {'train/tv_loss': 0.00016578567447140814, 'train/lm_loss': 6.0000957455486065e-05, 'train/info_loss': 2.574854443082586e-05, 'train/ref_loss': 0.25501570105552673, 'train/uncertainty_loss': 0.0059223812073469165, 'train/video_loss': 0.2622901201248169, 'train/total_loss': 0.2623501121997833}
17181
+ [Rank 3] Trainer log: {'loss': 0.3454, 'grad_norm': 3.6991794109344482, 'learning_rate': 5.769252436258295e-06}
17182
+ [Rank 1] Trainer log: {'loss': 0.3454, 'grad_norm': 3.6991794109344482, 'learning_rate': 5.769252436258295e-06}[Rank 0] Trainer log: {'loss': 0.3454, 'grad_norm': 3.6991794109344482, 'learning_rate': 5.769252436258295e-06}
17183
+
17184
+ [Rank 2] Trainer log: {'loss': 0.3454, 'grad_norm': 3.6991794109344482, 'learning_rate': 5.769252436258295e-06}
17185
+ {'loss': 0.3454, 'grad_norm': 3.6991794109344482, 'learning_rate': 5.769252436258295e-06, 'epoch': 0.66}
17186
+ tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
17187
+ tensor(-0.0015, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0015, device='cuda:2', grad_fn=<MulBackward0>)
17188
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
17189
+ {'train/tv_loss': 0.0003217862220481038, 'train/lm_loss': 4.653389332816005e-05, 'train/info_loss': 2.1338008082238957e-05, 'train/ref_loss': 0.17965355515480042, 'train/uncertainty_loss': -7.330098887905479e-05, 'train/video_loss': 0.1821758896112442, 'train/total_loss': 0.1822224259376526}
17190
+ tensor(-0.0014, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:3', grad_fn=<MulBackward0>)
17191
+ tensor(-0.0014, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:2', grad_fn=<MulBackward0>)
17192
+ tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:1', grad_fn=<MulBackward0>)
17193
+ tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
17194
+ tensor(-0.0009, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:0', grad_fn=<MulBackward0>)
17195
+ {'train/tv_loss': None, 'train/lm_loss': 0.21704933643341065, 'train/info_loss': 0.08072395622730255, 'train/ref_loss': None, 'train/uncertainty_loss': -9.479672298766673e-05, 'train/video_loss': 0.08062916249036789, 'train/total_loss': 0.29767850041389465}
17196
+ [Rank 1] Trainer log: {'loss': 0.3814, 'grad_norm': 3.887155294418335, 'learning_rate': 5.759589036469793e-06}[Rank 0] Trainer log: {'loss': 0.3814, 'grad_norm': 3.887155294418335, 'learning_rate': 5.759589036469793e-06}[Rank 3] Trainer log: {'loss': 0.3814, 'grad_norm': 3.887155294418335, 'learning_rate': 5.759589036469793e-06}
17197
+
17198
+
17199
+ [Rank 2] Trainer log: {'loss': 0.3814, 'grad_norm': 3.887155294418335, 'learning_rate': 5.759589036469793e-06}
17200
+ {'loss': 0.3814, 'grad_norm': 3.887155294418335, 'learning_rate': 5.759589036469793e-06, 'epoch': 0.66}
17201
+ tensor(-0.0014, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:3', grad_fn=<MulBackward0>)
17202
+ tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
17203
+ tensor(-0.0009, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:0', grad_fn=<MulBackward0>)
17204
+ {'train/tv_loss': None, 'train/lm_loss': 0.15884593725204468, 'train/info_loss': 0.09679526090621948, 'train/ref_loss': None, 'train/uncertainty_loss': -9.324780548922718e-05, 'train/video_loss': 0.09670200943946838, 'train/total_loss': 0.2555479407310486}
17205
+ tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:1', grad_fn=<MulBackward0>)
17206
+ tensor(-0.0013, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:3', grad_fn=<MulBackward0>)
17207
+ tensor(-0.0010, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:0', grad_fn=<MulBackward0>)
17208
+ {'train/tv_loss': None, 'train/lm_loss': 0.13753668069839478, 'train/info_loss': 0.23872123658657074, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001031742780469358, 'train/video_loss': 0.23861806094646454, 'train/total_loss': 0.37615475058555603}
17209
+ tensor(0.1312, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
17210
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
17211
+ [Rank 0] Trainer log: {'loss': 0.3451, 'grad_norm': 9.478434562683105, 'learning_rate': 5.749930462122784e-06}[Rank 3] Trainer log: {'loss': 0.3451, 'grad_norm': 9.478434562683105, 'learning_rate': 5.749930462122784e-06}[Rank 2] Trainer log: {'loss': 0.3451, 'grad_norm': 9.478434562683105, 'learning_rate': 5.749930462122784e-06}
17212
+
17213
+ [Rank 1] Trainer log: {'loss': 0.3451, 'grad_norm': 9.478434562683105, 'learning_rate': 5.749930462122784e-06}
17214
+
17215
+ {'loss': 0.3451, 'grad_norm': 9.478434562683105, 'learning_rate': 5.749930462122784e-06, 'epoch': 0.66}
17216
+ tensor(-0.0015, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0015, device='cuda:2', grad_fn=<MulBackward0>)
17217
+ tensor(-0.0013, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:0', grad_fn=<MulBackward0>)
17218
+ {'train/tv_loss': None, 'train/lm_loss': 0.3149502038955689, 'train/info_loss': 0.1556193083524704, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012698310893028976, 'train/video_loss': 0.15549232065677643, 'train/total_loss': 0.470442533493042}
17219
+ tensor(-0.0010, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:3', grad_fn=<MulBackward0>)
17220
+ tensor(-0.0008, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:1', grad_fn=<MulBackward0>)
17221
+ tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
17222
+ tensor(-0.0009, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:0', grad_fn=<MulBackward0>)
17223
+ {'train/tv_loss': None, 'train/lm_loss': 0.0765468716621399, 'train/info_loss': 0.2542925477027893, 'train/ref_loss': None, 'train/uncertainty_loss': -8.723060018382967e-05, 'train/video_loss': 0.2542053163051605, 'train/total_loss': 0.3307521939277649}
17224
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
17225
+ tensor(0.0120, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
17226
+ [Rank 0] Trainer log: {'loss': 0.2922, 'grad_norm': 1.9175676107406616, 'learning_rate': 5.740276724208397e-06}[Rank 1] Trainer log: {'loss': 0.2922, 'grad_norm': 1.9175676107406616, 'learning_rate': 5.740276724208397e-06}
17227
+ [Rank 2] Trainer log: {'loss': 0.2922, 'grad_norm': 1.9175676107406616, 'learning_rate': 5.740276724208397e-06}
17228
+
17229
+ [Rank 3] Trainer log: {'loss': 0.2922, 'grad_norm': 1.9175676107406616, 'learning_rate': 5.740276724208397e-06}
17230
+ {'loss': 0.2922, 'grad_norm': 1.9175676107406616, 'learning_rate': 5.740276724208397e-06, 'epoch': 0.66}
17231
+ tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
17232
+ tensor(0.1122, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
17233
+ tensor(0.1959, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
17234
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
17235
+ {'train/tv_loss': 0.00016258273972198368, 'train/lm_loss': 6.1025843024253845e-05, 'train/info_loss': 2.413929905742407e-05, 'train/ref_loss': 0.16883832216262817, 'train/uncertainty_loss': -7.159045781008899e-05, 'train/video_loss': 0.17009153962135315, 'train/total_loss': 0.17015255987644196}
17236
+ tensor(-0.0014, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:2', grad_fn=<MulBackward0>)
17237
+ tensor(-0.0010, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:3', grad_fn=<MulBackward0>)
17238
+ tensor(0.3660, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
17239
+ {'train/tv_loss': 0.0002153045265004039, 'train/lm_loss': 2.496086817700416e-05, 'train/info_loss': 1.8238688426208682e-05, 'train/ref_loss': 0.4590832591056824, 'train/uncertainty_loss': 0.03660423159599304, 'train/video_loss': 0.49742814898490906, 'train/total_loss': 0.49745312333106995}
17240
+ tensor(0.2555, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
17241
+ [Rank 3] Trainer log: {'loss': 0.3685, 'grad_norm': 5.5717644691467285, 'learning_rate': 5.7306278337122525e-06}[Rank 2] Trainer log: {'loss': 0.3685, 'grad_norm': 5.5717644691467285, 'learning_rate': 5.7306278337122525e-06}
17242
+ [Rank 1] Trainer log: {'loss': 0.3685, 'grad_norm': 5.5717644691467285, 'learning_rate': 5.7306278337122525e-06}
17243
+
17244
+ [Rank 0] Trainer log: {'loss': 0.3685, 'grad_norm': 5.5717644691467285, 'learning_rate': 5.7306278337122525e-06}
17245
+ {'loss': 0.3685, 'grad_norm': 5.5717644691467285, 'learning_rate': 5.7306278337122525e-06, 'epoch': 0.66}
17246
+ tensor(-0.0014, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:2', grad_fn=<MulBackward0>)
17247
+ tensor(-0.0013, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:3', grad_fn=<MulBackward0>)
17248
+ tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:1', grad_fn=<MulBackward0>)
17249
+ tensor(1.1109, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
17250
+ {'train/tv_loss': 0.0002126151230186224, 'train/lm_loss': 2.4722478701733053e-05, 'train/info_loss': 1.740425250318367e-05, 'train/ref_loss': 0.9555550813674927, 'train/uncertainty_loss': 0.11109327077865601, 'train/video_loss': 1.0683666467666626, 'train/total_loss': 1.0683913230895996}
17251
+ tensor(-0.0013, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:3', grad_fn=<MulBackward0>)
17252
+ tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
17253
+ tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
17254
+ tensor(0.0930, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
17255
+ {'train/tv_loss': 0.00013671087799593806, 'train/lm_loss': 3.657006600406021e-05, 'train/info_loss': 1.9132725356030278e-05, 'train/ref_loss': 0.2791999578475952, 'train/uncertainty_loss': 0.009300475567579269, 'train/video_loss': 0.2896132469177246, 'train/total_loss': 0.2896498143672943}
17256
+ [Rank 1] Trainer log: {'loss': 0.4932, 'grad_norm': 7.025022029876709, 'learning_rate': 5.720983801614455e-06}[Rank 0] Trainer log: {'loss': 0.4932, 'grad_norm': 7.025022029876709, 'learning_rate': 5.720983801614455e-06}
17257
+ [Rank 3] Trainer log: {'loss': 0.4932, 'grad_norm': 7.025022029876709, 'learning_rate': 5.720983801614455e-06}
17258
+
17259
+ [Rank 2] Trainer log: {'loss': 0.4932, 'grad_norm': 7.025022029876709, 'learning_rate': 5.720983801614455e-06}
17260
+ {'loss': 0.4932, 'grad_norm': 7.025022029876709, 'learning_rate': 5.720983801614455e-06, 'epoch': 0.66}
17261
+ tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
17262
+ tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
17263
+ tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
17264
+ {'train/tv_loss': None, 'train/lm_loss': 0.2787301301956177, 'train/info_loss': 0.2266109138727188, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011889106826856732, 'train/video_loss': 0.2264920175075531, 'train/total_loss': 0.5052221417427063}
17265
+ tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
17266
+ tensor(-0.0013, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:2', grad_fn=<MulBackward0>)
17267
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
17268
+ tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
17269
+ tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
17270
+ {'train/tv_loss': None, 'train/lm_loss': 0.13043994903564454, 'train/info_loss': 0.17577716708183289, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001127044321037829, 'train/video_loss': 0.17566446959972382, 'train/total_loss': 0.3061044216156006}
17271
+ [Rank 1] Trainer log: {'loss': 0.3514, 'grad_norm': 5.906411170959473, 'learning_rate': 5.7113446388895855e-06}[Rank 3] Trainer log: {'loss': 0.3514, 'grad_norm': 5.906411170959473, 'learning_rate': 5.7113446388895855e-06}[Rank 0] Trainer log: {'loss': 0.3514, 'grad_norm': 5.906411170959473, 'learning_rate': 5.7113446388895855e-06}
17272
+
17273
+
17274
+ [Rank 2] Trainer log: {'loss': 0.3514, 'grad_norm': 5.906411170959473, 'learning_rate': 5.7113446388895855e-06}
17275
+ {'loss': 0.3514, 'grad_norm': 5.906411170959473, 'learning_rate': 5.7113446388895855e-06, 'epoch': 0.66}
17276
+ tensor(-0.0014, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:2', grad_fn=<MulBackward0>)
17277
+ tensor(-0.0010, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:1', grad_fn=<MulBackward0>)
17278
+ tensor(-0.0008, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:3', grad_fn=<MulBackward0>)
17279
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
17280
+ {'train/tv_loss': 0.00023483284749090672, 'train/lm_loss': 3.645087999757379e-05, 'train/info_loss': 2.235124156868551e-05, 'train/ref_loss': 0.06819528341293335, 'train/uncertainty_loss': -6.94944174028933e-05, 'train/video_loss': 0.07002680748701096, 'train/total_loss': 0.07006325572729111}
17281
+ tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
17282
+ tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
17283
+ tensor(-0.0010, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:2', grad_fn=<MulBackward0>)
17284
+ tensor(0.5629, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:0', grad_fn=<MulBackward0>)
17285
+ {'train/tv_loss': 0.000887374859303236, 'train/lm_loss': 4.150436725467444e-05, 'train/info_loss': 2.270885306643322e-05, 'train/ref_loss': 0.5856791138648987, 'train/uncertainty_loss': 0.05629110932350159, 'train/video_loss': 0.649091899394989, 'train/total_loss': 0.6491333842277527}
17286
+ [Rank 1] Trainer log: {'loss': 0.4015, 'grad_norm': 11.06789493560791, 'learning_rate': 5.701710356506665e-06}
17287
+ [Rank 3] Trainer log: {'loss': 0.4015, 'grad_norm': 11.06789493560791, 'learning_rate': 5.701710356506665e-06}
17288
+ [Rank 0] Trainer log: {'loss': 0.4015, 'grad_norm': 11.06789493560791, 'learning_rate': 5.701710356506665e-06}[Rank 2] Trainer log: {'loss': 0.4015, 'grad_norm': 11.06789493560791, 'learning_rate': 5.701710356506665e-06}
17289
+
17290
+ {'loss': 0.4015, 'grad_norm': 11.06789493560791, 'learning_rate': 5.701710356506665e-06, 'epoch': 0.66}
17291
+ tensor(-0.0010, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:3', grad_fn=<MulBackward0>)
17292
+ tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
17293
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
17294
+ tensor(0.0157, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
17295
+ {'train/tv_loss': 0.00012852794025093318, 'train/lm_loss': 5.418520304374397e-05, 'train/info_loss': 2.3424076061928645e-05, 'train/ref_loss': 0.22761771082878113, 'train/uncertainty_loss': 0.001567848213016987, 'train/video_loss': 0.23023721575737, 'train/total_loss': 0.23029139637947083}
17296
+ tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
17297
+ tensor(-0.0010, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:3', grad_fn=<MulBackward0>)
17298
+ tensor(-0.0008, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:1', grad_fn=<MulBackward0>)
17299
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
17300
+ {'train/tv_loss': 0.0001663352712057531, 'train/lm_loss': 2.46032839640975e-05, 'train/info_loss': 1.710624019324314e-05, 'train/ref_loss': 0.1792670488357544, 'train/uncertainty_loss': -7.328792125917972e-05, 'train/video_loss': 0.18054156005382538, 'train/total_loss': 0.18056616187095642}
17301
+ [Rank 1] Trainer log: {'loss': 0.2317, 'grad_norm': 2.1956772804260254, 'learning_rate': 5.6920809654291945e-06}[Rank 3] Trainer log: {'loss': 0.2317, 'grad_norm': 2.1956772804260254, 'learning_rate': 5.6920809654291945e-06}[Rank 0] Trainer log: {'loss': 0.2317, 'grad_norm': 2.1956772804260254, 'learning_rate': 5.6920809654291945e-06}
17302
+
17303
+ [Rank 2] Trainer log: {'loss': 0.2317, 'grad_norm': 2.1956772804260254, 'learning_rate': 5.6920809654291945e-06}
17304
+
17305
+ {'loss': 0.2317, 'grad_norm': 2.1956772804260254, 'learning_rate': 5.6920809654291945e-06, 'epoch': 0.66}
17306
+ tensor(-0.0009, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:3', grad_fn=<MulBackward0>)
17307
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
17308
+ tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
17309
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
17310
+ {'train/tv_loss': 0.00016776639968156816, 'train/lm_loss': 2.7893029619008305e-05, 'train/info_loss': 2.0682384274550714e-05, 'train/ref_loss': 0.2100030779838562, 'train/uncertainty_loss': -7.060928619466722e-05, 'train/video_loss': 0.21129527688026428, 'train/total_loss': 0.21132317185401917}
17311
+ tensor(-0.0013, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:2', grad_fn=<MulBackward0>)
17312
+ tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:1', grad_fn=<MulBackward0>)
17313
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
17314
+ {'train/tv_loss': 0.00012384995352476835, 'train/lm_loss': 4.760652373079211e-05, 'train/info_loss': 2.3781687559676357e-05, 'train/ref_loss': 0.09827212989330292, 'train/uncertainty_loss': -7.054724264889956e-05, 'train/video_loss': 0.09921616315841675, 'train/total_loss': 0.09926377236843109}
17315
+ tensor(0.1509, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
17316
+ [Rank 2] Trainer log: {'loss': 0.2525, 'grad_norm': 3.496741771697998, 'learning_rate': 5.6824564766150724e-06}[Rank 3] Trainer log: {'loss': 0.2525, 'grad_norm': 3.496741771697998, 'learning_rate': 5.6824564766150724e-06}[Rank 0] Trainer log: {'loss': 0.2525, 'grad_norm': 3.496741771697998, 'learning_rate': 5.6824564766150724e-06}
17317
+
17318
+ [Rank 1] Trainer log: {'loss': 0.2525, 'grad_norm': 3.496741771697998, 'learning_rate': 5.6824564766150724e-06}
17319
+
17320
+ {'loss': 0.2525, 'grad_norm': 3.496741771697998, 'learning_rate': 5.6824564766150724e-06, 'epoch': 0.66}
17321
+ tensor(-0.0014, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:2', grad_fn=<MulBackward0>)
17322
+ tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
17323
+ {'train/tv_loss': None, 'train/lm_loss': 0.20042383670806885, 'train/info_loss': 0.1449424922466278, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011801900109276175, 'train/video_loss': 0.14482447504997253, 'train/total_loss': 0.3452483117580414}
17324
+ tensor(-0.0010, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:1', grad_fn=<MulBackward0>)
17325
+ tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
17326
+ tensor(-0.0013, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:0', grad_fn=<MulBackward0>)
17327
+ {'train/tv_loss': None, 'train/lm_loss': 0.45617589950561527, 'train/info_loss': 0.24490997195243835, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012810240732505918, 'train/video_loss': 0.24478186666965485, 'train/total_loss': 0.7009577751159668}
17328
+ tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:1', grad_fn=<MulBackward0>)
17329
+ tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
17330
+ tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
17331
+ [Rank 2] Trainer log: {'loss': 0.3158, 'grad_norm': 4.859067440032959, 'learning_rate': 5.6728369010166426e-06}[Rank 1] Trainer log: {'loss': 0.3158, 'grad_norm': 4.859067440032959, 'learning_rate': 5.6728369010166426e-06}[Rank 0] Trainer log: {'loss': 0.3158, 'grad_norm': 4.859067440032959, 'learning_rate': 5.6728369010166426e-06}
17332
+
17333
+ [Rank 3] Trainer log: {'loss': 0.3158, 'grad_norm': 4.859067440032959, 'learning_rate': 5.6728369010166426e-06}
17334
+
17335
+ {'loss': 0.3158, 'grad_norm': 4.859067440032959, 'learning_rate': 5.6728369010166426e-06, 'epoch': 0.66}
17336
+ tensor(-0.0013, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:3', grad_fn=<MulBackward0>)
17337
+ tensor(-0.0011, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:2', grad_fn=<MulBackward0>)
17338
+ tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
17339
+ tensor(-0.0010, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:0', grad_fn=<MulBackward0>)
17340
+ {'train/tv_loss': None, 'train/lm_loss': 0.11436709165573121, 'train/info_loss': 0.26202645897865295, 'train/ref_loss': None, 'train/uncertainty_loss': -9.564846986904741e-05, 'train/video_loss': 0.26193082332611084, 'train/total_loss': 0.3762979209423065}
17341
+ tensor(-0.0015, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0015, device='cuda:2', grad_fn=<MulBackward0>)
17342
+ tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
17343
+ tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
17344
+ tensor(-0.0009, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:0', grad_fn=<MulBackward0>)
17345
+ {'train/tv_loss': None, 'train/lm_loss': 0.15787471532821656, 'train/info_loss': 0.17485037446022034, 'train/ref_loss': None, 'train/uncertainty_loss': -9.450375218875707e-05, 'train/video_loss': 0.17475587129592896, 'train/total_loss': 0.33263057470321655}
17346
+ [Rank 1] Trainer log: {'loss': 0.4708, 'grad_norm': 2.322233200073242, 'learning_rate': 5.663222249580649e-06}[Rank 3] Trainer log: {'loss': 0.4708, 'grad_norm': 2.322233200073242, 'learning_rate': 5.663222249580649e-06}[Rank 0] Trainer log: {'loss': 0.4708, 'grad_norm': 2.322233200073242, 'learning_rate': 5.663222249580649e-06}
17347
+
17348
+ [Rank 2] Trainer log: {'loss': 0.4708, 'grad_norm': 2.322233200073242, 'learning_rate': 5.663222249580649e-06}
17349
+
17350
+ {'loss': 0.4708, 'grad_norm': 2.322233200073242, 'learning_rate': 5.663222249580649e-06, 'epoch': 0.66}
17351
+ tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
17352
+ tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
17353
+ tensor(0.2767, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
17354
+ tensor(0.0772, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
17355
+ {'train/tv_loss': 0.00016861287876963617, 'train/lm_loss': 4.701061698142439e-05, 'train/info_loss': 2.235124156868551e-05, 'train/ref_loss': 0.2696513533592224, 'train/uncertainty_loss': 0.007722488045692444, 'train/video_loss': 0.2787451148033142, 'train/total_loss': 0.2787921130657196}
17356
+ tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
17357
+ {'train/tv_loss': None, 'train/lm_loss': 0.20862238407135011, 'train/info_loss': 0.1744736284017563, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001176536548882723, 'train/video_loss': 0.17435596883296967, 'train/total_loss': 0.3829783499240875}
17358
+ tensor(-0.0010, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:1', grad_fn=<MulBackward0>)
17359
+ tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
17360
+ tensor(0.2500, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
17361
+ [Rank 3] Trainer log: {'loss': 0.3702, 'grad_norm': 10.16737174987793, 'learning_rate': 5.653612533248233e-06}[Rank 1] Trainer log: {'loss': 0.3702, 'grad_norm': 10.16737174987793, 'learning_rate': 5.653612533248233e-06}
17362
+ [Rank 0] Trainer log: {'loss': 0.3702, 'grad_norm': 10.16737174987793, 'learning_rate': 5.653612533248233e-06}
17363
+
17364
+ [Rank 2] Trainer log: {'loss': 0.3702, 'grad_norm': 10.16737174987793, 'learning_rate': 5.653612533248233e-06}
17365
+ {'loss': 0.3702, 'grad_norm': 10.16737174987793, 'learning_rate': 5.653612533248233e-06, 'epoch': 0.66}
17366
+ tensor(-0.0014, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:3', grad_fn=<MulBackward0>)
17367
+ tensor(-0.0010, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:1', grad_fn=<MulBackward0>)
17368
+ tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
17369
+ tensor(0.0352, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
17370
+ {'train/tv_loss': 0.00026960894465446474, 'train/lm_loss': 2.4698639754205944e-05, 'train/info_loss': 1.7702266632113606e-05, 'train/ref_loss': 0.24252426624298096, 'train/uncertainty_loss': 0.0035187847912311557, 'train/video_loss': 0.2482176274061203, 'train/total_loss': 0.2482423186302185}
17371
+ tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
17372
+ tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
17373
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
17374
+ {'train/tv_loss': 0.00018143276683986188, 'train/lm_loss': 3.5974127240478995e-05, 'train/info_loss': 1.9728748156921938e-05, 'train/ref_loss': 0.08057326078414917, 'train/uncertainty_loss': -7.370269158855081e-05, 'train/video_loss': 0.08197075128555298, 'train/total_loss': 0.08200672268867493}
17375
+ tensor(0.3041, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
17376
+ [Rank 2] Trainer log: {'loss': 0.3183, 'grad_norm': 8.234452247619629, 'learning_rate': 5.644007762954926e-06}[Rank 1] Trainer log: {'loss': 0.3183, 'grad_norm': 8.234452247619629, 'learning_rate': 5.644007762954926e-06}
17377
+
17378
+ [Rank 3] Trainer log: {'loss': 0.3183, 'grad_norm': 8.234452247619629, 'learning_rate': 5.644007762954926e-06}[Rank 0] Trainer log: {'loss': 0.3183, 'grad_norm': 8.234452247619629, 'learning_rate': 5.644007762954926e-06}
17379
+
17380
+ {'loss': 0.3183, 'grad_norm': 8.234452247619629, 'learning_rate': 5.644007762954926e-06, 'epoch': 0.66}
17381
+ tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
17382
+ tensor(-0.0010, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:1', grad_fn=<MulBackward0>)
17383
+ tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
17384
+ tensor(-0.0009, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:0', grad_fn=<MulBackward0>)
17385
+ {'train/tv_loss': None, 'train/lm_loss': 0.06672886610031128, 'train/info_loss': 0.18479126691818237, 'train/ref_loss': None, 'train/uncertainty_loss': -9.260554797947408e-05, 'train/video_loss': 0.1846986562013626, 'train/total_loss': 0.2514275312423706}
17386
+ tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
17387
+ tensor(-0.0014, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:1', grad_fn=<MulBackward0>)
17388
+ tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
17389
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
17390
+ {'train/tv_loss': 0.00025838704314082864, 'train/lm_loss': 2.7893029619008305e-05, 'train/info_loss': 1.7702266632113606e-05, 'train/ref_loss': 0.09182281792163849, 'train/uncertainty_loss': -6.915096309967339e-05, 'train/video_loss': 0.09383846819400787, 'train/total_loss': 0.09386636316776276}
17391
+ [Rank 1] Trainer log: {'loss': 0.2844, 'grad_norm': 3.304973602294922, 'learning_rate': 5.634407949630617e-06}[Rank 0] Trainer log: {'loss': 0.2844, 'grad_norm': 3.304973602294922, 'learning_rate': 5.634407949630617e-06}[Rank 3] Trainer log: {'loss': 0.2844, 'grad_norm': 3.304973602294922, 'learning_rate': 5.634407949630617e-06}
17392
+
17393
+ [Rank 2] Trainer log: {'loss': 0.2844, 'grad_norm': 3.304973602294922, 'learning_rate': 5.634407949630617e-06}
17394
+
17395
+ {'loss': 0.2844, 'grad_norm': 3.304973602294922, 'learning_rate': 5.634407949630617e-06, 'epoch': 0.66}
17396
+ tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:1', grad_fn=<MulBackward0>)
17397
+ tensor(1.0766, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
17398
+ tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
17399
+ tensor(0.7360, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
17400
+ {'train/tv_loss': 0.00015928944339975716, 'train/lm_loss': 4.684376472141594e-05, 'train/info_loss': 2.1993630070937797e-05, 'train/ref_loss': 0.7353293299674988, 'train/uncertainty_loss': 0.07359982132911683, 'train/video_loss': 0.8102254271507263, 'train/total_loss': 0.8102722764015198}
17401
+ tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
17402
+ tensor(-0.0013, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:1', grad_fn=<MulBackward0>)
17403
+ tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
17404
+ {'train/tv_loss': None, 'train/lm_loss': 0.2558208465576172, 'train/info_loss': 0.18679678440093994, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010758317075669766, 'train/video_loss': 0.18668919801712036, 'train/total_loss': 0.44251003861427307}
17405
+ tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
17406
+ [Rank 3] Trainer log: {'loss': 0.4651, 'grad_norm': 17.812118530273438, 'learning_rate': 5.624813104199567e-06}
17407
+ [Rank 2] Trainer log: {'loss': 0.4651, 'grad_norm': 17.812118530273438, 'learning_rate': 5.624813104199567e-06}
17408
+ [Rank 0] Trainer log: {'loss': 0.4651, 'grad_norm': 17.812118530273438, 'learning_rate': 5.624813104199567e-06}[Rank 1] Trainer log: {'loss': 0.4651, 'grad_norm': 17.812118530273438, 'learning_rate': 5.624813104199567e-06}
17409
+
17410
+ {'loss': 0.4651, 'grad_norm': 17.812118530273438, 'learning_rate': 5.624813104199567e-06, 'epoch': 0.66}
17411
+ tensor(-0.0010, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:2', grad_fn=<MulBackward0>)
17412
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
17413
+ tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
17414
+ tensor(0.4664, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
17415
+ {'train/tv_loss': 0.00020973044447600842, 'train/lm_loss': 4.715363611467183e-05, 'train/info_loss': 2.235124156868551e-05, 'train/ref_loss': 0.5165361166000366, 'train/uncertainty_loss': 0.0466405063867569, 'train/video_loss': 0.5648768544197083, 'train/total_loss': 0.5649240016937256}