Training in progress, step 3100
Browse files- adapter_model.safetensors +1 -1
- train.log +358 -0
adapter_model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1204780872
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:44f8a7d14f22c01a14ae42fef575ccd9bfbced1fa7a387e0f0fe630a34c22899
|
3 |
size 1204780872
|
train.log
CHANGED
@@ -16276,3 +16276,361 @@ tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device=
|
|
16276 |
tensor(-0.0010, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:0', grad_fn=<MulBackward0>)
|
16277 |
{'train/tv_loss': None, 'train/lm_loss': 0.15130637884140016, 'train/info_loss': 0.1600653976202011, 'train/ref_loss': None, 'train/uncertainty_loss': -9.910131338983775e-05, 'train/video_loss': 0.15996628999710083, 'train/total_loss': 0.31127268075942993}
|
16278 |
tensor(0.0755, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16276 |
tensor(-0.0010, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:0', grad_fn=<MulBackward0>)
|
16277 |
{'train/tv_loss': None, 'train/lm_loss': 0.15130637884140016, 'train/info_loss': 0.1600653976202011, 'train/ref_loss': None, 'train/uncertainty_loss': -9.910131338983775e-05, 'train/video_loss': 0.15996628999710083, 'train/total_loss': 0.31127268075942993}
|
16278 |
tensor(0.0755, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
16279 |
+
[Rank 3] Trainer log: {'loss': 0.2861, 'grad_norm': 4.437143802642822, 'learning_rate': 3.0097681139307223e-09}
|
16280 |
+
[Rank 2] Trainer log: {'loss': 0.2861, 'grad_norm': 4.437143802642822, 'learning_rate': 3.0097681139307223e-09}[Rank 0] Trainer log: {'loss': 0.2861, 'grad_norm': 4.437143802642822, 'learning_rate': 3.0097681139307223e-09}
|
16281 |
+
[Rank 1] Trainer log: {'loss': 0.2861, 'grad_norm': 4.437143802642822, 'learning_rate': 3.0097681139307223e-09}
|
16282 |
+
|
16283 |
+
{'loss': 0.2861, 'grad_norm': 4.437143802642822, 'learning_rate': 3.0097681139307223e-09, 'epoch': 0.99}
|
16284 |
+
tensor(-0.0011, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:2', grad_fn=<MulBackward0>)
|
16285 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
16286 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
16287 |
+
{'train/tv_loss': 0.0001784276915714145, 'train/lm_loss': 1.5091327077243478e-05, 'train/info_loss': 1.3351262168725953e-05, 'train/ref_loss': 0.10094004124403, 'train/uncertainty_loss': -6.672072340734303e-05, 'train/video_loss': 0.10231409221887589, 'train/total_loss': 0.10232918709516525}
|
16288 |
+
tensor(0.0191, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
16289 |
+
tensor(0.1469, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
16290 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
16291 |
+
tensor(0.4153, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
16292 |
+
tensor(0.1078, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
16293 |
+
{'train/tv_loss': 0.0001733818091452122, 'train/lm_loss': 2.2076342429500076e-05, 'train/info_loss': 1.5854584489716217e-05, 'train/ref_loss': 0.2894924581050873, 'train/uncertainty_loss': 0.010783981531858444, 'train/video_loss': 0.3016793429851532, 'train/total_loss': 0.3017014265060425}
|
16294 |
+
[Rank 1] Trainer log: {'loss': 0.2588, 'grad_norm': 8.357850074768066, 'learning_rate': 2.753750441613079e-09}
|
16295 |
+
[Rank 2] Trainer log: {'loss': 0.2588, 'grad_norm': 8.357850074768066, 'learning_rate': 2.753750441613079e-09}
|
16296 |
+
[Rank 0] Trainer log: {'loss': 0.2588, 'grad_norm': 8.357850074768066, 'learning_rate': 2.753750441613079e-09}[Rank 3] Trainer log: {'loss': 0.2588, 'grad_norm': 8.357850074768066, 'learning_rate': 2.753750441613079e-09}
|
16297 |
+
|
16298 |
+
{'loss': 0.2588, 'grad_norm': 8.357850074768066, 'learning_rate': 2.753750441613079e-09, 'epoch': 0.99}
|
16299 |
+
tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
|
16300 |
+
tensor(0.0729, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
16301 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
16302 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
16303 |
+
{'train/tv_loss': 0.0004507274366915226, 'train/lm_loss': 2.8608183492906394e-05, 'train/info_loss': 1.740425250318367e-05, 'train/ref_loss': 0.16604295372962952, 'train/uncertainty_loss': -7.334401598200203e-05, 'train/video_loss': 0.16959282755851746, 'train/total_loss': 0.16962143778800964}
|
16304 |
+
tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
|
16305 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.15873619318008425, 'train/info_loss': 0.20619021356105804, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012108908267691732, 'train/video_loss': 0.20606912672519684, 'train/total_loss': 0.36480534076690674}
|
16306 |
+
tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
|
16307 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
16308 |
+
tensor(0.0074, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
16309 |
+
[Rank 3] Trainer log: {'loss': 0.2282, 'grad_norm': 2.0619752407073975, 'learning_rate': 2.509109290893541e-09}[Rank 2] Trainer log: {'loss': 0.2282, 'grad_norm': 2.0619752407073975, 'learning_rate': 2.509109290893541e-09}
|
16310 |
+
|
16311 |
+
[Rank 1] Trainer log: {'loss': 0.2282, 'grad_norm': 2.0619752407073975, 'learning_rate': 2.509109290893541e-09}
|
16312 |
+
[Rank 0] Trainer log: {'loss': 0.2282, 'grad_norm': 2.0619752407073975, 'learning_rate': 2.509109290893541e-09}
|
16313 |
+
{'loss': 0.2282, 'grad_norm': 2.0619752407073975, 'learning_rate': 2.509109290893541e-09, 'epoch': 0.99}
|
16314 |
+
tensor(-0.0013, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:2', grad_fn=<MulBackward0>)
|
16315 |
+
tensor(0.4648, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
16316 |
+
tensor(-0.0008, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:3', grad_fn=<MulBackward0>)
|
16317 |
+
tensor(-0.0013, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:0', grad_fn=<MulBackward0>)
|
16318 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.31027505397796634, 'train/info_loss': 0.33556777238845825, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012801478151232005, 'train/video_loss': 0.3354397714138031, 'train/total_loss': 0.6457148194313049}
|
16319 |
+
tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
|
16320 |
+
tensor(-0.0011, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:2', grad_fn=<MulBackward0>)
|
16321 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
16322 |
+
{'train/tv_loss': 0.0001363527961075306, 'train/lm_loss': 2.822676906362176e-05, 'train/info_loss': 1.710624019324314e-05, 'train/ref_loss': 0.13075466454029083, 'train/uncertainty_loss': -6.930269300937652e-05, 'train/video_loss': 0.1317932903766632, 'train/total_loss': 0.13182151317596436}
|
16323 |
+
tensor(0.0040, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
16324 |
+
[Rank 2] Trainer log: {'loss': 0.4774, 'grad_norm': 2.6274688243865967, 'learning_rate': 2.2758449401638628e-09}
|
16325 |
+
[Rank 0] Trainer log: {'loss': 0.4774, 'grad_norm': 2.6274688243865967, 'learning_rate': 2.2758449401638628e-09}[Rank 3] Trainer log: {'loss': 0.4774, 'grad_norm': 2.6274688243865967, 'learning_rate': 2.2758449401638628e-09}
|
16326 |
+
|
16327 |
+
{'loss': 0.4774, 'grad_norm': 2.6274688243865967, 'learning_rate': 2.2758449401638628e-09, 'epoch': 0.99}
|
16328 |
+
[Rank 1] Trainer log: {'loss': 0.4774, 'grad_norm': 2.6274688243865967, 'learning_rate': 2.2758449401638628e-09}
|
16329 |
+
tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
|
16330 |
+
tensor(-0.0011, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:2', grad_fn=<MulBackward0>)
|
16331 |
+
tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
|
16332 |
+
tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
|
16333 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.30461447238922124, 'train/info_loss': 0.1613609790802002, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011446572607383133, 'train/video_loss': 0.16124650835990906, 'train/total_loss': 0.4658609926700592}
|
16334 |
+
tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
|
16335 |
+
tensor(-0.0008, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:0', grad_fn=<MulBackward0>)
|
16336 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.2111635446548462, 'train/info_loss': 0.14900483191013336, 'train/ref_loss': None, 'train/uncertainty_loss': -8.154477691277862e-05, 'train/video_loss': 0.14892329275608063, 'train/total_loss': 0.3600868582725525}
|
16337 |
+
tensor(0.1748, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
16338 |
+
tensor(-0.0009, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:3', grad_fn=<MulBackward0>)
|
16339 |
+
[Rank 1] Trainer log: {'loss': 0.4107, 'grad_norm': 6.459842681884766, 'learning_rate': 2.053957654871708e-09}
|
16340 |
+
[Rank 3] Trainer log: {'loss': 0.4107, 'grad_norm': 6.459842681884766, 'learning_rate': 2.053957654871708e-09}
|
16341 |
+
[Rank 0] Trainer log: {'loss': 0.4107, 'grad_norm': 6.459842681884766, 'learning_rate': 2.053957654871708e-09}[Rank 2] Trainer log: {'loss': 0.4107, 'grad_norm': 6.459842681884766, 'learning_rate': 2.053957654871708e-09}
|
16342 |
+
|
16343 |
+
{'loss': 0.4107, 'grad_norm': 6.459842681884766, 'learning_rate': 2.053957654871708e-09, 'epoch': 0.99}
|
16344 |
+
tensor(-0.0014, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:2', grad_fn=<MulBackward0>)
|
16345 |
+
tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
16346 |
+
tensor(0.0091, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
16347 |
+
tensor(0.0925, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
16348 |
+
{'train/tv_loss': 0.00016606017015874386, 'train/lm_loss': 7.689904887229205e-05, 'train/info_loss': 2.1993630070937797e-05, 'train/ref_loss': 0.2799209952354431, 'train/uncertainty_loss': 0.009252391010522843, 'train/video_loss': 0.29052385687828064, 'train/total_loss': 0.2906007468700409}
|
16349 |
+
tensor(-0.0011, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:2', grad_fn=<MulBackward0>)
|
16350 |
+
tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:1', grad_fn=<MulBackward0>)
|
16351 |
+
tensor(0.1732, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
16352 |
+
tensor(0.1812, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
16353 |
+
{'train/tv_loss': 0.00012968671508133413, 'train/lm_loss': 3.659390495158732e-05, 'train/info_loss': 2.002676046686247e-05, 'train/ref_loss': 0.3349378705024719, 'train/uncertainty_loss': 0.018121950328350067, 'train/video_loss': 0.35411736369132996, 'train/total_loss': 0.35415396094322205}
|
16354 |
+
[Rank 3] Trainer log: {'loss': 0.3274, 'grad_norm': 3.277092456817627, 'learning_rate': 1.8434476875162088e-09}
|
16355 |
+
[Rank 2] Trainer log: {'loss': 0.3274, 'grad_norm': 3.277092456817627, 'learning_rate': 1.8434476875162088e-09}
|
16356 |
+
[Rank 0] Trainer log: {'loss': 0.3274, 'grad_norm': 3.277092456817627, 'learning_rate': 1.8434476875162088e-09}[Rank 1] Trainer log: {'loss': 0.3274, 'grad_norm': 3.277092456817627, 'learning_rate': 1.8434476875162088e-09}
|
16357 |
+
|
16358 |
+
{'loss': 0.3274, 'grad_norm': 3.277092456817627, 'learning_rate': 1.8434476875162088e-09, 'epoch': 0.99}
|
16359 |
+
tensor(-0.0014, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:3', grad_fn=<MulBackward0>)
|
16360 |
+
tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(0.0697, device='cuda:0', grad_fn=<AddBackward0>)tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
16361 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
16362 |
+
{'train/tv_loss': 0.00016026009107008579, 'train/lm_loss': 1.6855483409017324e-05, 'train/info_loss': 1.4424115761357825e-05, 'train/ref_loss': 0.26810598373413086, 'train/uncertainty_loss': 0.006967854499816895, 'train/video_loss': 0.2763703167438507, 'train/total_loss': 0.27638718485832214}
|
16363 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
16364 |
+
tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
|
16365 |
+
tensor(-0.0010, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:0', grad_fn=<MulBackward0>)
|
16366 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.10554903745651245, 'train/info_loss': 0.11926790326833725, 'train/ref_loss': None, 'train/uncertainty_loss': -9.559270110912622e-05, 'train/video_loss': 0.11917231231927872, 'train/total_loss': 0.22472134232521057}
|
16367 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
16368 |
+
tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
16369 |
+
[Rank 3] Trainer log: {'loss': 0.2043, 'grad_norm': 7.04038667678833, 'learning_rate': 1.6443152776524085e-09}[Rank 1] Trainer log: {'loss': 0.2043, 'grad_norm': 7.04038667678833, 'learning_rate': 1.6443152776524085e-09}[Rank 0] Trainer log: {'loss': 0.2043, 'grad_norm': 7.04038667678833, 'learning_rate': 1.6443152776524085e-09}
|
16370 |
+
|
16371 |
+
[Rank 2] Trainer log: {'loss': 0.2043, 'grad_norm': 7.04038667678833, 'learning_rate': 1.6443152776524085e-09}
|
16372 |
+
|
16373 |
+
{'loss': 0.2043, 'grad_norm': 7.04038667678833, 'learning_rate': 1.6443152776524085e-09, 'epoch': 0.99}
|
16374 |
+
tensor(-0.0013, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:0', grad_fn=<MulBackward0>)
|
16375 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.10073459148406982, 'train/info_loss': 0.14998847246170044, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012555404100567102, 'train/video_loss': 0.14986291527748108, 'train/total_loss': 0.2505975067615509}
|
16376 |
+
tensor(-0.0013, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:1', grad_fn=<MulBackward0>)
|
16377 |
+
tensor(-0.0011, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:2', grad_fn=<MulBackward0>)
|
16378 |
+
tensor(0.2004, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
16379 |
+
tensor(-0.0013, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>)tensor(-0.0013, device='cuda:2', grad_fn=<MulBackward0>)
|
16380 |
+
tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
|
16381 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.39838287830352787, 'train/info_loss': 0.1479235589504242, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012364789145067335, 'train/video_loss': 0.14779990911483765, 'train/total_loss': 0.5461827516555786}
|
16382 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
16383 |
+
tensor(-0.0006, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0006, device='cuda:3', grad_fn=<MulBackward0>)
|
16384 |
+
[Rank 1] Trainer log: {'loss': 0.3795, 'grad_norm': 2.248142957687378, 'learning_rate': 1.4565606518845976e-09}[Rank 2] Trainer log: {'loss': 0.3795, 'grad_norm': 2.248142957687378, 'learning_rate': 1.4565606518845976e-09}
|
16385 |
+
[Rank 3] Trainer log: {'loss': 0.3795, 'grad_norm': 2.248142957687378, 'learning_rate': 1.4565606518845976e-09}
|
16386 |
+
|
16387 |
+
[Rank 0] Trainer log: {'loss': 0.3795, 'grad_norm': 2.248142957687378, 'learning_rate': 1.4565606518845976e-09}
|
16388 |
+
{'loss': 0.3795, 'grad_norm': 2.248142957687378, 'learning_rate': 1.4565606518845976e-09, 'epoch': 0.99}
|
16389 |
+
tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
|
16390 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
16391 |
+
{'train/tv_loss': 0.00012514020781964063, 'train/lm_loss': 1.3207952724769713e-05, 'train/info_loss': 1.2516818969743326e-05, 'train/ref_loss': 0.19032391905784607, 'train/uncertainty_loss': -6.848637713119388e-05, 'train/video_loss': 0.19126906991004944, 'train/total_loss': 0.1912822723388672}
|
16392 |
+
tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
16393 |
+
tensor(0.2011, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
16394 |
+
tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
|
16395 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.21502296924591066, 'train/info_loss': 0.1758623719215393, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011166655458509922, 'train/video_loss': 0.1757507026195526, 'train/total_loss': 0.3907736539840698}
|
16396 |
+
tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
|
16397 |
+
tensor(-0.0008, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:2', grad_fn=<MulBackward0>)
|
16398 |
+
tensor(-0.0009, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:3', grad_fn=<MulBackward0>)
|
16399 |
+
[Rank 2] Trainer log: {'loss': 0.3061, 'grad_norm': 9.498651504516602, 'learning_rate': 1.2801840238707565e-09}[Rank 0] Trainer log: {'loss': 0.3061, 'grad_norm': 9.498651504516602, 'learning_rate': 1.2801840238707565e-09}[Rank 3] Trainer log: {'loss': 0.3061, 'grad_norm': 9.498651504516602, 'learning_rate': 1.2801840238707565e-09}
|
16400 |
+
|
16401 |
+
|
16402 |
+
[Rank 1] Trainer log: {'loss': 0.3061, 'grad_norm': 9.498651504516602, 'learning_rate': 1.2801840238707565e-09}
|
16403 |
+
{'loss': 0.3061, 'grad_norm': 9.498651504516602, 'learning_rate': 1.2801840238707565e-09, 'epoch': 1.0}
|
16404 |
+
tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:1', grad_fn=<MulBackward0>)
|
16405 |
+
tensor(0.0269, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
16406 |
+
{'train/tv_loss': 0.00012117947917431593, 'train/lm_loss': 2.856050559785217e-05, 'train/info_loss': 1.7702266632113606e-05, 'train/ref_loss': 0.23599904775619507, 'train/uncertainty_loss': 0.0026872064918279648, 'train/video_loss': 0.23967339098453522, 'train/total_loss': 0.23970195651054382}
|
16407 |
+
tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
16408 |
+
tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
16409 |
+
tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
|
16410 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.2651496648788452, 'train/info_loss': 0.16941337287425995, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012172098504379392, 'train/video_loss': 0.1692916452884674, 'train/total_loss': 0.43444132804870605}
|
16411 |
+
tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
|
16412 |
+
tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
16413 |
+
tensor(0.0431, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
16414 |
+
[Rank 2] Trainer log: {'loss': 0.2937, 'grad_norm': 1.8806184530258179, 'learning_rate': 1.1151855943225543e-09}[Rank 3] Trainer log: {'loss': 0.2937, 'grad_norm': 1.8806184530258179, 'learning_rate': 1.1151855943225543e-09}[Rank 0] Trainer log: {'loss': 0.2937, 'grad_norm': 1.8806184530258179, 'learning_rate': 1.1151855943225543e-09}
|
16415 |
+
|
16416 |
+
|
16417 |
+
[Rank 1] Trainer log: {'loss': 0.2937, 'grad_norm': 1.8806184530258179, 'learning_rate': 1.1151855943225543e-09}
|
16418 |
+
{'loss': 0.2937, 'grad_norm': 1.8806184530258179, 'learning_rate': 1.1151855943225543e-09, 'epoch': 1.0}
|
16419 |
+
tensor(-0.0014, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:1', grad_fn=<MulBackward0>)
|
16420 |
+
tensor(-0.0013, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:3', grad_fn=<MulBackward0>)
|
16421 |
+
tensor(0.1725, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
16422 |
+
tensor(-0.0008, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:0', grad_fn=<MulBackward0>)
|
16423 |
+
{'train/tv_loss': 0.00032057708594948056, 'train/lm_loss': 1.3088752166368068e-05, 'train/info_loss': 1.3112849956087302e-05, 'train/ref_loss': 0.06266696751117706, 'train/uncertainty_loss': -7.520327344536782e-05, 'train/video_loss': 0.06516949087381363, 'train/total_loss': 0.06518258154392242}
|
16424 |
+
tensor(-0.0010, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:3', grad_fn=<MulBackward0>)
|
16425 |
+
tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
|
16426 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.35767147541046146, 'train/info_loss': 0.1781483143568039, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011188344797119498, 'train/video_loss': 0.17803643643856049, 'train/total_loss': 0.5357078909873962}
|
16427 |
+
tensor(-0.0010, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:2', grad_fn=<MulBackward0>)
|
16428 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
16429 |
+
[Rank 2] Trainer log: {'loss': 0.3343, 'grad_norm': 7.238387107849121, 'learning_rate': 9.615655510020193e-10}[Rank 3] Trainer log: {'loss': 0.3343, 'grad_norm': 7.238387107849121, 'learning_rate': 9.615655510020193e-10}[Rank 0] Trainer log: {'loss': 0.3343, 'grad_norm': 7.238387107849121, 'learning_rate': 9.615655510020193e-10}
|
16430 |
+
|
16431 |
+
|
16432 |
+
[Rank 1] Trainer log: {'loss': 0.3343, 'grad_norm': 7.238387107849121, 'learning_rate': 9.615655510020193e-10}
|
16433 |
+
{'loss': 0.3343, 'grad_norm': 7.238387107849121, 'learning_rate': 9.615655510020193e-10, 'epoch': 1.0}
|
16434 |
+
tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
|
16435 |
+
tensor(-0.0013, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:2', grad_fn=<MulBackward0>)
|
16436 |
+
tensor(0.1212, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
16437 |
+
{'train/tv_loss': 0.00014093549689278006, 'train/lm_loss': 1.683164300629869e-05, 'train/info_loss': 1.4185704458213877e-05, 'train/ref_loss': 0.2974424362182617, 'train/uncertainty_loss': 0.012122622132301331, 'train/video_loss': 0.3107067346572876, 'train/total_loss': 0.31072357296943665}
|
16438 |
+
tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
|
16439 |
+
tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
|
16440 |
+
tensor(-0.0010, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:2', grad_fn=<MulBackward0>)
|
16441 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
16442 |
+
tensor(-0.0009, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:0', grad_fn=<MulBackward0>)
|
16443 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.05586314797401429, 'train/info_loss': 0.11093997955322266, 'train/ref_loss': None, 'train/uncertainty_loss': -8.851074380800128e-05, 'train/video_loss': 0.1108514666557312, 'train/total_loss': 0.166714608669281}
|
16444 |
+
[Rank 0] Trainer log: {'loss': 0.3126, 'grad_norm': 2.235849380493164, 'learning_rate': 8.193240687226489e-10}[Rank 1] Trainer log: {'loss': 0.3126, 'grad_norm': 2.235849380493164, 'learning_rate': 8.193240687226489e-10}[Rank 3] Trainer log: {'loss': 0.3126, 'grad_norm': 2.235849380493164, 'learning_rate': 8.193240687226489e-10}
|
16445 |
+
|
16446 |
+
|
16447 |
+
[Rank 2] Trainer log: {'loss': 0.3126, 'grad_norm': 2.235849380493164, 'learning_rate': 8.193240687226489e-10}
|
16448 |
+
{'loss': 0.3126, 'grad_norm': 2.235849380493164, 'learning_rate': 8.193240687226489e-10, 'epoch': 1.0}
|
16449 |
+
tensor(-0.0013, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:3', grad_fn=<MulBackward0>)
|
16450 |
+
tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:1', grad_fn=<MulBackward0>)
|
16451 |
+
tensor(0.1352, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
16452 |
+
{'train/tv_loss': 0.00017385379178449514, 'train/lm_loss': 6.800925475545228e-05, 'train/info_loss': 1.9728748156921938e-05, 'train/ref_loss': 0.30475878715515137, 'train/uncertainty_loss': 0.013520647585391999, 'train/video_loss': 0.3196900188922882, 'train/total_loss': 0.31975802779197693}
|
16453 |
+
tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
16454 |
+
tensor(-0.0013, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:1', grad_fn=<MulBackward0>)
|
16455 |
+
tensor(-0.0008, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:2', grad_fn=<MulBackward0>)
|
16456 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
16457 |
+
{'train/tv_loss': 0.00015056305564939976, 'train/lm_loss': 2.4770156596787277e-05, 'train/info_loss': 1.7702266632113606e-05, 'train/ref_loss': 0.18287095427513123, 'train/uncertainty_loss': -7.037441246211529e-05, 'train/video_loss': 0.18402278423309326, 'train/total_loss': 0.18404754996299744}
|
16458 |
+
tensor(0.1576, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
16459 |
+
[Rank 3] Trainer log: {'loss': 0.2891, 'grad_norm': 1.8388005495071411, 'learning_rate': 6.88461309351629e-10}
|
16460 |
+
[Rank 0] Trainer log: {'loss': 0.2891, 'grad_norm': 1.8388005495071411, 'learning_rate': 6.88461309351629e-10}[Rank 1] Trainer log: {'loss': 0.2891, 'grad_norm': 1.8388005495071411, 'learning_rate': 6.88461309351629e-10}
|
16461 |
+
|
16462 |
+
[Rank 2] Trainer log: {'loss': 0.2891, 'grad_norm': 1.8388005495071411, 'learning_rate': 6.88461309351629e-10}
|
16463 |
+
{'loss': 0.2891, 'grad_norm': 1.8388005495071411, 'learning_rate': 6.88461309351629e-10, 'epoch': 1.0}
|
16464 |
+
tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
|
16465 |
+
tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
16466 |
+
tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
16467 |
+
tensor(0.0547, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
16468 |
+
{'train/tv_loss': 9.430212085135282e-05, 'train/lm_loss': 1.3041071360930802e-05, 'train/info_loss': 1.293404056923464e-05, 'train/ref_loss': 0.24797077476978302, 'train/uncertainty_loss': 0.005471675470471383, 'train/video_loss': 0.2542097866535187, 'train/total_loss': 0.2542228400707245}
|
16469 |
+
tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
|
16470 |
+
tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
|
16471 |
+
tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
16472 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
16473 |
+
{'train/tv_loss': 0.0001686649862676859, 'train/lm_loss': 1.6736284305807202e-05, 'train/info_loss': 1.4662527973996475e-05, 'train/ref_loss': 0.16706281900405884, 'train/uncertainty_loss': -7.104419637471437e-05, 'train/video_loss': 0.16835574805736542, 'train/total_loss': 0.1683724820613861}
|
16474 |
+
[Rank 3] Trainer log: {'loss': 0.2849, 'grad_norm': 2.7503747940063477, 'learning_rate': 5.689774218065047e-10}
|
16475 |
+
[Rank 0] Trainer log: {'loss': 0.2849, 'grad_norm': 2.7503747940063477, 'learning_rate': 5.689774218065047e-10}[Rank 1] Trainer log: {'loss': 0.2849, 'grad_norm': 2.7503747940063477, 'learning_rate': 5.689774218065047e-10}
|
16476 |
+
|
16477 |
+
[Rank 2] Trainer log: {'loss': 0.2849, 'grad_norm': 2.7503747940063477, 'learning_rate': 5.689774218065047e-10}
|
16478 |
+
{'loss': 0.2849, 'grad_norm': 2.7503747940063477, 'learning_rate': 5.689774218065047e-10, 'epoch': 1.0}
|
16479 |
+
tensor(-0.0013, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:2', grad_fn=<MulBackward0>)
|
16480 |
+
tensor(-0.0010, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:0', grad_fn=<MulBackward0>)
|
16481 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.2963602066040039, 'train/info_loss': 0.15512116253376007, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010410062968730928, 'train/video_loss': 0.15501706302165985, 'train/total_loss': 0.451377272605896}
|
16482 |
+
tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
16483 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
16484 |
+
tensor(-0.0013, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:2', grad_fn=<MulBackward0>)
|
16485 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
16486 |
+
{'train/tv_loss': 0.00016122745582833887, 'train/lm_loss': 1.504364627180621e-05, 'train/info_loss': 1.3768483768217266e-05, 'train/ref_loss': 0.12243026494979858, 'train/uncertainty_loss': -7.243495201691985e-05, 'train/video_loss': 0.12366142123937607, 'train/total_loss': 0.12367646396160126}
|
16487 |
+
tensor(0.0812, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
16488 |
+
tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
|
16489 |
+
[Rank 0] Trainer log: {'loss': 0.3402, 'grad_norm': 4.494094371795654, 'learning_rate': 4.608725420540694e-10}[Rank 1] Trainer log: {'loss': 0.3402, 'grad_norm': 4.494094371795654, 'learning_rate': 4.608725420540694e-10}
|
16490 |
+
[Rank 2] Trainer log: {'loss': 0.3402, 'grad_norm': 4.494094371795654, 'learning_rate': 4.608725420540694e-10}
|
16491 |
+
|
16492 |
+
[Rank 3] Trainer log: {'loss': 0.3402, 'grad_norm': 4.494094371795654, 'learning_rate': 4.608725420540694e-10}
|
16493 |
+
{'loss': 0.3402, 'grad_norm': 4.494094371795654, 'learning_rate': 4.608725420540694e-10, 'epoch': 1.0}
|
16494 |
+
tensor(-0.0013, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:2', grad_fn=<MulBackward0>)
|
16495 |
+
tensor(-0.0009, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:0', grad_fn=<MulBackward0>)
|
16496 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.035262671113014225, 'train/info_loss': 0.207004576921463, 'train/ref_loss': None, 'train/uncertainty_loss': -8.50230921059847e-05, 'train/video_loss': 0.20691955089569092, 'train/total_loss': 0.24218222498893738}
|
16497 |
+
tensor(0.1316, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
16498 |
+
tensor(0.0744, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
16499 |
+
tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
|
16500 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.08468132019042969, 'train/info_loss': 0.13620348274707794, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001168315066024661, 'train/video_loss': 0.13608665764331818, 'train/total_loss': 0.22076797485351562}
|
16501 |
+
tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
|
16502 |
+
tensor(-0.0011, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:2', grad_fn=<MulBackward0>)
|
16503 |
+
tensor(-0.0008, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:3', grad_fn=<MulBackward0>)
|
16504 |
+
[Rank 2] Trainer log: {'loss': 0.3431, 'grad_norm': 8.961502075195312, 'learning_rate': 3.6414679311591595e-10}[Rank 3] Trainer log: {'loss': 0.3431, 'grad_norm': 8.961502075195312, 'learning_rate': 3.6414679311591595e-10}
|
16505 |
+
|
16506 |
+
[Rank 1] Trainer log: {'loss': 0.3431, 'grad_norm': 8.961502075195312, 'learning_rate': 3.6414679311591595e-10}
|
16507 |
+
[Rank 0] Trainer log: {'loss': 0.3431, 'grad_norm': 8.961502075195312, 'learning_rate': 3.6414679311591595e-10}
|
16508 |
+
{'loss': 0.3431, 'grad_norm': 8.961502075195312, 'learning_rate': 3.6414679311591595e-10, 'epoch': 1.0}
|
16509 |
+
tensor(-0.0013, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:1', grad_fn=<MulBackward0>)
|
16510 |
+
tensor(0.1708, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
16511 |
+
tensor(0.1118, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
16512 |
+
{'train/tv_loss': 0.0001698122243396938, 'train/lm_loss': 1.723692112136632e-05, 'train/info_loss': 1.293404056923464e-05, 'train/ref_loss': 0.2866149842739105, 'train/uncertainty_loss': 0.01117597669363022, 'train/video_loss': 0.2991624176502228, 'train/total_loss': 0.29917964339256287}
|
16513 |
+
tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
16514 |
+
tensor(-0.0013, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:0', grad_fn=<MulBackward0>)
|
16515 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.3133418560028076, 'train/info_loss': 0.17231473326683044, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012505786726251246, 'train/video_loss': 0.17218968272209167, 'train/total_loss': 0.4855315387248993}
|
16516 |
+
tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
|
16517 |
+
tensor(-0.0010, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:1', grad_fn=<MulBackward0>)
|
16518 |
+
tensor(0.0181, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
16519 |
+
[Rank 2] Trainer log: {'loss': 0.3475, 'grad_norm': 2.2179486751556396, 'learning_rate': 2.7880028506066526e-10}
|
16520 |
+
[Rank 3] Trainer log: {'loss': 0.3475, 'grad_norm': 2.2179486751556396, 'learning_rate': 2.7880028506066526e-10}
|
16521 |
+
[Rank 1] Trainer log: {'loss': 0.3475, 'grad_norm': 2.2179486751556396, 'learning_rate': 2.7880028506066526e-10}
|
16522 |
+
[Rank 0] Trainer log: {'loss': 0.3475, 'grad_norm': 2.2179486751556396, 'learning_rate': 2.7880028506066526e-10}
|
16523 |
+
{'loss': 0.3475, 'grad_norm': 2.2179486751556396, 'learning_rate': 2.7880028506066526e-10, 'epoch': 1.0}
|
16524 |
+
tensor(-0.0013, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:3', grad_fn=<MulBackward0>)
|
16525 |
+
tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
16526 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
16527 |
+
tensor(-0.0013, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:0', grad_fn=<MulBackward0>)
|
16528 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.3884145736694336, 'train/info_loss': 0.23487679660320282, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012862115399912, 'train/video_loss': 0.23474816977977753, 'train/total_loss': 0.6231627464294434}
|
16529 |
+
tensor(-0.0010, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:1', grad_fn=<MulBackward0>)
|
16530 |
+
tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
16531 |
+
tensor(-0.0009, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:0', grad_fn=<MulBackward0>)
|
16532 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.010719782114028931, 'train/info_loss': 0.19857865571975708, 'train/ref_loss': None, 'train/uncertainty_loss': -8.820317452773452e-05, 'train/video_loss': 0.1984904557466507, 'train/total_loss': 0.20921023190021515}
|
16533 |
+
tensor(0.0573, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
16534 |
+
[Rank 3] Trainer log: {'loss': 0.2975, 'grad_norm': 3.0862128734588623, 'learning_rate': 2.0483311501062751e-10}
|
16535 |
+
[Rank 1] Trainer log: {'loss': 0.2975, 'grad_norm': 3.0862128734588623, 'learning_rate': 2.0483311501062751e-10}[Rank 0] Trainer log: {'loss': 0.2975, 'grad_norm': 3.0862128734588623, 'learning_rate': 2.0483311501062751e-10}
|
16536 |
+
[Rank 2] Trainer log: {'loss': 0.2975, 'grad_norm': 3.0862128734588623, 'learning_rate': 2.0483311501062751e-10}
|
16537 |
+
|
16538 |
+
{'loss': 0.2975, 'grad_norm': 3.0862128734588623, 'learning_rate': 2.0483311501062751e-10, 'epoch': 1.0}
|
16539 |
+
tensor(-0.0008, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:1', grad_fn=<MulBackward0>)
|
16540 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
16541 |
+
{'train/tv_loss': 0.0002269430086016655, 'train/lm_loss': 1.3041071360930802e-05, 'train/info_loss': 1.3768483768217266e-05, 'train/ref_loss': 0.15769097208976746, 'train/uncertainty_loss': -7.11314962245524e-05, 'train/video_loss': 0.15944914519786835, 'train/total_loss': 0.15946218371391296}
|
16542 |
+
tensor(0.1142, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
16543 |
+
tensor(-0.0008, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:3', grad_fn=<MulBackward0>)
|
16544 |
+
tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
|
16545 |
+
tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:1', grad_fn=<MulBackward0>)
|
16546 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
16547 |
+
{'train/tv_loss': 0.00013790428638458252, 'train/lm_loss': 2.784535172395408e-05, 'train/info_loss': 1.6867830709088594e-05, 'train/ref_loss': 0.2233009785413742, 'train/uncertainty_loss': -7.168206502683461e-05, 'train/video_loss': 0.22434939444065094, 'train/total_loss': 0.22437724471092224}
|
16548 |
+
tensor(0.1906, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
16549 |
+
[Rank 3] Trainer log: {'loss': 0.2824, 'grad_norm': 2.1807076930999756, 'learning_rate': 1.4224536713847158e-10}[Rank 0] Trainer log: {'loss': 0.2824, 'grad_norm': 2.1807076930999756, 'learning_rate': 1.4224536713847158e-10}[Rank 2] Trainer log: {'loss': 0.2824, 'grad_norm': 2.1807076930999756, 'learning_rate': 1.4224536713847158e-10}
|
16550 |
+
|
16551 |
+
[Rank 1] Trainer log: {'loss': 0.2824, 'grad_norm': 2.1807076930999756, 'learning_rate': 1.4224536713847158e-10}
|
16552 |
+
|
16553 |
+
{'loss': 0.2824, 'grad_norm': 2.1807076930999756, 'learning_rate': 1.4224536713847158e-10, 'epoch': 1.0}
|
16554 |
+
tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
|
16555 |
+
tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
|
16556 |
+
tensor(-0.0009, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:0', grad_fn=<MulBackward0>)
|
16557 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.1099284052848816, 'train/info_loss': 0.18190373480319977, 'train/ref_loss': None, 'train/uncertainty_loss': -9.252233430743218e-05, 'train/video_loss': 0.18181121349334717, 'train/total_loss': 0.2917396128177643}
|
16558 |
+
tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
16559 |
+
tensor(-0.0013, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:2', grad_fn=<MulBackward0>)
|
16560 |
+
tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
|
16561 |
+
tensor(-0.0013, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:3', grad_fn=<MulBackward0>)
|
16562 |
+
tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
|
16563 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.2040557146072388, 'train/info_loss': 0.19522923231124878, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010584730189293623, 'train/video_loss': 0.19512338936328888, 'train/total_loss': 0.3991791009902954}
|
16564 |
+
[Rank 1] Trainer log: {'loss': 0.4, 'grad_norm': 2.9970059394836426, 'learning_rate': 9.103711266611471e-11}
|
16565 |
+
[Rank 0] Trainer log: {'loss': 0.4, 'grad_norm': 2.9970059394836426, 'learning_rate': 9.103711266611471e-11}[Rank 3] Trainer log: {'loss': 0.4, 'grad_norm': 2.9970059394836426, 'learning_rate': 9.103711266611471e-11}
|
16566 |
+
[Rank 2] Trainer log: {'loss': 0.4, 'grad_norm': 2.9970059394836426, 'learning_rate': 9.103711266611471e-11}
|
16567 |
+
|
16568 |
+
{'loss': 0.4, 'grad_norm': 2.9970059394836426, 'learning_rate': 9.103711266611471e-11, 'epoch': 1.0}
|
16569 |
+
tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
|
16570 |
+
tensor(-0.0013, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:2', grad_fn=<MulBackward0>)
|
16571 |
+
tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
|
16572 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
16573 |
+
{'train/tv_loss': 0.00012163397623226047, 'train/lm_loss': 2.4770156596787277e-05, 'train/info_loss': 1.6629419405944645e-05, 'train/ref_loss': 0.0847846046090126, 'train/uncertainty_loss': -7.099361391738057e-05, 'train/video_loss': 0.0857033059000969, 'train/total_loss': 0.08572807908058167}
|
16574 |
+
tensor(-0.0013, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:0', grad_fn=<MulBackward0>)
|
16575 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.30223650932312013, 'train/info_loss': 0.21647511422634125, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013338448479771614, 'train/video_loss': 0.21634173393249512, 'train/total_loss': 0.518578290939331}
|
16576 |
+
tensor(-0.0014, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:3', grad_fn=<MulBackward0>)
|
16577 |
+
tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
16578 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
16579 |
+
[Rank 3] Trainer log: {'loss': 0.3471, 'grad_norm': 2.964756965637207, 'learning_rate': 5.12084098680532e-11}[Rank 0] Trainer log: {'loss': 0.3471, 'grad_norm': 2.964756965637207, 'learning_rate': 5.12084098680532e-11}[Rank 2] Trainer log: {'loss': 0.3471, 'grad_norm': 2.964756965637207, 'learning_rate': 5.12084098680532e-11}
|
16580 |
+
|
16581 |
+
|
16582 |
+
{'loss': 0.3471, 'grad_norm': 2.964756965637207, 'learning_rate': 5.12084098680532e-11, 'epoch': 1.0}
|
16583 |
+
[Rank 1] Trainer log: {'loss': 0.3471, 'grad_norm': 2.964756965637207, 'learning_rate': 5.12084098680532e-11}
|
16584 |
+
tensor(-0.0010, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:2', grad_fn=<MulBackward0>)
|
16585 |
+
tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:1', grad_fn=<MulBackward0>)
|
16586 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
16587 |
+
{'train/tv_loss': 0.00012253506574779748, 'train/lm_loss': 1.4829085557721555e-05, 'train/info_loss': 1.2755231182381976e-05, 'train/ref_loss': 0.06293053925037384, 'train/uncertainty_loss': -6.927629001438618e-05, 'train/video_loss': 0.06385429948568344, 'train/total_loss': 0.06386912614107132}
|
16588 |
+
tensor(0.3175, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
16589 |
+
tensor(0.2061, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
16590 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
16591 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
16592 |
+
{'train/tv_loss': 0.00018769102171063424, 'train/lm_loss': 3.144493966829032e-05, 'train/info_loss': 1.710624019324314e-05, 'train/ref_loss': 0.18825295567512512, 'train/uncertainty_loss': -6.965706706978381e-05, 'train/video_loss': 0.18970192968845367, 'train/total_loss': 0.1897333711385727}
|
16593 |
+
tensor(-0.0009, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:3', grad_fn=<MulBackward0>)
|
16594 |
+
[Rank 1] Trainer log: {'loss': 0.2313, 'grad_norm': 3.738187551498413, 'learning_rate': 2.2759304065811394e-11}[Rank 0] Trainer log: {'loss': 0.2313, 'grad_norm': 3.738187551498413, 'learning_rate': 2.2759304065811394e-11}[Rank 2] Trainer log: {'loss': 0.2313, 'grad_norm': 3.738187551498413, 'learning_rate': 2.2759304065811394e-11}
|
16595 |
+
|
16596 |
+
[Rank 3] Trainer log: {'loss': 0.2313, 'grad_norm': 3.738187551498413, 'learning_rate': 2.2759304065811394e-11}
|
16597 |
+
|
16598 |
+
{'loss': 0.2313, 'grad_norm': 3.738187551498413, 'learning_rate': 2.2759304065811394e-11, 'epoch': 1.0}
|
16599 |
+
tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
|
16600 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.08768467903137207, 'train/info_loss': 0.19097542762756348, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011287924135103822, 'train/video_loss': 0.19086255133152008, 'train/total_loss': 0.2785472273826599}
|
16601 |
+
tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
|
16602 |
+
tensor(-0.0010, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:2', grad_fn=<MulBackward0>)
|
16603 |
+
tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
16604 |
+
tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
|
16605 |
+
tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
|
16606 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.3417798757553101, 'train/info_loss': 0.12373294681310654, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011930712498724461, 'train/video_loss': 0.123613640666008, 'train/total_loss': 0.4653935432434082}
|
16607 |
+
tensor(-0.0014, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:3', grad_fn=<MulBackward0>)
|
16608 |
+
tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
|
16609 |
+
[Rank 1] Trainer log: {'loss': 0.3386, 'grad_norm': 3.233135223388672, 'learning_rate': 5.6898276357131296e-12}[Rank 2] Trainer log: {'loss': 0.3386, 'grad_norm': 3.233135223388672, 'learning_rate': 5.6898276357131296e-12}[Rank 3] Trainer log: {'loss': 0.3386, 'grad_norm': 3.233135223388672, 'learning_rate': 5.6898276357131296e-12}
|
16610 |
+
|
16611 |
+
|
16612 |
+
[Rank 0] Trainer log: {'loss': 0.3386, 'grad_norm': 3.233135223388672, 'learning_rate': 5.6898276357131296e-12}
|
16613 |
+
{'loss': 0.3386, 'grad_norm': 3.233135223388672, 'learning_rate': 5.6898276357131296e-12, 'epoch': 1.0}
|
16614 |
+
tensor(-0.0014, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:0', grad_fn=<MulBackward0>)
|
16615 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.22061138153076174, 'train/info_loss': 0.19586126506328583, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013994580367580057, 'train/video_loss': 0.1957213133573532, 'train/total_loss': 0.4163326919078827}
|
16616 |
+
tensor(-0.0009, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:3', grad_fn=<MulBackward0>)
|
16617 |
+
tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
16618 |
+
tensor(0.3543, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
16619 |
+
tensor(-0.0014, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:2', grad_fn=<MulBackward0>)
|
16620 |
+
tensor(-0.0013, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:0', grad_fn=<MulBackward0>)
|
16621 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.5868082523345948, 'train/info_loss': 0.14800412952899933, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013187677832320334, 'train/video_loss': 0.14787225425243378, 'train/total_loss': 0.7346805334091187}
|
16622 |
+
tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
|
16623 |
+
tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
|
16624 |
+
[Rank 1] Trainer log: {'loss': 0.4673, 'grad_norm': 8.241683959960938, 'learning_rate': 0.0}[Rank 2] Trainer log: {'loss': 0.4673, 'grad_norm': 8.241683959960938, 'learning_rate': 0.0}[Rank 3] Trainer log: {'loss': 0.4673, 'grad_norm': 8.241683959960938, 'learning_rate': 0.0}
|
16625 |
+
|
16626 |
+
|
16627 |
+
[Rank 0] Trainer log: {'loss': 0.4673, 'grad_norm': 8.241683959960938, 'learning_rate': 0.0}
|
16628 |
+
{'loss': 0.4673, 'grad_norm': 8.241683959960938, 'learning_rate': 0.0, 'epoch': 1.0}
|
16629 |
+
[Rank 1] Trainer log: {'train_runtime': 29823.9547, 'train_samples_per_second': 0.832, 'train_steps_per_second': 0.104, 'total_flos': 4.969273863004226e+18, 'train_loss': 0.11426909348176371}[Rank 2] Trainer log: {'train_runtime': 29822.0432, 'train_samples_per_second': 0.832, 'train_steps_per_second': 0.104, 'total_flos': 4.969273863004226e+18, 'train_loss': 0.11426909348176371}[Rank 3] Trainer log: {'train_runtime': 29817.5825, 'train_samples_per_second': 0.832, 'train_steps_per_second': 0.104, 'total_flos': 4.969273863004226e+18, 'train_loss': 0.11426909348176371}
|
16630 |
+
|
16631 |
+
|
16632 |
+
[Rank 0] Trainer log: {'train_runtime': 29828.2573, 'train_samples_per_second': 0.832, 'train_steps_per_second': 0.104, 'total_flos': 4.969273863004226e+18, 'train_loss': 0.11426909348176371}
|
16633 |
+
{'train_runtime': 29828.2573, 'train_samples_per_second': 0.832, 'train_steps_per_second': 0.104, 'train_loss': 0.11426909348176371, 'epoch': 1.0}
|
16634 |
+
Finished TrainingFinished Training
|
16635 |
+
Finished Training
|
16636 |
+
|