Training in progress, step 2075
Browse files- adapter_model.safetensors +1 -1
- train.log +360 -0
adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1204780872
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b24c739c235e4e9845a17684af3e9ae65224f6f9ce24eac3682df1571e15c4f5
|
| 3 |
size 1204780872
|
train.log
CHANGED
|
@@ -17413,3 +17413,363 @@ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device=
|
|
| 17413 |
tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17414 |
tensor(0.4664, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17415 |
{'train/tv_loss': 0.00020973044447600842, 'train/lm_loss': 4.715363611467183e-05, 'train/info_loss': 2.235124156868551e-05, 'train/ref_loss': 0.5165361166000366, 'train/uncertainty_loss': 0.0466405063867569, 'train/video_loss': 0.5648768544197083, 'train/total_loss': 0.5649240016937256}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17413 |
tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17414 |
tensor(0.4664, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17415 |
{'train/tv_loss': 0.00020973044447600842, 'train/lm_loss': 4.715363611467183e-05, 'train/info_loss': 2.235124156868551e-05, 'train/ref_loss': 0.5165361166000366, 'train/uncertainty_loss': 0.0466405063867569, 'train/video_loss': 0.5648768544197083, 'train/total_loss': 0.5649240016937256}
|
| 17416 |
+
tensor(-0.0010, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17417 |
+
tensor(-0.0010, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17418 |
+
tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17419 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.36234724521636963, 'train/info_loss': 0.14337365329265594, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011461186222732067, 'train/video_loss': 0.14325904846191406, 'train/total_loss': 0.5056062936782837}
|
| 17420 |
+
tensor(0.0681, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17421 |
+
[Rank 3] Trainer log: {'loss': 0.3913, 'grad_norm': 3.6058335304260254, 'learning_rate': 5.615223237580377e-06}[Rank 0] Trainer log: {'loss': 0.3913, 'grad_norm': 3.6058335304260254, 'learning_rate': 5.615223237580377e-06}[Rank 2] Trainer log: {'loss': 0.3913, 'grad_norm': 3.6058335304260254, 'learning_rate': 5.615223237580377e-06}
|
| 17422 |
+
|
| 17423 |
+
|
| 17424 |
+
[Rank 1] Trainer log: {'loss': 0.3913, 'grad_norm': 3.6058335304260254, 'learning_rate': 5.615223237580377e-06}
|
| 17425 |
+
{'loss': 0.3913, 'grad_norm': 3.6058335304260254, 'learning_rate': 5.615223237580377e-06, 'epoch': 0.66}
|
| 17426 |
+
tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17427 |
+
tensor(-0.0014, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17428 |
+
tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17429 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.06900591254234315, 'train/info_loss': 0.15682968497276306, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010683162836357952, 'train/video_loss': 0.15672285854816437, 'train/total_loss': 0.22572878003120422}
|
| 17430 |
+
tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17431 |
+
tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17432 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.40285620689392093, 'train/info_loss': 0.18601451814174652, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011755496961995959, 'train/video_loss': 0.18589696288108826, 'train/total_loss': 0.5887531638145447}
|
| 17433 |
+
tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17434 |
+
tensor(0.0370, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17435 |
+
tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17436 |
+
[Rank 1] Trainer log: {'loss': 0.3731, 'grad_norm': 2.03836727142334, 'learning_rate': 5.605638360685988e-06}
|
| 17437 |
+
[Rank 0] Trainer log: {'loss': 0.3731, 'grad_norm': 2.03836727142334, 'learning_rate': 5.605638360685988e-06}[Rank 3] Trainer log: {'loss': 0.3731, 'grad_norm': 2.03836727142334, 'learning_rate': 5.605638360685988e-06}
|
| 17438 |
+
|
| 17439 |
+
[Rank 2] Trainer log: {'loss': 0.3731, 'grad_norm': 2.03836727142334, 'learning_rate': 5.605638360685988e-06}
|
| 17440 |
+
{'loss': 0.3731, 'grad_norm': 2.03836727142334, 'learning_rate': 5.605638360685988e-06, 'epoch': 0.66}
|
| 17441 |
+
tensor(-0.0010, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17442 |
+
tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17443 |
+
tensor(0.2234, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17444 |
+
{'train/tv_loss': 0.00023927739821374419, 'train/lm_loss': 2.8107574325986207e-05, 'train/info_loss': 1.8238688426208682e-05, 'train/ref_loss': 0.3630368113517761, 'train/uncertainty_loss': 0.022341114282608033, 'train/video_loss': 0.3873103857040405, 'train/total_loss': 0.3873384892940521}
|
| 17445 |
+
tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17446 |
+
tensor(-0.0013, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17447 |
+
tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17448 |
+
tensor(-0.0010, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17449 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.14715486764907837, 'train/info_loss': 0.13828474283218384, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001031541614793241, 'train/video_loss': 0.13818158209323883, 'train/total_loss': 0.285336434841156}
|
| 17450 |
+
tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17451 |
+
[Rank 0] Trainer log: {'loss': 0.3744, 'grad_norm': 5.042629241943359, 'learning_rate': 5.5960584844236565e-06}[Rank 1] Trainer log: {'loss': 0.3744, 'grad_norm': 5.042629241943359, 'learning_rate': 5.5960584844236565e-06}
|
| 17452 |
+
[Rank 2] Trainer log: {'loss': 0.3744, 'grad_norm': 5.042629241943359, 'learning_rate': 5.5960584844236565e-06}
|
| 17453 |
+
|
| 17454 |
+
[Rank 3] Trainer log: {'loss': 0.3744, 'grad_norm': 5.042629241943359, 'learning_rate': 5.5960584844236565e-06}
|
| 17455 |
+
{'loss': 0.3744, 'grad_norm': 5.042629241943359, 'learning_rate': 5.5960584844236565e-06, 'epoch': 0.66}
|
| 17456 |
+
tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17457 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.4195257663726807, 'train/info_loss': 0.10877030342817307, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011566100874915719, 'train/video_loss': 0.10865464061498642, 'train/total_loss': 0.5281804203987122}
|
| 17458 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17459 |
+
tensor(0.1955, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17460 |
+
tensor(-0.0010, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17461 |
+
tensor(-0.0016, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0016, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17462 |
+
{'train/tv_loss': None, 'train/lm_loss': 2.8298282995820048e-05, 'train/info_loss': 2.342404332011938e-05, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00015964442864060404, 'train/video_loss': -0.00013622039114125073, 'train/total_loss': -0.00010792211105581373}
|
| 17463 |
+
tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17464 |
+
tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17465 |
+
tensor(0.0522, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17466 |
+
[Rank 3] Trainer log: {'loss': 0.2652, 'grad_norm': 10.820907592773438, 'learning_rate': 5.586483619694953e-06}
|
| 17467 |
+
[Rank 0] Trainer log: {'loss': 0.2652, 'grad_norm': 10.820907592773438, 'learning_rate': 5.586483619694953e-06}[Rank 1] Trainer log: {'loss': 0.2652, 'grad_norm': 10.820907592773438, 'learning_rate': 5.586483619694953e-06}
|
| 17468 |
+
|
| 17469 |
+
[Rank 2] Trainer log: {'loss': 0.2652, 'grad_norm': 10.820907592773438, 'learning_rate': 5.586483619694953e-06}
|
| 17470 |
+
{'loss': 0.2652, 'grad_norm': 10.820907592773438, 'learning_rate': 5.586483619694953e-06, 'epoch': 0.66}
|
| 17471 |
+
tensor(-0.0016, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0016, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17472 |
+
tensor(-0.0011, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17473 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17474 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17475 |
+
{'train/tv_loss': 0.00031795650720596315, 'train/lm_loss': 7.840050384402276e-05, 'train/info_loss': 2.413929905742407e-05, 'train/ref_loss': 0.1724846065044403, 'train/uncertainty_loss': -6.841651047579944e-05, 'train/video_loss': 0.17498399317264557, 'train/total_loss': 0.1750623881816864}
|
| 17476 |
+
tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17477 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.19196442365646363, 'train/info_loss': 0.2579406201839447, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011937021045014263, 'train/video_loss': 0.257821261882782, 'train/total_loss': 0.4497857093811035}
|
| 17478 |
+
tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17479 |
+
tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17480 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17481 |
+
[Rank 2] Trainer log: {'loss': 0.3145, 'grad_norm': 6.420591831207275, 'learning_rate': 5.576913777395749e-06}
|
| 17482 |
+
[Rank 3] Trainer log: {'loss': 0.3145, 'grad_norm': 6.420591831207275, 'learning_rate': 5.576913777395749e-06}
|
| 17483 |
+
[Rank 0] Trainer log: {'loss': 0.3145, 'grad_norm': 6.420591831207275, 'learning_rate': 5.576913777395749e-06}[Rank 1] Trainer log: {'loss': 0.3145, 'grad_norm': 6.420591831207275, 'learning_rate': 5.576913777395749e-06}
|
| 17484 |
+
|
| 17485 |
+
{'loss': 0.3145, 'grad_norm': 6.420591831207275, 'learning_rate': 5.576913777395749e-06, 'epoch': 0.66}
|
| 17486 |
+
tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17487 |
+
tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17488 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.3795618772506714, 'train/info_loss': 0.29438889026641846, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011655737180262805, 'train/video_loss': 0.2942723333835602, 'train/total_loss': 0.6738342046737671}
|
| 17489 |
+
tensor(-0.0010, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17490 |
+
tensor(-0.0014, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17491 |
+
tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17492 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.32472651004791264, 'train/info_loss': 0.22495689988136292, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001121837878599763, 'train/video_loss': 0.22484470903873444, 'train/total_loss': 0.5495712161064148}
|
| 17493 |
+
tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17494 |
+
tensor(0.0239, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17495 |
+
tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17496 |
+
[Rank 2] Trainer log: {'loss': 0.385, 'grad_norm': 4.281194686889648, 'learning_rate': 5.567348968416184e-06}[Rank 3] Trainer log: {'loss': 0.385, 'grad_norm': 4.281194686889648, 'learning_rate': 5.567348968416184e-06}
|
| 17497 |
+
|
| 17498 |
+
[Rank 0] Trainer log: {'loss': 0.385, 'grad_norm': 4.281194686889648, 'learning_rate': 5.567348968416184e-06}
|
| 17499 |
+
[Rank 1] Trainer log: {'loss': 0.385, 'grad_norm': 4.281194686889648, 'learning_rate': 5.567348968416184e-06}
|
| 17500 |
+
{'loss': 0.385, 'grad_norm': 4.281194686889648, 'learning_rate': 5.567348968416184e-06, 'epoch': 0.66}
|
| 17501 |
+
tensor(-0.0014, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17502 |
+
tensor(-0.0011, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17503 |
+
tensor(0.4046, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17504 |
+
{'train/tv_loss': 0.00038167270831763745, 'train/lm_loss': 2.8584344545379284e-05, 'train/info_loss': 1.9132725356030278e-05, 'train/ref_loss': 0.49088212847709656, 'train/uncertainty_loss': 0.04045624732971192, 'train/video_loss': 0.5344108939170837, 'train/total_loss': 0.5344395041465759}
|
| 17505 |
+
tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17506 |
+
tensor(-0.0015, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0015, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17507 |
+
tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17508 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.24148297309875488, 'train/info_loss': 0.2007106989622116, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011322950012981892, 'train/video_loss': 0.20059746503829956, 'train/total_loss': 0.44208043813705444}
|
| 17509 |
+
tensor(0.0585, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17510 |
+
tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17511 |
+
[Rank 0] Trainer log: {'loss': 0.4198, 'grad_norm': 8.499199867248535, 'learning_rate': 5.557789203640687e-06}[Rank 2] Trainer log: {'loss': 0.4198, 'grad_norm': 8.499199867248535, 'learning_rate': 5.557789203640687e-06}[Rank 1] Trainer log: {'loss': 0.4198, 'grad_norm': 8.499199867248535, 'learning_rate': 5.557789203640687e-06}
|
| 17512 |
+
|
| 17513 |
+
|
| 17514 |
+
[Rank 3] Trainer log: {'loss': 0.4198, 'grad_norm': 8.499199867248535, 'learning_rate': 5.557789203640687e-06}
|
| 17515 |
+
{'loss': 0.4198, 'grad_norm': 8.499199867248535, 'learning_rate': 5.557789203640687e-06, 'epoch': 0.66}
|
| 17516 |
+
tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17517 |
+
tensor(0.2698, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17518 |
+
tensor(0.4741, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17519 |
+
{'train/tv_loss': 0.00020899735391139985, 'train/lm_loss': 2.796454355120659e-05, 'train/info_loss': 1.794067611626815e-05, 'train/ref_loss': 0.5671656727790833, 'train/uncertainty_loss': 0.04740565419197083, 'train/video_loss': 0.6162612438201904, 'train/total_loss': 0.6162891983985901}
|
| 17520 |
+
tensor(0.2692, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17521 |
+
tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17522 |
+
tensor(0.2025, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17523 |
+
{'train/tv_loss': 0.00020984613802284003, 'train/lm_loss': 4.1623553261160855e-05, 'train/info_loss': 2.270885306643322e-05, 'train/ref_loss': 0.34875354170799255, 'train/uncertainty_loss': 0.020245166122913362, 'train/video_loss': 0.3707001805305481, 'train/total_loss': 0.3707418143749237}
|
| 17524 |
+
tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17525 |
+
tensor(0.1522, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17526 |
+
[Rank 0] Trainer log: {'loss': 0.3907, 'grad_norm': 24.31747055053711, 'learning_rate': 5.548234493947939e-06}[Rank 1] Trainer log: {'loss': 0.3907, 'grad_norm': 24.31747055053711, 'learning_rate': 5.548234493947939e-06}
|
| 17527 |
+
[Rank 3] Trainer log: {'loss': 0.3907, 'grad_norm': 24.31747055053711, 'learning_rate': 5.548234493947939e-06}
|
| 17528 |
+
|
| 17529 |
+
[Rank 2] Trainer log: {'loss': 0.3907, 'grad_norm': 24.31747055053711, 'learning_rate': 5.548234493947939e-06}
|
| 17530 |
+
{'loss': 0.3907, 'grad_norm': 24.31747055053711, 'learning_rate': 5.548234493947939e-06, 'epoch': 0.66}
|
| 17531 |
+
tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17532 |
+
tensor(-0.0014, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17533 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.3018145322799683, 'train/info_loss': 0.25296148657798767, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00014256734866648912, 'train/video_loss': 0.25281891226768494, 'train/total_loss': 0.5546334385871887}
|
| 17534 |
+
tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17535 |
+
tensor(0.7656, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17536 |
+
tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17537 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.39072275161743164, 'train/info_loss': 0.18714040517807007, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010978373466059567, 'train/video_loss': 0.187030628323555, 'train/total_loss': 0.5777533650398254}
|
| 17538 |
+
tensor(-0.0009, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17539 |
+
tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17540 |
+
tensor(0.0862, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17541 |
+
[Rank 1] Trainer log: {'loss': 0.4751, 'grad_norm': 13.349773406982422, 'learning_rate': 5.538684850210872e-06}[Rank 0] Trainer log: {'loss': 0.4751, 'grad_norm': 13.349773406982422, 'learning_rate': 5.538684850210872e-06}[Rank 3] Trainer log: {'loss': 0.4751, 'grad_norm': 13.349773406982422, 'learning_rate': 5.538684850210872e-06}
|
| 17542 |
+
|
| 17543 |
+
[Rank 2] Trainer log: {'loss': 0.4751, 'grad_norm': 13.349773406982422, 'learning_rate': 5.538684850210872e-06}
|
| 17544 |
+
|
| 17545 |
+
{'loss': 0.4751, 'grad_norm': 13.349773406982422, 'learning_rate': 5.538684850210872e-06, 'epoch': 0.66}
|
| 17546 |
+
tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17547 |
+
tensor(-0.0013, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17548 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.3718518972396851, 'train/info_loss': 0.2331818789243698, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013318745186552405, 'train/video_loss': 0.2330486923456192, 'train/total_loss': 0.604900598526001}
|
| 17549 |
+
tensor(0.2073, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0006, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17550 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17551 |
+
tensor(-0.0010, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17552 |
+
tensor(-0.0010, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17553 |
+
tensor(0.1576, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17554 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17555 |
+
{'train/tv_loss': 0.00022317415568977596, 'train/lm_loss': 2.8441313770599666e-05, 'train/info_loss': 1.8834713046089746e-05, 'train/ref_loss': 0.14411097764968872, 'train/uncertainty_loss': -6.683562532998622e-05, 'train/video_loss': 0.1458483785390854, 'train/total_loss': 0.14587682485580444}
|
| 17556 |
+
[Rank 3] Trainer log: {'loss': 0.3281, 'grad_norm': 2.9163901805877686, 'learning_rate': 5.529140283296655e-06}[Rank 0] Trainer log: {'loss': 0.3281, 'grad_norm': 2.9163901805877686, 'learning_rate': 5.529140283296655e-06}
|
| 17557 |
+
[Rank 2] Trainer log: {'loss': 0.3281, 'grad_norm': 2.9163901805877686, 'learning_rate': 5.529140283296655e-06}
|
| 17558 |
+
[Rank 1] Trainer log: {'loss': 0.3281, 'grad_norm': 2.9163901805877686, 'learning_rate': 5.529140283296655e-06}
|
| 17559 |
+
|
| 17560 |
+
{'loss': 0.3281, 'grad_norm': 2.9163901805877686, 'learning_rate': 5.529140283296655e-06, 'epoch': 0.67}
|
| 17561 |
+
tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17562 |
+
tensor(-0.0013, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17563 |
+
tensor(-0.0014, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17564 |
+
tensor(-0.0010, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17565 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.062081152200698854, 'train/info_loss': 0.16616980731487274, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001029652892611921, 'train/video_loss': 0.16606684029102325, 'train/total_loss': 0.22814799845218658}
|
| 17566 |
+
tensor(-0.0010, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17567 |
+
tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17568 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17569 |
+
{'train/tv_loss': 0.00018424011068418622, 'train/lm_loss': 3.657006600406021e-05, 'train/info_loss': 2.0682384274550714e-05, 'train/ref_loss': 0.03961974009871483, 'train/uncertainty_loss': -6.689630681648851e-05, 'train/video_loss': 0.04104745015501976, 'train/total_loss': 0.04108402132987976}
|
| 17570 |
+
tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17571 |
+
[Rank 2] Trainer log: {'loss': 0.3907, 'grad_norm': 4.135325908660889, 'learning_rate': 5.5196008040666645e-06}[Rank 1] Trainer log: {'loss': 0.3907, 'grad_norm': 4.135325908660889, 'learning_rate': 5.5196008040666645e-06}
|
| 17572 |
+
[Rank 3] Trainer log: {'loss': 0.3907, 'grad_norm': 4.135325908660889, 'learning_rate': 5.5196008040666645e-06}
|
| 17573 |
+
|
| 17574 |
+
[Rank 0] Trainer log: {'loss': 0.3907, 'grad_norm': 4.135325908660889, 'learning_rate': 5.5196008040666645e-06}
|
| 17575 |
+
{'loss': 0.3907, 'grad_norm': 4.135325908660889, 'learning_rate': 5.5196008040666645e-06, 'epoch': 0.67}
|
| 17576 |
+
tensor(-0.0014, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17577 |
+
tensor(-0.0013, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17578 |
+
tensor(0.0723, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17579 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17580 |
+
{'train/tv_loss': 0.0002113046357408166, 'train/lm_loss': 4.093228199053556e-05, 'train/info_loss': 2.0324770957813598e-05, 'train/ref_loss': 0.0825377106666565, 'train/uncertainty_loss': -6.546297227032483e-05, 'train/video_loss': 0.08418301492929459, 'train/total_loss': 0.08422394841909409}
|
| 17581 |
+
tensor(-0.0010, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17582 |
+
tensor(0.1926, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17583 |
+
tensor(0.1136, device='cuda:3', grad_fn=<AddBackward0>) tensor(0.0902, device='cuda:0', grad_fn=<AddBackward0>)tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17584 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17585 |
+
{'train/tv_loss': 0.0002980440389364958, 'train/lm_loss': 3.676076594274491e-05, 'train/info_loss': 2.0980394765501842e-05, 'train/ref_loss': 0.27036359906196594, 'train/uncertainty_loss': 0.00902046412229538, 'train/video_loss': 0.2817894220352173, 'train/total_loss': 0.2818261682987213}
|
| 17586 |
+
[Rank 1] Trainer log: {'loss': 0.3286, 'grad_norm': 2.2409825325012207, 'learning_rate': 5.510066423376514e-06}
|
| 17587 |
+
[Rank 3] Trainer log: {'loss': 0.3286, 'grad_norm': 2.2409825325012207, 'learning_rate': 5.510066423376514e-06}
|
| 17588 |
+
[Rank 2] Trainer log: {'loss': 0.3286, 'grad_norm': 2.2409825325012207, 'learning_rate': 5.510066423376514e-06}
|
| 17589 |
+
[Rank 0] Trainer log: {'loss': 0.3286, 'grad_norm': 2.2409825325012207, 'learning_rate': 5.510066423376514e-06}
|
| 17590 |
+
{'loss': 0.3286, 'grad_norm': 2.2409825325012207, 'learning_rate': 5.510066423376514e-06, 'epoch': 0.67}
|
| 17591 |
+
tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17592 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.22054891586303713, 'train/info_loss': 0.16172195971012115, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012107143411412836, 'train/video_loss': 0.16160088777542114, 'train/total_loss': 0.3821498155593872}
|
| 17593 |
+
tensor(-0.0014, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17594 |
+
tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17595 |
+
tensor(0.0148, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17596 |
+
tensor(-0.0009, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17597 |
+
tensor(-0.0009, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17598 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.2663368940353394, 'train/info_loss': 0.10841624438762665, 'train/ref_loss': None, 'train/uncertainty_loss': -8.942196145653725e-05, 'train/video_loss': 0.10832682251930237, 'train/total_loss': 0.37466371059417725}
|
| 17599 |
+
tensor(0.1076, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17600 |
+
tensor(-0.0010, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17601 |
+
[Rank 1] Trainer log: {'loss': 0.3337, 'grad_norm': 2.651632070541382, 'learning_rate': 5.500537152075987e-06}[Rank 2] Trainer log: {'loss': 0.3337, 'grad_norm': 2.651632070541382, 'learning_rate': 5.500537152075987e-06}[Rank 3] Trainer log: {'loss': 0.3337, 'grad_norm': 2.651632070541382, 'learning_rate': 5.500537152075987e-06}
|
| 17602 |
+
|
| 17603 |
+
|
| 17604 |
+
[Rank 0] Trainer log: {'loss': 0.3337, 'grad_norm': 2.651632070541382, 'learning_rate': 5.500537152075987e-06}
|
| 17605 |
+
{'loss': 0.3337, 'grad_norm': 2.651632070541382, 'learning_rate': 5.500537152075987e-06, 'epoch': 0.67}
|
| 17606 |
+
tensor(-0.0010, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17607 |
+
tensor(0.2763, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17608 |
+
tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17609 |
+
tensor(-0.0009, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17610 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.06459110975265503, 'train/info_loss': 0.21699465811252594, 'train/ref_loss': None, 'train/uncertainty_loss': -9.331009350717069e-05, 'train/video_loss': 0.21690134704113007, 'train/total_loss': 0.2814924716949463}
|
| 17611 |
+
tensor(-0.0014, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17612 |
+
tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17613 |
+
tensor(-0.0015, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0015, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17614 |
+
tensor(0.0351, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17615 |
+
{'train/tv_loss': 0.00018034547101706267, 'train/lm_loss': 3.2064729020930825e-05, 'train/info_loss': 1.9728748156921938e-05, 'train/ref_loss': 0.22903497517108917, 'train/uncertainty_loss': 0.003506988659501076, 'train/video_loss': 0.2340044528245926, 'train/total_loss': 0.23403652012348175}
|
| 17616 |
+
[Rank 1] Trainer log: {'loss': 0.3109, 'grad_norm': 3.874473810195923, 'learning_rate': 5.491013001009076e-06}
|
| 17617 |
+
[Rank 0] Trainer log: {'loss': 0.3109, 'grad_norm': 3.874473810195923, 'learning_rate': 5.491013001009076e-06}[Rank 2] Trainer log: {'loss': 0.3109, 'grad_norm': 3.874473810195923, 'learning_rate': 5.491013001009076e-06}
|
| 17618 |
+
|
| 17619 |
+
[Rank 3] Trainer log: {'loss': 0.3109, 'grad_norm': 3.874473810195923, 'learning_rate': 5.491013001009076e-06}
|
| 17620 |
+
{'loss': 0.3109, 'grad_norm': 3.874473810195923, 'learning_rate': 5.491013001009076e-06, 'epoch': 0.67}
|
| 17621 |
+
tensor(-0.0009, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17622 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.07050921320915222, 'train/info_loss': 0.19436703622341156, 'train/ref_loss': None, 'train/uncertainty_loss': -8.927494054660201e-05, 'train/video_loss': 0.19427776336669922, 'train/total_loss': 0.2647869884967804}
|
| 17623 |
+
tensor(0.1349, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17624 |
+
tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17625 |
+
tensor(0.0307, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17626 |
+
tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17627 |
+
tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17628 |
+
tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17629 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.15221678018569948, 'train/info_loss': 0.10548965632915497, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011310963891446591, 'train/video_loss': 0.10537654906511307, 'train/total_loss': 0.2575933337211609}
|
| 17630 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17631 |
+
[Rank 1] Trainer log: {'loss': 0.2411, 'grad_norm': 2.033538818359375, 'learning_rate': 5.4814939810139236e-06}[Rank 0] Trainer log: {'loss': 0.2411, 'grad_norm': 2.033538818359375, 'learning_rate': 5.4814939810139236e-06}[Rank 3] Trainer log: {'loss': 0.2411, 'grad_norm': 2.033538818359375, 'learning_rate': 5.4814939810139236e-06}
|
| 17632 |
+
|
| 17633 |
+
[Rank 2] Trainer log: {'loss': 0.2411, 'grad_norm': 2.033538818359375, 'learning_rate': 5.4814939810139236e-06}
|
| 17634 |
+
|
| 17635 |
+
{'loss': 0.2411, 'grad_norm': 2.033538818359375, 'learning_rate': 5.4814939810139236e-06, 'epoch': 0.67}
|
| 17636 |
+
tensor(-0.0014, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17637 |
+
tensor(-0.0008, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17638 |
+
tensor(0.1315, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17639 |
+
tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17640 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.17559853792190552, 'train/info_loss': 0.09834705293178558, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001089532976038754, 'train/video_loss': 0.09823810309171677, 'train/total_loss': 0.2738366425037384}
|
| 17641 |
+
tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17642 |
+
tensor(-0.0011, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17643 |
+
tensor(-0.0008, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17644 |
+
tensor(0.3571, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17645 |
+
{'train/tv_loss': 0.00020730055402964356, 'train/lm_loss': 3.654622996691615e-05, 'train/info_loss': 2.235124156868551e-05, 'train/ref_loss': 0.46529483795166016, 'train/uncertainty_loss': 0.035708272457122804, 'train/video_loss': 0.5026838779449463, 'train/total_loss': 0.5027204155921936}
|
| 17646 |
+
[Rank 2] Trainer log: {'loss': 0.3566, 'grad_norm': 9.683050155639648, 'learning_rate': 5.471980102922859e-06}[Rank 0] Trainer log: {'loss': 0.3566, 'grad_norm': 9.683050155639648, 'learning_rate': 5.471980102922859e-06}
|
| 17647 |
+
[Rank 1] Trainer log: {'loss': 0.3566, 'grad_norm': 9.683050155639648, 'learning_rate': 5.471980102922859e-06}
|
| 17648 |
+
|
| 17649 |
+
[Rank 3] Trainer log: {'loss': 0.3566, 'grad_norm': 9.683050155639648, 'learning_rate': 5.471980102922859e-06}
|
| 17650 |
+
{'loss': 0.3566, 'grad_norm': 9.683050155639648, 'learning_rate': 5.471980102922859e-06, 'epoch': 0.67}
|
| 17651 |
+
tensor(-0.0013, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17652 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.1469507694244385, 'train/info_loss': 0.2718953788280487, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012867078185081483, 'train/video_loss': 0.271766722202301, 'train/total_loss': 0.41871750354766846}
|
| 17653 |
+
tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17654 |
+
tensor(0.5532, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17655 |
+
tensor(0.4076, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17656 |
+
tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17657 |
+
tensor(-0.0009, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17658 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.08282127976417542, 'train/info_loss': 0.19566015899181366, 'train/ref_loss': None, 'train/uncertainty_loss': -8.814950124360622e-05, 'train/video_loss': 0.19557200372219086, 'train/total_loss': 0.2783932685852051}
|
| 17659 |
+
tensor(0.6230, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17660 |
+
tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17661 |
+
[Rank 2] Trainer log: {'loss': 0.3995, 'grad_norm': 9.78901481628418, 'learning_rate': 5.4624713775623465e-06}
|
| 17662 |
+
[Rank 1] Trainer log: {'loss': 0.3995, 'grad_norm': 9.78901481628418, 'learning_rate': 5.4624713775623465e-06}
|
| 17663 |
+
[Rank 3] Trainer log: {'loss': 0.3995, 'grad_norm': 9.78901481628418, 'learning_rate': 5.4624713775623465e-06}
|
| 17664 |
+
[Rank 0] Trainer log: {'loss': 0.3995, 'grad_norm': 9.78901481628418, 'learning_rate': 5.4624713775623465e-06}
|
| 17665 |
+
{'loss': 0.3995, 'grad_norm': 9.78901481628418, 'learning_rate': 5.4624713775623465e-06, 'epoch': 0.67}
|
| 17666 |
+
tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17667 |
+
tensor(-0.0010, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17668 |
+
tensor(0.4353, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17669 |
+
{'train/tv_loss': 0.00014477868098765613, 'train/lm_loss': 2.8036060393787923e-05, 'train/info_loss': 1.9132725356030278e-05, 'train/ref_loss': 0.5355187058448792, 'train/uncertainty_loss': 0.043532878160476685, 'train/video_loss': 0.5802289247512817, 'train/total_loss': 0.5802569389343262}
|
| 17670 |
+
tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17671 |
+
tensor(-0.0013, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17672 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.2846735239028931, 'train/info_loss': 0.25035104155540466, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012556229485198857, 'train/video_loss': 0.2502254843711853, 'train/total_loss': 0.5348989963531494}
|
| 17673 |
+
tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17674 |
+
tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17675 |
+
tensor(-0.0010, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17676 |
+
[Rank 1] Trainer log: {'loss': 0.2973, 'grad_norm': 6.888955593109131, 'learning_rate': 5.452967815752981e-06}
|
| 17677 |
+
[Rank 0] Trainer log: {'loss': 0.2973, 'grad_norm': 6.888955593109131, 'learning_rate': 5.452967815752981e-06}[Rank 2] Trainer log: {'loss': 0.2973, 'grad_norm': 6.888955593109131, 'learning_rate': 5.452967815752981e-06}
|
| 17678 |
+
[Rank 3] Trainer log: {'loss': 0.2973, 'grad_norm': 6.888955593109131, 'learning_rate': 5.452967815752981e-06}
|
| 17679 |
+
|
| 17680 |
+
{'loss': 0.2973, 'grad_norm': 6.888955593109131, 'learning_rate': 5.452967815752981e-06, 'epoch': 0.67}
|
| 17681 |
+
tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17682 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.35250082015991213, 'train/info_loss': 0.12986838817596436, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011850049486383796, 'train/video_loss': 0.12974989414215088, 'train/total_loss': 0.48225072026252747}
|
| 17683 |
+
tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17684 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17685 |
+
tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17686 |
+
tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17687 |
+
tensor(-0.0009, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17688 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.06482524275779725, 'train/info_loss': 0.19261880218982697, 'train/ref_loss': None, 'train/uncertainty_loss': -9.101710165850819e-05, 'train/video_loss': 0.19252778589725494, 'train/total_loss': 0.2573530375957489}
|
| 17689 |
+
tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17690 |
+
tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17691 |
+
[Rank 3] Trainer log: {'loss': 0.2944, 'grad_norm': 2.9446370601654053, 'learning_rate': 5.4434694283094916e-06}[Rank 0] Trainer log: {'loss': 0.2944, 'grad_norm': 2.9446370601654053, 'learning_rate': 5.4434694283094916e-06}
|
| 17692 |
+
[Rank 1] Trainer log: {'loss': 0.2944, 'grad_norm': 2.9446370601654053, 'learning_rate': 5.4434694283094916e-06}
|
| 17693 |
+
[Rank 2] Trainer log: {'loss': 0.2944, 'grad_norm': 2.9446370601654053, 'learning_rate': 5.4434694283094916e-06}
|
| 17694 |
+
|
| 17695 |
+
{'loss': 0.2944, 'grad_norm': 2.9446370601654053, 'learning_rate': 5.4434694283094916e-06, 'epoch': 0.67}
|
| 17696 |
+
tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17697 |
+
tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17698 |
+
tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17699 |
+
tensor(0.0826, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17700 |
+
{'train/tv_loss': 0.00014374495949596167, 'train/lm_loss': 2.815525222104043e-05, 'train/info_loss': 1.9132725356030278e-05, 'train/ref_loss': 0.2506270706653595, 'train/uncertainty_loss': 0.008263303339481354, 'train/video_loss': 0.2600594460964203, 'train/total_loss': 0.26008760929107666}
|
| 17701 |
+
tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17702 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.32692942619323734, 'train/info_loss': 0.17380265891551971, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010667567839846016, 'train/video_loss': 0.17369598150253296, 'train/total_loss': 0.5006253719329834}
|
| 17703 |
+
tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17704 |
+
tensor(0.4338, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17705 |
+
tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17706 |
+
[Rank 3] Trainer log: {'loss': 0.4206, 'grad_norm': 6.2780914306640625, 'learning_rate': 5.433976226040713e-06}
|
| 17707 |
+
[Rank 2] Trainer log: {'loss': 0.4206, 'grad_norm': 6.2780914306640625, 'learning_rate': 5.433976226040713e-06}[Rank 1] Trainer log: {'loss': 0.4206, 'grad_norm': 6.2780914306640625, 'learning_rate': 5.433976226040713e-06}
|
| 17708 |
+
[Rank 0] Trainer log: {'loss': 0.4206, 'grad_norm': 6.2780914306640625, 'learning_rate': 5.433976226040713e-06}
|
| 17709 |
+
|
| 17710 |
+
{'loss': 0.4206, 'grad_norm': 6.2780914306640625, 'learning_rate': 5.433976226040713e-06, 'epoch': 0.67}
|
| 17711 |
+
tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17712 |
+
tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17713 |
+
tensor(0.0401, device='cuda:0', grad_fn=<AddBackward0>) tensor(0.1457, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17714 |
+
tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17715 |
+
{'train/tv_loss': 0.0001431380049325526, 'train/lm_loss': 2.1766431746073068e-05, 'train/info_loss': 1.561617318657227e-05, 'train/ref_loss': 0.21703669428825378, 'train/uncertainty_loss': 0.0040077798068523405, 'train/video_loss': 0.22220520675182343, 'train/total_loss': 0.22222697734832764}
|
| 17716 |
+
tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17717 |
+
tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17718 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.19441068172454834, 'train/info_loss': 0.2764996588230133, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010609085438773036, 'train/video_loss': 0.2763935625553131, 'train/total_loss': 0.47080424427986145}
|
| 17719 |
+
tensor(0.0335, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17720 |
+
tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17721 |
+
[Rank 1] Trainer log: {'loss': 0.3794, 'grad_norm': 5.537331581115723, 'learning_rate': 5.424488219749593e-06}[Rank 3] Trainer log: {'loss': 0.3794, 'grad_norm': 5.537331581115723, 'learning_rate': 5.424488219749593e-06}[Rank 0] Trainer log: {'loss': 0.3794, 'grad_norm': 5.537331581115723, 'learning_rate': 5.424488219749593e-06}
|
| 17722 |
+
|
| 17723 |
+
[Rank 2] Trainer log: {'loss': 0.3794, 'grad_norm': 5.537331581115723, 'learning_rate': 5.424488219749593e-06}
|
| 17724 |
+
|
| 17725 |
+
{'loss': 0.3794, 'grad_norm': 5.537331581115723, 'learning_rate': 5.424488219749593e-06, 'epoch': 0.67}
|
| 17726 |
+
tensor(-0.0008, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17727 |
+
tensor(0.1813, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17728 |
+
tensor(0.1105, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17729 |
+
tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17730 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.10859153270721436, 'train/info_loss': 0.21142883598804474, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010824981145560742, 'train/video_loss': 0.21132057905197144, 'train/total_loss': 0.3199121057987213}
|
| 17731 |
+
tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17732 |
+
tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17733 |
+
tensor(0.2025, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17734 |
+
{'train/tv_loss': 0.00026534441858530047, 'train/lm_loss': 3.2517651561647654e-05, 'train/info_loss': 2.0682384274550714e-05, 'train/ref_loss': 0.34838536381721497, 'train/uncertainty_loss': 0.020245753228664398, 'train/video_loss': 0.3707745671272278, 'train/total_loss': 0.37080708146095276}
|
| 17735 |
+
tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17736 |
+
[Rank 3] Trainer log: {'loss': 0.4171, 'grad_norm': 7.524171829223633, 'learning_rate': 5.415005420233141e-06}[Rank 1] Trainer log: {'loss': 0.4171, 'grad_norm': 7.524171829223633, 'learning_rate': 5.415005420233141e-06}[Rank 2] Trainer log: {'loss': 0.4171, 'grad_norm': 7.524171829223633, 'learning_rate': 5.415005420233141e-06}
|
| 17737 |
+
|
| 17738 |
+
|
| 17739 |
+
[Rank 0] Trainer log: {'loss': 0.4171, 'grad_norm': 7.524171829223633, 'learning_rate': 5.415005420233141e-06}
|
| 17740 |
+
{'loss': 0.4171, 'grad_norm': 7.524171829223633, 'learning_rate': 5.415005420233141e-06, 'epoch': 0.67}
|
| 17741 |
+
tensor(-0.0015, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0015, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17742 |
+
tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17743 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.4450415134429932, 'train/info_loss': 0.18519927561283112, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010915255406871439, 'train/video_loss': 0.18509012460708618, 'train/total_loss': 0.6301316022872925}
|
| 17744 |
+
tensor(0.2665, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17745 |
+
tensor(0.2499, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17746 |
+
tensor(-0.0014, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17747 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.31789367198944096, 'train/info_loss': 0.2461404800415039, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00014492600457742812, 'train/video_loss': 0.24599555134773254, 'train/total_loss': 0.5638892650604248}
|
| 17748 |
+
tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17749 |
+
tensor(0.4158, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0006, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17750 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17751 |
+
[Rank 1] Trainer log: {'loss': 0.4059, 'grad_norm': 16.058292388916016, 'learning_rate': 5.405527838282458e-06}
|
| 17752 |
+
[Rank 3] Trainer log: {'loss': 0.4059, 'grad_norm': 16.058292388916016, 'learning_rate': 5.405527838282458e-06}
|
| 17753 |
+
[Rank 0] Trainer log: {'loss': 0.4059, 'grad_norm': 16.058292388916016, 'learning_rate': 5.405527838282458e-06}
|
| 17754 |
+
[Rank 2] Trainer log: {'loss': 0.4059, 'grad_norm': 16.058292388916016, 'learning_rate': 5.405527838282458e-06}
|
| 17755 |
+
{'loss': 0.4059, 'grad_norm': 16.058292388916016, 'learning_rate': 5.405527838282458e-06, 'epoch': 0.67}
|
| 17756 |
+
tensor(-0.0010, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17757 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17758 |
+
tensor(0.0608, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17759 |
+
tensor(0.1152, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17760 |
+
{'train/tv_loss': 0.000334132113493979, 'train/lm_loss': 3.177867038175464e-05, 'train/info_loss': 2.0682384274550714e-05, 'train/ref_loss': 0.2817174196243286, 'train/uncertainty_loss': 0.011518295109272004, 'train/video_loss': 0.2959294617176056, 'train/total_loss': 0.2959612309932709}
|
| 17761 |
+
tensor(-0.0011, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17762 |
+
tensor(0.3057, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17763 |
+
{'train/tv_loss': 0.0002599612809717655, 'train/lm_loss': 4.741583543363959e-05, 'train/info_loss': 2.3066464564180933e-05, 'train/ref_loss': 0.42135030031204224, 'train/uncertainty_loss': 0.03057071566581726, 'train/video_loss': 0.4540237784385681, 'train/total_loss': 0.45407119393348694}
|
| 17764 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17765 |
+
tensor(0.0280, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17766 |
+
[Rank 1] Trainer log: {'loss': 0.28, 'grad_norm': 6.182593822479248, 'learning_rate': 5.396055484682719e-06}[Rank 2] Trainer log: {'loss': 0.28, 'grad_norm': 6.182593822479248, 'learning_rate': 5.396055484682719e-06}
|
| 17767 |
+
|
| 17768 |
+
[Rank 0] Trainer log: {'loss': 0.28, 'grad_norm': 6.182593822479248, 'learning_rate': 5.396055484682719e-06}[Rank 3] Trainer log: {'loss': 0.28, 'grad_norm': 6.182593822479248, 'learning_rate': 5.396055484682719e-06}
|
| 17769 |
+
|
| 17770 |
+
{'loss': 0.28, 'grad_norm': 6.182593822479248, 'learning_rate': 5.396055484682719e-06, 'epoch': 0.67}
|
| 17771 |
+
tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17772 |
+
tensor(0.0697, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17773 |
+
{'train/tv_loss': 0.0003155144164338708, 'train/lm_loss': 5.3374795243144035e-05, 'train/info_loss': 2.455650974297896e-05, 'train/ref_loss': 0.22853229939937592, 'train/uncertainty_loss': 0.006969699263572693, 'train/video_loss': 0.2380506843328476, 'train/total_loss': 0.23810406029224396}
|
| 17774 |
+
tensor(0.2178, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17775 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|