Training in progress, step 3050
Browse files- adapter_model.safetensors +1 -1
- train.log +373 -0
adapter_model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1204780872
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d34e23c8484e80b6ef5c7243ef0e9595e8747d365664c46ffcfd790b9e48b3eb
|
3 |
size 1204780872
|
train.log
CHANGED
@@ -15526,3 +15526,376 @@ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device=
|
|
15526 |
tensor(0.0556, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
15527 |
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
15528 |
{'train/tv_loss': 0.00018669653218239546, 'train/lm_loss': 1.6736284305807202e-05, 'train/info_loss': 1.4662527973996475e-05, 'train/ref_loss': 0.09598883241415024, 'train/uncertainty_loss': -6.740234675817192e-05, 'train/video_loss': 0.09742966294288635, 'train/total_loss': 0.09744639694690704}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15526 |
tensor(0.0556, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
15527 |
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
15528 |
{'train/tv_loss': 0.00018669653218239546, 'train/lm_loss': 1.6736284305807202e-05, 'train/info_loss': 1.4662527973996475e-05, 'train/ref_loss': 0.09598883241415024, 'train/uncertainty_loss': -6.740234675817192e-05, 'train/video_loss': 0.09742966294288635, 'train/total_loss': 0.09744639694690704}
|
15529 |
+
[Rank 2] Trainer log: {'loss': 0.2252, 'grad_norm': 2.7614905834198, 'learning_rate': 3.030577462765139e-08}[Rank 3] Trainer log: {'loss': 0.2252, 'grad_norm': 2.7614905834198, 'learning_rate': 3.030577462765139e-08}
|
15530 |
+
[Rank 1] Trainer log: {'loss': 0.2252, 'grad_norm': 2.7614905834198, 'learning_rate': 3.030577462765139e-08}
|
15531 |
+
|
15532 |
+
[Rank 0] Trainer log: {'loss': 0.2252, 'grad_norm': 2.7614905834198, 'learning_rate': 3.030577462765139e-08}
|
15533 |
+
{'loss': 0.2252, 'grad_norm': 2.7614905834198, 'learning_rate': 3.030577462765139e-08, 'epoch': 0.98}
|
15534 |
+
tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
|
15535 |
+
tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
|
15536 |
+
tensor(-0.0014, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:0', grad_fn=<MulBackward0>)
|
15537 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.1601017475128174, 'train/info_loss': 0.18674495816230774, 'train/ref_loss': None, 'train/uncertainty_loss': -0.000141742208506912, 'train/video_loss': 0.18660321831703186, 'train/total_loss': 0.34670495986938477}
|
15538 |
+
tensor(0.0981, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
15539 |
+
tensor(-0.0013, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:3', grad_fn=<MulBackward0>)
|
15540 |
+
tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
15541 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
15542 |
+
tensor(0.0275, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
15543 |
+
{'train/tv_loss': 0.00012622961075976492, 'train/lm_loss': 4.0336354868486526e-05, 'train/info_loss': 1.561617318657227e-05, 'train/ref_loss': 0.2368880957365036, 'train/uncertainty_loss': 0.002750598080456257, 'train/video_loss': 0.24066415429115295, 'train/total_loss': 0.2407044917345047}
|
15544 |
+
[Rank 1] Trainer log: {'loss': 0.3173, 'grad_norm': 3.536466360092163, 'learning_rate': 2.9481571807336018e-08}
|
15545 |
+
[Rank 2] Trainer log: {'loss': 0.3173, 'grad_norm': 3.536466360092163, 'learning_rate': 2.9481571807336018e-08}[Rank 3] Trainer log: {'loss': 0.3173, 'grad_norm': 3.536466360092163, 'learning_rate': 2.9481571807336018e-08}
|
15546 |
+
|
15547 |
+
[Rank 0] Trainer log: {'loss': 0.3173, 'grad_norm': 3.536466360092163, 'learning_rate': 2.9481571807336018e-08}
|
15548 |
+
{'loss': 0.3173, 'grad_norm': 3.536466360092163, 'learning_rate': 2.9481571807336018e-08, 'epoch': 0.98}
|
15549 |
+
tensor(-0.0011, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:2', grad_fn=<MulBackward0>)
|
15550 |
+
tensor(-0.0010, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:0', grad_fn=<MulBackward0>)
|
15551 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.022706881165504456, 'train/info_loss': 0.18825730681419373, 'train/ref_loss': None, 'train/uncertainty_loss': -9.704146068543196e-05, 'train/video_loss': 0.1881602704524994, 'train/total_loss': 0.21086715161800385}
|
15552 |
+
tensor(-0.0009, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:3', grad_fn=<MulBackward0>)
|
15553 |
+
tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:1', grad_fn=<MulBackward0>)
|
15554 |
+
tensor(-0.0009, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:3', grad_fn=<MulBackward0>)
|
15555 |
+
tensor(0.0030, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
15556 |
+
tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
15557 |
+
tensor(0.2461, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
15558 |
+
{'train/tv_loss': 0.00019956468604505063, 'train/lm_loss': 1.6450205293949693e-05, 'train/info_loss': 1.4424115761357825e-05, 'train/ref_loss': 0.38291317224502563, 'train/uncertainty_loss': 0.024610528349876405, 'train/video_loss': 0.4091346263885498, 'train/total_loss': 0.4091510772705078}
|
15559 |
+
[Rank 3] Trainer log: {'loss': 0.235, 'grad_norm': 2.253711223602295, 'learning_rate': 2.866871509327962e-08}[Rank 2] Trainer log: {'loss': 0.235, 'grad_norm': 2.253711223602295, 'learning_rate': 2.866871509327962e-08}
|
15560 |
+
|
15561 |
+
[Rank 0] Trainer log: {'loss': 0.235, 'grad_norm': 2.253711223602295, 'learning_rate': 2.866871509327962e-08}[Rank 1] Trainer log: {'loss': 0.235, 'grad_norm': 2.253711223602295, 'learning_rate': 2.866871509327962e-08}
|
15562 |
+
|
15563 |
+
{'loss': 0.235, 'grad_norm': 2.253711223602295, 'learning_rate': 2.866871509327962e-08, 'epoch': 0.98}
|
15564 |
+
tensor(-0.0016, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0016, device='cuda:0', grad_fn=<MulBackward0>)
|
15565 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.27252380847930907, 'train/info_loss': 0.33596906065940857, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00015895817195996643, 'train/video_loss': 0.3358100950717926, 'train/total_loss': 0.608333945274353}
|
15566 |
+
tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
|
15567 |
+
tensor(0.2933, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
15568 |
+
tensor(-0.0009, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:3', grad_fn=<MulBackward0>)
|
15569 |
+
tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
|
15570 |
+
tensor(-0.0010, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:0', grad_fn=<MulBackward0>)
|
15571 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.021349179744720462, 'train/info_loss': 0.18826666474342346, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010396195575594902, 'train/video_loss': 0.188162699341774, 'train/total_loss': 0.2095118761062622}
|
15572 |
+
tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
15573 |
+
tensor(0.0953, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
15574 |
+
[Rank 0] Trainer log: {'loss': 0.3715, 'grad_norm': 3.0025641918182373, 'learning_rate': 2.7867205410484488e-08}[Rank 1] Trainer log: {'loss': 0.3715, 'grad_norm': 3.0025641918182373, 'learning_rate': 2.7867205410484488e-08}
|
15575 |
+
[Rank 3] Trainer log: {'loss': 0.3715, 'grad_norm': 3.0025641918182373, 'learning_rate': 2.7867205410484488e-08}
|
15576 |
+
|
15577 |
+
[Rank 2] Trainer log: {'loss': 0.3715, 'grad_norm': 3.0025641918182373, 'learning_rate': 2.7867205410484488e-08}
|
15578 |
+
{'loss': 0.3715, 'grad_norm': 3.0025641918182373, 'learning_rate': 2.7867205410484488e-08, 'epoch': 0.98}
|
15579 |
+
tensor(-0.0013, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:1', grad_fn=<MulBackward0>)
|
15580 |
+
tensor(-0.0009, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:0', grad_fn=<MulBackward0>)
|
15581 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.3378294467926026, 'train/info_loss': 0.1996716856956482, 'train/ref_loss': None, 'train/uncertainty_loss': -8.704561041668059e-05, 'train/video_loss': 0.19958463311195374, 'train/total_loss': 0.5374140739440918}
|
15582 |
+
tensor(0.0409, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
15583 |
+
tensor(0.0946, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
15584 |
+
tensor(-0.0013, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:3', grad_fn=<MulBackward0>)
|
15585 |
+
tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
|
15586 |
+
tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
|
15587 |
+
tensor(0.0392, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
15588 |
+
{'train/tv_loss': 0.0001396071049384773, 'train/lm_loss': 2.4341055541299285e-05, 'train/info_loss': 1.633140527701471e-05, 'train/ref_loss': 0.2384827733039856, 'train/uncertainty_loss': 0.003921708464622498, 'train/video_loss': 0.24353766441345215, 'train/total_loss': 0.2435620129108429}
|
15589 |
+
[Rank 2] Trainer log: {'loss': 0.3805, 'grad_norm': 4.503754615783691, 'learning_rate': 2.7077043671039914e-08}
|
15590 |
+
[Rank 1] Trainer log: {'loss': 0.3805, 'grad_norm': 4.503754615783691, 'learning_rate': 2.7077043671039914e-08}[Rank 3] Trainer log: {'loss': 0.3805, 'grad_norm': 4.503754615783691, 'learning_rate': 2.7077043671039914e-08}
|
15591 |
+
|
15592 |
+
[Rank 0] Trainer log: {'loss': 0.3805, 'grad_norm': 4.503754615783691, 'learning_rate': 2.7077043671039914e-08}
|
15593 |
+
{'loss': 0.3805, 'grad_norm': 4.503754615783691, 'learning_rate': 2.7077043671039914e-08, 'epoch': 0.98}
|
15594 |
+
tensor(-0.0013, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:1', grad_fn=<MulBackward0>)
|
15595 |
+
tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
|
15596 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.4039686679840088, 'train/info_loss': 0.2144818902015686, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012274450855329633, 'train/video_loss': 0.21435914933681488, 'train/total_loss': 0.618327796459198}
|
15597 |
+
tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
15598 |
+
tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
15599 |
+
tensor(-0.0010, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:1', grad_fn=<MulBackward0>)
|
15600 |
+
tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
|
15601 |
+
tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
15602 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
15603 |
+
{'train/tv_loss': 0.00021858105901628733, 'train/lm_loss': 2.1337324869818986e-05, 'train/info_loss': 1.740425250318367e-05, 'train/ref_loss': 0.06369525194168091, 'train/uncertainty_loss': -7.047837134450674e-05, 'train/video_loss': 0.06539083272218704, 'train/total_loss': 0.06541217118501663}
|
15604 |
+
[Rank 1] Trainer log: {'loss': 0.3141, 'grad_norm': 1.3152093887329102, 'learning_rate': 2.62982307741233e-08}[Rank 3] Trainer log: {'loss': 0.3141, 'grad_norm': 1.3152093887329102, 'learning_rate': 2.62982307741233e-08}
|
15605 |
+
|
15606 |
+
[Rank 2] Trainer log: {'loss': 0.3141, 'grad_norm': 1.3152093887329102, 'learning_rate': 2.62982307741233e-08}
|
15607 |
+
[Rank 0] Trainer log: {'loss': 0.3141, 'grad_norm': 1.3152093887329102, 'learning_rate': 2.62982307741233e-08}
|
15608 |
+
{'loss': 0.3141, 'grad_norm': 1.3152093887329102, 'learning_rate': 2.62982307741233e-08, 'epoch': 0.98}
|
15609 |
+
tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
|
15610 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.3192060708999634, 'train/info_loss': 0.10271652787923813, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010767884086817504, 'train/video_loss': 0.10260885208845139, 'train/total_loss': 0.4218149483203888}
|
15611 |
+
tensor(-0.0008, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:2', grad_fn=<MulBackward0>)
|
15612 |
+
tensor(0.0468, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
15613 |
+
tensor(0.5917, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
15614 |
+
tensor(-0.0013, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:0', grad_fn=<MulBackward0>)
|
15615 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.3076723337173462, 'train/info_loss': 0.20474545657634735, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013230217155069113, 'train/video_loss': 0.20461314916610718, 'train/total_loss': 0.5122854709625244}
|
15616 |
+
tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
|
15617 |
+
tensor(-0.0008, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:1', grad_fn=<MulBackward0>)
|
15618 |
+
tensor(0.2428, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
15619 |
+
[Rank 1] Trainer log: {'loss': 0.4295, 'grad_norm': 5.666561126708984, 'learning_rate': 2.553076760599793e-08}
|
15620 |
+
[Rank 2] Trainer log: {'loss': 0.4295, 'grad_norm': 5.666561126708984, 'learning_rate': 2.553076760599793e-08}
|
15621 |
+
[Rank 0] Trainer log: {'loss': 0.4295, 'grad_norm': 5.666561126708984, 'learning_rate': 2.553076760599793e-08}
|
15622 |
+
[Rank 3] Trainer log: {'loss': 0.4295, 'grad_norm': 5.666561126708984, 'learning_rate': 2.553076760599793e-08}
|
15623 |
+
{'loss': 0.4295, 'grad_norm': 5.666561126708984, 'learning_rate': 2.553076760599793e-08, 'epoch': 0.98}
|
15624 |
+
tensor(-0.0008, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:1', grad_fn=<MulBackward0>)
|
15625 |
+
tensor(0.0288, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
15626 |
+
tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
15627 |
+
tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
|
15628 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.2875659942626953, 'train/info_loss': 0.11822149902582169, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011541531421244144, 'train/video_loss': 0.11810608208179474, 'train/total_loss': 0.4056720733642578}
|
15629 |
+
tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
|
15630 |
+
tensor(-0.0013, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:1', grad_fn=<MulBackward0>)
|
15631 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
15632 |
+
{'train/tv_loss': 0.00014680708991363645, 'train/lm_loss': 1.6664764552842828e-05, 'train/info_loss': 1.5139350580284372e-05, 'train/ref_loss': 0.1329575628042221, 'train/uncertainty_loss': -6.927012582309544e-05, 'train/video_loss': 0.13407787680625916, 'train/total_loss': 0.13409453630447388}
|
15633 |
+
tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
|
15634 |
+
[Rank 1] Trainer log: {'loss': 0.3915, 'grad_norm': 2.96402645111084, 'learning_rate': 2.4774655040008534e-08}[Rank 2] Trainer log: {'loss': 0.3915, 'grad_norm': 2.96402645111084, 'learning_rate': 2.4774655040008534e-08}
|
15635 |
+
[Rank 3] Trainer log: {'loss': 0.3915, 'grad_norm': 2.96402645111084, 'learning_rate': 2.4774655040008534e-08}
|
15636 |
+
|
15637 |
+
[Rank 0] Trainer log: {'loss': 0.3915, 'grad_norm': 2.96402645111084, 'learning_rate': 2.4774655040008534e-08}
|
15638 |
+
{'loss': 0.3915, 'grad_norm': 2.96402645111084, 'learning_rate': 2.4774655040008534e-08, 'epoch': 0.98}
|
15639 |
+
tensor(-0.0010, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:1', grad_fn=<MulBackward0>)
|
15640 |
+
tensor(0.0327, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
15641 |
+
tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
15642 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
15643 |
+
{'train/tv_loss': 0.00015368129825219513, 'train/lm_loss': 1.4590684440918268e-05, 'train/info_loss': 1.3112849956087302e-05, 'train/ref_loss': 0.08234774321317673, 'train/uncertainty_loss': -6.846861215308309e-05, 'train/video_loss': 0.08352183550596237, 'train/total_loss': 0.08353642374277115}
|
15644 |
+
tensor(-0.0013, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:1', grad_fn=<MulBackward0>)
|
15645 |
+
tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
|
15646 |
+
tensor(-0.0011, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:2', grad_fn=<MulBackward0>)
|
15647 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
15648 |
+
{'train/tv_loss': 0.00024882378056645393, 'train/lm_loss': 2.4436411331407728e-05, 'train/info_loss': 1.7702266632113606e-05, 'train/ref_loss': 0.04439636319875717, 'train/uncertainty_loss': -6.729786400683224e-05, 'train/video_loss': 0.04633735865354538, 'train/total_loss': 0.04636179655790329}
|
15649 |
+
[Rank 3] Trainer log: {'loss': 0.3067, 'grad_norm': 5.423553466796875, 'learning_rate': 2.4029893936586835e-08}[Rank 1] Trainer log: {'loss': 0.3067, 'grad_norm': 5.423553466796875, 'learning_rate': 2.4029893936586835e-08}[Rank 2] Trainer log: {'loss': 0.3067, 'grad_norm': 5.423553466796875, 'learning_rate': 2.4029893936586835e-08}
|
15650 |
+
|
15651 |
+
|
15652 |
+
[Rank 0] Trainer log: {'loss': 0.3067, 'grad_norm': 5.423553466796875, 'learning_rate': 2.4029893936586835e-08}
|
15653 |
+
{'loss': 0.3067, 'grad_norm': 5.423553466796875, 'learning_rate': 2.4029893936586835e-08, 'epoch': 0.98}
|
15654 |
+
tensor(-0.0015, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0015, device='cuda:2', grad_fn=<MulBackward0>)
|
15655 |
+
tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
|
15656 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
15657 |
+
tensor(0.3023, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
15658 |
+
{'train/tv_loss': 0.00023792595602571966, 'train/lm_loss': 1.6783963656052946e-05, 'train/info_loss': 1.4006895071361214e-05, 'train/ref_loss': 0.4113433361053467, 'train/uncertainty_loss': 0.030228328704833985, 'train/video_loss': 0.44348907470703125, 'train/total_loss': 0.4435058534145355}
|
15659 |
+
tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
|
15660 |
+
tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
|
15661 |
+
tensor(0.3504, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
15662 |
+
{'train/tv_loss': 0.0002617940539494157, 'train/lm_loss': 1.4757565804757179e-05, 'train/info_loss': 1.3768483768217266e-05, 'train/ref_loss': 0.45626747608184814, 'train/uncertainty_loss': 0.035035207867622375, 'train/video_loss': 0.4934108257293701, 'train/total_loss': 0.493425577878952}
|
15663 |
+
tensor(0.1190, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
15664 |
+
[Rank 1] Trainer log: {'loss': 0.4272, 'grad_norm': 13.721080780029297, 'learning_rate': 2.3296485143244896e-08}
|
15665 |
+
[Rank 0] Trainer log: {'loss': 0.4272, 'grad_norm': 13.721080780029297, 'learning_rate': 2.3296485143244896e-08}[Rank 3] Trainer log: {'loss': 0.4272, 'grad_norm': 13.721080780029297, 'learning_rate': 2.3296485143244896e-08}
|
15666 |
+
|
15667 |
+
[Rank 2] Trainer log: {'loss': 0.4272, 'grad_norm': 13.721080780029297, 'learning_rate': 2.3296485143244896e-08}
|
15668 |
+
{'loss': 0.4272, 'grad_norm': 13.721080780029297, 'learning_rate': 2.3296485143244896e-08, 'epoch': 0.98}
|
15669 |
+
tensor(-0.0011, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:2', grad_fn=<MulBackward0>)
|
15670 |
+
tensor(-0.0010, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:1', grad_fn=<MulBackward0>)
|
15671 |
+
tensor(0.0039, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
15672 |
+
{'train/tv_loss': 0.00018340945243835451, 'train/lm_loss': 2.1456521062646063e-05, 'train/info_loss': 1.537776188342832e-05, 'train/ref_loss': 0.21618469059467316, 'train/uncertainty_loss': 0.0003915365319699049, 'train/video_loss': 0.21805888414382935, 'train/total_loss': 0.2180803418159485}
|
15673 |
+
tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
15674 |
+
tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
|
15675 |
+
tensor(0.3194, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
15676 |
+
tensor(0.1016, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
15677 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
15678 |
+
{'train/tv_loss': 0.00016525617102161052, 'train/lm_loss': 2.1575717255473137e-05, 'train/info_loss': 1.6867830709088594e-05, 'train/ref_loss': 0.14564038813114166, 'train/uncertainty_loss': -6.878315471112729e-05, 'train/video_loss': 0.14691051840782166, 'train/total_loss': 0.14693209528923035}
|
15679 |
+
[Rank 0] Trainer log: {'loss': 0.3268, 'grad_norm': 4.3162922859191895, 'learning_rate': 2.2574429494575112e-08}[Rank 1] Trainer log: {'loss': 0.3268, 'grad_norm': 4.3162922859191895, 'learning_rate': 2.2574429494575112e-08}
|
15680 |
+
[Rank 3] Trainer log: {'loss': 0.3268, 'grad_norm': 4.3162922859191895, 'learning_rate': 2.2574429494575112e-08}
|
15681 |
+
|
15682 |
+
[Rank 2] Trainer log: {'loss': 0.3268, 'grad_norm': 4.3162922859191895, 'learning_rate': 2.2574429494575112e-08}
|
15683 |
+
{'loss': 0.3268, 'grad_norm': 4.3162922859191895, 'learning_rate': 2.2574429494575112e-08, 'epoch': 0.98}
|
15684 |
+
tensor(-0.0014, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:3', grad_fn=<MulBackward0>)
|
15685 |
+
tensor(0.2224, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0006, device='cuda:2', grad_fn=<MulBackward0>)
|
15686 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
15687 |
+
tensor(0.3880, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
15688 |
+
{'train/tv_loss': 0.0002865825081244111, 'train/lm_loss': 7.627939921803773e-05, 'train/info_loss': 2.235124156868551e-05, 'train/ref_loss': 0.47501638531684875, 'train/uncertainty_loss': 0.03880217373371125, 'train/video_loss': 0.5161335468292236, 'train/total_loss': 0.5162098407745361}
|
15689 |
+
tensor(-0.0013, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:1', grad_fn=<MulBackward0>)
|
15690 |
+
tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
|
15691 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.2736924409866333, 'train/info_loss': 0.19035007059574127, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012445753673091532, 'train/video_loss': 0.19022561609745026, 'train/total_loss': 0.4639180898666382}
|
15692 |
+
tensor(-0.0008, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:2', grad_fn=<MulBackward0>)
|
15693 |
+
tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
|
15694 |
+
[Rank 1] Trainer log: {'loss': 0.3889, 'grad_norm': 8.067330360412598, 'learning_rate': 2.1863727812254653e-08}
|
15695 |
+
[Rank 0] Trainer log: {'loss': 0.3889, 'grad_norm': 8.067330360412598, 'learning_rate': 2.1863727812254653e-08}[Rank 3] Trainer log: {'loss': 0.3889, 'grad_norm': 8.067330360412598, 'learning_rate': 2.1863727812254653e-08}
|
15696 |
+
|
15697 |
+
[Rank 2] Trainer log: {'loss': 0.3889, 'grad_norm': 8.067330360412598, 'learning_rate': 2.1863727812254653e-08}
|
15698 |
+
{'loss': 0.3889, 'grad_norm': 8.067330360412598, 'learning_rate': 2.1863727812254653e-08, 'epoch': 0.98}
|
15699 |
+
tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
|
15700 |
+
tensor(-0.0009, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:0', grad_fn=<MulBackward0>)
|
15701 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.14509937763214112, 'train/info_loss': 0.2527730464935303, 'train/ref_loss': None, 'train/uncertainty_loss': -8.878109510987998e-05, 'train/video_loss': 0.25268426537513733, 'train/total_loss': 0.39778363704681396}
|
15702 |
+
tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
15703 |
+
tensor(0.5379, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0006, device='cuda:3', grad_fn=<MulBackward0>)
|
15704 |
+
tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
|
15705 |
+
tensor(-0.0010, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:3', grad_fn=<MulBackward0>)
|
15706 |
+
tensor(0.1698, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
15707 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
15708 |
+
{'train/tv_loss': 9.16590914130211e-05, 'train/lm_loss': 3.5878777271136646e-05, 'train/info_loss': 2.0980394765501842e-05, 'train/ref_loss': 0.069771908223629, 'train/uncertainty_loss': -6.670778384432197e-05, 'train/video_loss': 0.07045945525169373, 'train/total_loss': 0.07049533724784851}
|
15709 |
+
[Rank 1] Trainer log: {'loss': 0.3645, 'grad_norm': 2.2399256229400635, 'learning_rate': 2.1164380905035476e-08}
|
15710 |
+
[Rank 3] Trainer log: {'loss': 0.3645, 'grad_norm': 2.2399256229400635, 'learning_rate': 2.1164380905035476e-08}
|
15711 |
+
[Rank 2] Trainer log: {'loss': 0.3645, 'grad_norm': 2.2399256229400635, 'learning_rate': 2.1164380905035476e-08}
|
15712 |
+
[Rank 0] Trainer log: {'loss': 0.3645, 'grad_norm': 2.2399256229400635, 'learning_rate': 2.1164380905035476e-08}
|
15713 |
+
{'loss': 0.3645, 'grad_norm': 2.2399256229400635, 'learning_rate': 2.1164380905035476e-08, 'epoch': 0.98}
|
15714 |
+
tensor(-0.0017, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0017, device='cuda:2', grad_fn=<MulBackward0>)
|
15715 |
+
tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
|
15716 |
+
tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
|
15717 |
+
tensor(0.4245, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
15718 |
+
{'train/tv_loss': 0.0001862031174823642, 'train/lm_loss': 1.2969550152774901e-05, 'train/info_loss': 1.3768483768217266e-05, 'train/ref_loss': 0.5063884854316711, 'train/uncertainty_loss': 0.042449754476547245, 'train/video_loss': 0.5503416657447815, 'train/total_loss': 0.5503546595573425}
|
15719 |
+
tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
|
15720 |
+
tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
|
15721 |
+
tensor(-0.0010, device='cuda:2', grad_fn=<MulBackward0>)
|
15722 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.3805509328842163, 'train/info_loss': 0.2284892201423645, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011929129250347615, 'train/video_loss': 0.22836992144584656, 'train/total_loss': 0.6089208722114563}
|
15723 |
+
tensor(-0.0008, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:3', grad_fn=<MulBackward0>)
|
15724 |
+
[Rank 2] Trainer log: {'loss': 0.4398, 'grad_norm': 3.0451931953430176, 'learning_rate': 2.0476389568749867e-08}
|
15725 |
+
[Rank 1] Trainer log: {'loss': 0.4398, 'grad_norm': 3.0451931953430176, 'learning_rate': 2.0476389568749867e-08}
|
15726 |
+
[Rank 0] Trainer log: {'loss': 0.4398, 'grad_norm': 3.0451931953430176, 'learning_rate': 2.0476389568749867e-08}[Rank 3] Trainer log: {'loss': 0.4398, 'grad_norm': 3.0451931953430176, 'learning_rate': 2.0476389568749867e-08}
|
15727 |
+
|
15728 |
+
{'loss': 0.4398, 'grad_norm': 3.0451931953430176, 'learning_rate': 2.0476389568749867e-08, 'epoch': 0.98}
|
15729 |
+
tensor(-0.0014, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:2', grad_fn=<MulBackward0>)
|
15730 |
+
tensor(0.0213, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
15731 |
+
tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
|
15732 |
+
tensor(0.1013, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
15733 |
+
{'train/tv_loss': 0.00010751406662166119, 'train/lm_loss': 1.6736284305807202e-05, 'train/info_loss': 1.4424115761357825e-05, 'train/ref_loss': 0.28108254075050354, 'train/uncertainty_loss': 0.010134818404912949, 'train/video_loss': 0.29209190607070923, 'train/total_loss': 0.2921086549758911}
|
15734 |
+
tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
15735 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
15736 |
+
tensor(0.1053, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
15737 |
+
tensor(-0.0008, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:0', grad_fn=<MulBackward0>)
|
15738 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.13767752647399903, 'train/info_loss': 0.11634093523025513, 'train/ref_loss': None, 'train/uncertainty_loss': -8.035243372432888e-05, 'train/video_loss': 0.1162605807185173, 'train/total_loss': 0.25393810868263245}
|
15739 |
+
[Rank 3] Trainer log: {'loss': 0.2633, 'grad_norm': 11.52269172668457, 'learning_rate': 1.979975458631045e-08}
|
15740 |
+
[Rank 0] Trainer log: {'loss': 0.2633, 'grad_norm': 11.52269172668457, 'learning_rate': 1.979975458631045e-08}[Rank 1] Trainer log: {'loss': 0.2633, 'grad_norm': 11.52269172668457, 'learning_rate': 1.979975458631045e-08}
|
15741 |
+
[Rank 2] Trainer log: {'loss': 0.2633, 'grad_norm': 11.52269172668457, 'learning_rate': 1.979975458631045e-08}
|
15742 |
+
|
15743 |
+
{'loss': 0.2633, 'grad_norm': 11.52269172668457, 'learning_rate': 1.979975458631045e-08, 'epoch': 0.98}
|
15744 |
+
tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
|
15745 |
+
tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
|
15746 |
+
tensor(-0.0014, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:0', grad_fn=<MulBackward0>)
|
15747 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.340410852432251, 'train/info_loss': 0.22465088963508606, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013702560681849718, 'train/video_loss': 0.22451385855674744, 'train/total_loss': 0.5649247169494629}
|
15748 |
+
tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:1', grad_fn=<MulBackward0>)
|
15749 |
+
tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
|
15750 |
+
tensor(-0.0013, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:1', grad_fn=<MulBackward0>)
|
15751 |
+
tensor(0.3107, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
15752 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
15753 |
+
{'train/tv_loss': 0.00016929933335632088, 'train/lm_loss': 1.6664764552842828e-05, 'train/info_loss': 1.3530071555578616e-05, 'train/ref_loss': 0.1848124861717224, 'train/uncertainty_loss': -6.540960166603328e-05, 'train/video_loss': 0.18611499667167664, 'train/total_loss': 0.18613165616989136}
|
15754 |
+
[Rank 1] Trainer log: {'loss': 0.4731, 'grad_norm': 6.321042060852051, 'learning_rate': 1.913447672770241e-08}[Rank 3] Trainer log: {'loss': 0.4731, 'grad_norm': 6.321042060852051, 'learning_rate': 1.913447672770241e-08}[Rank 2] Trainer log: {'loss': 0.4731, 'grad_norm': 6.321042060852051, 'learning_rate': 1.913447672770241e-08}
|
15755 |
+
|
15756 |
+
|
15757 |
+
[Rank 0] Trainer log: {'loss': 0.4731, 'grad_norm': 6.321042060852051, 'learning_rate': 1.913447672770241e-08}
|
15758 |
+
{'loss': 0.4731, 'grad_norm': 6.321042060852051, 'learning_rate': 1.913447672770241e-08, 'epoch': 0.98}
|
15759 |
+
tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
|
15760 |
+
tensor(-0.0008, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:2', grad_fn=<MulBackward0>)
|
15761 |
+
tensor(-0.0010, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:1', grad_fn=<MulBackward0>)
|
15762 |
+
tensor(-0.0008, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:0', grad_fn=<MulBackward0>)
|
15763 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.030660930275917056, 'train/info_loss': 0.1238495334982872, 'train/ref_loss': None, 'train/uncertainty_loss': -8.383804815821351e-05, 'train/video_loss': 0.12376569211483002, 'train/total_loss': 0.15442661941051483}
|
15764 |
+
tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
|
15765 |
+
tensor(-0.0008, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:2', grad_fn=<MulBackward0>)
|
15766 |
+
tensor(0.1368, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
15767 |
+
{'train/tv_loss': 0.00013003707863390446, 'train/lm_loss': 1.2945709750056267e-05, 'train/info_loss': 1.3530071555578616e-05, 'train/ref_loss': 0.3049245774745941, 'train/uncertainty_loss': 0.013680557906627656, 'train/video_loss': 0.31965896487236023, 'train/total_loss': 0.3196718990802765}
|
15768 |
+
tensor(0.0753, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
15769 |
+
[Rank 3] Trainer log: {'loss': 0.2912, 'grad_norm': 9.444515228271484, 'learning_rate': 1.8480556749991274e-08}[Rank 2] Trainer log: {'loss': 0.2912, 'grad_norm': 9.444515228271484, 'learning_rate': 1.8480556749991274e-08}[Rank 0] Trainer log: {'loss': 0.2912, 'grad_norm': 9.444515228271484, 'learning_rate': 1.8480556749991274e-08}
|
15770 |
+
|
15771 |
+
|
15772 |
+
{'loss': 0.2912, 'grad_norm': 9.444515228271484, 'learning_rate': 1.8480556749991274e-08, 'epoch': 0.98}[Rank 1] Trainer log: {'loss': 0.2912, 'grad_norm': 9.444515228271484, 'learning_rate': 1.8480556749991274e-08}
|
15773 |
+
|
15774 |
+
tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
|
15775 |
+
tensor(-0.0009, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:3', grad_fn=<MulBackward0>)
|
15776 |
+
tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
15777 |
+
tensor(0.1515, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
15778 |
+
{'train/tv_loss': 0.0001313490211032331, 'train/lm_loss': 1.4757565804757179e-05, 'train/info_loss': 1.4006895071361214e-05, 'train/ref_loss': 0.31524068117141724, 'train/uncertainty_loss': 0.01514904797077179, 'train/video_loss': 0.3314545452594757, 'train/total_loss': 0.3314692974090576}
|
15779 |
+
tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
|
15780 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.19247951507568362, 'train/info_loss': 0.1875615417957306, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010782489553093911, 'train/video_loss': 0.1874537169933319, 'train/total_loss': 0.37993323802948}
|
15781 |
+
tensor(-0.0009, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:3', grad_fn=<MulBackward0>)
|
15782 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
15783 |
+
tensor(0.1763, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
15784 |
+
[Rank 0] Trainer log: {'loss': 0.274, 'grad_norm': 2.9086925983428955, 'learning_rate': 1.7837995397312903e-08}[Rank 3] Trainer log: {'loss': 0.274, 'grad_norm': 2.9086925983428955, 'learning_rate': 1.7837995397312903e-08}
|
15785 |
+
|
15786 |
+
[Rank 1] Trainer log: {'loss': 0.274, 'grad_norm': 2.9086925983428955, 'learning_rate': 1.7837995397312903e-08}
|
15787 |
+
[Rank 2] Trainer log: {'loss': 0.274, 'grad_norm': 2.9086925983428955, 'learning_rate': 1.7837995397312903e-08}
|
15788 |
+
{'loss': 0.274, 'grad_norm': 2.9086925983428955, 'learning_rate': 1.7837995397312903e-08, 'epoch': 0.98}
|
15789 |
+
tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
|
15790 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.3090352535247803, 'train/info_loss': 0.11806312948465347, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010535655310377479, 'train/video_loss': 0.11795777082443237, 'train/total_loss': 0.4269930422306061}
|
15791 |
+
tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
|
15792 |
+
tensor(-0.0009, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:3', grad_fn=<MulBackward0>)
|
15793 |
+
tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:1', grad_fn=<MulBackward0>)
|
15794 |
+
tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
|
15795 |
+
tensor(-0.0009, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:0', grad_fn=<MulBackward0>)
|
15796 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.0443669855594635, 'train/info_loss': 0.20817534625530243, 'train/ref_loss': None, 'train/uncertainty_loss': -8.949771290645003e-05, 'train/video_loss': 0.20808584988117218, 'train/total_loss': 0.2524528503417969}
|
15797 |
+
tensor(-0.0008, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:2', grad_fn=<MulBackward0>)
|
15798 |
+
tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
15799 |
+
[Rank 1] Trainer log: {'loss': 0.3108, 'grad_norm': 2.2581498622894287, 'learning_rate': 1.7206793400880163e-08}[Rank 0] Trainer log: {'loss': 0.3108, 'grad_norm': 2.2581498622894287, 'learning_rate': 1.7206793400880163e-08}
|
15800 |
+
[Rank 2] Trainer log: {'loss': 0.3108, 'grad_norm': 2.2581498622894287, 'learning_rate': 1.7206793400880163e-08}
|
15801 |
+
|
15802 |
+
[Rank 3] Trainer log: {'loss': 0.3108, 'grad_norm': 2.2581498622894287, 'learning_rate': 1.7206793400880163e-08}
|
15803 |
+
{'loss': 0.3108, 'grad_norm': 2.2581498622894287, 'learning_rate': 1.7206793400880163e-08, 'epoch': 0.98}
|
15804 |
+
tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
|
15805 |
+
tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
15806 |
+
tensor(0.1164, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
15807 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
15808 |
+
{'train/tv_loss': 0.00012342289555817844, 'train/lm_loss': 2.7726159896701577e-05, 'train/info_loss': 1.7702266632113606e-05, 'train/ref_loss': 0.15569385886192322, 'train/uncertainty_loss': -6.904435576871037e-05, 'train/video_loss': 0.15662990510463715, 'train/total_loss': 0.1566576361656189}
|
15809 |
+
tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
|
15810 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.060094434022903445, 'train/info_loss': 0.060833293944597244, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011266678338870407, 'train/video_loss': 0.06072062626481056, 'train/total_loss': 0.12081506103277206}
|
15811 |
+
tensor(-0.0013, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:3', grad_fn=<MulBackward0>)
|
15812 |
+
tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
|
15813 |
+
tensor(-0.0010, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:2', grad_fn=<MulBackward0>)
|
15814 |
+
[Rank 0] Trainer log: {'loss': 0.3028, 'grad_norm': 4.024272441864014, 'learning_rate': 1.6586951478981818e-08}[Rank 3] Trainer log: {'loss': 0.3028, 'grad_norm': 4.024272441864014, 'learning_rate': 1.6586951478981818e-08}[Rank 2] Trainer log: {'loss': 0.3028, 'grad_norm': 4.024272441864014, 'learning_rate': 1.6586951478981818e-08}
|
15815 |
+
|
15816 |
+
|
15817 |
+
[Rank 1] Trainer log: {'loss': 0.3028, 'grad_norm': 4.024272441864014, 'learning_rate': 1.6586951478981818e-08}
|
15818 |
+
{'loss': 0.3028, 'grad_norm': 4.024272441864014, 'learning_rate': 1.6586951478981818e-08, 'epoch': 0.98}
|
15819 |
+
tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
15820 |
+
tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
|
15821 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.19400665760040284, 'train/info_loss': 0.0838468000292778, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010808079969137907, 'train/video_loss': 0.08373872190713882, 'train/total_loss': 0.27774539589881897}
|
15822 |
+
tensor(0.6478, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
15823 |
+
tensor(0.1944, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
15824 |
+
tensor(-0.0014, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:2', grad_fn=<MulBackward0>)
|
15825 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
15826 |
+
tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
15827 |
+
{'train/tv_loss': 0.00012543149059638382, 'train/lm_loss': 4.641471023205668e-05, 'train/info_loss': 1.740425250318367e-05, 'train/ref_loss': 0.07925719767808914, 'train/uncertainty_loss': -6.660351064056159e-05, 'train/video_loss': 0.08021145313978195, 'train/total_loss': 0.08025787025690079}
|
15828 |
+
tensor(0.0176, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
15829 |
+
[Rank 2] Trainer log: {'loss': 0.2885, 'grad_norm': 9.745797157287598, 'learning_rate': 1.597847033697475e-08}
|
15830 |
+
[Rank 3] Trainer log: {'loss': 0.2885, 'grad_norm': 9.745797157287598, 'learning_rate': 1.597847033697475e-08}
|
15831 |
+
[Rank 1] Trainer log: {'loss': 0.2885, 'grad_norm': 9.745797157287598, 'learning_rate': 1.597847033697475e-08}
|
15832 |
+
[Rank 0] Trainer log: {'loss': 0.2885, 'grad_norm': 9.745797157287598, 'learning_rate': 1.597847033697475e-08}
|
15833 |
+
{'loss': 0.2885, 'grad_norm': 9.745797157287598, 'learning_rate': 1.597847033697475e-08, 'epoch': 0.98}
|
15834 |
+
tensor(-0.0010, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:0', grad_fn=<MulBackward0>)
|
15835 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.07550049424171448, 'train/info_loss': 0.2326270490884781, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010250152554363013, 'train/video_loss': 0.2325245440006256, 'train/total_loss': 0.3080250322818756}
|
15836 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
15837 |
+
tensor(0.0303, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
15838 |
+
tensor(0.0823, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
15839 |
+
tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
|
15840 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.3728404760360718, 'train/info_loss': 0.21700052917003632, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011042470578104258, 'train/video_loss': 0.2168901115655899, 'train/total_loss': 0.5897306203842163}
|
15841 |
+
tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
|
15842 |
+
tensor(0.1946, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0006, device='cuda:1', grad_fn=<MulBackward0>)
|
15843 |
+
tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
15844 |
+
[Rank 0] Trainer log: {'loss': 0.3109, 'grad_norm': 2.3931143283843994, 'learning_rate': 1.538135066728841e-08}[Rank 3] Trainer log: {'loss': 0.3109, 'grad_norm': 2.3931143283843994, 'learning_rate': 1.538135066728841e-08}
|
15845 |
+
[Rank 1] Trainer log: {'loss': 0.3109, 'grad_norm': 2.3931143283843994, 'learning_rate': 1.538135066728841e-08}
|
15846 |
+
|
15847 |
+
[Rank 2] Trainer log: {'loss': 0.3109, 'grad_norm': 2.3931143283843994, 'learning_rate': 1.538135066728841e-08}
|
15848 |
+
{'loss': 0.3109, 'grad_norm': 2.3931143283843994, 'learning_rate': 1.538135066728841e-08, 'epoch': 0.98}
|
15849 |
+
tensor(-0.0010, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:2', grad_fn=<MulBackward0>)
|
15850 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
15851 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
15852 |
+
{'train/tv_loss': 0.00019662757404148579, 'train/lm_loss': 1.6736284305807202e-05, 'train/info_loss': 1.4185704458213877e-05, 'train/ref_loss': 0.1474931538105011, 'train/uncertainty_loss': -6.86797546222806e-05, 'train/video_loss': 0.14901168644428253, 'train/total_loss': 0.14902842044830322}
|
15853 |
+
tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
15854 |
+
tensor(-0.0014, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:2', grad_fn=<MulBackward0>)
|
15855 |
+
tensor(0.2225, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
15856 |
+
{'train/tv_loss': 0.0002068731002509594, 'train/lm_loss': 1.9001058535650373e-05, 'train/info_loss': 1.4006895071361214e-05, 'train/ref_loss': 0.3619014620780945, 'train/uncertainty_loss': 0.02225349098443985, 'train/video_loss': 0.38582393527030945, 'train/total_loss': 0.3858429491519928}
|
15857 |
+
tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
|
15858 |
+
tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
|
15859 |
+
[Rank 1] Trainer log: {'loss': 0.3651, 'grad_norm': 5.535183429718018, 'learning_rate': 1.4795593149427023e-08}
|
15860 |
+
[Rank 3] Trainer log: {'loss': 0.3651, 'grad_norm': 5.535183429718018, 'learning_rate': 1.4795593149427023e-08}[Rank 0] Trainer log: {'loss': 0.3651, 'grad_norm': 5.535183429718018, 'learning_rate': 1.4795593149427023e-08}
|
15861 |
+
|
15862 |
+
[Rank 2] Trainer log: {'loss': 0.3651, 'grad_norm': 5.535183429718018, 'learning_rate': 1.4795593149427023e-08}
|
15863 |
+
{'loss': 0.3651, 'grad_norm': 5.535183429718018, 'learning_rate': 1.4795593149427023e-08, 'epoch': 0.98}
|
15864 |
+
tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
|
15865 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.1908965349197388, 'train/info_loss': 0.2302519679069519, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012198447948321701, 'train/video_loss': 0.23012998700141907, 'train/total_loss': 0.4210265278816223}
|
15866 |
+
tensor(0.2627, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0006, device='cuda:1', grad_fn=<MulBackward0>)
|
15867 |
+
tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
15868 |
+
tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
15869 |
+
tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
|
15870 |
+
tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
|
15871 |
+
tensor(-0.0013, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:0', grad_fn=<MulBackward0>)
|
15872 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.3204694747924805, 'train/info_loss': 0.20709535479545593, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012635280145332218, 'train/video_loss': 0.2069690078496933, 'train/total_loss': 0.5274384617805481}
|
15873 |
+
tensor(-0.0010, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:1', grad_fn=<MulBackward0>)
|
15874 |
+
[Rank 2] Trainer log: {'loss': 0.3904, 'grad_norm': 7.206387042999268, 'learning_rate': 1.4221198449960727e-08}
|
15875 |
+
[Rank 1] Trainer log: {'loss': 0.3904, 'grad_norm': 7.206387042999268, 'learning_rate': 1.4221198449960727e-08}
|
15876 |
+
[Rank 0] Trainer log: {'loss': 0.3904, 'grad_norm': 7.206387042999268, 'learning_rate': 1.4221198449960727e-08}[Rank 3] Trainer log: {'loss': 0.3904, 'grad_norm': 7.206387042999268, 'learning_rate': 1.4221198449960727e-08}
|
15877 |
+
|
15878 |
+
{'loss': 0.3904, 'grad_norm': 7.206387042999268, 'learning_rate': 1.4221198449960727e-08, 'epoch': 0.98}
|
15879 |
+
tensor(-0.0014, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:2', grad_fn=<MulBackward0>)
|
15880 |
+
tensor(0.0467, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
15881 |
+
tensor(0.1316, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
15882 |
+
tensor(0.3947, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
15883 |
+
{'train/tv_loss': 0.00013940890785306694, 'train/lm_loss': 5.173014360480011e-05, 'train/info_loss': 1.7702266632113606e-05, 'train/ref_loss': 0.48780110478401184, 'train/uncertainty_loss': 0.03946700990200043, 'train/video_loss': 0.5284010767936707, 'train/total_loss': 0.5284528136253357}
|
15884 |
+
tensor(-0.0010, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:1', grad_fn=<MulBackward0>)
|
15885 |
+
tensor(0.1995, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
15886 |
+
tensor(0.1702, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
15887 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
15888 |
+
{'train/tv_loss': 0.00011268751695752144, 'train/lm_loss': 1.680780405877158e-05, 'train/info_loss': 1.4185704458213877e-05, 'train/ref_loss': 0.07056979835033417, 'train/uncertainty_loss': -6.798054673708976e-05, 'train/video_loss': 0.07141750305891037, 'train/total_loss': 0.07143431156873703}
|
15889 |
+
[Rank 0] Trainer log: {'loss': 0.3433, 'grad_norm': 4.602771759033203, 'learning_rate': 1.3658167222529994e-08}[Rank 1] Trainer log: {'loss': 0.3433, 'grad_norm': 4.602771759033203, 'learning_rate': 1.3658167222529994e-08}
|
15890 |
+
[Rank 2] Trainer log: {'loss': 0.3433, 'grad_norm': 4.602771759033203, 'learning_rate': 1.3658167222529994e-08}
|
15891 |
+
|
15892 |
+
[Rank 3] Trainer log: {'loss': 0.3433, 'grad_norm': 4.602771759033203, 'learning_rate': 1.3658167222529994e-08}
|
15893 |
+
{'loss': 0.3433, 'grad_norm': 4.602771759033203, 'learning_rate': 1.3658167222529994e-08, 'epoch': 0.98}
|
15894 |
+
tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
|
15895 |
+
tensor(0.1286, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
15896 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
15897 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
15898 |
+
{'train/tv_loss': 9.077095310203732e-05, 'train/lm_loss': 1.8905699835158885e-05, 'train/info_loss': 1.4006895071361214e-05, 'train/ref_loss': 0.1560274064540863, 'train/uncertainty_loss': -6.968472735024989e-05, 'train/video_loss': 0.15669789910316467, 'train/total_loss': 0.15671680867671967}
|
15899 |
+
tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
|
15900 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.10165367126464844, 'train/info_loss': 0.22112657129764557, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010826733196154237, 'train/video_loss': 0.22101829946041107, 'train/total_loss': 0.3226719796657562}
|
15901 |
+
tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
|