Training in progress, step 2050
Browse files- adapter_model.safetensors +1 -1
- train.log +385 -0
adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1204780872
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:29a81378ab1f78803e81097b916a079f8e851fd7ab413fde0729ae1de1e9207a
|
| 3 |
size 1204780872
|
train.log
CHANGED
|
@@ -17028,3 +17028,388 @@ tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device=
|
|
| 17028 |
{'train/tv_loss': None, 'train/lm_loss': 0.32293994426727296, 'train/info_loss': 0.12928418815135956, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011226574424654246, 'train/video_loss': 0.1291719228029251, 'train/total_loss': 0.4521118998527527}
|
| 17029 |
tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17030 |
tensor(0.2072, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17028 |
{'train/tv_loss': None, 'train/lm_loss': 0.32293994426727296, 'train/info_loss': 0.12928418815135956, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011226574424654246, 'train/video_loss': 0.1291719228029251, 'train/total_loss': 0.4521118998527527}
|
| 17029 |
tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17030 |
tensor(0.2072, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17031 |
+
[Rank 0] Trainer log: {'loss': 0.4444, 'grad_norm': 12.160444259643555, 'learning_rate': 5.866149411468177e-06}[Rank 3] Trainer log: {'loss': 0.4444, 'grad_norm': 12.160444259643555, 'learning_rate': 5.866149411468177e-06}[Rank 1] Trainer log: {'loss': 0.4444, 'grad_norm': 12.160444259643555, 'learning_rate': 5.866149411468177e-06}
|
| 17032 |
+
|
| 17033 |
+
|
| 17034 |
+
[Rank 2] Trainer log: {'loss': 0.4444, 'grad_norm': 12.160444259643555, 'learning_rate': 5.866149411468177e-06}
|
| 17035 |
+
{'loss': 0.4444, 'grad_norm': 12.160444259643555, 'learning_rate': 5.866149411468177e-06, 'epoch': 0.65}
|
| 17036 |
+
tensor(-0.0014, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17037 |
+
tensor(-0.0010, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17038 |
+
tensor(-0.0008, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17039 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17040 |
+
{'train/tv_loss': 0.00020880883093923332, 'train/lm_loss': 2.4817834491841496e-05, 'train/info_loss': 1.8834713046089746e-05, 'train/ref_loss': 0.18321365118026733, 'train/uncertainty_loss': -6.815286469645798e-05, 'train/video_loss': 0.1848347932100296, 'train/total_loss': 0.18485960364341736}
|
| 17041 |
+
tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17042 |
+
tensor(-0.0010, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17043 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.2403105974197388, 'train/info_loss': 0.2913327217102051, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010404838249087334, 'train/video_loss': 0.29122868180274963, 'train/total_loss': 0.531539261341095}
|
| 17044 |
+
tensor(0.0252, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17045 |
+
tensor(0.0596, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17046 |
+
[Rank 3] Trainer log: {'loss': 0.3638, 'grad_norm': 1.8527789115905762, 'learning_rate': 5.8564383629784175e-06}[Rank 1] Trainer log: {'loss': 0.3638, 'grad_norm': 1.8527789115905762, 'learning_rate': 5.8564383629784175e-06}
|
| 17047 |
+
|
| 17048 |
+
[Rank 0] Trainer log: {'loss': 0.3638, 'grad_norm': 1.8527789115905762, 'learning_rate': 5.8564383629784175e-06}
|
| 17049 |
+
[Rank 2] Trainer log: {'loss': 0.3638, 'grad_norm': 1.8527789115905762, 'learning_rate': 5.8564383629784175e-06}
|
| 17050 |
+
{'loss': 0.3638, 'grad_norm': 1.8527789115905762, 'learning_rate': 5.8564383629784175e-06, 'epoch': 0.65}
|
| 17051 |
+
tensor(-0.0013, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17052 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.38158850669860844, 'train/info_loss': 0.06055440008640289, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001311479019932449, 'train/video_loss': 0.06042325124144554, 'train/total_loss': 0.4420117735862732}
|
| 17053 |
+
tensor(-0.0014, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17054 |
+
tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17055 |
+
tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17056 |
+
tensor(-0.0010, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17057 |
+
tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17058 |
+
tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17059 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.2511230230331421, 'train/info_loss': 0.2582288980484009, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012226408580318093, 'train/video_loss': 0.25810661911964417, 'train/total_loss': 0.5092296600341797}
|
| 17060 |
+
tensor(-0.0008, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17061 |
+
[Rank 3] Trainer log: {'loss': 0.3692, 'grad_norm': 2.5602004528045654, 'learning_rate': 5.846732029718962e-06}[Rank 0] Trainer log: {'loss': 0.3692, 'grad_norm': 2.5602004528045654, 'learning_rate': 5.846732029718962e-06}
|
| 17062 |
+
[Rank 1] Trainer log: {'loss': 0.3692, 'grad_norm': 2.5602004528045654, 'learning_rate': 5.846732029718962e-06}
|
| 17063 |
+
|
| 17064 |
+
[Rank 2] Trainer log: {'loss': 0.3692, 'grad_norm': 2.5602004528045654, 'learning_rate': 5.846732029718962e-06}
|
| 17065 |
+
{'loss': 0.3692, 'grad_norm': 2.5602004528045654, 'learning_rate': 5.846732029718962e-06, 'epoch': 0.65}
|
| 17066 |
+
tensor(-0.0013, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17067 |
+
tensor(-0.0013, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17068 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.3681243896484375, 'train/info_loss': 0.07267985492944717, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012856742832809687, 'train/video_loss': 0.07255128771066666, 'train/total_loss': 0.44067567586898804}
|
| 17069 |
+
tensor(0.2176, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17070 |
+
tensor(0.0752, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17071 |
+
tensor(-0.0010, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17072 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.29500226974487304, 'train/info_loss': 0.15204651653766632, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010267778998240829, 'train/video_loss': 0.15194383263587952, 'train/total_loss': 0.4469461143016815}
|
| 17073 |
+
tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17074 |
+
tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17075 |
+
tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17076 |
+
[Rank 0] Trainer log: {'loss': 0.3501, 'grad_norm': 5.391907691955566, 'learning_rate': 5.837030422735281e-06}[Rank 2] Trainer log: {'loss': 0.3501, 'grad_norm': 5.391907691955566, 'learning_rate': 5.837030422735281e-06}[Rank 1] Trainer log: {'loss': 0.3501, 'grad_norm': 5.391907691955566, 'learning_rate': 5.837030422735281e-06}
|
| 17077 |
+
|
| 17078 |
+
|
| 17079 |
+
[Rank 3] Trainer log: {'loss': 0.3501, 'grad_norm': 5.391907691955566, 'learning_rate': 5.837030422735281e-06}
|
| 17080 |
+
{'loss': 0.3501, 'grad_norm': 5.391907691955566, 'learning_rate': 5.837030422735281e-06, 'epoch': 0.65}
|
| 17081 |
+
tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17082 |
+
tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17083 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.27378618717193604, 'train/info_loss': 0.18530026078224182, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001140484819188714, 'train/video_loss': 0.1851862072944641, 'train/total_loss': 0.45897239446640015}
|
| 17084 |
+
tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17085 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17086 |
+
tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17087 |
+
tensor(0.2328, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17088 |
+
tensor(0.1607, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17089 |
+
{'train/tv_loss': 0.00021110486704856157, 'train/lm_loss': 0.00011288317618891598, 'train/info_loss': 2.6761768822325394e-05, 'train/ref_loss': 0.3208291828632355, 'train/uncertainty_loss': 0.016073283553123475, 'train/video_loss': 0.33861806988716125, 'train/total_loss': 0.33873096108436584}
|
| 17090 |
+
tensor(0.0314, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17091 |
+
[Rank 0] Trainer log: {'loss': 0.3379, 'grad_norm': 7.8567070960998535, 'learning_rate': 5.827333553067473e-06}[Rank 1] Trainer log: {'loss': 0.3379, 'grad_norm': 7.8567070960998535, 'learning_rate': 5.827333553067473e-06}
|
| 17092 |
+
|
| 17093 |
+
[Rank 2] Trainer log: {'loss': 0.3379, 'grad_norm': 7.8567070960998535, 'learning_rate': 5.827333553067473e-06}
|
| 17094 |
+
[Rank 3] Trainer log: {'loss': 0.3379, 'grad_norm': 7.8567070960998535, 'learning_rate': 5.827333553067473e-06}
|
| 17095 |
+
{'loss': 0.3379, 'grad_norm': 7.8567070960998535, 'learning_rate': 5.827333553067473e-06, 'epoch': 0.66}
|
| 17096 |
+
tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17097 |
+
tensor(-0.0010, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17098 |
+
tensor(0.0031, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17099 |
+
{'train/tv_loss': 0.00014839788200333716, 'train/lm_loss': 2.1766431746073068e-05, 'train/info_loss': 1.794067611626815e-05, 'train/ref_loss': 0.21624045073986053, 'train/uncertainty_loss': 0.0003052026499062777, 'train/video_loss': 0.21775078773498535, 'train/total_loss': 0.21777255833148956}
|
| 17100 |
+
tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17101 |
+
tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17102 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17103 |
+
tensor(0.0286, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17104 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17105 |
+
{'train/tv_loss': 0.00024025768507272006, 'train/lm_loss': 3.1874023261480036e-05, 'train/info_loss': 1.9728748156921938e-05, 'train/ref_loss': 0.1827160120010376, 'train/uncertainty_loss': -6.815321394242347e-05, 'train/video_loss': 0.18458963930606842, 'train/total_loss': 0.18462151288986206}
|
| 17106 |
+
[Rank 1] Trainer log: {'loss': 0.2946, 'grad_norm': 7.998960971832275, 'learning_rate': 5.817641431750234e-06}
|
| 17107 |
+
[Rank 3] Trainer log: {'loss': 0.2946, 'grad_norm': 7.998960971832275, 'learning_rate': 5.817641431750234e-06}
|
| 17108 |
+
[Rank 0] Trainer log: {'loss': 0.2946, 'grad_norm': 7.998960971832275, 'learning_rate': 5.817641431750234e-06}[Rank 2] Trainer log: {'loss': 0.2946, 'grad_norm': 7.998960971832275, 'learning_rate': 5.817641431750234e-06}
|
| 17109 |
+
|
| 17110 |
+
{'loss': 0.2946, 'grad_norm': 7.998960971832275, 'learning_rate': 5.817641431750234e-06, 'epoch': 0.66}
|
| 17111 |
+
tensor(-0.0009, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17112 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17113 |
+
tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17114 |
+
tensor(0.0528, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17115 |
+
{'train/tv_loss': 0.00027445454616099597, 'train/lm_loss': 2.1790270693600178e-05, 'train/info_loss': 1.6867830709088594e-05, 'train/ref_loss': 0.250593900680542, 'train/uncertainty_loss': 0.0052795026451349265, 'train/video_loss': 0.2580859065055847, 'train/total_loss': 0.2581076920032501}
|
| 17116 |
+
tensor(-0.0013, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17117 |
+
tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17118 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.21145300865173342, 'train/info_loss': 0.3242379426956177, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012023800518363715, 'train/video_loss': 0.3241176903247833, 'train/total_loss': 0.5355706810951233}
|
| 17119 |
+
tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17120 |
+
tensor(0.0132, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17121 |
+
[Rank 2] Trainer log: {'loss': 0.3252, 'grad_norm': 10.534229278564453, 'learning_rate': 5.807954069812862e-06}[Rank 1] Trainer log: {'loss': 0.3252, 'grad_norm': 10.534229278564453, 'learning_rate': 5.807954069812862e-06}
|
| 17122 |
+
[Rank 3] Trainer log: {'loss': 0.3252, 'grad_norm': 10.534229278564453, 'learning_rate': 5.807954069812862e-06}
|
| 17123 |
+
|
| 17124 |
+
[Rank 0] Trainer log: {'loss': 0.3252, 'grad_norm': 10.534229278564453, 'learning_rate': 5.807954069812862e-06}
|
| 17125 |
+
{'loss': 0.3252, 'grad_norm': 10.534229278564453, 'learning_rate': 5.807954069812862e-06, 'epoch': 0.66}
|
| 17126 |
+
tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17127 |
+
tensor(-0.0013, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17128 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17129 |
+
tensor(0.0294, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17130 |
+
{'train/tv_loss': 0.00017149079358205201, 'train/lm_loss': 4.7129800077527764e-05, 'train/info_loss': 2.455650974297896e-05, 'train/ref_loss': 0.2356904149055481, 'train/uncertainty_loss': 0.0029447738081216815, 'train/video_loss': 0.2400316596031189, 'train/total_loss': 0.24007879197597504}
|
| 17131 |
+
tensor(-0.0011, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17132 |
+
tensor(-0.0010, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17133 |
+
tensor(-0.0009, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17134 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.10311661958694458, 'train/info_loss': 0.17362730205059052, 'train/ref_loss': None, 'train/uncertainty_loss': -9.315699571743608e-05, 'train/video_loss': 0.17353413999080658, 'train/total_loss': 0.2766507565975189}
|
| 17135 |
+
tensor(0.2431, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17136 |
+
[Rank 0] Trainer log: {'loss': 0.3809, 'grad_norm': 4.766957759857178, 'learning_rate': 5.798271478279253e-06}[Rank 1] Trainer log: {'loss': 0.3809, 'grad_norm': 4.766957759857178, 'learning_rate': 5.798271478279253e-06}[Rank 3] Trainer log: {'loss': 0.3809, 'grad_norm': 4.766957759857178, 'learning_rate': 5.798271478279253e-06}
|
| 17137 |
+
|
| 17138 |
+
[Rank 2] Trainer log: {'loss': 0.3809, 'grad_norm': 4.766957759857178, 'learning_rate': 5.798271478279253e-06}
|
| 17139 |
+
|
| 17140 |
+
{'loss': 0.3809, 'grad_norm': 4.766957759857178, 'learning_rate': 5.798271478279253e-06, 'epoch': 0.66}
|
| 17141 |
+
tensor(-0.0014, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17142 |
+
tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17143 |
+
tensor(0.1595, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17144 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17145 |
+
{'train/tv_loss': 0.00022061308845877647, 'train/lm_loss': 3.185018722433597e-05, 'train/info_loss': 1.9430735846981406e-05, 'train/ref_loss': 0.18868407607078552, 'train/uncertainty_loss': -6.92706322297454e-05, 'train/video_loss': 0.1903991401195526, 'train/total_loss': 0.19043098390102386}
|
| 17146 |
+
tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17147 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.29790046215057375, 'train/info_loss': 0.1837206333875656, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011250831885263325, 'train/video_loss': 0.18360812962055206, 'train/total_loss': 0.48150861263275146}
|
| 17148 |
+
tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17149 |
+
tensor(-0.0009, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17150 |
+
tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17151 |
+
[Rank 1] Trainer log: {'loss': 0.3361, 'grad_norm': 4.297579288482666, 'learning_rate': 5.788593668167854e-06}[Rank 0] Trainer log: {'loss': 0.3361, 'grad_norm': 4.297579288482666, 'learning_rate': 5.788593668167854e-06}
|
| 17152 |
+
[Rank 3] Trainer log: {'loss': 0.3361, 'grad_norm': 4.297579288482666, 'learning_rate': 5.788593668167854e-06}
|
| 17153 |
+
|
| 17154 |
+
[Rank 2] Trainer log: {'loss': 0.3361, 'grad_norm': 4.297579288482666, 'learning_rate': 5.788593668167854e-06}
|
| 17155 |
+
{'loss': 0.3361, 'grad_norm': 4.297579288482666, 'learning_rate': 5.788593668167854e-06, 'epoch': 0.66}
|
| 17156 |
+
tensor(-0.0014, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17157 |
+
tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17158 |
+
tensor(0.1702, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17159 |
+
{'train/tv_loss': 0.00018524311017245056, 'train/lm_loss': 3.623634111136198e-05, 'train/info_loss': 2.0980394765501842e-05, 'train/ref_loss': 0.32768750190734863, 'train/uncertainty_loss': 0.017021270096302034, 'train/video_loss': 0.346211701631546, 'train/total_loss': 0.34624794125556946}
|
| 17160 |
+
tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17161 |
+
tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17162 |
+
tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17163 |
+
tensor(0.1074, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17164 |
+
tensor(0.0083, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17165 |
+
{'train/tv_loss': 0.00019137355266138912, 'train/lm_loss': 3.5974127240478995e-05, 'train/info_loss': 2.235124156868551e-05, 'train/ref_loss': 0.22056730091571808, 'train/uncertainty_loss': 0.0008338701911270619, 'train/video_loss': 0.22295451164245605, 'train/total_loss': 0.222990483045578}
|
| 17166 |
+
[Rank 0] Trainer log: {'loss': 0.3789, 'grad_norm': 8.171895980834961, 'learning_rate': 5.7789206504916815e-06}[Rank 2] Trainer log: {'loss': 0.3789, 'grad_norm': 8.171895980834961, 'learning_rate': 5.7789206504916815e-06}[Rank 3] Trainer log: {'loss': 0.3789, 'grad_norm': 8.171895980834961, 'learning_rate': 5.7789206504916815e-06}
|
| 17167 |
+
|
| 17168 |
+
|
| 17169 |
+
{'loss': 0.3789, 'grad_norm': 8.171895980834961, 'learning_rate': 5.7789206504916815e-06, 'epoch': 0.66}[Rank 1] Trainer log: {'loss': 0.3789, 'grad_norm': 8.171895980834961, 'learning_rate': 5.7789206504916815e-06}
|
| 17170 |
+
|
| 17171 |
+
tensor(-0.0013, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17172 |
+
tensor(-0.0013, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17173 |
+
tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17174 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17175 |
+
{'train/tv_loss': 0.00026274872943758963, 'train/lm_loss': 3.1874023261480036e-05, 'train/info_loss': 2.0682384274550714e-05, 'train/ref_loss': 0.1149064302444458, 'train/uncertainty_loss': -7.323419558815659e-05, 'train/video_loss': 0.11695586889982224, 'train/total_loss': 0.11698774248361588}
|
| 17176 |
+
tensor(-0.0013, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17177 |
+
tensor(-0.0010, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17178 |
+
tensor(-0.0010, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17179 |
+
tensor(0.0592, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17180 |
+
{'train/tv_loss': 0.00016578567447140814, 'train/lm_loss': 6.0000957455486065e-05, 'train/info_loss': 2.574854443082586e-05, 'train/ref_loss': 0.25501570105552673, 'train/uncertainty_loss': 0.0059223812073469165, 'train/video_loss': 0.2622901201248169, 'train/total_loss': 0.2623501121997833}
|
| 17181 |
+
[Rank 3] Trainer log: {'loss': 0.3454, 'grad_norm': 3.6991794109344482, 'learning_rate': 5.769252436258295e-06}
|
| 17182 |
+
[Rank 1] Trainer log: {'loss': 0.3454, 'grad_norm': 3.6991794109344482, 'learning_rate': 5.769252436258295e-06}[Rank 0] Trainer log: {'loss': 0.3454, 'grad_norm': 3.6991794109344482, 'learning_rate': 5.769252436258295e-06}
|
| 17183 |
+
|
| 17184 |
+
[Rank 2] Trainer log: {'loss': 0.3454, 'grad_norm': 3.6991794109344482, 'learning_rate': 5.769252436258295e-06}
|
| 17185 |
+
{'loss': 0.3454, 'grad_norm': 3.6991794109344482, 'learning_rate': 5.769252436258295e-06, 'epoch': 0.66}
|
| 17186 |
+
tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17187 |
+
tensor(-0.0015, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0015, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17188 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17189 |
+
{'train/tv_loss': 0.0003217862220481038, 'train/lm_loss': 4.653389332816005e-05, 'train/info_loss': 2.1338008082238957e-05, 'train/ref_loss': 0.17965355515480042, 'train/uncertainty_loss': -7.330098887905479e-05, 'train/video_loss': 0.1821758896112442, 'train/total_loss': 0.1822224259376526}
|
| 17190 |
+
tensor(-0.0014, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17191 |
+
tensor(-0.0014, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17192 |
+
tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17193 |
+
tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17194 |
+
tensor(-0.0009, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17195 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.21704933643341065, 'train/info_loss': 0.08072395622730255, 'train/ref_loss': None, 'train/uncertainty_loss': -9.479672298766673e-05, 'train/video_loss': 0.08062916249036789, 'train/total_loss': 0.29767850041389465}
|
| 17196 |
+
[Rank 1] Trainer log: {'loss': 0.3814, 'grad_norm': 3.887155294418335, 'learning_rate': 5.759589036469793e-06}[Rank 0] Trainer log: {'loss': 0.3814, 'grad_norm': 3.887155294418335, 'learning_rate': 5.759589036469793e-06}[Rank 3] Trainer log: {'loss': 0.3814, 'grad_norm': 3.887155294418335, 'learning_rate': 5.759589036469793e-06}
|
| 17197 |
+
|
| 17198 |
+
|
| 17199 |
+
[Rank 2] Trainer log: {'loss': 0.3814, 'grad_norm': 3.887155294418335, 'learning_rate': 5.759589036469793e-06}
|
| 17200 |
+
{'loss': 0.3814, 'grad_norm': 3.887155294418335, 'learning_rate': 5.759589036469793e-06, 'epoch': 0.66}
|
| 17201 |
+
tensor(-0.0014, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17202 |
+
tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17203 |
+
tensor(-0.0009, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17204 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.15884593725204468, 'train/info_loss': 0.09679526090621948, 'train/ref_loss': None, 'train/uncertainty_loss': -9.324780548922718e-05, 'train/video_loss': 0.09670200943946838, 'train/total_loss': 0.2555479407310486}
|
| 17205 |
+
tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17206 |
+
tensor(-0.0013, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17207 |
+
tensor(-0.0010, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17208 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.13753668069839478, 'train/info_loss': 0.23872123658657074, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001031742780469358, 'train/video_loss': 0.23861806094646454, 'train/total_loss': 0.37615475058555603}
|
| 17209 |
+
tensor(0.1312, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17210 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17211 |
+
[Rank 0] Trainer log: {'loss': 0.3451, 'grad_norm': 9.478434562683105, 'learning_rate': 5.749930462122784e-06}[Rank 3] Trainer log: {'loss': 0.3451, 'grad_norm': 9.478434562683105, 'learning_rate': 5.749930462122784e-06}[Rank 2] Trainer log: {'loss': 0.3451, 'grad_norm': 9.478434562683105, 'learning_rate': 5.749930462122784e-06}
|
| 17212 |
+
|
| 17213 |
+
[Rank 1] Trainer log: {'loss': 0.3451, 'grad_norm': 9.478434562683105, 'learning_rate': 5.749930462122784e-06}
|
| 17214 |
+
|
| 17215 |
+
{'loss': 0.3451, 'grad_norm': 9.478434562683105, 'learning_rate': 5.749930462122784e-06, 'epoch': 0.66}
|
| 17216 |
+
tensor(-0.0015, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0015, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17217 |
+
tensor(-0.0013, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17218 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.3149502038955689, 'train/info_loss': 0.1556193083524704, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012698310893028976, 'train/video_loss': 0.15549232065677643, 'train/total_loss': 0.470442533493042}
|
| 17219 |
+
tensor(-0.0010, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17220 |
+
tensor(-0.0008, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17221 |
+
tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17222 |
+
tensor(-0.0009, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17223 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.0765468716621399, 'train/info_loss': 0.2542925477027893, 'train/ref_loss': None, 'train/uncertainty_loss': -8.723060018382967e-05, 'train/video_loss': 0.2542053163051605, 'train/total_loss': 0.3307521939277649}
|
| 17224 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17225 |
+
tensor(0.0120, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17226 |
+
[Rank 0] Trainer log: {'loss': 0.2922, 'grad_norm': 1.9175676107406616, 'learning_rate': 5.740276724208397e-06}[Rank 1] Trainer log: {'loss': 0.2922, 'grad_norm': 1.9175676107406616, 'learning_rate': 5.740276724208397e-06}
|
| 17227 |
+
[Rank 2] Trainer log: {'loss': 0.2922, 'grad_norm': 1.9175676107406616, 'learning_rate': 5.740276724208397e-06}
|
| 17228 |
+
|
| 17229 |
+
[Rank 3] Trainer log: {'loss': 0.2922, 'grad_norm': 1.9175676107406616, 'learning_rate': 5.740276724208397e-06}
|
| 17230 |
+
{'loss': 0.2922, 'grad_norm': 1.9175676107406616, 'learning_rate': 5.740276724208397e-06, 'epoch': 0.66}
|
| 17231 |
+
tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17232 |
+
tensor(0.1122, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17233 |
+
tensor(0.1959, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17234 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17235 |
+
{'train/tv_loss': 0.00016258273972198368, 'train/lm_loss': 6.1025843024253845e-05, 'train/info_loss': 2.413929905742407e-05, 'train/ref_loss': 0.16883832216262817, 'train/uncertainty_loss': -7.159045781008899e-05, 'train/video_loss': 0.17009153962135315, 'train/total_loss': 0.17015255987644196}
|
| 17236 |
+
tensor(-0.0014, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17237 |
+
tensor(-0.0010, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17238 |
+
tensor(0.3660, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17239 |
+
{'train/tv_loss': 0.0002153045265004039, 'train/lm_loss': 2.496086817700416e-05, 'train/info_loss': 1.8238688426208682e-05, 'train/ref_loss': 0.4590832591056824, 'train/uncertainty_loss': 0.03660423159599304, 'train/video_loss': 0.49742814898490906, 'train/total_loss': 0.49745312333106995}
|
| 17240 |
+
tensor(0.2555, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17241 |
+
[Rank 3] Trainer log: {'loss': 0.3685, 'grad_norm': 5.5717644691467285, 'learning_rate': 5.7306278337122525e-06}[Rank 2] Trainer log: {'loss': 0.3685, 'grad_norm': 5.5717644691467285, 'learning_rate': 5.7306278337122525e-06}
|
| 17242 |
+
[Rank 1] Trainer log: {'loss': 0.3685, 'grad_norm': 5.5717644691467285, 'learning_rate': 5.7306278337122525e-06}
|
| 17243 |
+
|
| 17244 |
+
[Rank 0] Trainer log: {'loss': 0.3685, 'grad_norm': 5.5717644691467285, 'learning_rate': 5.7306278337122525e-06}
|
| 17245 |
+
{'loss': 0.3685, 'grad_norm': 5.5717644691467285, 'learning_rate': 5.7306278337122525e-06, 'epoch': 0.66}
|
| 17246 |
+
tensor(-0.0014, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17247 |
+
tensor(-0.0013, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17248 |
+
tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17249 |
+
tensor(1.1109, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17250 |
+
{'train/tv_loss': 0.0002126151230186224, 'train/lm_loss': 2.4722478701733053e-05, 'train/info_loss': 1.740425250318367e-05, 'train/ref_loss': 0.9555550813674927, 'train/uncertainty_loss': 0.11109327077865601, 'train/video_loss': 1.0683666467666626, 'train/total_loss': 1.0683913230895996}
|
| 17251 |
+
tensor(-0.0013, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17252 |
+
tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17253 |
+
tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17254 |
+
tensor(0.0930, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17255 |
+
{'train/tv_loss': 0.00013671087799593806, 'train/lm_loss': 3.657006600406021e-05, 'train/info_loss': 1.9132725356030278e-05, 'train/ref_loss': 0.2791999578475952, 'train/uncertainty_loss': 0.009300475567579269, 'train/video_loss': 0.2896132469177246, 'train/total_loss': 0.2896498143672943}
|
| 17256 |
+
[Rank 1] Trainer log: {'loss': 0.4932, 'grad_norm': 7.025022029876709, 'learning_rate': 5.720983801614455e-06}[Rank 0] Trainer log: {'loss': 0.4932, 'grad_norm': 7.025022029876709, 'learning_rate': 5.720983801614455e-06}
|
| 17257 |
+
[Rank 3] Trainer log: {'loss': 0.4932, 'grad_norm': 7.025022029876709, 'learning_rate': 5.720983801614455e-06}
|
| 17258 |
+
|
| 17259 |
+
[Rank 2] Trainer log: {'loss': 0.4932, 'grad_norm': 7.025022029876709, 'learning_rate': 5.720983801614455e-06}
|
| 17260 |
+
{'loss': 0.4932, 'grad_norm': 7.025022029876709, 'learning_rate': 5.720983801614455e-06, 'epoch': 0.66}
|
| 17261 |
+
tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17262 |
+
tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17263 |
+
tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17264 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.2787301301956177, 'train/info_loss': 0.2266109138727188, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011889106826856732, 'train/video_loss': 0.2264920175075531, 'train/total_loss': 0.5052221417427063}
|
| 17265 |
+
tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17266 |
+
tensor(-0.0013, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17267 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17268 |
+
tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17269 |
+
tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17270 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.13043994903564454, 'train/info_loss': 0.17577716708183289, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001127044321037829, 'train/video_loss': 0.17566446959972382, 'train/total_loss': 0.3061044216156006}
|
| 17271 |
+
[Rank 1] Trainer log: {'loss': 0.3514, 'grad_norm': 5.906411170959473, 'learning_rate': 5.7113446388895855e-06}[Rank 3] Trainer log: {'loss': 0.3514, 'grad_norm': 5.906411170959473, 'learning_rate': 5.7113446388895855e-06}[Rank 0] Trainer log: {'loss': 0.3514, 'grad_norm': 5.906411170959473, 'learning_rate': 5.7113446388895855e-06}
|
| 17272 |
+
|
| 17273 |
+
|
| 17274 |
+
[Rank 2] Trainer log: {'loss': 0.3514, 'grad_norm': 5.906411170959473, 'learning_rate': 5.7113446388895855e-06}
|
| 17275 |
+
{'loss': 0.3514, 'grad_norm': 5.906411170959473, 'learning_rate': 5.7113446388895855e-06, 'epoch': 0.66}
|
| 17276 |
+
tensor(-0.0014, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17277 |
+
tensor(-0.0010, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17278 |
+
tensor(-0.0008, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17279 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17280 |
+
{'train/tv_loss': 0.00023483284749090672, 'train/lm_loss': 3.645087999757379e-05, 'train/info_loss': 2.235124156868551e-05, 'train/ref_loss': 0.06819528341293335, 'train/uncertainty_loss': -6.94944174028933e-05, 'train/video_loss': 0.07002680748701096, 'train/total_loss': 0.07006325572729111}
|
| 17281 |
+
tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17282 |
+
tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17283 |
+
tensor(-0.0010, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17284 |
+
tensor(0.5629, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17285 |
+
{'train/tv_loss': 0.000887374859303236, 'train/lm_loss': 4.150436725467444e-05, 'train/info_loss': 2.270885306643322e-05, 'train/ref_loss': 0.5856791138648987, 'train/uncertainty_loss': 0.05629110932350159, 'train/video_loss': 0.649091899394989, 'train/total_loss': 0.6491333842277527}
|
| 17286 |
+
[Rank 1] Trainer log: {'loss': 0.4015, 'grad_norm': 11.06789493560791, 'learning_rate': 5.701710356506665e-06}
|
| 17287 |
+
[Rank 3] Trainer log: {'loss': 0.4015, 'grad_norm': 11.06789493560791, 'learning_rate': 5.701710356506665e-06}
|
| 17288 |
+
[Rank 0] Trainer log: {'loss': 0.4015, 'grad_norm': 11.06789493560791, 'learning_rate': 5.701710356506665e-06}[Rank 2] Trainer log: {'loss': 0.4015, 'grad_norm': 11.06789493560791, 'learning_rate': 5.701710356506665e-06}
|
| 17289 |
+
|
| 17290 |
+
{'loss': 0.4015, 'grad_norm': 11.06789493560791, 'learning_rate': 5.701710356506665e-06, 'epoch': 0.66}
|
| 17291 |
+
tensor(-0.0010, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17292 |
+
tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17293 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17294 |
+
tensor(0.0157, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17295 |
+
{'train/tv_loss': 0.00012852794025093318, 'train/lm_loss': 5.418520304374397e-05, 'train/info_loss': 2.3424076061928645e-05, 'train/ref_loss': 0.22761771082878113, 'train/uncertainty_loss': 0.001567848213016987, 'train/video_loss': 0.23023721575737, 'train/total_loss': 0.23029139637947083}
|
| 17296 |
+
tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17297 |
+
tensor(-0.0010, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17298 |
+
tensor(-0.0008, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17299 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17300 |
+
{'train/tv_loss': 0.0001663352712057531, 'train/lm_loss': 2.46032839640975e-05, 'train/info_loss': 1.710624019324314e-05, 'train/ref_loss': 0.1792670488357544, 'train/uncertainty_loss': -7.328792125917972e-05, 'train/video_loss': 0.18054156005382538, 'train/total_loss': 0.18056616187095642}
|
| 17301 |
+
[Rank 1] Trainer log: {'loss': 0.2317, 'grad_norm': 2.1956772804260254, 'learning_rate': 5.6920809654291945e-06}[Rank 3] Trainer log: {'loss': 0.2317, 'grad_norm': 2.1956772804260254, 'learning_rate': 5.6920809654291945e-06}[Rank 0] Trainer log: {'loss': 0.2317, 'grad_norm': 2.1956772804260254, 'learning_rate': 5.6920809654291945e-06}
|
| 17302 |
+
|
| 17303 |
+
[Rank 2] Trainer log: {'loss': 0.2317, 'grad_norm': 2.1956772804260254, 'learning_rate': 5.6920809654291945e-06}
|
| 17304 |
+
|
| 17305 |
+
{'loss': 0.2317, 'grad_norm': 2.1956772804260254, 'learning_rate': 5.6920809654291945e-06, 'epoch': 0.66}
|
| 17306 |
+
tensor(-0.0009, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17307 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17308 |
+
tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17309 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17310 |
+
{'train/tv_loss': 0.00016776639968156816, 'train/lm_loss': 2.7893029619008305e-05, 'train/info_loss': 2.0682384274550714e-05, 'train/ref_loss': 0.2100030779838562, 'train/uncertainty_loss': -7.060928619466722e-05, 'train/video_loss': 0.21129527688026428, 'train/total_loss': 0.21132317185401917}
|
| 17311 |
+
tensor(-0.0013, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17312 |
+
tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17313 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17314 |
+
{'train/tv_loss': 0.00012384995352476835, 'train/lm_loss': 4.760652373079211e-05, 'train/info_loss': 2.3781687559676357e-05, 'train/ref_loss': 0.09827212989330292, 'train/uncertainty_loss': -7.054724264889956e-05, 'train/video_loss': 0.09921616315841675, 'train/total_loss': 0.09926377236843109}
|
| 17315 |
+
tensor(0.1509, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17316 |
+
[Rank 2] Trainer log: {'loss': 0.2525, 'grad_norm': 3.496741771697998, 'learning_rate': 5.6824564766150724e-06}[Rank 3] Trainer log: {'loss': 0.2525, 'grad_norm': 3.496741771697998, 'learning_rate': 5.6824564766150724e-06}[Rank 0] Trainer log: {'loss': 0.2525, 'grad_norm': 3.496741771697998, 'learning_rate': 5.6824564766150724e-06}
|
| 17317 |
+
|
| 17318 |
+
[Rank 1] Trainer log: {'loss': 0.2525, 'grad_norm': 3.496741771697998, 'learning_rate': 5.6824564766150724e-06}
|
| 17319 |
+
|
| 17320 |
+
{'loss': 0.2525, 'grad_norm': 3.496741771697998, 'learning_rate': 5.6824564766150724e-06, 'epoch': 0.66}
|
| 17321 |
+
tensor(-0.0014, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17322 |
+
tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17323 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.20042383670806885, 'train/info_loss': 0.1449424922466278, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011801900109276175, 'train/video_loss': 0.14482447504997253, 'train/total_loss': 0.3452483117580414}
|
| 17324 |
+
tensor(-0.0010, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17325 |
+
tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17326 |
+
tensor(-0.0013, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17327 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.45617589950561527, 'train/info_loss': 0.24490997195243835, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012810240732505918, 'train/video_loss': 0.24478186666965485, 'train/total_loss': 0.7009577751159668}
|
| 17328 |
+
tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17329 |
+
tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17330 |
+
tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17331 |
+
[Rank 2] Trainer log: {'loss': 0.3158, 'grad_norm': 4.859067440032959, 'learning_rate': 5.6728369010166426e-06}[Rank 1] Trainer log: {'loss': 0.3158, 'grad_norm': 4.859067440032959, 'learning_rate': 5.6728369010166426e-06}[Rank 0] Trainer log: {'loss': 0.3158, 'grad_norm': 4.859067440032959, 'learning_rate': 5.6728369010166426e-06}
|
| 17332 |
+
|
| 17333 |
+
[Rank 3] Trainer log: {'loss': 0.3158, 'grad_norm': 4.859067440032959, 'learning_rate': 5.6728369010166426e-06}
|
| 17334 |
+
|
| 17335 |
+
{'loss': 0.3158, 'grad_norm': 4.859067440032959, 'learning_rate': 5.6728369010166426e-06, 'epoch': 0.66}
|
| 17336 |
+
tensor(-0.0013, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17337 |
+
tensor(-0.0011, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17338 |
+
tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17339 |
+
tensor(-0.0010, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17340 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.11436709165573121, 'train/info_loss': 0.26202645897865295, 'train/ref_loss': None, 'train/uncertainty_loss': -9.564846986904741e-05, 'train/video_loss': 0.26193082332611084, 'train/total_loss': 0.3762979209423065}
|
| 17341 |
+
tensor(-0.0015, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0015, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17342 |
+
tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17343 |
+
tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17344 |
+
tensor(-0.0009, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17345 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.15787471532821656, 'train/info_loss': 0.17485037446022034, 'train/ref_loss': None, 'train/uncertainty_loss': -9.450375218875707e-05, 'train/video_loss': 0.17475587129592896, 'train/total_loss': 0.33263057470321655}
|
| 17346 |
+
[Rank 1] Trainer log: {'loss': 0.4708, 'grad_norm': 2.322233200073242, 'learning_rate': 5.663222249580649e-06}[Rank 3] Trainer log: {'loss': 0.4708, 'grad_norm': 2.322233200073242, 'learning_rate': 5.663222249580649e-06}[Rank 0] Trainer log: {'loss': 0.4708, 'grad_norm': 2.322233200073242, 'learning_rate': 5.663222249580649e-06}
|
| 17347 |
+
|
| 17348 |
+
[Rank 2] Trainer log: {'loss': 0.4708, 'grad_norm': 2.322233200073242, 'learning_rate': 5.663222249580649e-06}
|
| 17349 |
+
|
| 17350 |
+
{'loss': 0.4708, 'grad_norm': 2.322233200073242, 'learning_rate': 5.663222249580649e-06, 'epoch': 0.66}
|
| 17351 |
+
tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17352 |
+
tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17353 |
+
tensor(0.2767, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17354 |
+
tensor(0.0772, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17355 |
+
{'train/tv_loss': 0.00016861287876963617, 'train/lm_loss': 4.701061698142439e-05, 'train/info_loss': 2.235124156868551e-05, 'train/ref_loss': 0.2696513533592224, 'train/uncertainty_loss': 0.007722488045692444, 'train/video_loss': 0.2787451148033142, 'train/total_loss': 0.2787921130657196}
|
| 17356 |
+
tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17357 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.20862238407135011, 'train/info_loss': 0.1744736284017563, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001176536548882723, 'train/video_loss': 0.17435596883296967, 'train/total_loss': 0.3829783499240875}
|
| 17358 |
+
tensor(-0.0010, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17359 |
+
tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17360 |
+
tensor(0.2500, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17361 |
+
[Rank 3] Trainer log: {'loss': 0.3702, 'grad_norm': 10.16737174987793, 'learning_rate': 5.653612533248233e-06}[Rank 1] Trainer log: {'loss': 0.3702, 'grad_norm': 10.16737174987793, 'learning_rate': 5.653612533248233e-06}
|
| 17362 |
+
[Rank 0] Trainer log: {'loss': 0.3702, 'grad_norm': 10.16737174987793, 'learning_rate': 5.653612533248233e-06}
|
| 17363 |
+
|
| 17364 |
+
[Rank 2] Trainer log: {'loss': 0.3702, 'grad_norm': 10.16737174987793, 'learning_rate': 5.653612533248233e-06}
|
| 17365 |
+
{'loss': 0.3702, 'grad_norm': 10.16737174987793, 'learning_rate': 5.653612533248233e-06, 'epoch': 0.66}
|
| 17366 |
+
tensor(-0.0014, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17367 |
+
tensor(-0.0010, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17368 |
+
tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17369 |
+
tensor(0.0352, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17370 |
+
{'train/tv_loss': 0.00026960894465446474, 'train/lm_loss': 2.4698639754205944e-05, 'train/info_loss': 1.7702266632113606e-05, 'train/ref_loss': 0.24252426624298096, 'train/uncertainty_loss': 0.0035187847912311557, 'train/video_loss': 0.2482176274061203, 'train/total_loss': 0.2482423186302185}
|
| 17371 |
+
tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17372 |
+
tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17373 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17374 |
+
{'train/tv_loss': 0.00018143276683986188, 'train/lm_loss': 3.5974127240478995e-05, 'train/info_loss': 1.9728748156921938e-05, 'train/ref_loss': 0.08057326078414917, 'train/uncertainty_loss': -7.370269158855081e-05, 'train/video_loss': 0.08197075128555298, 'train/total_loss': 0.08200672268867493}
|
| 17375 |
+
tensor(0.3041, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17376 |
+
[Rank 2] Trainer log: {'loss': 0.3183, 'grad_norm': 8.234452247619629, 'learning_rate': 5.644007762954926e-06}[Rank 1] Trainer log: {'loss': 0.3183, 'grad_norm': 8.234452247619629, 'learning_rate': 5.644007762954926e-06}
|
| 17377 |
+
|
| 17378 |
+
[Rank 3] Trainer log: {'loss': 0.3183, 'grad_norm': 8.234452247619629, 'learning_rate': 5.644007762954926e-06}[Rank 0] Trainer log: {'loss': 0.3183, 'grad_norm': 8.234452247619629, 'learning_rate': 5.644007762954926e-06}
|
| 17379 |
+
|
| 17380 |
+
{'loss': 0.3183, 'grad_norm': 8.234452247619629, 'learning_rate': 5.644007762954926e-06, 'epoch': 0.66}
|
| 17381 |
+
tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17382 |
+
tensor(-0.0010, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17383 |
+
tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17384 |
+
tensor(-0.0009, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17385 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.06672886610031128, 'train/info_loss': 0.18479126691818237, 'train/ref_loss': None, 'train/uncertainty_loss': -9.260554797947408e-05, 'train/video_loss': 0.1846986562013626, 'train/total_loss': 0.2514275312423706}
|
| 17386 |
+
tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17387 |
+
tensor(-0.0014, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17388 |
+
tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17389 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17390 |
+
{'train/tv_loss': 0.00025838704314082864, 'train/lm_loss': 2.7893029619008305e-05, 'train/info_loss': 1.7702266632113606e-05, 'train/ref_loss': 0.09182281792163849, 'train/uncertainty_loss': -6.915096309967339e-05, 'train/video_loss': 0.09383846819400787, 'train/total_loss': 0.09386636316776276}
|
| 17391 |
+
[Rank 1] Trainer log: {'loss': 0.2844, 'grad_norm': 3.304973602294922, 'learning_rate': 5.634407949630617e-06}[Rank 0] Trainer log: {'loss': 0.2844, 'grad_norm': 3.304973602294922, 'learning_rate': 5.634407949630617e-06}[Rank 3] Trainer log: {'loss': 0.2844, 'grad_norm': 3.304973602294922, 'learning_rate': 5.634407949630617e-06}
|
| 17392 |
+
|
| 17393 |
+
[Rank 2] Trainer log: {'loss': 0.2844, 'grad_norm': 3.304973602294922, 'learning_rate': 5.634407949630617e-06}
|
| 17394 |
+
|
| 17395 |
+
{'loss': 0.2844, 'grad_norm': 3.304973602294922, 'learning_rate': 5.634407949630617e-06, 'epoch': 0.66}
|
| 17396 |
+
tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17397 |
+
tensor(1.0766, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17398 |
+
tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17399 |
+
tensor(0.7360, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17400 |
+
{'train/tv_loss': 0.00015928944339975716, 'train/lm_loss': 4.684376472141594e-05, 'train/info_loss': 2.1993630070937797e-05, 'train/ref_loss': 0.7353293299674988, 'train/uncertainty_loss': 0.07359982132911683, 'train/video_loss': 0.8102254271507263, 'train/total_loss': 0.8102722764015198}
|
| 17401 |
+
tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17402 |
+
tensor(-0.0013, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17403 |
+
tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17404 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.2558208465576172, 'train/info_loss': 0.18679678440093994, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010758317075669766, 'train/video_loss': 0.18668919801712036, 'train/total_loss': 0.44251003861427307}
|
| 17405 |
+
tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17406 |
+
[Rank 3] Trainer log: {'loss': 0.4651, 'grad_norm': 17.812118530273438, 'learning_rate': 5.624813104199567e-06}
|
| 17407 |
+
[Rank 2] Trainer log: {'loss': 0.4651, 'grad_norm': 17.812118530273438, 'learning_rate': 5.624813104199567e-06}
|
| 17408 |
+
[Rank 0] Trainer log: {'loss': 0.4651, 'grad_norm': 17.812118530273438, 'learning_rate': 5.624813104199567e-06}[Rank 1] Trainer log: {'loss': 0.4651, 'grad_norm': 17.812118530273438, 'learning_rate': 5.624813104199567e-06}
|
| 17409 |
+
|
| 17410 |
+
{'loss': 0.4651, 'grad_norm': 17.812118530273438, 'learning_rate': 5.624813104199567e-06, 'epoch': 0.66}
|
| 17411 |
+
tensor(-0.0010, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:2', grad_fn=<MulBackward0>)
|
| 17412 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 17413 |
+
tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 17414 |
+
tensor(0.4664, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 17415 |
+
{'train/tv_loss': 0.00020973044447600842, 'train/lm_loss': 4.715363611467183e-05, 'train/info_loss': 2.235124156868551e-05, 'train/ref_loss': 0.5165361166000366, 'train/uncertainty_loss': 0.0466405063867569, 'train/video_loss': 0.5648768544197083, 'train/total_loss': 0.5649240016937256}
|