Training in progress, step 2075
Browse files- adapter_model.safetensors +1 -1
- train.log +360 -0
    	
        adapter_model.safetensors
    CHANGED
    
    | @@ -1,3 +1,3 @@ | |
| 1 | 
             
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            -
            oid sha256: | 
| 3 | 
             
            size 1204780872
         | 
|  | |
| 1 | 
             
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:b24c739c235e4e9845a17684af3e9ae65224f6f9ce24eac3682df1571e15c4f5
         | 
| 3 | 
             
            size 1204780872
         | 
    	
        train.log
    CHANGED
    
    | @@ -17413,3 +17413,363 @@ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device= | |
| 17413 | 
             
            tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
         | 
| 17414 | 
             
            tensor(0.4664, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
         | 
| 17415 | 
             
            {'train/tv_loss': 0.00020973044447600842, 'train/lm_loss': 4.715363611467183e-05, 'train/info_loss': 2.235124156868551e-05, 'train/ref_loss': 0.5165361166000366, 'train/uncertainty_loss': 0.0466405063867569, 'train/video_loss': 0.5648768544197083, 'train/total_loss': 0.5649240016937256}
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 17413 | 
             
            tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
         | 
| 17414 | 
             
            tensor(0.4664, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
         | 
| 17415 | 
             
            {'train/tv_loss': 0.00020973044447600842, 'train/lm_loss': 4.715363611467183e-05, 'train/info_loss': 2.235124156868551e-05, 'train/ref_loss': 0.5165361166000366, 'train/uncertainty_loss': 0.0466405063867569, 'train/video_loss': 0.5648768544197083, 'train/total_loss': 0.5649240016937256}
         | 
| 17416 | 
            +
            tensor(-0.0010, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:2', grad_fn=<MulBackward0>)
         | 
| 17417 | 
            +
            tensor(-0.0010, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:1', grad_fn=<MulBackward0>)
         | 
| 17418 | 
            +
            tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
         | 
| 17419 | 
            +
            {'train/tv_loss': None, 'train/lm_loss': 0.36234724521636963, 'train/info_loss': 0.14337365329265594, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011461186222732067, 'train/video_loss': 0.14325904846191406, 'train/total_loss': 0.5056062936782837}
         | 
| 17420 | 
            +
            tensor(0.0681, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
         | 
| 17421 | 
            +
            [Rank 3] Trainer log: {'loss': 0.3913, 'grad_norm': 3.6058335304260254, 'learning_rate': 5.615223237580377e-06}[Rank 0] Trainer log: {'loss': 0.3913, 'grad_norm': 3.6058335304260254, 'learning_rate': 5.615223237580377e-06}[Rank 2] Trainer log: {'loss': 0.3913, 'grad_norm': 3.6058335304260254, 'learning_rate': 5.615223237580377e-06}
         | 
| 17422 | 
            +
             | 
| 17423 | 
            +
             | 
| 17424 | 
            +
            [Rank 1] Trainer log: {'loss': 0.3913, 'grad_norm': 3.6058335304260254, 'learning_rate': 5.615223237580377e-06}
         | 
| 17425 | 
            +
            {'loss': 0.3913, 'grad_norm': 3.6058335304260254, 'learning_rate': 5.615223237580377e-06, 'epoch': 0.66}
         | 
| 17426 | 
            +
            tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
         | 
| 17427 | 
            +
            tensor(-0.0014, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:3', grad_fn=<MulBackward0>)
         | 
| 17428 | 
            +
            tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
         | 
| 17429 | 
            +
            {'train/tv_loss': None, 'train/lm_loss': 0.06900591254234315, 'train/info_loss': 0.15682968497276306, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010683162836357952, 'train/video_loss': 0.15672285854816437, 'train/total_loss': 0.22572878003120422}
         | 
| 17430 | 
            +
            tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
         | 
| 17431 | 
            +
            tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
         | 
| 17432 | 
            +
            {'train/tv_loss': None, 'train/lm_loss': 0.40285620689392093, 'train/info_loss': 0.18601451814174652, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011755496961995959, 'train/video_loss': 0.18589696288108826, 'train/total_loss': 0.5887531638145447}
         | 
| 17433 | 
            +
            tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
         | 
| 17434 | 
            +
            tensor(0.0370, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
         | 
| 17435 | 
            +
            tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
         | 
| 17436 | 
            +
            [Rank 1] Trainer log: {'loss': 0.3731, 'grad_norm': 2.03836727142334, 'learning_rate': 5.605638360685988e-06}
         | 
| 17437 | 
            +
            [Rank 0] Trainer log: {'loss': 0.3731, 'grad_norm': 2.03836727142334, 'learning_rate': 5.605638360685988e-06}[Rank 3] Trainer log: {'loss': 0.3731, 'grad_norm': 2.03836727142334, 'learning_rate': 5.605638360685988e-06}
         | 
| 17438 | 
            +
             | 
| 17439 | 
            +
            [Rank 2] Trainer log: {'loss': 0.3731, 'grad_norm': 2.03836727142334, 'learning_rate': 5.605638360685988e-06}
         | 
| 17440 | 
            +
            {'loss': 0.3731, 'grad_norm': 2.03836727142334, 'learning_rate': 5.605638360685988e-06, 'epoch': 0.66}
         | 
| 17441 | 
            +
            tensor(-0.0010, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:1', grad_fn=<MulBackward0>)
         | 
| 17442 | 
            +
            tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
         | 
| 17443 | 
            +
            tensor(0.2234, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
         | 
| 17444 | 
            +
            {'train/tv_loss': 0.00023927739821374419, 'train/lm_loss': 2.8107574325986207e-05, 'train/info_loss': 1.8238688426208682e-05, 'train/ref_loss': 0.3630368113517761, 'train/uncertainty_loss': 0.022341114282608033, 'train/video_loss': 0.3873103857040405, 'train/total_loss': 0.3873384892940521}
         | 
| 17445 | 
            +
            tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
         | 
| 17446 | 
            +
            tensor(-0.0013, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:1', grad_fn=<MulBackward0>)
         | 
| 17447 | 
            +
            tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
         | 
| 17448 | 
            +
            tensor(-0.0010, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:0', grad_fn=<MulBackward0>)
         | 
| 17449 | 
            +
            {'train/tv_loss': None, 'train/lm_loss': 0.14715486764907837, 'train/info_loss': 0.13828474283218384, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001031541614793241, 'train/video_loss': 0.13818158209323883, 'train/total_loss': 0.285336434841156}
         | 
| 17450 | 
            +
            tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
         | 
| 17451 | 
            +
            [Rank 0] Trainer log: {'loss': 0.3744, 'grad_norm': 5.042629241943359, 'learning_rate': 5.5960584844236565e-06}[Rank 1] Trainer log: {'loss': 0.3744, 'grad_norm': 5.042629241943359, 'learning_rate': 5.5960584844236565e-06}
         | 
| 17452 | 
            +
            [Rank 2] Trainer log: {'loss': 0.3744, 'grad_norm': 5.042629241943359, 'learning_rate': 5.5960584844236565e-06}
         | 
| 17453 | 
            +
             | 
| 17454 | 
            +
            [Rank 3] Trainer log: {'loss': 0.3744, 'grad_norm': 5.042629241943359, 'learning_rate': 5.5960584844236565e-06}
         | 
| 17455 | 
            +
            {'loss': 0.3744, 'grad_norm': 5.042629241943359, 'learning_rate': 5.5960584844236565e-06, 'epoch': 0.66}
         | 
| 17456 | 
            +
            tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
         | 
| 17457 | 
            +
            {'train/tv_loss': None, 'train/lm_loss': 0.4195257663726807, 'train/info_loss': 0.10877030342817307, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011566100874915719, 'train/video_loss': 0.10865464061498642, 'train/total_loss': 0.5281804203987122}
         | 
| 17458 | 
            +
            tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
         | 
| 17459 | 
            +
            tensor(0.1955, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
         | 
| 17460 | 
            +
            tensor(-0.0010, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:3', grad_fn=<MulBackward0>)
         | 
| 17461 | 
            +
            tensor(-0.0016, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0016, device='cuda:0', grad_fn=<MulBackward0>)
         | 
| 17462 | 
            +
            {'train/tv_loss': None, 'train/lm_loss': 2.8298282995820048e-05, 'train/info_loss': 2.342404332011938e-05, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00015964442864060404, 'train/video_loss': -0.00013622039114125073, 'train/total_loss': -0.00010792211105581373}
         | 
| 17463 | 
            +
            tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:1', grad_fn=<MulBackward0>)
         | 
| 17464 | 
            +
            tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
         | 
| 17465 | 
            +
            tensor(0.0522, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
         | 
| 17466 | 
            +
            [Rank 3] Trainer log: {'loss': 0.2652, 'grad_norm': 10.820907592773438, 'learning_rate': 5.586483619694953e-06}
         | 
| 17467 | 
            +
            [Rank 0] Trainer log: {'loss': 0.2652, 'grad_norm': 10.820907592773438, 'learning_rate': 5.586483619694953e-06}[Rank 1] Trainer log: {'loss': 0.2652, 'grad_norm': 10.820907592773438, 'learning_rate': 5.586483619694953e-06}
         | 
| 17468 | 
            +
             | 
| 17469 | 
            +
            [Rank 2] Trainer log: {'loss': 0.2652, 'grad_norm': 10.820907592773438, 'learning_rate': 5.586483619694953e-06}
         | 
| 17470 | 
            +
            {'loss': 0.2652, 'grad_norm': 10.820907592773438, 'learning_rate': 5.586483619694953e-06, 'epoch': 0.66}
         | 
| 17471 | 
            +
            tensor(-0.0016, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0016, device='cuda:3', grad_fn=<MulBackward0>)
         | 
| 17472 | 
            +
            tensor(-0.0011, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:2', grad_fn=<MulBackward0>)
         | 
| 17473 | 
            +
            tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
         | 
| 17474 | 
            +
            tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
         | 
| 17475 | 
            +
            {'train/tv_loss': 0.00031795650720596315, 'train/lm_loss': 7.840050384402276e-05, 'train/info_loss': 2.413929905742407e-05, 'train/ref_loss': 0.1724846065044403, 'train/uncertainty_loss': -6.841651047579944e-05, 'train/video_loss': 0.17498399317264557, 'train/total_loss': 0.1750623881816864}
         | 
| 17476 | 
            +
            tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
         | 
| 17477 | 
            +
            {'train/tv_loss': None, 'train/lm_loss': 0.19196442365646363, 'train/info_loss': 0.2579406201839447, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011937021045014263, 'train/video_loss': 0.257821261882782, 'train/total_loss': 0.4497857093811035}
         | 
| 17478 | 
            +
            tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
         | 
| 17479 | 
            +
            tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
         | 
| 17480 | 
            +
            tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
         | 
| 17481 | 
            +
            [Rank 2] Trainer log: {'loss': 0.3145, 'grad_norm': 6.420591831207275, 'learning_rate': 5.576913777395749e-06}
         | 
| 17482 | 
            +
            [Rank 3] Trainer log: {'loss': 0.3145, 'grad_norm': 6.420591831207275, 'learning_rate': 5.576913777395749e-06}
         | 
| 17483 | 
            +
            [Rank 0] Trainer log: {'loss': 0.3145, 'grad_norm': 6.420591831207275, 'learning_rate': 5.576913777395749e-06}[Rank 1] Trainer log: {'loss': 0.3145, 'grad_norm': 6.420591831207275, 'learning_rate': 5.576913777395749e-06}
         | 
| 17484 | 
            +
             | 
| 17485 | 
            +
            {'loss': 0.3145, 'grad_norm': 6.420591831207275, 'learning_rate': 5.576913777395749e-06, 'epoch': 0.66}
         | 
| 17486 | 
            +
            tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
         | 
| 17487 | 
            +
            tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
         | 
| 17488 | 
            +
            {'train/tv_loss': None, 'train/lm_loss': 0.3795618772506714, 'train/info_loss': 0.29438889026641846, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011655737180262805, 'train/video_loss': 0.2942723333835602, 'train/total_loss': 0.6738342046737671}
         | 
| 17489 | 
            +
            tensor(-0.0010, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:3', grad_fn=<MulBackward0>)
         | 
| 17490 | 
            +
            tensor(-0.0014, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:1', grad_fn=<MulBackward0>)
         | 
| 17491 | 
            +
            tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
         | 
| 17492 | 
            +
            {'train/tv_loss': None, 'train/lm_loss': 0.32472651004791264, 'train/info_loss': 0.22495689988136292, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001121837878599763, 'train/video_loss': 0.22484470903873444, 'train/total_loss': 0.5495712161064148}
         | 
| 17493 | 
            +
            tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
         | 
| 17494 | 
            +
            tensor(0.0239, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
         | 
| 17495 | 
            +
            tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
         | 
| 17496 | 
            +
            [Rank 2] Trainer log: {'loss': 0.385, 'grad_norm': 4.281194686889648, 'learning_rate': 5.567348968416184e-06}[Rank 3] Trainer log: {'loss': 0.385, 'grad_norm': 4.281194686889648, 'learning_rate': 5.567348968416184e-06}
         | 
| 17497 | 
            +
             | 
| 17498 | 
            +
            [Rank 0] Trainer log: {'loss': 0.385, 'grad_norm': 4.281194686889648, 'learning_rate': 5.567348968416184e-06}
         | 
| 17499 | 
            +
            [Rank 1] Trainer log: {'loss': 0.385, 'grad_norm': 4.281194686889648, 'learning_rate': 5.567348968416184e-06}
         | 
| 17500 | 
            +
            {'loss': 0.385, 'grad_norm': 4.281194686889648, 'learning_rate': 5.567348968416184e-06, 'epoch': 0.66}
         | 
| 17501 | 
            +
            tensor(-0.0014, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:1', grad_fn=<MulBackward0>)
         | 
| 17502 | 
            +
            tensor(-0.0011, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:2', grad_fn=<MulBackward0>)
         | 
| 17503 | 
            +
            tensor(0.4046, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
         | 
| 17504 | 
            +
            {'train/tv_loss': 0.00038167270831763745, 'train/lm_loss': 2.8584344545379284e-05, 'train/info_loss': 1.9132725356030278e-05, 'train/ref_loss': 0.49088212847709656, 'train/uncertainty_loss': 0.04045624732971192, 'train/video_loss': 0.5344108939170837, 'train/total_loss': 0.5344395041465759}
         | 
| 17505 | 
            +
            tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
         | 
| 17506 | 
            +
            tensor(-0.0015, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0015, device='cuda:3', grad_fn=<MulBackward0>)
         | 
| 17507 | 
            +
            tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
         | 
| 17508 | 
            +
            {'train/tv_loss': None, 'train/lm_loss': 0.24148297309875488, 'train/info_loss': 0.2007106989622116, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011322950012981892, 'train/video_loss': 0.20059746503829956, 'train/total_loss': 0.44208043813705444}
         | 
| 17509 | 
            +
            tensor(0.0585, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
         | 
| 17510 | 
            +
            tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
         | 
| 17511 | 
            +
            [Rank 0] Trainer log: {'loss': 0.4198, 'grad_norm': 8.499199867248535, 'learning_rate': 5.557789203640687e-06}[Rank 2] Trainer log: {'loss': 0.4198, 'grad_norm': 8.499199867248535, 'learning_rate': 5.557789203640687e-06}[Rank 1] Trainer log: {'loss': 0.4198, 'grad_norm': 8.499199867248535, 'learning_rate': 5.557789203640687e-06}
         | 
| 17512 | 
            +
             | 
| 17513 | 
            +
             | 
| 17514 | 
            +
            [Rank 3] Trainer log: {'loss': 0.4198, 'grad_norm': 8.499199867248535, 'learning_rate': 5.557789203640687e-06}
         | 
| 17515 | 
            +
            {'loss': 0.4198, 'grad_norm': 8.499199867248535, 'learning_rate': 5.557789203640687e-06, 'epoch': 0.66}
         | 
| 17516 | 
            +
            tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
         | 
| 17517 | 
            +
            tensor(0.2698, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
         | 
| 17518 | 
            +
            tensor(0.4741, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
         | 
| 17519 | 
            +
            {'train/tv_loss': 0.00020899735391139985, 'train/lm_loss': 2.796454355120659e-05, 'train/info_loss': 1.794067611626815e-05, 'train/ref_loss': 0.5671656727790833, 'train/uncertainty_loss': 0.04740565419197083, 'train/video_loss': 0.6162612438201904, 'train/total_loss': 0.6162891983985901}
         | 
| 17520 | 
            +
            tensor(0.2692, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
         | 
| 17521 | 
            +
            tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
         | 
| 17522 | 
            +
            tensor(0.2025, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
         | 
| 17523 | 
            +
            {'train/tv_loss': 0.00020984613802284003, 'train/lm_loss': 4.1623553261160855e-05, 'train/info_loss': 2.270885306643322e-05, 'train/ref_loss': 0.34875354170799255, 'train/uncertainty_loss': 0.020245166122913362, 'train/video_loss': 0.3707001805305481, 'train/total_loss': 0.3707418143749237}
         | 
| 17524 | 
            +
            tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
         | 
| 17525 | 
            +
            tensor(0.1522, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
         | 
| 17526 | 
            +
            [Rank 0] Trainer log: {'loss': 0.3907, 'grad_norm': 24.31747055053711, 'learning_rate': 5.548234493947939e-06}[Rank 1] Trainer log: {'loss': 0.3907, 'grad_norm': 24.31747055053711, 'learning_rate': 5.548234493947939e-06}
         | 
| 17527 | 
            +
            [Rank 3] Trainer log: {'loss': 0.3907, 'grad_norm': 24.31747055053711, 'learning_rate': 5.548234493947939e-06}
         | 
| 17528 | 
            +
             | 
| 17529 | 
            +
            [Rank 2] Trainer log: {'loss': 0.3907, 'grad_norm': 24.31747055053711, 'learning_rate': 5.548234493947939e-06}
         | 
| 17530 | 
            +
            {'loss': 0.3907, 'grad_norm': 24.31747055053711, 'learning_rate': 5.548234493947939e-06, 'epoch': 0.66}
         | 
| 17531 | 
            +
            tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
         | 
| 17532 | 
            +
            tensor(-0.0014, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:0', grad_fn=<MulBackward0>)
         | 
| 17533 | 
            +
            {'train/tv_loss': None, 'train/lm_loss': 0.3018145322799683, 'train/info_loss': 0.25296148657798767, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00014256734866648912, 'train/video_loss': 0.25281891226768494, 'train/total_loss': 0.5546334385871887}
         | 
| 17534 | 
            +
            tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
         | 
| 17535 | 
            +
            tensor(0.7656, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
         | 
| 17536 | 
            +
            tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
         | 
| 17537 | 
            +
            {'train/tv_loss': None, 'train/lm_loss': 0.39072275161743164, 'train/info_loss': 0.18714040517807007, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010978373466059567, 'train/video_loss': 0.187030628323555, 'train/total_loss': 0.5777533650398254}
         | 
| 17538 | 
            +
            tensor(-0.0009, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:3', grad_fn=<MulBackward0>)
         | 
| 17539 | 
            +
            tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
         | 
| 17540 | 
            +
            tensor(0.0862, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
         | 
| 17541 | 
            +
            [Rank 1] Trainer log: {'loss': 0.4751, 'grad_norm': 13.349773406982422, 'learning_rate': 5.538684850210872e-06}[Rank 0] Trainer log: {'loss': 0.4751, 'grad_norm': 13.349773406982422, 'learning_rate': 5.538684850210872e-06}[Rank 3] Trainer log: {'loss': 0.4751, 'grad_norm': 13.349773406982422, 'learning_rate': 5.538684850210872e-06}
         | 
| 17542 | 
            +
             | 
| 17543 | 
            +
            [Rank 2] Trainer log: {'loss': 0.4751, 'grad_norm': 13.349773406982422, 'learning_rate': 5.538684850210872e-06}
         | 
| 17544 | 
            +
             | 
| 17545 | 
            +
            {'loss': 0.4751, 'grad_norm': 13.349773406982422, 'learning_rate': 5.538684850210872e-06, 'epoch': 0.66}
         | 
| 17546 | 
            +
            tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
         | 
| 17547 | 
            +
            tensor(-0.0013, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:0', grad_fn=<MulBackward0>)
         | 
| 17548 | 
            +
            {'train/tv_loss': None, 'train/lm_loss': 0.3718518972396851, 'train/info_loss': 0.2331818789243698, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013318745186552405, 'train/video_loss': 0.2330486923456192, 'train/total_loss': 0.604900598526001}
         | 
| 17549 | 
            +
            tensor(0.2073, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0006, device='cuda:2', grad_fn=<MulBackward0>)
         | 
| 17550 | 
            +
            tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
         | 
| 17551 | 
            +
            tensor(-0.0010, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:3', grad_fn=<MulBackward0>)
         | 
| 17552 | 
            +
            tensor(-0.0010, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:2', grad_fn=<MulBackward0>)
         | 
| 17553 | 
            +
            tensor(0.1576, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
         | 
| 17554 | 
            +
            tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
         | 
| 17555 | 
            +
            {'train/tv_loss': 0.00022317415568977596, 'train/lm_loss': 2.8441313770599666e-05, 'train/info_loss': 1.8834713046089746e-05, 'train/ref_loss': 0.14411097764968872, 'train/uncertainty_loss': -6.683562532998622e-05, 'train/video_loss': 0.1458483785390854, 'train/total_loss': 0.14587682485580444}
         | 
| 17556 | 
            +
            [Rank 3] Trainer log: {'loss': 0.3281, 'grad_norm': 2.9163901805877686, 'learning_rate': 5.529140283296655e-06}[Rank 0] Trainer log: {'loss': 0.3281, 'grad_norm': 2.9163901805877686, 'learning_rate': 5.529140283296655e-06}
         | 
| 17557 | 
            +
            [Rank 2] Trainer log: {'loss': 0.3281, 'grad_norm': 2.9163901805877686, 'learning_rate': 5.529140283296655e-06}
         | 
| 17558 | 
            +
            [Rank 1] Trainer log: {'loss': 0.3281, 'grad_norm': 2.9163901805877686, 'learning_rate': 5.529140283296655e-06}
         | 
| 17559 | 
            +
             | 
| 17560 | 
            +
            {'loss': 0.3281, 'grad_norm': 2.9163901805877686, 'learning_rate': 5.529140283296655e-06, 'epoch': 0.67}
         | 
| 17561 | 
            +
            tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
         | 
| 17562 | 
            +
            tensor(-0.0013, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:3', grad_fn=<MulBackward0>)
         | 
| 17563 | 
            +
            tensor(-0.0014, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:1', grad_fn=<MulBackward0>)
         | 
| 17564 | 
            +
            tensor(-0.0010, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:0', grad_fn=<MulBackward0>)
         | 
| 17565 | 
            +
            {'train/tv_loss': None, 'train/lm_loss': 0.062081152200698854, 'train/info_loss': 0.16616980731487274, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001029652892611921, 'train/video_loss': 0.16606684029102325, 'train/total_loss': 0.22814799845218658}
         | 
| 17566 | 
            +
            tensor(-0.0010, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:2', grad_fn=<MulBackward0>)
         | 
| 17567 | 
            +
            tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
         | 
| 17568 | 
            +
            tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
         | 
| 17569 | 
            +
            {'train/tv_loss': 0.00018424011068418622, 'train/lm_loss': 3.657006600406021e-05, 'train/info_loss': 2.0682384274550714e-05, 'train/ref_loss': 0.03961974009871483, 'train/uncertainty_loss': -6.689630681648851e-05, 'train/video_loss': 0.04104745015501976, 'train/total_loss': 0.04108402132987976}
         | 
| 17570 | 
            +
            tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
         | 
| 17571 | 
            +
            [Rank 2] Trainer log: {'loss': 0.3907, 'grad_norm': 4.135325908660889, 'learning_rate': 5.5196008040666645e-06}[Rank 1] Trainer log: {'loss': 0.3907, 'grad_norm': 4.135325908660889, 'learning_rate': 5.5196008040666645e-06}
         | 
| 17572 | 
            +
            [Rank 3] Trainer log: {'loss': 0.3907, 'grad_norm': 4.135325908660889, 'learning_rate': 5.5196008040666645e-06}
         | 
| 17573 | 
            +
             | 
| 17574 | 
            +
            [Rank 0] Trainer log: {'loss': 0.3907, 'grad_norm': 4.135325908660889, 'learning_rate': 5.5196008040666645e-06}
         | 
| 17575 | 
            +
            {'loss': 0.3907, 'grad_norm': 4.135325908660889, 'learning_rate': 5.5196008040666645e-06, 'epoch': 0.67}
         | 
| 17576 | 
            +
            tensor(-0.0014, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:3', grad_fn=<MulBackward0>)
         | 
| 17577 | 
            +
            tensor(-0.0013, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:2', grad_fn=<MulBackward0>)
         | 
| 17578 | 
            +
            tensor(0.0723, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
         | 
| 17579 | 
            +
            tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
         | 
| 17580 | 
            +
            {'train/tv_loss': 0.0002113046357408166, 'train/lm_loss': 4.093228199053556e-05, 'train/info_loss': 2.0324770957813598e-05, 'train/ref_loss': 0.0825377106666565, 'train/uncertainty_loss': -6.546297227032483e-05, 'train/video_loss': 0.08418301492929459, 'train/total_loss': 0.08422394841909409}
         | 
| 17581 | 
            +
            tensor(-0.0010, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:2', grad_fn=<MulBackward0>)
         | 
| 17582 | 
            +
            tensor(0.1926, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
         | 
| 17583 | 
            +
            tensor(0.1136, device='cuda:3', grad_fn=<AddBackward0>) tensor(0.0902, device='cuda:0', grad_fn=<AddBackward0>)tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
         | 
| 17584 | 
            +
             tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
         | 
| 17585 | 
            +
            {'train/tv_loss': 0.0002980440389364958, 'train/lm_loss': 3.676076594274491e-05, 'train/info_loss': 2.0980394765501842e-05, 'train/ref_loss': 0.27036359906196594, 'train/uncertainty_loss': 0.00902046412229538, 'train/video_loss': 0.2817894220352173, 'train/total_loss': 0.2818261682987213}
         | 
| 17586 | 
            +
            [Rank 1] Trainer log: {'loss': 0.3286, 'grad_norm': 2.2409825325012207, 'learning_rate': 5.510066423376514e-06}
         | 
| 17587 | 
            +
            [Rank 3] Trainer log: {'loss': 0.3286, 'grad_norm': 2.2409825325012207, 'learning_rate': 5.510066423376514e-06}
         | 
| 17588 | 
            +
            [Rank 2] Trainer log: {'loss': 0.3286, 'grad_norm': 2.2409825325012207, 'learning_rate': 5.510066423376514e-06}
         | 
| 17589 | 
            +
            [Rank 0] Trainer log: {'loss': 0.3286, 'grad_norm': 2.2409825325012207, 'learning_rate': 5.510066423376514e-06}
         | 
| 17590 | 
            +
            {'loss': 0.3286, 'grad_norm': 2.2409825325012207, 'learning_rate': 5.510066423376514e-06, 'epoch': 0.67}
         | 
| 17591 | 
            +
            tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
         | 
| 17592 | 
            +
            {'train/tv_loss': None, 'train/lm_loss': 0.22054891586303713, 'train/info_loss': 0.16172195971012115, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012107143411412836, 'train/video_loss': 0.16160088777542114, 'train/total_loss': 0.3821498155593872}
         | 
| 17593 | 
            +
            tensor(-0.0014, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:2', grad_fn=<MulBackward0>)
         | 
| 17594 | 
            +
            tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
         | 
| 17595 | 
            +
            tensor(0.0148, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
         | 
| 17596 | 
            +
            tensor(-0.0009, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:3', grad_fn=<MulBackward0>)
         | 
| 17597 | 
            +
            tensor(-0.0009, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:0', grad_fn=<MulBackward0>)
         | 
| 17598 | 
            +
            {'train/tv_loss': None, 'train/lm_loss': 0.2663368940353394, 'train/info_loss': 0.10841624438762665, 'train/ref_loss': None, 'train/uncertainty_loss': -8.942196145653725e-05, 'train/video_loss': 0.10832682251930237, 'train/total_loss': 0.37466371059417725}
         | 
| 17599 | 
            +
            tensor(0.1076, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
         | 
| 17600 | 
            +
            tensor(-0.0010, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:1', grad_fn=<MulBackward0>)
         | 
| 17601 | 
            +
            [Rank 1] Trainer log: {'loss': 0.3337, 'grad_norm': 2.651632070541382, 'learning_rate': 5.500537152075987e-06}[Rank 2] Trainer log: {'loss': 0.3337, 'grad_norm': 2.651632070541382, 'learning_rate': 5.500537152075987e-06}[Rank 3] Trainer log: {'loss': 0.3337, 'grad_norm': 2.651632070541382, 'learning_rate': 5.500537152075987e-06}
         | 
| 17602 | 
            +
             | 
| 17603 | 
            +
             | 
| 17604 | 
            +
            [Rank 0] Trainer log: {'loss': 0.3337, 'grad_norm': 2.651632070541382, 'learning_rate': 5.500537152075987e-06}
         | 
| 17605 | 
            +
            {'loss': 0.3337, 'grad_norm': 2.651632070541382, 'learning_rate': 5.500537152075987e-06, 'epoch': 0.67}
         | 
| 17606 | 
            +
            tensor(-0.0010, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:2', grad_fn=<MulBackward0>)
         | 
| 17607 | 
            +
            tensor(0.2763, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
         | 
| 17608 | 
            +
            tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
         | 
| 17609 | 
            +
            tensor(-0.0009, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:0', grad_fn=<MulBackward0>)
         | 
| 17610 | 
            +
            {'train/tv_loss': None, 'train/lm_loss': 0.06459110975265503, 'train/info_loss': 0.21699465811252594, 'train/ref_loss': None, 'train/uncertainty_loss': -9.331009350717069e-05, 'train/video_loss': 0.21690134704113007, 'train/total_loss': 0.2814924716949463}
         | 
| 17611 | 
            +
            tensor(-0.0014, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:2', grad_fn=<MulBackward0>)
         | 
| 17612 | 
            +
            tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
         | 
| 17613 | 
            +
            tensor(-0.0015, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0015, device='cuda:3', grad_fn=<MulBackward0>)
         | 
| 17614 | 
            +
            tensor(0.0351, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
         | 
| 17615 | 
            +
            {'train/tv_loss': 0.00018034547101706267, 'train/lm_loss': 3.2064729020930825e-05, 'train/info_loss': 1.9728748156921938e-05, 'train/ref_loss': 0.22903497517108917, 'train/uncertainty_loss': 0.003506988659501076, 'train/video_loss': 0.2340044528245926, 'train/total_loss': 0.23403652012348175}
         | 
| 17616 | 
            +
            [Rank 1] Trainer log: {'loss': 0.3109, 'grad_norm': 3.874473810195923, 'learning_rate': 5.491013001009076e-06}
         | 
| 17617 | 
            +
            [Rank 0] Trainer log: {'loss': 0.3109, 'grad_norm': 3.874473810195923, 'learning_rate': 5.491013001009076e-06}[Rank 2] Trainer log: {'loss': 0.3109, 'grad_norm': 3.874473810195923, 'learning_rate': 5.491013001009076e-06}
         | 
| 17618 | 
            +
             | 
| 17619 | 
            +
            [Rank 3] Trainer log: {'loss': 0.3109, 'grad_norm': 3.874473810195923, 'learning_rate': 5.491013001009076e-06}
         | 
| 17620 | 
            +
            {'loss': 0.3109, 'grad_norm': 3.874473810195923, 'learning_rate': 5.491013001009076e-06, 'epoch': 0.67}
         | 
| 17621 | 
            +
            tensor(-0.0009, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:0', grad_fn=<MulBackward0>)
         | 
| 17622 | 
            +
            {'train/tv_loss': None, 'train/lm_loss': 0.07050921320915222, 'train/info_loss': 0.19436703622341156, 'train/ref_loss': None, 'train/uncertainty_loss': -8.927494054660201e-05, 'train/video_loss': 0.19427776336669922, 'train/total_loss': 0.2647869884967804}
         | 
| 17623 | 
            +
            tensor(0.1349, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
         | 
| 17624 | 
            +
            tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
         | 
| 17625 | 
            +
            tensor(0.0307, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
         | 
| 17626 | 
            +
            tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
         | 
| 17627 | 
            +
            tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
         | 
| 17628 | 
            +
            tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
         | 
| 17629 | 
            +
            {'train/tv_loss': None, 'train/lm_loss': 0.15221678018569948, 'train/info_loss': 0.10548965632915497, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011310963891446591, 'train/video_loss': 0.10537654906511307, 'train/total_loss': 0.2575933337211609}
         | 
| 17630 | 
            +
            tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
         | 
| 17631 | 
            +
            [Rank 1] Trainer log: {'loss': 0.2411, 'grad_norm': 2.033538818359375, 'learning_rate': 5.4814939810139236e-06}[Rank 0] Trainer log: {'loss': 0.2411, 'grad_norm': 2.033538818359375, 'learning_rate': 5.4814939810139236e-06}[Rank 3] Trainer log: {'loss': 0.2411, 'grad_norm': 2.033538818359375, 'learning_rate': 5.4814939810139236e-06}
         | 
| 17632 | 
            +
             | 
| 17633 | 
            +
            [Rank 2] Trainer log: {'loss': 0.2411, 'grad_norm': 2.033538818359375, 'learning_rate': 5.4814939810139236e-06}
         | 
| 17634 | 
            +
             | 
| 17635 | 
            +
            {'loss': 0.2411, 'grad_norm': 2.033538818359375, 'learning_rate': 5.4814939810139236e-06, 'epoch': 0.67}
         | 
| 17636 | 
            +
            tensor(-0.0014, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:1', grad_fn=<MulBackward0>)
         | 
| 17637 | 
            +
            tensor(-0.0008, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:3', grad_fn=<MulBackward0>)
         | 
| 17638 | 
            +
            tensor(0.1315, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
         | 
| 17639 | 
            +
            tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
         | 
| 17640 | 
            +
            {'train/tv_loss': None, 'train/lm_loss': 0.17559853792190552, 'train/info_loss': 0.09834705293178558, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001089532976038754, 'train/video_loss': 0.09823810309171677, 'train/total_loss': 0.2738366425037384}
         | 
| 17641 | 
            +
            tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
         | 
| 17642 | 
            +
            tensor(-0.0011, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:2', grad_fn=<MulBackward0>)
         | 
| 17643 | 
            +
            tensor(-0.0008, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:1', grad_fn=<MulBackward0>)
         | 
| 17644 | 
            +
            tensor(0.3571, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
         | 
| 17645 | 
            +
            {'train/tv_loss': 0.00020730055402964356, 'train/lm_loss': 3.654622996691615e-05, 'train/info_loss': 2.235124156868551e-05, 'train/ref_loss': 0.46529483795166016, 'train/uncertainty_loss': 0.035708272457122804, 'train/video_loss': 0.5026838779449463, 'train/total_loss': 0.5027204155921936}
         | 
| 17646 | 
            +
            [Rank 2] Trainer log: {'loss': 0.3566, 'grad_norm': 9.683050155639648, 'learning_rate': 5.471980102922859e-06}[Rank 0] Trainer log: {'loss': 0.3566, 'grad_norm': 9.683050155639648, 'learning_rate': 5.471980102922859e-06}
         | 
| 17647 | 
            +
            [Rank 1] Trainer log: {'loss': 0.3566, 'grad_norm': 9.683050155639648, 'learning_rate': 5.471980102922859e-06}
         | 
| 17648 | 
            +
             | 
| 17649 | 
            +
            [Rank 3] Trainer log: {'loss': 0.3566, 'grad_norm': 9.683050155639648, 'learning_rate': 5.471980102922859e-06}
         | 
| 17650 | 
            +
            {'loss': 0.3566, 'grad_norm': 9.683050155639648, 'learning_rate': 5.471980102922859e-06, 'epoch': 0.67}
         | 
| 17651 | 
            +
            tensor(-0.0013, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:0', grad_fn=<MulBackward0>)
         | 
| 17652 | 
            +
            {'train/tv_loss': None, 'train/lm_loss': 0.1469507694244385, 'train/info_loss': 0.2718953788280487, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012867078185081483, 'train/video_loss': 0.271766722202301, 'train/total_loss': 0.41871750354766846}
         | 
| 17653 | 
            +
            tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
         | 
| 17654 | 
            +
            tensor(0.5532, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
         | 
| 17655 | 
            +
            tensor(0.4076, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
         | 
| 17656 | 
            +
            tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:1', grad_fn=<MulBackward0>)
         | 
| 17657 | 
            +
            tensor(-0.0009, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:0', grad_fn=<MulBackward0>)
         | 
| 17658 | 
            +
            {'train/tv_loss': None, 'train/lm_loss': 0.08282127976417542, 'train/info_loss': 0.19566015899181366, 'train/ref_loss': None, 'train/uncertainty_loss': -8.814950124360622e-05, 'train/video_loss': 0.19557200372219086, 'train/total_loss': 0.2783932685852051}
         | 
| 17659 | 
            +
            tensor(0.6230, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
         | 
| 17660 | 
            +
            tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
         | 
| 17661 | 
            +
            [Rank 2] Trainer log: {'loss': 0.3995, 'grad_norm': 9.78901481628418, 'learning_rate': 5.4624713775623465e-06}
         | 
| 17662 | 
            +
            [Rank 1] Trainer log: {'loss': 0.3995, 'grad_norm': 9.78901481628418, 'learning_rate': 5.4624713775623465e-06}
         | 
| 17663 | 
            +
            [Rank 3] Trainer log: {'loss': 0.3995, 'grad_norm': 9.78901481628418, 'learning_rate': 5.4624713775623465e-06}
         | 
| 17664 | 
            +
            [Rank 0] Trainer log: {'loss': 0.3995, 'grad_norm': 9.78901481628418, 'learning_rate': 5.4624713775623465e-06}
         | 
| 17665 | 
            +
            {'loss': 0.3995, 'grad_norm': 9.78901481628418, 'learning_rate': 5.4624713775623465e-06, 'epoch': 0.67}
         | 
| 17666 | 
            +
            tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
         | 
| 17667 | 
            +
            tensor(-0.0010, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:1', grad_fn=<MulBackward0>)
         | 
| 17668 | 
            +
            tensor(0.4353, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
         | 
| 17669 | 
            +
            {'train/tv_loss': 0.00014477868098765613, 'train/lm_loss': 2.8036060393787923e-05, 'train/info_loss': 1.9132725356030278e-05, 'train/ref_loss': 0.5355187058448792, 'train/uncertainty_loss': 0.043532878160476685, 'train/video_loss': 0.5802289247512817, 'train/total_loss': 0.5802569389343262}
         | 
| 17670 | 
            +
            tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
         | 
| 17671 | 
            +
            tensor(-0.0013, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:0', grad_fn=<MulBackward0>)
         | 
| 17672 | 
            +
            {'train/tv_loss': None, 'train/lm_loss': 0.2846735239028931, 'train/info_loss': 0.25035104155540466, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012556229485198857, 'train/video_loss': 0.2502254843711853, 'train/total_loss': 0.5348989963531494}
         | 
| 17673 | 
            +
            tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
         | 
| 17674 | 
            +
            tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
         | 
| 17675 | 
            +
            tensor(-0.0010, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:1', grad_fn=<MulBackward0>)
         | 
| 17676 | 
            +
            [Rank 1] Trainer log: {'loss': 0.2973, 'grad_norm': 6.888955593109131, 'learning_rate': 5.452967815752981e-06}
         | 
| 17677 | 
            +
            [Rank 0] Trainer log: {'loss': 0.2973, 'grad_norm': 6.888955593109131, 'learning_rate': 5.452967815752981e-06}[Rank 2] Trainer log: {'loss': 0.2973, 'grad_norm': 6.888955593109131, 'learning_rate': 5.452967815752981e-06}
         | 
| 17678 | 
            +
            [Rank 3] Trainer log: {'loss': 0.2973, 'grad_norm': 6.888955593109131, 'learning_rate': 5.452967815752981e-06}
         | 
| 17679 | 
            +
             | 
| 17680 | 
            +
            {'loss': 0.2973, 'grad_norm': 6.888955593109131, 'learning_rate': 5.452967815752981e-06, 'epoch': 0.67}
         | 
| 17681 | 
            +
            tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
         | 
| 17682 | 
            +
            {'train/tv_loss': None, 'train/lm_loss': 0.35250082015991213, 'train/info_loss': 0.12986838817596436, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011850049486383796, 'train/video_loss': 0.12974989414215088, 'train/total_loss': 0.48225072026252747}
         | 
| 17683 | 
            +
            tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
         | 
| 17684 | 
            +
            tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
         | 
| 17685 | 
            +
            tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
         | 
| 17686 | 
            +
            tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
         | 
| 17687 | 
            +
            tensor(-0.0009, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:0', grad_fn=<MulBackward0>)
         | 
| 17688 | 
            +
            {'train/tv_loss': None, 'train/lm_loss': 0.06482524275779725, 'train/info_loss': 0.19261880218982697, 'train/ref_loss': None, 'train/uncertainty_loss': -9.101710165850819e-05, 'train/video_loss': 0.19252778589725494, 'train/total_loss': 0.2573530375957489}
         | 
| 17689 | 
            +
            tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:1', grad_fn=<MulBackward0>)
         | 
| 17690 | 
            +
            tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
         | 
| 17691 | 
            +
            [Rank 3] Trainer log: {'loss': 0.2944, 'grad_norm': 2.9446370601654053, 'learning_rate': 5.4434694283094916e-06}[Rank 0] Trainer log: {'loss': 0.2944, 'grad_norm': 2.9446370601654053, 'learning_rate': 5.4434694283094916e-06}
         | 
| 17692 | 
            +
            [Rank 1] Trainer log: {'loss': 0.2944, 'grad_norm': 2.9446370601654053, 'learning_rate': 5.4434694283094916e-06}
         | 
| 17693 | 
            +
            [Rank 2] Trainer log: {'loss': 0.2944, 'grad_norm': 2.9446370601654053, 'learning_rate': 5.4434694283094916e-06}
         | 
| 17694 | 
            +
             | 
| 17695 | 
            +
            {'loss': 0.2944, 'grad_norm': 2.9446370601654053, 'learning_rate': 5.4434694283094916e-06, 'epoch': 0.67}
         | 
| 17696 | 
            +
            tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
         | 
| 17697 | 
            +
            tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
         | 
| 17698 | 
            +
            tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
         | 
| 17699 | 
            +
            tensor(0.0826, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
         | 
| 17700 | 
            +
            {'train/tv_loss': 0.00014374495949596167, 'train/lm_loss': 2.815525222104043e-05, 'train/info_loss': 1.9132725356030278e-05, 'train/ref_loss': 0.2506270706653595, 'train/uncertainty_loss': 0.008263303339481354, 'train/video_loss': 0.2600594460964203, 'train/total_loss': 0.26008760929107666}
         | 
| 17701 | 
            +
            tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
         | 
| 17702 | 
            +
            {'train/tv_loss': None, 'train/lm_loss': 0.32692942619323734, 'train/info_loss': 0.17380265891551971, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010667567839846016, 'train/video_loss': 0.17369598150253296, 'train/total_loss': 0.5006253719329834}
         | 
| 17703 | 
            +
            tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
         | 
| 17704 | 
            +
            tensor(0.4338, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
         | 
| 17705 | 
            +
            tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
         | 
| 17706 | 
            +
            [Rank 3] Trainer log: {'loss': 0.4206, 'grad_norm': 6.2780914306640625, 'learning_rate': 5.433976226040713e-06}
         | 
| 17707 | 
            +
            [Rank 2] Trainer log: {'loss': 0.4206, 'grad_norm': 6.2780914306640625, 'learning_rate': 5.433976226040713e-06}[Rank 1] Trainer log: {'loss': 0.4206, 'grad_norm': 6.2780914306640625, 'learning_rate': 5.433976226040713e-06}
         | 
| 17708 | 
            +
            [Rank 0] Trainer log: {'loss': 0.4206, 'grad_norm': 6.2780914306640625, 'learning_rate': 5.433976226040713e-06}
         | 
| 17709 | 
            +
             | 
| 17710 | 
            +
            {'loss': 0.4206, 'grad_norm': 6.2780914306640625, 'learning_rate': 5.433976226040713e-06, 'epoch': 0.67}
         | 
| 17711 | 
            +
            tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
         | 
| 17712 | 
            +
            tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
         | 
| 17713 | 
            +
            tensor(0.0401, device='cuda:0', grad_fn=<AddBackward0>) tensor(0.1457, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
         | 
| 17714 | 
            +
            tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
         | 
| 17715 | 
            +
            {'train/tv_loss': 0.0001431380049325526, 'train/lm_loss': 2.1766431746073068e-05, 'train/info_loss': 1.561617318657227e-05, 'train/ref_loss': 0.21703669428825378, 'train/uncertainty_loss': 0.0040077798068523405, 'train/video_loss': 0.22220520675182343, 'train/total_loss': 0.22222697734832764}
         | 
| 17716 | 
            +
            tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
         | 
| 17717 | 
            +
            tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
         | 
| 17718 | 
            +
            {'train/tv_loss': None, 'train/lm_loss': 0.19441068172454834, 'train/info_loss': 0.2764996588230133, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010609085438773036, 'train/video_loss': 0.2763935625553131, 'train/total_loss': 0.47080424427986145}
         | 
| 17719 | 
            +
            tensor(0.0335, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
         | 
| 17720 | 
            +
            tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
         | 
| 17721 | 
            +
            [Rank 1] Trainer log: {'loss': 0.3794, 'grad_norm': 5.537331581115723, 'learning_rate': 5.424488219749593e-06}[Rank 3] Trainer log: {'loss': 0.3794, 'grad_norm': 5.537331581115723, 'learning_rate': 5.424488219749593e-06}[Rank 0] Trainer log: {'loss': 0.3794, 'grad_norm': 5.537331581115723, 'learning_rate': 5.424488219749593e-06}
         | 
| 17722 | 
            +
             | 
| 17723 | 
            +
            [Rank 2] Trainer log: {'loss': 0.3794, 'grad_norm': 5.537331581115723, 'learning_rate': 5.424488219749593e-06}
         | 
| 17724 | 
            +
             | 
| 17725 | 
            +
            {'loss': 0.3794, 'grad_norm': 5.537331581115723, 'learning_rate': 5.424488219749593e-06, 'epoch': 0.67}
         | 
| 17726 | 
            +
            tensor(-0.0008, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:2', grad_fn=<MulBackward0>)
         | 
| 17727 | 
            +
            tensor(0.1813, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
         | 
| 17728 | 
            +
            tensor(0.1105, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
         | 
| 17729 | 
            +
            tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
         | 
| 17730 | 
            +
            {'train/tv_loss': None, 'train/lm_loss': 0.10859153270721436, 'train/info_loss': 0.21142883598804474, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010824981145560742, 'train/video_loss': 0.21132057905197144, 'train/total_loss': 0.3199121057987213}
         | 
| 17731 | 
            +
            tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
         | 
| 17732 | 
            +
            tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
         | 
| 17733 | 
            +
            tensor(0.2025, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
         | 
| 17734 | 
            +
            {'train/tv_loss': 0.00026534441858530047, 'train/lm_loss': 3.2517651561647654e-05, 'train/info_loss': 2.0682384274550714e-05, 'train/ref_loss': 0.34838536381721497, 'train/uncertainty_loss': 0.020245753228664398, 'train/video_loss': 0.3707745671272278, 'train/total_loss': 0.37080708146095276}
         | 
| 17735 | 
            +
            tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
         | 
| 17736 | 
            +
            [Rank 3] Trainer log: {'loss': 0.4171, 'grad_norm': 7.524171829223633, 'learning_rate': 5.415005420233141e-06}[Rank 1] Trainer log: {'loss': 0.4171, 'grad_norm': 7.524171829223633, 'learning_rate': 5.415005420233141e-06}[Rank 2] Trainer log: {'loss': 0.4171, 'grad_norm': 7.524171829223633, 'learning_rate': 5.415005420233141e-06}
         | 
| 17737 | 
            +
             | 
| 17738 | 
            +
             | 
| 17739 | 
            +
            [Rank 0] Trainer log: {'loss': 0.4171, 'grad_norm': 7.524171829223633, 'learning_rate': 5.415005420233141e-06}
         | 
| 17740 | 
            +
            {'loss': 0.4171, 'grad_norm': 7.524171829223633, 'learning_rate': 5.415005420233141e-06, 'epoch': 0.67}
         | 
| 17741 | 
            +
            tensor(-0.0015, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0015, device='cuda:3', grad_fn=<MulBackward0>)
         | 
| 17742 | 
            +
            tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
         | 
| 17743 | 
            +
            {'train/tv_loss': None, 'train/lm_loss': 0.4450415134429932, 'train/info_loss': 0.18519927561283112, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010915255406871439, 'train/video_loss': 0.18509012460708618, 'train/total_loss': 0.6301316022872925}
         | 
| 17744 | 
            +
            tensor(0.2665, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
         | 
| 17745 | 
            +
            tensor(0.2499, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
         | 
| 17746 | 
            +
            tensor(-0.0014, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:0', grad_fn=<MulBackward0>)
         | 
| 17747 | 
            +
            {'train/tv_loss': None, 'train/lm_loss': 0.31789367198944096, 'train/info_loss': 0.2461404800415039, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00014492600457742812, 'train/video_loss': 0.24599555134773254, 'train/total_loss': 0.5638892650604248}
         | 
| 17748 | 
            +
            tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
         | 
| 17749 | 
            +
            tensor(0.4158, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0006, device='cuda:3', grad_fn=<MulBackward0>)
         | 
| 17750 | 
            +
            tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
         | 
| 17751 | 
            +
            [Rank 1] Trainer log: {'loss': 0.4059, 'grad_norm': 16.058292388916016, 'learning_rate': 5.405527838282458e-06}
         | 
| 17752 | 
            +
            [Rank 3] Trainer log: {'loss': 0.4059, 'grad_norm': 16.058292388916016, 'learning_rate': 5.405527838282458e-06}
         | 
| 17753 | 
            +
            [Rank 0] Trainer log: {'loss': 0.4059, 'grad_norm': 16.058292388916016, 'learning_rate': 5.405527838282458e-06}
         | 
| 17754 | 
            +
            [Rank 2] Trainer log: {'loss': 0.4059, 'grad_norm': 16.058292388916016, 'learning_rate': 5.405527838282458e-06}
         | 
| 17755 | 
            +
            {'loss': 0.4059, 'grad_norm': 16.058292388916016, 'learning_rate': 5.405527838282458e-06, 'epoch': 0.67}
         | 
| 17756 | 
            +
            tensor(-0.0010, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:2', grad_fn=<MulBackward0>)
         | 
| 17757 | 
            +
            tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
         | 
| 17758 | 
            +
            tensor(0.0608, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
         | 
| 17759 | 
            +
            tensor(0.1152, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
         | 
| 17760 | 
            +
            {'train/tv_loss': 0.000334132113493979, 'train/lm_loss': 3.177867038175464e-05, 'train/info_loss': 2.0682384274550714e-05, 'train/ref_loss': 0.2817174196243286, 'train/uncertainty_loss': 0.011518295109272004, 'train/video_loss': 0.2959294617176056, 'train/total_loss': 0.2959612309932709}
         | 
| 17761 | 
            +
            tensor(-0.0011, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:2', grad_fn=<MulBackward0>)
         | 
| 17762 | 
            +
            tensor(0.3057, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
         | 
| 17763 | 
            +
            {'train/tv_loss': 0.0002599612809717655, 'train/lm_loss': 4.741583543363959e-05, 'train/info_loss': 2.3066464564180933e-05, 'train/ref_loss': 0.42135030031204224, 'train/uncertainty_loss': 0.03057071566581726, 'train/video_loss': 0.4540237784385681, 'train/total_loss': 0.45407119393348694}
         | 
| 17764 | 
            +
            tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
         | 
| 17765 | 
            +
            tensor(0.0280, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
         | 
| 17766 | 
            +
            [Rank 1] Trainer log: {'loss': 0.28, 'grad_norm': 6.182593822479248, 'learning_rate': 5.396055484682719e-06}[Rank 2] Trainer log: {'loss': 0.28, 'grad_norm': 6.182593822479248, 'learning_rate': 5.396055484682719e-06}
         | 
| 17767 | 
            +
             | 
| 17768 | 
            +
            [Rank 0] Trainer log: {'loss': 0.28, 'grad_norm': 6.182593822479248, 'learning_rate': 5.396055484682719e-06}[Rank 3] Trainer log: {'loss': 0.28, 'grad_norm': 6.182593822479248, 'learning_rate': 5.396055484682719e-06}
         | 
| 17769 | 
            +
             | 
| 17770 | 
            +
            {'loss': 0.28, 'grad_norm': 6.182593822479248, 'learning_rate': 5.396055484682719e-06, 'epoch': 0.67}
         | 
| 17771 | 
            +
            tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
         | 
| 17772 | 
            +
            tensor(0.0697, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
         | 
| 17773 | 
            +
            {'train/tv_loss': 0.0003155144164338708, 'train/lm_loss': 5.3374795243144035e-05, 'train/info_loss': 2.455650974297896e-05, 'train/ref_loss': 0.22853229939937592, 'train/uncertainty_loss': 0.006969699263572693, 'train/video_loss': 0.2380506843328476, 'train/total_loss': 0.23810406029224396}
         | 
| 17774 | 
            +
            tensor(0.2178, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
         | 
| 17775 | 
            +
            tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
         |