diff --git "a/train.log" "b/train.log" --- "a/train.log" +++ "b/train.log" @@ -1,19 +1,19 @@ -[2025-04-22 16:06:22,354] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) -[2025-04-22 16:06:22,354] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) -[2025-04-22 16:06:22,354] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) -[2025-04-22 16:06:22,354] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) -Global rank 1, Local Rank: 1 initiatedGlobal rank 3, Local Rank: 3 initiatedGlobal rank 0, Local Rank: 0 initiated +[2025-04-23 18:06:31,697] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-23 18:06:31,697] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-23 18:06:31,697] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-23 18:06:31,697] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +Global rank 0, Local Rank: 0 initiated +Global rank 1, Local Rank: 1 initiatedGlobal rank 3, Local Rank: 3 initiated Global rank 2, Local Rank: 2 initiated - -[2025-04-22 16:06:25,876] [INFO] [comm.py:652:init_distributed] cdb=None -[2025-04-22 16:06:25,876] [INFO] [comm.py:652:init_distributed] cdb=None -[2025-04-22 16:06:25,876] [INFO] [comm.py:652:init_distributed] cdb=None -[2025-04-22 16:06:25,876] [INFO] [comm.py:652:init_distributed] cdb=None -[2025-04-22 16:06:25,876] [INFO] [comm.py:683:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +[2025-04-23 18:06:35,705] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-04-23 18:06:35,705] [INFO] [comm.py:683:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +[2025-04-23 18:06:35,705] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-04-23 18:06:35,705] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-04-23 18:06:35,705] [INFO] [comm.py:652:init_distributed] cdb=None GPU 0 - Using device: cuda -GPU 3 - Using device: cuda GPU 2 - Using device: cuda +GPU 3 - Using device: cuda GPU 1 - Using device: cuda Wandb initialized Rank 0: Loading vision tower: google/siglip-so400m-patch14-384 @@ -23,10 +23,6 @@ creating lora with config: LoraConfig(peft_type=, auto_ma creating lora with config: LoraConfig(peft_type=, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='CAUSAL_LM', inference_mode=False, r=16, target_modules='model\\.layers.*(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$', lora_alpha=32, lora_dropout=0.05, fan_in_fan_out=False, bias='none', modules_to_save=['connector', 'mm_projector', 'response_head', 'lm_head', 'informative_head', 'relevance_head', 'uncertainty_head'], init_lora_weights=True, layers_to_transform=None, layers_pattern=None) creating lora with config: LoraConfig(peft_type=, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='CAUSAL_LM', inference_mode=False, r=16, target_modules='model\\.layers.*(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$', lora_alpha=32, lora_dropout=0.05, fan_in_fan_out=False, bias='none', modules_to_save=['connector', 'mm_projector', 'response_head', 'lm_head', 'informative_head', 'relevance_head', 'uncertainty_head'], init_lora_weights=True, layers_to_transform=None, layers_pattern=None) trainable params: 1,164,355,584 || all params: 8,632,726,048 || trainable%: 13.487692966577503 -[Rank 3] Distributed initialized? True -[Rank 3] Backend: nccl -load datasets/coin/videos_metadata.json... -trainable params: 1,164,355,584 || all params: 8,632,726,048 || trainable%: 13.487692966577503 [Rank 2] Distributed initialized? True [Rank 2] Backend: nccl load datasets/coin/videos_metadata.json... @@ -472,6 +468,14 @@ trainable params: 1,164,355,584 || all params: 8,632,726,048 || trainable%: 13.4 ('base_model.model.model.layers.16.mlp.up_proj.lora_B.default.weight', torch.Size([18944, 16]), torch.float32, True) ('base_model.model.model.layers.16.mlp.down_proj.weight', torch.Size([3584, 18944]), torch.bfloat16, False) ('base_model.model.model.layers.16.mlp.down_proj.lora_A.default.weight', torch.Size([16, 18944]), torch.float32, True) +trainable params: 1,164,355,584 || all params: 8,632,726,048 || trainable%: 13.487692966577503 +[Rank 1] Distributed initialized? True +[Rank 1] Backend: nccl +load datasets/coin/videos_metadata.json... +trainable params: 1,164,355,584 || all params: 8,632,726,048 || trainable%: 13.487692966577503 +[Rank 3] Distributed initialized? True +[Rank 3] Backend: nccl +load datasets/coin/videos_metadata.json... ('base_model.model.model.layers.16.mlp.down_proj.lora_B.default.weight', torch.Size([3584, 16]), torch.float32, True) ('base_model.model.model.layers.16.input_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.layers.16.post_attention_layernorm.weight', torch.Size([3584]), torch.bfloat16, False) @@ -1202,16574 +1206,458 @@ trainable params: 1,164,355,584 || all params: 8,632,726,048 || trainable%: 13.4 [Rank 0] Distributed initialized? True [Rank 0] Backend: nccl load datasets/coin/videos_metadata.json... -trainable params: 1,164,355,584 || all params: 8,632,726,048 || trainable%: 13.487692966577503 -[Rank 1] Distributed initialized? True -[Rank 1] Backend: nccl -load datasets/coin/videos_metadata.json... -Dataset DenseVideoCaptioningStreamDataset has 4506 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nSimply interpret the scene for me.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nput the melted soap block into the vessel<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\ntake out after freezing<|im_end|>', torch.Size([91, 3, 384, 384]), [range(30833, 30884), range(31623, 31656)], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1], None, 0] -load datasets/shot2story/release_134k_videos_metadata.json... -Dataset DenseVideoCaptioningStreamDataset has 4506 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nSimply interpret the scene for me.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nput the melted soap block into the vessel<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\ntake out after freezing<|im_end|>', torch.Size([91, 3, 384, 384]), [range(30833, 30884), range(31623, 31656)], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1], None, 0] +Dataset DenseVideoCaptioningStreamDataset has 4506 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nHelp me to illustrate my view in short.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nput the melted soap block into the vessel<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\ntake out after freezing<|im_end|>', torch.Size([91, 3, 384, 384]), [range(30838, 30889), range(31628, 31661)], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1], None, 0] load datasets/shot2story/release_134k_videos_metadata.json... -Dataset DenseVideoCaptioningStreamDataset has 4506 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nDo concise real-time narration.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nput the melted soap block into the vessel<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\ntake out after freezing<|im_end|>', torch.Size([91, 3, 384, 384]), [range(30830, 30881), range(31620, 31653)], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1], None, 0] +Dataset DenseVideoCaptioningStreamDataset has 4506 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nContinuously answer what you observed with simple text.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nput the melted soap block into the vessel<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\ntake out after freezing<|im_end|>', torch.Size([91, 3, 384, 384]), [range(30854, 30905), range(31644, 31677)], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1], None, 0] load datasets/shot2story/release_134k_videos_metadata.json... -Dataset MAGQAStreamDataset has 5000 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>user\nWhat happens during the basketball game?<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nSeveral players in white jerseys are celebrating by high-fiving each other.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA player in a white jersey makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA successful shot is made by a player in a white jersey after a series of passes.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA player in a white jersey passes the ball and the teammate shoots over the defending blue players.<|im_end|>', torch.Size([36, 3, 384, 384]), [range(1371, 1456), range(4939, 5000), range(8826, 8917), range(13086, 13195)], [-100, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], None, 0] +Dataset DenseVideoCaptioningStreamDataset has 4506 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nWhat can you tell me about? Be concise.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nput the melted soap block into the vessel<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\ntake out after freezing<|im_end|>', torch.Size([91, 3, 384, 384]), [range(30838, 30889), range(31628, 31661)], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1], None, 0] load datasets/shot2story/release_134k_videos_metadata.json... -Dataset DenseVideoCaptioningStreamDataset has 4506 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nContinuously answer what you observed with simple text.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nput the melted soap block into the vessel<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\ntake out after freezing<|im_end|>', torch.Size([91, 3, 384, 384]), [range(30854, 30905), range(31644, 31677)], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1], None, 0] +Dataset DenseVideoCaptioningStreamDataset has 4506 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nWhat is the action now? Please response in short.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nput the melted soap block into the vessel<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\ntake out after freezing<|im_end|>', torch.Size([91, 3, 384, 384]), [range(30848, 30899), range(31638, 31671)], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1], None, 0] load datasets/shot2story/release_134k_videos_metadata.json... Dataset MAGQAStreamDataset has 5000 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>user\nWhat happens during the basketball game?<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nSeveral players in white jerseys are celebrating by high-fiving each other.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA player in a white jersey makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA successful shot is made by a player in a white jersey after a series of passes.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA player in a white jersey passes the ball and the teammate shoots over the defending blue players.<|im_end|>', torch.Size([36, 3, 384, 384]), [range(1371, 1456), range(4939, 5000), range(8826, 8917), range(13086, 13195)], [-100, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], None, 0] -load datasets/shot2story/release_134k_videos_metadata.json... Dataset MAGQAStreamDataset has 5000 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>user\nWhat happens during the basketball game?<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nSeveral players in white jerseys are celebrating by high-fiving each other.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA player in a white jersey makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA successful shot is made by a player in a white jersey after a series of passes.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA player in a white jersey passes the ball and the teammate shoots over the defending blue players.<|im_end|>', torch.Size([36, 3, 384, 384]), [range(1371, 1456), range(4939, 5000), range(8826, 8917), range(13086, 13195)], [-100, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], None, 0] -load datasets/shot2story/release_134k_videos_metadata.json... -Dataset DenseVideoCaptioningStreamDataset has 5000 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nWhat is the action now? Please response in short.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nSeveral players in white jerseys are high-fiving each other. There is a black screen behind them.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game going on. The players wear blue and white jerseys. There are many spectators sitting in the stands. A player in a white jersey passes the ball to his teammate, who takes the ball and makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game taking place in the stadium. There is a huge crowd watching from the stands. The players wear blue and white jerseys. A player in a white jersey passes the ball to his teammate, who passes the ball to another teammate. The last player wearing the white jersey catches the ball and makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game played by two teams, one is blue and the other is white. The game is played in a large stadium with many people watching in the stands. A player in a white jersey passes the ball to his teammate. There are several players in blue jerseys in front of him who are preventing him from passing the ball. Eventually, his teammate catches the ball and jumps up to shoot.<|im_end|>', torch.Size([37, 3, 384, 384]), [range(1693, 1800), range(5283, 5527), range(9010, 9352), range(13864, 14265)], [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1], None, 0] Dataset MAGQAStreamDataset has 5000 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>user\nWhat happens during the basketball game?<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nSeveral players in white jerseys are celebrating by high-fiving each other.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA player in a white jersey makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA successful shot is made by a player in a white jersey after a series of passes.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA player in a white jersey passes the ball and the teammate shoots over the defending blue players.<|im_end|>', torch.Size([36, 3, 384, 384]), [range(1371, 1456), range(4939, 5000), range(8826, 8917), range(13086, 13195)], [-100, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], None, 0] -load datasets/hisum/videos_metadata.json... +Dataset MAGQAStreamDataset has 5000 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>user\nWhat happens during the basketball game?<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nSeveral players in white jerseys are celebrating by high-fiving each other.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA player in a white jersey makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA successful shot is made by a player in a white jersey after a series of passes.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA player in a white jersey passes the ball and the teammate shoots over the defending blue players.<|im_end|>', torch.Size([36, 3, 384, 384]), [range(1371, 1456), range(4939, 5000), range(8826, 8917), range(13086, 13195)], [-100, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], None, 0] +load datasets/shot2story/release_134k_videos_metadata.json... load datasets/shot2story/release_134k_videos_metadata.json... +load datasets/shot2story/release_134k_videos_metadata.json... +load datasets/shot2story/release_134k_videos_metadata.json... +Dataset DenseVideoCaptioningStreamDataset has 5000 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nUse simple text to explain what is shown in front of me.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nSeveral players in white jerseys are high-fiving each other. There is a black screen behind them.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game going on. The players wear blue and white jerseys. There are many spectators sitting in the stands. A player in a white jersey passes the ball to his teammate, who takes the ball and makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game taking place in the stadium. There is a huge crowd watching from the stands. The players wear blue and white jerseys. A player in a white jersey passes the ball to his teammate, who passes the ball to another teammate. The last player wearing the white jersey catches the ball and makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game played by two teams, one is blue and the other is white. The game is played in a large stadium with many people watching in the stands. A player in a white jersey passes the ball to his teammate. There are several players in blue jerseys in front of him who are preventing him from passing the ball. Eventually, his teammate catches the ball and jumps up to shoot.<|im_end|>', torch.Size([37, 3, 384, 384]), [range(1700, 1807), range(5290, 5534), range(9017, 9359), range(13871, 14272)], [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1], None, 0] +Dataset DenseVideoCaptioningStreamDataset has 5000 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nUse simple text to explain what is shown in front of me.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nSeveral players in white jerseys are high-fiving each other. There is a black screen behind them.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game going on. The players wear blue and white jerseys. There are many spectators sitting in the stands. A player in a white jersey passes the ball to his teammate, who takes the ball and makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game taking place in the stadium. There is a huge crowd watching from the stands. The players wear blue and white jerseys. A player in a white jersey passes the ball to his teammate, who passes the ball to another teammate. The last player wearing the white jersey catches the ball and makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game played by two teams, one is blue and the other is white. The game is played in a large stadium with many people watching in the stands. A player in a white jersey passes the ball to his teammate. There are several players in blue jerseys in front of him who are preventing him from passing the ball. Eventually, his teammate catches the ball and jumps up to shoot.<|im_end|>', torch.Size([37, 3, 384, 384]), [range(1700, 1807), range(5290, 5534), range(9017, 9359), range(13871, 14272)], [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1], None, 0] +Dataset DenseVideoCaptioningStreamDataset has 5000 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nPlease concisely narrate the video in real time.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nSeveral players in white jerseys are high-fiving each other. There is a black screen behind them.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game going on. The players wear blue and white jerseys. There are many spectators sitting in the stands. A player in a white jersey passes the ball to his teammate, who takes the ball and makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game taking place in the stadium. There is a huge crowd watching from the stands. The players wear blue and white jerseys. A player in a white jersey passes the ball to his teammate, who passes the ball to another teammate. The last player wearing the white jersey catches the ball and makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game played by two teams, one is blue and the other is white. The game is played in a large stadium with many people watching in the stands. A player in a white jersey passes the ball to his teammate. There are several players in blue jerseys in front of him who are preventing him from passing the ball. Eventually, his teammate catches the ball and jumps up to shoot.<|im_end|>', torch.Size([37, 3, 384, 384]), [range(1692, 1799), range(5282, 5526), range(9009, 9351), range(13863, 14264)], [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1], None, 0] Dataset DenseVideoCaptioningStreamDataset has 5000 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nWhat can you tell me about? Be concise.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nSeveral players in white jerseys are high-fiving each other. There is a black screen behind them.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game going on. The players wear blue and white jerseys. There are many spectators sitting in the stands. A player in a white jersey passes the ball to his teammate, who takes the ball and makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game taking place in the stadium. There is a huge crowd watching from the stands. The players wear blue and white jerseys. A player in a white jersey passes the ball to his teammate, who passes the ball to another teammate. The last player wearing the white jersey catches the ball and makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game played by two teams, one is blue and the other is white. The game is played in a large stadium with many people watching in the stands. A player in a white jersey passes the ball to his teammate. There are several players in blue jerseys in front of him who are preventing him from passing the ball. Eventually, his teammate catches the ball and jumps up to shoot.<|im_end|>', torch.Size([37, 3, 384, 384]), [range(1683, 1790), range(5273, 5517), range(9000, 9342), range(13854, 14255)], [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1], None, 0] load datasets/hisum/videos_metadata.json... -Dataset DenseVideoCaptioningStreamDataset has 5000 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nPlease concisely narrate the video in real time.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nSeveral players in white jerseys are high-fiving each other. There is a black screen behind them.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game going on. The players wear blue and white jerseys. There are many spectators sitting in the stands. A player in a white jersey passes the ball to his teammate, who takes the ball and makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game taking place in the stadium. There is a huge crowd watching from the stands. The players wear blue and white jerseys. A player in a white jersey passes the ball to his teammate, who passes the ball to another teammate. The last player wearing the white jersey catches the ball and makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game played by two teams, one is blue and the other is white. The game is played in a large stadium with many people watching in the stands. A player in a white jersey passes the ball to his teammate. There are several players in blue jerseys in front of him who are preventing him from passing the ball. Eventually, his teammate catches the ball and jumps up to shoot.<|im_end|>', torch.Size([37, 3, 384, 384]), [range(1692, 1799), range(5282, 5526), range(9009, 9351), range(13863, 14264)], [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1], None, 0] load datasets/hisum/videos_metadata.json... -Dataset DenseVideoCaptioningStreamDataset has 5000 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nDo concise real-time narration.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nSeveral players in white jerseys are high-fiving each other. There is a black screen behind them.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game going on. The players wear blue and white jerseys. There are many spectators sitting in the stands. A player in a white jersey passes the ball to his teammate, who takes the ball and makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game taking place in the stadium. There is a huge crowd watching from the stands. The players wear blue and white jerseys. A player in a white jersey passes the ball to his teammate, who passes the ball to another teammate. The last player wearing the white jersey catches the ball and makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game played by two teams, one is blue and the other is white. The game is played in a large stadium with many people watching in the stands. A player in a white jersey passes the ball to his teammate. There are several players in blue jerseys in front of him who are preventing him from passing the ball. Eventually, his teammate catches the ball and jumps up to shoot.<|im_end|>', torch.Size([37, 3, 384, 384]), [range(1675, 1782), range(5265, 5509), range(8992, 9334), range(13846, 14247)], [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1], None, 0] +load datasets/hisum/videos_metadata.json... load datasets/hisum/videos_metadata.json... Mr. HiSum loaded 10381 out of 12000 videos Mr. HiSum loaded 10381 out of 12000 videos Mr. HiSum loaded 10381 out of 12000 videos Mr. HiSum loaded 10381 out of 12000 videos Dataset HiSumDataset has 10298 examples. Example data: ["<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nWhich clips in the video relate to the query 'Secret Story 3: Apresentação videoclip Casa dos Segredos 3'?<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>", torch.Size([128, 3, 384, 384]), [], None, [0.573610125431077, 0.573610125431077, 0.573610125431077, 0.5190257218487939, 0.5190257218487939, 0.4920738181080592, 0.4920738181080592, 0.4601356210236246, 0.4601356210236246, 0.43507277660799903, 0.43507277660799903, 0.38699690685350124, 0.38699690685350124, 0.38699690685350124, 0.37920869077893027, 0.37920869077893027, 0.3731251628395779, 0.3731251628395779, 0.3757135246799159, 0.3757135246799159, 0.4061223641868629, 0.4061223641868629, 0.412723468317345, 0.412723468317345, 0.4480569998392883, 0.4480569998392883, 0.4480569998392883, 0.49385419749337894, 0.49385419749337894, 0.5368615907107697, 0.5368615907107697, 0.6325263863833461, 0.6325263863833461, 0.7174177300069065, 0.7174177300069065, 0.7305644441747216, 0.7305644441747216, 0.6170838325317146, 0.6170838325317146, 0.6170838325317146, 0.40516891411332123, 0.40516891411332123, 0.2798772936155921, 0.2798772936155921, 0.2258411947560871, 0.2258411947560871, 0.21026533966857236, 0.21026533966857236, 0.19389804455935236, 0.19389804455935236, 0.18344125507826253, 0.18344125507826253, 0.18344125507826253, 0.19896618447717687, 0.19896618447717687, 0.2210711552571805, 0.2210711552571805, 0.2350028175033948, 0.2350028175033948, 0.2521406822388029, 0.2521406822388029, 0.26011932865035, 0.26011932865035, 0.26904334627265797, 0.26904334627265797, 0.26904334627265797, 0.2405866100735917, 0.2405866100735917, 0.1929686868087463, 0.1929686868087463, 0.1783107444162315, 0.1783107444162315, 0.16864515451450218, 0.16864515451450218, 0.15701144784473733, 0.15701144784473733, 0.12751931882106693, 0.12751931882106693, 0.12751931882106693, 0.09733938042624272, 0.09733938042624272, 0.08791658920415796, 0.08791658920415796, 0.07448338798267237, 0.07448338798267237, 0.0767613147118111, 0.0767613147118111, 0.07152613949714658, 0.07152613949714658, 0.07218002245346791, 0.07218002245346791, 0.07218002245346791, 0.0727822103056859, 0.0727822103056859, 0.06409070043067071, 0.06409070043067071, 0.05969236075238449, 0.05969236075238449, 0.06923746499520125, 0.06923746499520125, 0.06907980694980323, 0.06907980694980323, 0.05680241207913092, 0.05680241207913092, 0.05680241207913092, 0.062098082587713606, 0.062098082587713606, 0.08217037783013864, 0.08217037783013864, 0.07396535495108778, 0.07396535495108778, 0.07110803430400549, 0.07110803430400549, 0.07003520460633673, 0.07003520460633673, 0.07003520460633673, 0.05085658547056923, 0.05085658547056923, 0.03540607297732927, 0.03540607297732927, 0.027072726018929738, 0.027072726018929738, 0.023274049548242495, 0.023274049548242495, 0.02204286452237619, 0.02204286452237619, 0.018730699091887733, 0.018730699091887733], 0] -Starting Training! -Resuming from checkpoint: outputs/aha/checkpoint-975 Dataset HiSumDataset has 10298 examples. Example data: ["<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nWhich clips in the video relate to the query 'Secret Story 3: Apresentação videoclip Casa dos Segredos 3'?<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>", torch.Size([128, 3, 384, 384]), [], None, [0.573610125431077, 0.573610125431077, 0.573610125431077, 0.5190257218487939, 0.5190257218487939, 0.4920738181080592, 0.4920738181080592, 0.4601356210236246, 0.4601356210236246, 0.43507277660799903, 0.43507277660799903, 0.38699690685350124, 0.38699690685350124, 0.38699690685350124, 0.37920869077893027, 0.37920869077893027, 0.3731251628395779, 0.3731251628395779, 0.3757135246799159, 0.3757135246799159, 0.4061223641868629, 0.4061223641868629, 0.412723468317345, 0.412723468317345, 0.4480569998392883, 0.4480569998392883, 0.4480569998392883, 0.49385419749337894, 0.49385419749337894, 0.5368615907107697, 0.5368615907107697, 0.6325263863833461, 0.6325263863833461, 0.7174177300069065, 0.7174177300069065, 0.7305644441747216, 0.7305644441747216, 0.6170838325317146, 0.6170838325317146, 0.6170838325317146, 0.40516891411332123, 0.40516891411332123, 0.2798772936155921, 0.2798772936155921, 0.2258411947560871, 0.2258411947560871, 0.21026533966857236, 0.21026533966857236, 0.19389804455935236, 0.19389804455935236, 0.18344125507826253, 0.18344125507826253, 0.18344125507826253, 0.19896618447717687, 0.19896618447717687, 0.2210711552571805, 0.2210711552571805, 0.2350028175033948, 0.2350028175033948, 0.2521406822388029, 0.2521406822388029, 0.26011932865035, 0.26011932865035, 0.26904334627265797, 0.26904334627265797, 0.26904334627265797, 0.2405866100735917, 0.2405866100735917, 0.1929686868087463, 0.1929686868087463, 0.1783107444162315, 0.1783107444162315, 0.16864515451450218, 0.16864515451450218, 0.15701144784473733, 0.15701144784473733, 0.12751931882106693, 0.12751931882106693, 0.12751931882106693, 0.09733938042624272, 0.09733938042624272, 0.08791658920415796, 0.08791658920415796, 0.07448338798267237, 0.07448338798267237, 0.0767613147118111, 0.0767613147118111, 0.07152613949714658, 0.07152613949714658, 0.07218002245346791, 0.07218002245346791, 0.07218002245346791, 0.0727822103056859, 0.0727822103056859, 0.06409070043067071, 0.06409070043067071, 0.05969236075238449, 0.05969236075238449, 0.06923746499520125, 0.06923746499520125, 0.06907980694980323, 0.06907980694980323, 0.05680241207913092, 0.05680241207913092, 0.05680241207913092, 0.062098082587713606, 0.062098082587713606, 0.08217037783013864, 0.08217037783013864, 0.07396535495108778, 0.07396535495108778, 0.07110803430400549, 0.07110803430400549, 0.07003520460633673, 0.07003520460633673, 0.07003520460633673, 0.05085658547056923, 0.05085658547056923, 0.03540607297732927, 0.03540607297732927, 0.027072726018929738, 0.027072726018929738, 0.023274049548242495, 0.023274049548242495, 0.02204286452237619, 0.02204286452237619, 0.018730699091887733, 0.018730699091887733], 0] -Starting Training! -Resuming from checkpoint: outputs/aha/checkpoint-975 Dataset HiSumDataset has 10298 examples. Example data: ["<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nWhich clips in the video relate to the query 'Secret Story 3: Apresentação videoclip Casa dos Segredos 3'?<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>", torch.Size([128, 3, 384, 384]), [], None, [0.573610125431077, 0.573610125431077, 0.573610125431077, 0.5190257218487939, 0.5190257218487939, 0.4920738181080592, 0.4920738181080592, 0.4601356210236246, 0.4601356210236246, 0.43507277660799903, 0.43507277660799903, 0.38699690685350124, 0.38699690685350124, 0.38699690685350124, 0.37920869077893027, 0.37920869077893027, 0.3731251628395779, 0.3731251628395779, 0.3757135246799159, 0.3757135246799159, 0.4061223641868629, 0.4061223641868629, 0.412723468317345, 0.412723468317345, 0.4480569998392883, 0.4480569998392883, 0.4480569998392883, 0.49385419749337894, 0.49385419749337894, 0.5368615907107697, 0.5368615907107697, 0.6325263863833461, 0.6325263863833461, 0.7174177300069065, 0.7174177300069065, 0.7305644441747216, 0.7305644441747216, 0.6170838325317146, 0.6170838325317146, 0.6170838325317146, 0.40516891411332123, 0.40516891411332123, 0.2798772936155921, 0.2798772936155921, 0.2258411947560871, 0.2258411947560871, 0.21026533966857236, 0.21026533966857236, 0.19389804455935236, 0.19389804455935236, 0.18344125507826253, 0.18344125507826253, 0.18344125507826253, 0.19896618447717687, 0.19896618447717687, 0.2210711552571805, 0.2210711552571805, 0.2350028175033948, 0.2350028175033948, 0.2521406822388029, 0.2521406822388029, 0.26011932865035, 0.26011932865035, 0.26904334627265797, 0.26904334627265797, 0.26904334627265797, 0.2405866100735917, 0.2405866100735917, 0.1929686868087463, 0.1929686868087463, 0.1783107444162315, 0.1783107444162315, 0.16864515451450218, 0.16864515451450218, 0.15701144784473733, 0.15701144784473733, 0.12751931882106693, 0.12751931882106693, 0.12751931882106693, 0.09733938042624272, 0.09733938042624272, 0.08791658920415796, 0.08791658920415796, 0.07448338798267237, 0.07448338798267237, 0.0767613147118111, 0.0767613147118111, 0.07152613949714658, 0.07152613949714658, 0.07218002245346791, 0.07218002245346791, 0.07218002245346791, 0.0727822103056859, 0.0727822103056859, 0.06409070043067071, 0.06409070043067071, 0.05969236075238449, 0.05969236075238449, 0.06923746499520125, 0.06923746499520125, 0.06907980694980323, 0.06907980694980323, 0.05680241207913092, 0.05680241207913092, 0.05680241207913092, 0.062098082587713606, 0.062098082587713606, 0.08217037783013864, 0.08217037783013864, 0.07396535495108778, 0.07396535495108778, 0.07110803430400549, 0.07110803430400549, 0.07003520460633673, 0.07003520460633673, 0.07003520460633673, 0.05085658547056923, 0.05085658547056923, 0.03540607297732927, 0.03540607297732927, 0.027072726018929738, 0.027072726018929738, 0.023274049548242495, 0.023274049548242495, 0.02204286452237619, 0.02204286452237619, 0.018730699091887733, 0.018730699091887733], 0] Dataset HiSumDataset has 10298 examples. Example data: ["<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nWhich clips in the video relate to the query 'Secret Story 3: Apresentação videoclip Casa dos Segredos 3'?<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>", torch.Size([128, 3, 384, 384]), [], None, [0.573610125431077, 0.573610125431077, 0.573610125431077, 0.5190257218487939, 0.5190257218487939, 0.4920738181080592, 0.4920738181080592, 0.4601356210236246, 0.4601356210236246, 0.43507277660799903, 0.43507277660799903, 0.38699690685350124, 0.38699690685350124, 0.38699690685350124, 0.37920869077893027, 0.37920869077893027, 0.3731251628395779, 0.3731251628395779, 0.3757135246799159, 0.3757135246799159, 0.4061223641868629, 0.4061223641868629, 0.412723468317345, 0.412723468317345, 0.4480569998392883, 0.4480569998392883, 0.4480569998392883, 0.49385419749337894, 0.49385419749337894, 0.5368615907107697, 0.5368615907107697, 0.6325263863833461, 0.6325263863833461, 0.7174177300069065, 0.7174177300069065, 0.7305644441747216, 0.7305644441747216, 0.6170838325317146, 0.6170838325317146, 0.6170838325317146, 0.40516891411332123, 0.40516891411332123, 0.2798772936155921, 0.2798772936155921, 0.2258411947560871, 0.2258411947560871, 0.21026533966857236, 0.21026533966857236, 0.19389804455935236, 0.19389804455935236, 0.18344125507826253, 0.18344125507826253, 0.18344125507826253, 0.19896618447717687, 0.19896618447717687, 0.2210711552571805, 0.2210711552571805, 0.2350028175033948, 0.2350028175033948, 0.2521406822388029, 0.2521406822388029, 0.26011932865035, 0.26011932865035, 0.26904334627265797, 0.26904334627265797, 0.26904334627265797, 0.2405866100735917, 0.2405866100735917, 0.1929686868087463, 0.1929686868087463, 0.1783107444162315, 0.1783107444162315, 0.16864515451450218, 0.16864515451450218, 0.15701144784473733, 0.15701144784473733, 0.12751931882106693, 0.12751931882106693, 0.12751931882106693, 0.09733938042624272, 0.09733938042624272, 0.08791658920415796, 0.08791658920415796, 0.07448338798267237, 0.07448338798267237, 0.0767613147118111, 0.0767613147118111, 0.07152613949714658, 0.07152613949714658, 0.07218002245346791, 0.07218002245346791, 0.07218002245346791, 0.0727822103056859, 0.0727822103056859, 0.06409070043067071, 0.06409070043067071, 0.05969236075238449, 0.05969236075238449, 0.06923746499520125, 0.06923746499520125, 0.06907980694980323, 0.06907980694980323, 0.05680241207913092, 0.05680241207913092, 0.05680241207913092, 0.062098082587713606, 0.062098082587713606, 0.08217037783013864, 0.08217037783013864, 0.07396535495108778, 0.07396535495108778, 0.07110803430400549, 0.07110803430400549, 0.07003520460633673, 0.07003520460633673, 0.07003520460633673, 0.05085658547056923, 0.05085658547056923, 0.03540607297732927, 0.03540607297732927, 0.027072726018929738, 0.027072726018929738, 0.023274049548242495, 0.023274049548242495, 0.02204286452237619, 0.02204286452237619, 0.018730699091887733, 0.018730699091887733], 0] Starting Training! -Resuming from checkpoint: outputs/aha/checkpoint-975 +Resuming from checkpoint: outputs/aha/checkpoint-2075 Starting Training! -Resuming from checkpoint: outputs/aha/checkpoint-975 -ninja: no work to do. -Time to load cpu_adam op: 2.2936739921569824 seconds -ninja: no work to do. -Time to load cpu_adam op: 2.2399489879608154 seconds +Resuming from checkpoint: outputs/aha/checkpoint-2075 +Starting Training! +Resuming from checkpoint: outputs/aha/checkpoint-2075 +Starting Training! +Resuming from checkpoint: outputs/aha/checkpoint-2075 ninja: no work to do. -Time to load cpu_adam op: 2.246027708053589 seconds -Time to load cpu_adam op: 2.2889904975891113 seconds -tensor(0.2204, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0008057240396738053, 'train/lm_loss': 0.00011619529686868191, 'train/info_loss': 4.273470403859392e-05, 'train/ref_loss': 0.3618800640106201, 'train/uncertainty_loss': 0.02204093784093857, 'train/video_loss': 0.39040952920913696, 'train/total_loss': 0.3905257284641266} -tensor(0.0615, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0502, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.2355, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0009562340565025806, 'train/lm_loss': 0.0001391165307722986, 'train/info_loss': 4.2436706280568615e-05, 'train/ref_loss': 0.15135735273361206, 'train/uncertainty_loss': -7.079218048602343e-05, 'train/video_loss': 0.15897886455059052, 'train/total_loss': 0.15911798179149628} -[Rank 2] Trainer log: {'loss': 0.3737, 'grad_norm': 1.138757586479187, 'learning_rate': 1.640378297207456e-05}[Rank 0] Trainer log: {'loss': 0.3737, 'grad_norm': 1.138757586479187, 'learning_rate': 1.640378297207456e-05} -[Rank 3] Trainer log: {'loss': 0.3737, 'grad_norm': 1.138757586479187, 'learning_rate': 1.640378297207456e-05} -[Rank 1] Trainer log: {'loss': 0.3737, 'grad_norm': 1.138757586479187, 'learning_rate': 1.640378297207456e-05} +Time to load cpu_adam op: 2.287750244140625 seconds +Time to load cpu_adam op: 2.293984889984131 seconds +Time to load cpu_adam op: 2.2945845127105713 seconds +Time to load cpu_adam op: 2.2895333766937256 seconds +tensor(0.1152, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) +{'train/tv_loss': 0.000334132113493979, 'train/lm_loss': 3.177867038175464e-05, 'train/info_loss': 2.0682384274550714e-05, 'train/ref_loss': 0.2817174196243286, 'train/uncertainty_loss': 0.011518295109272004, 'train/video_loss': 0.2959294617176056, 'train/total_loss': 0.2959612309932709} +tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) +tensor(0.0608, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) +tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) +tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) +tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) +tensor(0.0280, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) +tensor(0.3057, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) +{'train/tv_loss': 0.0002599612809717655, 'train/lm_loss': 4.741583543363959e-05, 'train/info_loss': 2.3066464564180933e-05, 'train/ref_loss': 0.42135030031204224, 'train/uncertainty_loss': 0.03057071566581726, 'train/video_loss': 0.4540237784385681, 'train/total_loss': 0.45407119393348694} +[Rank 1] Trainer log: {'loss': 0.2789, 'grad_norm': 6.179122447967529, 'learning_rate': 5.396055484682719e-06}[Rank 0] Trainer log: {'loss': 0.2789, 'grad_norm': 6.179122447967529, 'learning_rate': 5.396055484682719e-06}[Rank 2] Trainer log: {'loss': 0.2789, 'grad_norm': 6.179122447967529, 'learning_rate': 5.396055484682719e-06} -{'loss': 0.3737, 'grad_norm': 1.138757586479187, 'learning_rate': 1.640378297207456e-05, 'epoch': 0.31} -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1752513885498047, 'train/info_loss': 0.26765304803848267, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001318566151894629, 'train/video_loss': 0.2675212025642395, 'train/total_loss': 0.44277259707450867} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.2694, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(0.1669, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0008481717668473721, 'train/lm_loss': 6.839059642516077e-05, 'train/info_loss': 3.462909808149561e-05, 'train/ref_loss': 0.3225399851799011, 'train/uncertainty_loss': 0.016690045595169067, 'train/video_loss': 0.3460500240325928, 'train/total_loss': 0.34611842036247253} -[Rank 1] Trainer log: {'loss': 0.4564, 'grad_norm': 15.013384819030762, 'learning_rate': 1.63955860173363e-05}[Rank 3] Trainer log: {'loss': 0.4564, 'grad_norm': 15.013384819030762, 'learning_rate': 1.63955860173363e-05}[Rank 2] Trainer log: {'loss': 0.4564, 'grad_norm': 15.013384819030762, 'learning_rate': 1.63955860173363e-05} +[Rank 3] Trainer log: {'loss': 0.2789, 'grad_norm': 6.179122447967529, 'learning_rate': 5.396055484682719e-06} +{'loss': 0.2789, 'grad_norm': 6.179122447967529, 'learning_rate': 5.396055484682719e-06, 'epoch': 0.67} +tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) +tensor(0.0679, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) +{'train/tv_loss': 0.0003141710069030524, 'train/lm_loss': 5.3613149793818594e-05, 'train/info_loss': 2.413929905742407e-05, 'train/ref_loss': 0.2282991260290146, 'train/uncertainty_loss': 0.006792537868022919, 'train/video_loss': 0.23762917518615723, 'train/total_loss': 0.2376827895641327} +tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) +tensor(0.2199, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) +tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) +tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) +tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) +{'train/tv_loss': None, 'train/lm_loss': 0.06702634692192078, 'train/info_loss': 0.11625701934099197, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012080817250534893, 'train/video_loss': 0.11613620817661285, 'train/total_loss': 0.18316255509853363} +tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) +[Rank 1] Trainer log: {'loss': 0.3175, 'grad_norm': 2.9217422008514404, 'learning_rate': 5.386588370213124e-06} +[Rank 0] Trainer log: {'loss': 0.3175, 'grad_norm': 2.9217422008514404, 'learning_rate': 5.386588370213124e-06}[Rank 3] Trainer log: {'loss': 0.3175, 'grad_norm': 2.9217422008514404, 'learning_rate': 5.386588370213124e-06} +[Rank 2] Trainer log: {'loss': 0.3175, 'grad_norm': 2.9217422008514404, 'learning_rate': 5.386588370213124e-06} -[Rank 0] Trainer log: {'loss': 0.4564, 'grad_norm': 15.013384819030762, 'learning_rate': 1.63955860173363e-05} -{'loss': 0.4564, 'grad_norm': 15.013384819030762, 'learning_rate': 1.63955860173363e-05, 'epoch': 0.32} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.0792, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1435, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0015209039673209192, 'train/lm_loss': 0.00011719607282429935, 'train/info_loss': 4.4820684706792235e-05, 'train/ref_loss': 0.2937156558036804, 'train/uncertainty_loss': 0.014346362650394441, 'train/video_loss': 0.32027408480644226, 'train/total_loss': 0.3203912675380707} +{'loss': 0.3175, 'grad_norm': 2.9217422008514404, 'learning_rate': 5.386588370213124e-06, 'epoch': 0.67} +tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) +{'train/tv_loss': None, 'train/lm_loss': 0.21666500568389893, 'train/info_loss': 0.20285502076148987, 'train/ref_loss': None, 'train/uncertainty_loss': -0.000115617283154279, 'train/video_loss': 0.2027394026517868, 'train/total_loss': 0.41940441727638245} +tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) +tensor(0.0125, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) +tensor(0.0021, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) +tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) +{'train/tv_loss': None, 'train/lm_loss': 0.33185107707977296, 'train/info_loss': 0.13354338705539703, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010592369362711908, 'train/video_loss': 0.13343746960163116, 'train/total_loss': 0.46528857946395874} tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(0.1003, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0013115388341248036, 'train/lm_loss': 0.00010158818913623691, 'train/info_loss': 3.9814316551201046e-05, 'train/ref_loss': 0.2822808623313904, 'train/uncertainty_loss': 0.010030072182416916, 'train/video_loss': 0.3028430640697479, 'train/total_loss': 0.3029446601867676} -[Rank 3] Trainer log: {'loss': 0.3596, 'grad_norm': 2.761920690536499, 'learning_rate': 1.6387381784641628e-05}[Rank 2] Trainer log: {'loss': 0.3596, 'grad_norm': 2.761920690536499, 'learning_rate': 1.6387381784641628e-05} - -[Rank 0] Trainer log: {'loss': 0.3596, 'grad_norm': 2.761920690536499, 'learning_rate': 1.6387381784641628e-05}[Rank 1] Trainer log: {'loss': 0.3596, 'grad_norm': 2.761920690536499, 'learning_rate': 1.6387381784641628e-05} +tensor(0.0911, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) +tensor(0.2186, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) +[Rank 1] Trainer log: {'loss': 0.3286, 'grad_norm': 2.4960975646972656, 'learning_rate': 5.377126505646924e-06}[Rank 3] Trainer log: {'loss': 0.3286, 'grad_norm': 2.4960975646972656, 'learning_rate': 5.377126505646924e-06} +[Rank 0] Trainer log: {'loss': 0.3286, 'grad_norm': 2.4960975646972656, 'learning_rate': 5.377126505646924e-06} +[Rank 2] Trainer log: {'loss': 0.3286, 'grad_norm': 2.4960975646972656, 'learning_rate': 5.377126505646924e-06} -{'loss': 0.3596, 'grad_norm': 2.761920690536499, 'learning_rate': 1.6387381784641628e-05, 'epoch': 0.32} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.0150, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1721, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0009329324588179589, 'train/lm_loss': 8.90771858394146e-05, 'train/info_loss': 3.743031629710458e-05, 'train/ref_loss': 0.3270999491214752, 'train/uncertainty_loss': 0.01721448004245758, 'train/video_loss': 0.3518153131008148, 'train/total_loss': 0.35190439224243164} -tensor(-0.0015, device='cuda:1', grad_fn=) tensor(-0.0015, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.22260327339172364, 'train/info_loss': 0.13384655117988586, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010243868455290795, 'train/video_loss': 0.13374410569667816, 'train/total_loss': 0.35634738206863403} +{'loss': 0.3286, 'grad_norm': 2.4960975646972656, 'learning_rate': 5.377126505646924e-06, 'epoch': 0.67} +tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) +tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3651, 'grad_norm': 3.4514734745025635, 'learning_rate': 1.6379170283326675e-05}[Rank 2] Trainer log: {'loss': 0.3651, 'grad_norm': 3.4514734745025635, 'learning_rate': 1.6379170283326675e-05}[Rank 3] Trainer log: {'loss': 0.3651, 'grad_norm': 3.4514734745025635, 'learning_rate': 1.6379170283326675e-05} +tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) +{'train/tv_loss': None, 'train/lm_loss': 0.3841631889343262, 'train/info_loss': 0.23512820899486542, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001275499234907329, 'train/video_loss': 0.23500065505504608, 'train/total_loss': 0.6191638708114624} +tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) +tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) +tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) +tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) +{'train/tv_loss': None, 'train/lm_loss': 0.10131968259811402, 'train/info_loss': 0.18927474319934845, 'train/ref_loss': None, 'train/uncertainty_loss': -9.222659864462913e-05, 'train/video_loss': 0.18918251991271973, 'train/total_loss': 0.2905021905899048} +[Rank 1] Trainer log: {'loss': 0.3943, 'grad_norm': 1.9749025106430054, 'learning_rate': 5.3676699017513956e-06}[Rank 3] Trainer log: {'loss': 0.3943, 'grad_norm': 1.9749025106430054, 'learning_rate': 5.3676699017513956e-06} +[Rank 0] Trainer log: {'loss': 0.3943, 'grad_norm': 1.9749025106430054, 'learning_rate': 5.3676699017513956e-06}[Rank 2] Trainer log: {'loss': 0.3943, 'grad_norm': 1.9749025106430054, 'learning_rate': 5.3676699017513956e-06} -[Rank 0] Trainer log: {'loss': 0.3651, 'grad_norm': 3.4514734745025635, 'learning_rate': 1.6379170283326675e-05} -{'loss': 0.3651, 'grad_norm': 3.4514734745025635, 'learning_rate': 1.6379170283326675e-05, 'epoch': 0.32} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.15100182294845582, 'train/info_loss': 0.1741984486579895, 'train/ref_loss': None, 'train/uncertainty_loss': -8.991920622065664e-05, 'train/video_loss': 0.17410853505134583, 'train/total_loss': 0.32511037588119507} -tensor(0.1966, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1714, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) +{'loss': 0.3943, 'grad_norm': 1.9749025106430054, 'learning_rate': 5.3676699017513956e-06, 'epoch': 0.67} tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.41518659591674806, 'train/info_loss': 0.16691680252552032, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001260237768292427, 'train/video_loss': 0.16679078340530396, 'train/total_loss': 0.581977367401123} +{'train/tv_loss': None, 'train/lm_loss': 1.9263292779214682e-05, 'train/info_loss': 8.935135701904073e-05, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012658059131354094, 'train/video_loss': -3.72292342945002e-05, 'train/total_loss': -1.7965941879083402e-05} +tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) +tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) +tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) +tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) +{'train/tv_loss': None, 'train/lm_loss': 0.06375178694725037, 'train/info_loss': 0.1386939287185669, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010659184772521257, 'train/video_loss': 0.1385873407125473, 'train/total_loss': 0.20233912765979767} tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1103, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3753, 'grad_norm': 12.842230796813965, 'learning_rate': 1.637095152273585e-05}[Rank 3] Trainer log: {'loss': 0.3753, 'grad_norm': 12.842230796813965, 'learning_rate': 1.637095152273585e-05}[Rank 2] Trainer log: {'loss': 0.3753, 'grad_norm': 12.842230796813965, 'learning_rate': 1.637095152273585e-05} - +tensor(0.3624, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) +tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) +[Rank 0] Trainer log: {'loss': 0.2786, 'grad_norm': 3.6809277534484863, 'learning_rate': 5.358218569287834e-06}[Rank 1] Trainer log: {'loss': 0.2786, 'grad_norm': 3.6809277534484863, 'learning_rate': 5.358218569287834e-06} +[Rank 2] Trainer log: {'loss': 0.2786, 'grad_norm': 3.6809277534484863, 'learning_rate': 5.358218569287834e-06} +[Rank 3] Trainer log: {'loss': 0.2786, 'grad_norm': 3.6809277534484863, 'learning_rate': 5.358218569287834e-06} -[Rank 0] Trainer log: {'loss': 0.3753, 'grad_norm': 12.842230796813965, 'learning_rate': 1.637095152273585e-05} -{'loss': 0.3753, 'grad_norm': 12.842230796813965, 'learning_rate': 1.637095152273585e-05, 'epoch': 0.32} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) +{'loss': 0.2786, 'grad_norm': 3.6809277534484863, 'learning_rate': 5.358218569287834e-06, 'epoch': 0.67} tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0015, device='cuda:3', grad_fn=) tensor(-0.0015, device='cuda:3', grad_fn=) -tensor(0.0772, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.001044172327965498, 'train/lm_loss': 0.00010992842726409435, 'train/info_loss': 4.1125513234874234e-05, 'train/ref_loss': 0.2613227367401123, 'train/uncertainty_loss': 0.007717213034629822, 'train/video_loss': 0.27743446826934814, 'train/total_loss': 0.27754440903663635} +tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) +{'train/tv_loss': None, 'train/lm_loss': 0.2943330526351929, 'train/info_loss': 0.14906230568885803, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011231083190068603, 'train/video_loss': 0.14894999563694, 'train/total_loss': 0.4432830810546875} +tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) +tensor(0.0972, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) +tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) +tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) +tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) +{'train/tv_loss': 0.00019463470671325924, 'train/lm_loss': 2.817909116856754e-05, 'train/info_loss': 1.9430735846981406e-05, 'train/ref_loss': 0.17679685354232788, 'train/uncertainty_loss': -7.071272702887655e-05, 'train/video_loss': 0.17830266058444977, 'train/total_loss': 0.17833083868026733} +tensor(0.5060, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) +[Rank 1] Trainer log: {'loss': 0.3584, 'grad_norm': 8.921645164489746, 'learning_rate': 5.348772519011522e-06}[Rank 3] Trainer log: {'loss': 0.3584, 'grad_norm': 8.921645164489746, 'learning_rate': 5.348772519011522e-06} + +[Rank 0] Trainer log: {'loss': 0.3584, 'grad_norm': 8.921645164489746, 'learning_rate': 5.348772519011522e-06}[Rank 2] Trainer log: {'loss': 0.3584, 'grad_norm': 8.921645164489746, 'learning_rate': 5.348772519011522e-06} + +{'loss': 0.3584, 'grad_norm': 8.921645164489746, 'learning_rate': 5.348772519011522e-06, 'epoch': 0.67} +tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) +tensor(-0.0015, device='cuda:2', grad_fn=) tensor(-0.0015, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.38717830181121826, 'train/info_loss': 0.2320910543203354, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012302817776799202, 'train/video_loss': 0.23196803033351898, 'train/total_loss': 0.6191463470458984} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) +{'train/tv_loss': None, 'train/lm_loss': 0.18815828561782838, 'train/info_loss': 0.2182503193616867, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001221169950440526, 'train/video_loss': 0.21812820434570312, 'train/total_loss': 0.40628647804260254} tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.47, 'grad_norm': 6.236999034881592, 'learning_rate': 1.6362725512221816e-05}[Rank 1] Trainer log: {'loss': 0.47, 'grad_norm': 6.236999034881592, 'learning_rate': 1.6362725512221816e-05}[Rank 2] Trainer log: {'loss': 0.47, 'grad_norm': 6.236999034881592, 'learning_rate': 1.6362725512221816e-05} - +tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) +tensor(0.1818, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) +{'train/tv_loss': 0.0002212676452472806, 'train/lm_loss': 3.621250216383487e-05, 'train/info_loss': 2.0682384274550714e-05, 'train/ref_loss': 0.3341350257396698, 'train/uncertainty_loss': 0.018175090849399566, 'train/video_loss': 0.35410094261169434, 'train/total_loss': 0.3541371524333954} +tensor(0.0466, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) +[Rank 1] Trainer log: {'loss': 0.338, 'grad_norm': 7.290347576141357, 'learning_rate': 5.33933176167174e-06}[Rank 3] Trainer log: {'loss': 0.338, 'grad_norm': 7.290347576141357, 'learning_rate': 5.33933176167174e-06}[Rank 0] Trainer log: {'loss': 0.338, 'grad_norm': 7.290347576141357, 'learning_rate': 5.33933176167174e-06} -[Rank 0] Trainer log: {'loss': 0.47, 'grad_norm': 6.236999034881592, 'learning_rate': 1.6362725512221816e-05} -{'loss': 0.47, 'grad_norm': 6.236999034881592, 'learning_rate': 1.6362725512221816e-05, 'epoch': 0.32} -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.18853067159652712, 'train/info_loss': 0.17951151728630066, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011444524861872197, 'train/video_loss': 0.1793970763683319, 'train/total_loss': 0.3679277300834656} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.18521385192871095, 'train/info_loss': 0.23002685606479645, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001066391938365996, 'train/video_loss': 0.22992022335529327, 'train/total_loss': 0.415134072303772} -tensor(0.1219, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.3723, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.5072, 'grad_norm': 10.11407470703125, 'learning_rate': 1.635449226114549e-05}[Rank 2] Trainer log: {'loss': 0.5072, 'grad_norm': 10.11407470703125, 'learning_rate': 1.635449226114549e-05} +[Rank 2] Trainer log: {'loss': 0.338, 'grad_norm': 7.290347576141357, 'learning_rate': 5.33933176167174e-06} -[Rank 1] Trainer log: {'loss': 0.5072, 'grad_norm': 10.11407470703125, 'learning_rate': 1.635449226114549e-05} -[Rank 0] Trainer log: {'loss': 0.5072, 'grad_norm': 10.11407470703125, 'learning_rate': 1.635449226114549e-05} -{'loss': 0.5072, 'grad_norm': 10.11407470703125, 'learning_rate': 1.635449226114549e-05, 'epoch': 0.32} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) +{'loss': 0.338, 'grad_norm': 7.290347576141357, 'learning_rate': 5.33933176167174e-06, 'epoch': 0.67} +tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.16356310844421387, 'train/info_loss': 0.14643177390098572, 'train/ref_loss': None, 'train/uncertainty_loss': -9.01074439752847e-05, 'train/video_loss': 0.14634166657924652, 'train/total_loss': 0.3099047839641571} -tensor(0.1825, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) +{'train/tv_loss': None, 'train/lm_loss': 0.1676993489265442, 'train/info_loss': 0.16723506152629852, 'train/ref_loss': None, 'train/uncertainty_loss': -9.374739020131529e-05, 'train/video_loss': 0.16714131832122803, 'train/total_loss': 0.33484065532684326} +tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.2248, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0009957646019756795, 'train/lm_loss': 0.00010833189589902759, 'train/info_loss': 3.951631879317574e-05, 'train/ref_loss': 0.35675305128097534, 'train/uncertainty_loss': 0.022476166486740112, 'train/video_loss': 0.3872348666191101, 'train/total_loss': 0.3873431980609894} -[Rank 3] Trainer log: {'loss': 0.3956, 'grad_norm': 8.027265548706055, 'learning_rate': 1.6346251778876034e-05}[Rank 0] Trainer log: {'loss': 0.3956, 'grad_norm': 8.027265548706055, 'learning_rate': 1.6346251778876034e-05}[Rank 2] Trainer log: {'loss': 0.3956, 'grad_norm': 8.027265548706055, 'learning_rate': 1.6346251778876034e-05} - -[Rank 1] Trainer log: {'loss': 0.3956, 'grad_norm': 8.027265548706055, 'learning_rate': 1.6346251778876034e-05} - -{'loss': 0.3956, 'grad_norm': 8.027265548706055, 'learning_rate': 1.6346251778876034e-05, 'epoch': 0.32} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) +tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) +tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) +tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) +{'train/tv_loss': None, 'train/lm_loss': 0.07586914896965027, 'train/info_loss': 0.19399400055408478, 'train/ref_loss': None, 'train/uncertainty_loss': -8.165427134372295e-05, 'train/video_loss': 0.1939123421907425, 'train/total_loss': 0.2697815001010895} +[Rank 1] Trainer log: {'loss': 0.3344, 'grad_norm': 2.729362726211548, 'learning_rate': 5.329896308011746e-06} +[Rank 3] Trainer log: {'loss': 0.3344, 'grad_norm': 2.729362726211548, 'learning_rate': 5.329896308011746e-06} +[Rank 2] Trainer log: {'loss': 0.3344, 'grad_norm': 2.729362726211548, 'learning_rate': 5.329896308011746e-06} +[Rank 0] Trainer log: {'loss': 0.3344, 'grad_norm': 2.729362726211548, 'learning_rate': 5.329896308011746e-06} +{'loss': 0.3344, 'grad_norm': 2.729362726211548, 'learning_rate': 5.329896308011746e-06, 'epoch': 0.67} +tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) +tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0009665003977715969, 'train/lm_loss': 0.00011822066735476256, 'train/info_loss': 4.0767914470052347e-05, 'train/ref_loss': 0.05018105357885361, 'train/uncertainty_loss': -7.398106972686947e-05, 'train/video_loss': 0.057879846543073654, 'train/total_loss': 0.05799806863069534} -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.24710693359375002, 'train/info_loss': 0.10016009956598282, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013044244842603804, 'train/video_loss': 0.10002965480089188, 'train/total_loss': 0.34713658690452576} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.7753, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.459, 'grad_norm': 6.656757354736328, 'learning_rate': 1.6338004074790827e-05}[Rank 1] Trainer log: {'loss': 0.459, 'grad_norm': 6.656757354736328, 'learning_rate': 1.6338004074790827e-05} -[Rank 3] Trainer log: {'loss': 0.459, 'grad_norm': 6.656757354736328, 'learning_rate': 1.6338004074790827e-05}[Rank 2] Trainer log: {'loss': 0.459, 'grad_norm': 6.656757354736328, 'learning_rate': 1.6338004074790827e-05} - - -{'loss': 0.459, 'grad_norm': 6.656757354736328, 'learning_rate': 1.6338004074790827e-05, 'epoch': 0.32} -tensor(-0.0015, device='cuda:3', grad_fn=) tensor(-0.0015, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.22772152423858644, 'train/info_loss': 0.23055961728096008, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012859554262831808, 'train/video_loss': 0.23043102025985718, 'train/total_loss': 0.45815253257751465} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(0.1510, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.1594, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0012007216922938825, 'train/lm_loss': 0.000149885262362659, 'train/info_loss': 4.803903721040115e-05, 'train/ref_loss': 0.3195462226867676, 'train/uncertainty_loss': 0.015938019752502443, 'train/video_loss': 0.3451380431652069, 'train/total_loss': 0.34528791904449463} -[Rank 3] Trainer log: {'loss': 0.4061, 'grad_norm': 6.767094135284424, 'learning_rate': 1.6329749158275466e-05}[Rank 2] Trainer log: {'loss': 0.4061, 'grad_norm': 6.767094135284424, 'learning_rate': 1.6329749158275466e-05}[Rank 0] Trainer log: {'loss': 0.4061, 'grad_norm': 6.767094135284424, 'learning_rate': 1.6329749158275466e-05} +{'train/tv_loss': 0.00028699506074190143, 'train/lm_loss': 2.4722478701733053e-05, 'train/info_loss': 1.5854584489716217e-05, 'train/ref_loss': 0.16213561594486237, 'train/uncertainty_loss': -6.997622549533845e-05, 'train/video_loss': 0.16437745094299316, 'train/total_loss': 0.16440217196941376} +tensor(0.1039, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) +tensor(0.2505, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) +tensor(0.3928, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) +tensor(0.1910, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) +{'train/tv_loss': 0.00017832512967288496, 'train/lm_loss': 3.2017051125876606e-05, 'train/info_loss': 1.8536700736149214e-05, 'train/ref_loss': 0.34068092703819275, 'train/uncertainty_loss': 0.019102130830287934, 'train/video_loss': 0.36122819781303406, 'train/total_loss': 0.36126020550727844} +tensor(0.1056, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) +[Rank 2] Trainer log: {'loss': 0.3922, 'grad_norm': 25.53676414489746, 'learning_rate': 5.320466168768763e-06}[Rank 1] Trainer log: {'loss': 0.3922, 'grad_norm': 25.53676414489746, 'learning_rate': 5.320466168768763e-06}[Rank 3] Trainer log: {'loss': 0.3922, 'grad_norm': 25.53676414489746, 'learning_rate': 5.320466168768763e-06} -[Rank 1] Trainer log: {'loss': 0.4061, 'grad_norm': 6.767094135284424, 'learning_rate': 1.6329749158275466e-05} -{'loss': 0.4061, 'grad_norm': 6.767094135284424, 'learning_rate': 1.6329749158275466e-05, 'epoch': 0.32} +[Rank 0] Trainer log: {'loss': 0.3922, 'grad_norm': 25.53676414489746, 'learning_rate': 5.320466168768763e-06} +{'loss': 0.3922, 'grad_norm': 25.53676414489746, 'learning_rate': 5.320466168768763e-06, 'epoch': 0.67} +tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.44742703437805176, 'train/info_loss': 0.14066272974014282, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010276878019794823, 'train/video_loss': 0.14055995643138885, 'train/total_loss': 0.5879870057106018} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) +tensor(0.0523, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) +tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) +{'train/tv_loss': 0.00034126632381230597, 'train/lm_loss': 4.705829196609557e-05, 'train/info_loss': 2.270885306643322e-05, 'train/ref_loss': 0.14596444368362427, 'train/uncertainty_loss': -6.959487800486387e-05, 'train/video_loss': 0.14864769577980042, 'train/total_loss': 0.14869475364685059} +tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.13306982517242433, 'train/info_loss': 0.1701342612504959, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011413327883929014, 'train/video_loss': 0.17002013325691223, 'train/total_loss': 0.30308997631073} -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.4491, 'grad_norm': 3.4552502632141113, 'learning_rate': 1.6321487038723775e-05}[Rank 1] Trainer log: {'loss': 0.4491, 'grad_norm': 3.4552502632141113, 'learning_rate': 1.6321487038723775e-05} +tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) +tensor(0.2626, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) +{'train/tv_loss': 0.00020637928973883392, 'train/lm_loss': 4.114681505598128e-05, 'train/info_loss': 2.002676046686247e-05, 'train/ref_loss': 0.3821646273136139, 'train/uncertainty_loss': 0.026258140802383423, 'train/video_loss': 0.41009384393692017, 'train/total_loss': 0.4101350009441376} +[Rank 2] Trainer log: {'loss': 0.3904, 'grad_norm': 3.6489875316619873, 'learning_rate': 5.311041354673965e-06}[Rank 1] Trainer log: {'loss': 0.3904, 'grad_norm': 3.6489875316619873, 'learning_rate': 5.311041354673965e-06}[Rank 3] Trainer log: {'loss': 0.3904, 'grad_norm': 3.6489875316619873, 'learning_rate': 5.311041354673965e-06} -[Rank 3] Trainer log: {'loss': 0.4491, 'grad_norm': 3.4552502632141113, 'learning_rate': 1.6321487038723775e-05}[Rank 0] Trainer log: {'loss': 0.4491, 'grad_norm': 3.4552502632141113, 'learning_rate': 1.6321487038723775e-05} -{'loss': 0.4491, 'grad_norm': 3.4552502632141113, 'learning_rate': 1.6321487038723775e-05, 'epoch': 0.32} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.26768338680267334, 'train/info_loss': 0.19073674082756042, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012272892054170372, 'train/video_loss': 0.1906140148639679, 'train/total_loss': 0.45829740166664124} -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.14820789098739626, 'train/info_loss': 0.20938153564929962, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001053486135788262, 'train/video_loss': 0.20927618443965912, 'train/total_loss': 0.3574840724468231} +[Rank 0] Trainer log: {'loss': 0.3904, 'grad_norm': 3.6489875316619873, 'learning_rate': 5.311041354673965e-06} +{'loss': 0.3904, 'grad_norm': 3.6489875316619873, 'learning_rate': 5.311041354673965e-06, 'epoch': 0.67} +tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) +{'train/tv_loss': None, 'train/lm_loss': 0.3425630569458008, 'train/info_loss': 0.09640836715698242, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013262249995023013, 'train/video_loss': 0.09627574682235718, 'train/total_loss': 0.43883880972862244} +tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.4136, 'grad_norm': 2.8739962577819824, 'learning_rate': 1.631321772553775e-05}[Rank 1] Trainer log: {'loss': 0.4136, 'grad_norm': 2.8739962577819824, 'learning_rate': 1.631321772553775e-05} -[Rank 2] Trainer log: {'loss': 0.4136, 'grad_norm': 2.8739962577819824, 'learning_rate': 1.631321772553775e-05} - -[Rank 3] Trainer log: {'loss': 0.4136, 'grad_norm': 2.8739962577819824, 'learning_rate': 1.631321772553775e-05} -{'loss': 0.4136, 'grad_norm': 2.8739962577819824, 'learning_rate': 1.631321772553775e-05, 'epoch': 0.32} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.37124569416046144, 'train/info_loss': 0.23484741151332855, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010931423166766764, 'train/video_loss': 0.23473809659481049, 'train/total_loss': 0.6059837937355042} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) +tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) +tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) +{'train/tv_loss': None, 'train/lm_loss': 0.21872355937957766, 'train/info_loss': 0.1936417818069458, 'train/ref_loss': None, 'train/uncertainty_loss': -9.261313243769109e-05, 'train/video_loss': 0.19354917109012604, 'train/total_loss': 0.41227275133132935} tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.4042, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1339, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0013623428530991077, 'train/lm_loss': 0.00016139192739501597, 'train/info_loss': 4.6966255467850715e-05, 'train/ref_loss': 0.304008424282074, 'train/uncertainty_loss': 0.013390748202800751, 'train/video_loss': 0.3283448815345764, 'train/total_loss': 0.3285062611103058} -[Rank 1] Trainer log: {'loss': 0.4774, 'grad_norm': 7.135345458984375, 'learning_rate': 1.630494122812759e-05} -[Rank 3] Trainer log: {'loss': 0.4774, 'grad_norm': 7.135345458984375, 'learning_rate': 1.630494122812759e-05} -[Rank 0] Trainer log: {'loss': 0.4774, 'grad_norm': 7.135345458984375, 'learning_rate': 1.630494122812759e-05}[Rank 2] Trainer log: {'loss': 0.4774, 'grad_norm': 7.135345458984375, 'learning_rate': 1.630494122812759e-05} +tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) +tensor(0.0175, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) +[Rank 3] Trainer log: {'loss': 0.2958, 'grad_norm': 2.4354987144470215, 'learning_rate': 5.3016218764524566e-06}[Rank 2] Trainer log: {'loss': 0.2958, 'grad_norm': 2.4354987144470215, 'learning_rate': 5.3016218764524566e-06}[Rank 1] Trainer log: {'loss': 0.2958, 'grad_norm': 2.4354987144470215, 'learning_rate': 5.3016218764524566e-06} -{'loss': 0.4774, 'grad_norm': 7.135345458984375, 'learning_rate': 1.630494122812759e-05, 'epoch': 0.32} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) + +[Rank 0] Trainer log: {'loss': 0.2958, 'grad_norm': 2.4354987144470215, 'learning_rate': 5.3016218764524566e-06} +{'loss': 0.2958, 'grad_norm': 2.4354987144470215, 'learning_rate': 5.3016218764524566e-06, 'epoch': 0.67} +tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) +tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.14400602579116822, 'train/info_loss': 0.152194544672966, 'train/ref_loss': None, 'train/uncertainty_loss': -8.764031808823348e-05, 'train/video_loss': 0.15210691094398499, 'train/total_loss': 0.29611295461654663} -tensor(0.4345, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) +{'train/tv_loss': None, 'train/lm_loss': 0.04078601598739624, 'train/info_loss': 0.1744564175605774, 'train/ref_loss': None, 'train/uncertainty_loss': -9.315100614912808e-05, 'train/video_loss': 0.17436327040195465, 'train/total_loss': 0.21514928340911865} +tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) +tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) +{'train/tv_loss': None, 'train/lm_loss': 0.4514183521270752, 'train/info_loss': 0.1671510636806488, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012873088708147408, 'train/video_loss': 0.16702233254909515, 'train/total_loss': 0.6184406876564026} tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3880802154541016, 'train/info_loss': 0.19103002548217773, 'train/ref_loss': None, 'train/uncertainty_loss': -9.857392869889737e-05, 'train/video_loss': 0.19093145430088043, 'train/total_loss': 0.5790116786956787} -[Rank 3] Trainer log: {'loss': 0.5179, 'grad_norm': 9.568516731262207, 'learning_rate': 1.6296657555911662e-05}[Rank 1] Trainer log: {'loss': 0.5179, 'grad_norm': 9.568516731262207, 'learning_rate': 1.6296657555911662e-05} +tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) +tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) +[Rank 3] Trainer log: {'loss': 0.4384, 'grad_norm': 3.6673552989959717, 'learning_rate': 5.292207744823292e-06}[Rank 1] Trainer log: {'loss': 0.4384, 'grad_norm': 3.6673552989959717, 'learning_rate': 5.292207744823292e-06}[Rank 0] Trainer log: {'loss': 0.4384, 'grad_norm': 3.6673552989959717, 'learning_rate': 5.292207744823292e-06} -[Rank 2] Trainer log: {'loss': 0.5179, 'grad_norm': 9.568516731262207, 'learning_rate': 1.6296657555911662e-05}[Rank 0] Trainer log: {'loss': 0.5179, 'grad_norm': 9.568516731262207, 'learning_rate': 1.6296657555911662e-05} +[Rank 2] Trainer log: {'loss': 0.4384, 'grad_norm': 3.6673552989959717, 'learning_rate': 5.292207744823292e-06} -{'loss': 0.5179, 'grad_norm': 9.568516731262207, 'learning_rate': 1.6296657555911662e-05, 'epoch': 0.32} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.26672067642211916, 'train/info_loss': 0.13223646581172943, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001173800090327859, 'train/video_loss': 0.1321190893650055, 'train/total_loss': 0.3988397717475891} +{'loss': 0.4384, 'grad_norm': 3.6673552989959717, 'learning_rate': 5.292207744823292e-06, 'epoch': 0.67} +tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) +tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) +{'train/tv_loss': None, 'train/lm_loss': 0.2933893442153931, 'train/info_loss': 0.2160397469997406, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012039007851853967, 'train/video_loss': 0.2159193605184555, 'train/total_loss': 0.5093086957931519} +tensor(0.5757, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2856492280960083, 'train/info_loss': 0.19189755618572235, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001266586594283581, 'train/video_loss': 0.19177089631557465, 'train/total_loss': 0.4774201512336731} -tensor(0.2212, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.5225, 'grad_norm': 6.002381324768066, 'learning_rate': 1.6288366718316504e-05}[Rank 1] Trainer log: {'loss': 0.5225, 'grad_norm': 6.002381324768066, 'learning_rate': 1.6288366718316504e-05}[Rank 0] Trainer log: {'loss': 0.5225, 'grad_norm': 6.002381324768066, 'learning_rate': 1.6288366718316504e-05} +tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) +{'train/tv_loss': None, 'train/lm_loss': 0.3226417779922486, 'train/info_loss': 0.2257913053035736, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012224605306982995, 'train/video_loss': 0.22566905617713928, 'train/total_loss': 0.5483108758926392} +tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) +tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) +[Rank 1] Trainer log: {'loss': 0.4335, 'grad_norm': 7.770205020904541, 'learning_rate': 5.282798970499419e-06}[Rank 3] Trainer log: {'loss': 0.4335, 'grad_norm': 7.770205020904541, 'learning_rate': 5.282798970499419e-06}[Rank 0] Trainer log: {'loss': 0.4335, 'grad_norm': 7.770205020904541, 'learning_rate': 5.282798970499419e-06} -[Rank 2] Trainer log: {'loss': 0.5225, 'grad_norm': 6.002381324768066, 'learning_rate': 1.6288366718316504e-05} -{'loss': 0.5225, 'grad_norm': 6.002381324768066, 'learning_rate': 1.6288366718316504e-05, 'epoch': 0.32} +[Rank 2] Trainer log: {'loss': 0.4335, 'grad_norm': 7.770205020904541, 'learning_rate': 5.282798970499419e-06} +{'loss': 0.4335, 'grad_norm': 7.770205020904541, 'learning_rate': 5.282798970499419e-06, 'epoch': 0.67} tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) +tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) +tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) +{'train/tv_loss': None, 'train/lm_loss': 0.05969409346580506, 'train/info_loss': 0.12639681994915009, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010193279013037682, 'train/video_loss': 0.12629488110542297, 'train/total_loss': 0.18598897755146027} +tensor(0.4648, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) +tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0010931103490293026, 'train/lm_loss': 8.986361208371819e-05, 'train/info_loss': 3.683431350509636e-05, 'train/ref_loss': 0.096525639295578, 'train/uncertainty_loss': -6.960913306102157e-05, 'train/video_loss': 0.10523774474859238, 'train/total_loss': 0.10532760620117188} +{'train/tv_loss': 0.0001848034793511033, 'train/lm_loss': 4.784488410223276e-05, 'train/info_loss': 2.3781687559676357e-05, 'train/ref_loss': 0.09314209222793579, 'train/uncertainty_loss': -6.842478760518134e-05, 'train/video_loss': 0.09457587450742722, 'train/total_loss': 0.09462372213602066} +tensor(0.0209, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) +[Rank 1] Trainer log: {'loss': 0.3372, 'grad_norm': 3.80949330329895, 'learning_rate': 5.2733955641877014e-06} +[Rank 0] Trainer log: {'loss': 0.3372, 'grad_norm': 3.80949330329895, 'learning_rate': 5.2733955641877014e-06}[Rank 3] Trainer log: {'loss': 0.3372, 'grad_norm': 3.80949330329895, 'learning_rate': 5.2733955641877014e-06} +[Rank 2] Trainer log: {'loss': 0.3372, 'grad_norm': 3.80949330329895, 'learning_rate': 5.2733955641877014e-06} + +{'loss': 0.3372, 'grad_norm': 3.80949330329895, 'learning_rate': 5.2733955641877014e-06, 'epoch': 0.67} +tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) +{'train/tv_loss': None, 'train/lm_loss': 0.3318225860595703, 'train/info_loss': 0.15727035701274872, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012564819771796465, 'train/video_loss': 0.1571447104215622, 'train/total_loss': 0.48896729946136475} +tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) +tensor(0.1757, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) +tensor(0.0752, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) +tensor(-0.0015, device='cuda:2', grad_fn=) tensor(-0.0015, device='cuda:2', grad_fn=) +tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) +tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) +{'train/tv_loss': None, 'train/lm_loss': 0.36557722091674805, 'train/info_loss': 0.21017247438430786, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001263092621229589, 'train/video_loss': 0.2100461721420288, 'train/total_loss': 0.5756233930587769} +tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) +[Rank 2] Trainer log: {'loss': 0.4421, 'grad_norm': 2.6192946434020996, 'learning_rate': 5.2639975365888915e-06}[Rank 1] Trainer log: {'loss': 0.4421, 'grad_norm': 2.6192946434020996, 'learning_rate': 5.2639975365888915e-06}[Rank 0] Trainer log: {'loss': 0.4421, 'grad_norm': 2.6192946434020996, 'learning_rate': 5.2639975365888915e-06} + +[Rank 3] Trainer log: {'loss': 0.4421, 'grad_norm': 2.6192946434020996, 'learning_rate': 5.2639975365888915e-06} + +{'loss': 0.4421, 'grad_norm': 2.6192946434020996, 'learning_rate': 5.2639975365888915e-06, 'epoch': 0.67} tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.21335966587066652, 'train/info_loss': 0.20443585515022278, 'train/ref_loss': None, 'train/uncertainty_loss': -9.755018982104957e-05, 'train/video_loss': 0.20433831214904785, 'train/total_loss': 0.4176979660987854} -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(0.0412, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.305, 'grad_norm': 2.150538206100464, 'learning_rate': 1.6280068724776795e-05}[Rank 1] Trainer log: {'loss': 0.305, 'grad_norm': 2.150538206100464, 'learning_rate': 1.6280068724776795e-05}[Rank 2] Trainer log: {'loss': 0.305, 'grad_norm': 2.150538206100464, 'learning_rate': 1.6280068724776795e-05} -[Rank 0] Trainer log: {'loss': 0.305, 'grad_norm': 2.150538206100464, 'learning_rate': 1.6280068724776795e-05} +tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) +tensor(0.0558, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) +{'train/tv_loss': 0.0002826522570103407, 'train/lm_loss': 4.1075304034166043e-05, 'train/info_loss': 2.0980394765501842e-05, 'train/ref_loss': 0.2548973560333252, 'train/uncertainty_loss': 0.005578510090708733, 'train/video_loss': 0.2627580761909485, 'train/total_loss': 0.26279914379119873} +tensor(0.0751, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) +tensor(0.1031, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) +tensor(0.2885, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) +tensor(0.3759, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) +{'train/tv_loss': 0.00014740541810169818, 'train/lm_loss': 4.1718900320120156e-05, 'train/info_loss': 2.235124156868551e-05, 'train/ref_loss': 0.44964680075645447, 'train/uncertainty_loss': 0.03759066164493561, 'train/video_loss': 0.48843905329704285, 'train/total_loss': 0.4884807765483856} +tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) +[Rank 3] Trainer log: {'loss': 0.3811, 'grad_norm': 6.568197250366211, 'learning_rate': 5.254604898397622e-06}[Rank 2] Trainer log: {'loss': 0.3811, 'grad_norm': 6.568197250366211, 'learning_rate': 5.254604898397622e-06} +[Rank 0] Trainer log: {'loss': 0.3811, 'grad_norm': 6.568197250366211, 'learning_rate': 5.254604898397622e-06}[Rank 1] Trainer log: {'loss': 0.3811, 'grad_norm': 6.568197250366211, 'learning_rate': 5.254604898397622e-06} -{'loss': 0.305, 'grad_norm': 2.150538206100464, 'learning_rate': 1.6280068724776795e-05, 'epoch': 0.32} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) +{'loss': 0.3811, 'grad_norm': 6.568197250366211, 'learning_rate': 5.254604898397622e-06, 'epoch': 0.67} tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.1144, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0012724722735583784, 'train/lm_loss': 0.00014104637084528805, 'train/info_loss': 4.3092302803415805e-05, 'train/ref_loss': 0.29118889570236206, 'train/uncertainty_loss': 0.011438516527414323, 'train/video_loss': 0.31285029649734497, 'train/total_loss': 0.31299135088920593} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) +tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.16582590341567993, 'train/info_loss': 0.21356548368930817, 'train/ref_loss': None, 'train/uncertainty_loss': -9.082397446036339e-05, 'train/video_loss': 0.21347466111183167, 'train/total_loss': 0.3793005645275116} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3954, 'grad_norm': 5.590821743011475, 'learning_rate': 1.6271763584735373e-05}[Rank 3] Trainer log: {'loss': 0.3954, 'grad_norm': 5.590821743011475, 'learning_rate': 1.6271763584735373e-05}[Rank 2] Trainer log: {'loss': 0.3954, 'grad_norm': 5.590821743011475, 'learning_rate': 1.6271763584735373e-05} +{'train/tv_loss': None, 'train/lm_loss': 0.20859401226043703, 'train/info_loss': 0.15063157677650452, 'train/ref_loss': None, 'train/uncertainty_loss': -8.650162490084768e-05, 'train/video_loss': 0.15054507553577423, 'train/total_loss': 0.359139084815979} +tensor(0.3707, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) +tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) +tensor(0.1651, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) +tensor(0.0766, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) +tensor(0.0790, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) +{'train/tv_loss': 0.00012563209747895598, 'train/lm_loss': 5.358931375667453e-05, 'train/info_loss': 2.3424076061928645e-05, 'train/ref_loss': 0.2713722586631775, 'train/uncertainty_loss': 0.007897476851940156, 'train/video_loss': 0.2802982032299042, 'train/total_loss': 0.28035178780555725} +[Rank 3] Trainer log: {'loss': 0.3563, 'grad_norm': 5.911625385284424, 'learning_rate': 5.245217660302393e-06}[Rank 0] Trainer log: {'loss': 0.3563, 'grad_norm': 5.911625385284424, 'learning_rate': 5.245217660302393e-06}[Rank 1] Trainer log: {'loss': 0.3563, 'grad_norm': 5.911625385284424, 'learning_rate': 5.245217660302393e-06} -[Rank 1] Trainer log: {'loss': 0.3954, 'grad_norm': 5.590821743011475, 'learning_rate': 1.6271763584735373e-05} -{'loss': 0.3954, 'grad_norm': 5.590821743011475, 'learning_rate': 1.6271763584735373e-05, 'epoch': 0.32} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.15804960727691653, 'train/info_loss': 0.19896839559078217, 'train/ref_loss': None, 'train/uncertainty_loss': -9.187590330839157e-05, 'train/video_loss': 0.1988765150308609, 'train/total_loss': 0.3569261431694031} -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(0.1132, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.0882499098777771, 'train/info_loss': 0.2833555340766907, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001314101740717888, 'train/video_loss': 0.2832241356372833, 'train/total_loss': 0.3714740574359894} -tensor(-0.0015, device='cuda:2', grad_fn=) tensor(-0.0015, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) +[Rank 2] Trainer log: {'loss': 0.3563, 'grad_norm': 5.911625385284424, 'learning_rate': 5.245217660302393e-06} +{'loss': 0.3563, 'grad_norm': 5.911625385284424, 'learning_rate': 5.245217660302393e-06, 'epoch': 0.67} +tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) +{'train/tv_loss': None, 'train/lm_loss': 0.12044333219528199, 'train/info_loss': 0.24000190198421478, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010617880616337061, 'train/video_loss': 0.23989571630954742, 'train/total_loss': 0.36033904552459717} +tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) +tensor(0.3196, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) +tensor(0.3214, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) +tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) +{'train/tv_loss': None, 'train/lm_loss': 0.42991042137145996, 'train/info_loss': 0.19825683534145355, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011912664631381632, 'train/video_loss': 0.19813771545886993, 'train/total_loss': 0.6280481219291687} +tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3317, 'grad_norm': 3.976871967315674, 'learning_rate': 1.6263451307643194e-05}[Rank 0] Trainer log: {'loss': 0.3317, 'grad_norm': 3.976871967315674, 'learning_rate': 1.6263451307643194e-05} -[Rank 2] Trainer log: {'loss': 0.3317, 'grad_norm': 3.976871967315674, 'learning_rate': 1.6263451307643194e-05} -[Rank 1] Trainer log: {'loss': 0.3317, 'grad_norm': 3.976871967315674, 'learning_rate': 1.6263451307643194e-05} +tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) +[Rank 2] Trainer log: {'loss': 0.3669, 'grad_norm': 4.621405601501465, 'learning_rate': 5.235835832985552e-06} +[Rank 0] Trainer log: {'loss': 0.3669, 'grad_norm': 4.621405601501465, 'learning_rate': 5.235835832985552e-06}[Rank 3] Trainer log: {'loss': 0.3669, 'grad_norm': 4.621405601501465, 'learning_rate': 5.235835832985552e-06} +[Rank 1] Trainer log: {'loss': 0.3669, 'grad_norm': 4.621405601501465, 'learning_rate': 5.235835832985552e-06} -{'loss': 0.3317, 'grad_norm': 3.976871967315674, 'learning_rate': 1.6263451307643194e-05, 'epoch': 0.32} +{'loss': 0.3669, 'grad_norm': 4.621405601501465, 'learning_rate': 5.235835832985552e-06, 'epoch': 0.68} +tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) +{'train/tv_loss': None, 'train/lm_loss': 0.3681203842163086, 'train/info_loss': 0.21409358084201813, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013099560746923088, 'train/video_loss': 0.213962584733963, 'train/total_loss': 0.582082986831665} tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3866883993148804, 'train/info_loss': 0.22924336791038513, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010740474099293352, 'train/video_loss': 0.22913596034049988, 'train/total_loss': 0.6158243417739868} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.0939, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) +tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) +tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.28033058643341063, 'train/info_loss': 0.18017196655273438, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012003884185105563, 'train/video_loss': 0.18005192279815674, 'train/total_loss': 0.46038252115249634} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4506, 'grad_norm': 2.3910672664642334, 'learning_rate': 1.625513190295935e-05}[Rank 2] Trainer log: {'loss': 0.4506, 'grad_norm': 2.3910672664642334, 'learning_rate': 1.625513190295935e-05}[Rank 3] Trainer log: {'loss': 0.4506, 'grad_norm': 2.3910672664642334, 'learning_rate': 1.625513190295935e-05} - +{'train/tv_loss': None, 'train/lm_loss': 0.3256625413894654, 'train/info_loss': 0.21197225153446198, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012033459497615696, 'train/video_loss': 0.21185190975666046, 'train/total_loss': 0.5375144481658936} +tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) +tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) +tensor(-0.0002, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) +[Rank 0] Trainer log: {'loss': 0.3319, 'grad_norm': 3.0671746730804443, 'learning_rate': 5.2264594271232985e-06}[Rank 2] Trainer log: {'loss': 0.3319, 'grad_norm': 3.0671746730804443, 'learning_rate': 5.2264594271232985e-06} +[Rank 3] Trainer log: {'loss': 0.3319, 'grad_norm': 3.0671746730804443, 'learning_rate': 5.2264594271232985e-06} +[Rank 1] Trainer log: {'loss': 0.3319, 'grad_norm': 3.0671746730804443, 'learning_rate': 5.2264594271232985e-06} -[Rank 0] Trainer log: {'loss': 0.4506, 'grad_norm': 2.3910672664642334, 'learning_rate': 1.625513190295935e-05} -{'loss': 0.4506, 'grad_norm': 2.3910672664642334, 'learning_rate': 1.625513190295935e-05, 'epoch': 0.32} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) +{'loss': 0.3319, 'grad_norm': 3.0671746730804443, 'learning_rate': 5.2264594271232985e-06, 'epoch': 0.68} tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1925522327423096, 'train/info_loss': 0.19846221804618835, 'train/ref_loss': None, 'train/uncertainty_loss': -9.281996171921492e-05, 'train/video_loss': 0.19836939871311188, 'train/total_loss': 0.3909216523170471} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) +tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) +{'train/tv_loss': None, 'train/lm_loss': 0.024680739641189577, 'train/info_loss': 0.22439506649971008, 'train/ref_loss': None, 'train/uncertainty_loss': -9.505017078481615e-05, 'train/video_loss': 0.22430001199245453, 'train/total_loss': 0.24898074567317963} +tensor(0.2703, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) +tensor(0.2134, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) +tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) +tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) +tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) +tensor(0.2072, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) +{'train/tv_loss': 0.00019791726954281332, 'train/lm_loss': 5.254055140540004e-05, 'train/info_loss': 2.1338008082238957e-05, 'train/ref_loss': 0.3478505611419678, 'train/uncertainty_loss': 0.02072073519229889, 'train/video_loss': 0.37017595767974854, 'train/total_loss': 0.37022849917411804} +[Rank 1] Trainer log: {'loss': 0.3313, 'grad_norm': 10.232361793518066, 'learning_rate': 5.217088453385658e-06} +[Rank 3] Trainer log: {'loss': 0.3313, 'grad_norm': 10.232361793518066, 'learning_rate': 5.217088453385658e-06} +[Rank 2] Trainer log: {'loss': 0.3313, 'grad_norm': 10.232361793518066, 'learning_rate': 5.217088453385658e-06} +[Rank 0] Trainer log: {'loss': 0.3313, 'grad_norm': 10.232361793518066, 'learning_rate': 5.217088453385658e-06} +{'loss': 0.3313, 'grad_norm': 10.232361793518066, 'learning_rate': 5.217088453385658e-06, 'epoch': 0.68} +tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) +tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) +tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) +{'train/tv_loss': 0.0001836120616644621, 'train/lm_loss': 4.071774892508984e-05, 'train/info_loss': 2.0682384274550714e-05, 'train/ref_loss': 0.1617579460144043, 'train/uncertainty_loss': -6.792510394006968e-05, 'train/video_loss': 0.16317960619926453, 'train/total_loss': 0.1632203310728073} +tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3197020530700684, 'train/info_loss': 0.1790241003036499, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001317097805440426, 'train/video_loss': 0.17889238893985748, 'train/total_loss': 0.4985944628715515} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.3954, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.458, 'grad_norm': 5.913549900054932, 'learning_rate': 1.6246805380151028e-05}[Rank 2] Trainer log: {'loss': 0.458, 'grad_norm': 5.913549900054932, 'learning_rate': 1.6246805380151028e-05}[Rank 3] Trainer log: {'loss': 0.458, 'grad_norm': 5.913549900054932, 'learning_rate': 1.6246805380151028e-05} +{'train/tv_loss': None, 'train/lm_loss': 0.22212421894073486, 'train/info_loss': 0.1980726271867752, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013388840015977622, 'train/video_loss': 0.1979387402534485, 'train/total_loss': 0.42006295919418335} +tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) +tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) +tensor(0.3523, device='cuda:3', grad_fn=) tensor(-0.0006, device='cuda:3', grad_fn=) +[Rank 1] Trainer log: {'loss': 0.3441, 'grad_norm': 6.984566688537598, 'learning_rate': 5.207722922436477e-06}[Rank 2] Trainer log: {'loss': 0.3441, 'grad_norm': 6.984566688537598, 'learning_rate': 5.207722922436477e-06}[Rank 0] Trainer log: {'loss': 0.3441, 'grad_norm': 6.984566688537598, 'learning_rate': 5.207722922436477e-06} -[Rank 0] Trainer log: {'loss': 0.458, 'grad_norm': 5.913549900054932, 'learning_rate': 1.6246805380151028e-05} -{'loss': 0.458, 'grad_norm': 5.913549900054932, 'learning_rate': 1.6246805380151028e-05, 'epoch': 0.32} +[Rank 3] Trainer log: {'loss': 0.3441, 'grad_norm': 6.984566688537598, 'learning_rate': 5.207722922436477e-06} +{'loss': 0.3441, 'grad_norm': 6.984566688537598, 'learning_rate': 5.207722922436477e-06, 'epoch': 0.68} tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3718394994735718, 'train/info_loss': 0.1859433501958847, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011998609406873585, 'train/video_loss': 0.18582336604595184, 'train/total_loss': 0.557662844657898} +{'train/tv_loss': None, 'train/lm_loss': 0.27136542797088625, 'train/info_loss': 0.16034714877605438, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012400286504998803, 'train/video_loss': 0.16022314131259918, 'train/total_loss': 0.4315885901451111} tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) +tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) +tensor(0.3354, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(0.1056, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1450533390045166, 'train/info_loss': 0.17725688219070435, 'train/ref_loss': None, 'train/uncertainty_loss': -8.601777371950448e-05, 'train/video_loss': 0.17717085778713226, 'train/total_loss': 0.3222241997718811} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.0589, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4106, 'grad_norm': 6.09997034072876, 'learning_rate': 1.6238471748693535e-05}[Rank 0] Trainer log: {'loss': 0.4106, 'grad_norm': 6.09997034072876, 'learning_rate': 1.6238471748693535e-05}[Rank 2] Trainer log: {'loss': 0.4106, 'grad_norm': 6.09997034072876, 'learning_rate': 1.6238471748693535e-05} - -[Rank 3] Trainer log: {'loss': 0.4106, 'grad_norm': 6.09997034072876, 'learning_rate': 1.6238471748693535e-05} - -{'loss': 0.4106, 'grad_norm': 6.09997034072876, 'learning_rate': 1.6238471748693535e-05, 'epoch': 0.32} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1278, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0012008221819996834, 'train/lm_loss': 0.00013130168663337828, 'train/info_loss': 4.1125513234874234e-05, 'train/ref_loss': 0.3012641966342926, 'train/uncertainty_loss': 0.012784802913665773, 'train/video_loss': 0.32369670271873474, 'train/total_loss': 0.3238280117511749} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.4400810241699219, 'train/info_loss': 0.1576402485370636, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011949799954891205, 'train/video_loss': 0.15752075612545013, 'train/total_loss': 0.5976017713546753} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1308, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3545, 'grad_norm': 6.040954113006592, 'learning_rate': 1.623013101807025e-05}[Rank 1] Trainer log: {'loss': 0.3545, 'grad_norm': 6.040954113006592, 'learning_rate': 1.623013101807025e-05}[Rank 0] Trainer log: {'loss': 0.3545, 'grad_norm': 6.040954113006592, 'learning_rate': 1.623013101807025e-05} - -[Rank 2] Trainer log: {'loss': 0.3545, 'grad_norm': 6.040954113006592, 'learning_rate': 1.623013101807025e-05} -{'loss': 0.3545, 'grad_norm': 6.040954113006592, 'learning_rate': 1.623013101807025e-05, 'epoch': 0.32} - -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(0.4391, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(0.5484, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1981, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0011589550413191319, 'train/lm_loss': 8.912484627217055e-05, 'train/info_loss': 3.832431684713811e-05, 'train/ref_loss': 0.3454377055168152, 'train/uncertainty_loss': 0.019809661805629732, 'train/video_loss': 0.37455734610557556, 'train/total_loss': 0.37464648485183716} -tensor(-0.0019, device='cuda:1', grad_fn=) tensor(-0.0019, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.4081481456756592, 'train/info_loss': 0.21005874872207642, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011364904930815101, 'train/video_loss': 0.20994509756565094, 'train/total_loss': 0.6180932521820068} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.5455, 'grad_norm': 13.616247177124023, 'learning_rate': 1.6221783197772633e-05}[Rank 3] Trainer log: {'loss': 0.5455, 'grad_norm': 13.616247177124023, 'learning_rate': 1.6221783197772633e-05} -[Rank 2] Trainer log: {'loss': 0.5455, 'grad_norm': 13.616247177124023, 'learning_rate': 1.6221783197772633e-05} - -[Rank 1] Trainer log: {'loss': 0.5455, 'grad_norm': 13.616247177124023, 'learning_rate': 1.6221783197772633e-05} -{'loss': 0.5455, 'grad_norm': 13.616247177124023, 'learning_rate': 1.6221783197772633e-05, 'epoch': 0.32} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(0.0392, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0011184865608811379, 'train/lm_loss': 0.0001025175559334457, 'train/info_loss': 3.921831739717163e-05, 'train/ref_loss': 0.24740096926689148, 'train/uncertainty_loss': 0.003917390108108521, 'train/video_loss': 0.2603054642677307, 'train/total_loss': 0.2604079842567444} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.4057766437530518, 'train/info_loss': 0.10351146757602692, 'train/ref_loss': None, 'train/uncertainty_loss': -9.115069988183678e-05, 'train/video_loss': 0.10342031717300415, 'train/total_loss': 0.5091969966888428} -tensor(0.2305, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.4016, 'grad_norm': 7.431344032287598, 'learning_rate': 1.6213428297300226e-05}[Rank 1] Trainer log: {'loss': 0.4016, 'grad_norm': 7.431344032287598, 'learning_rate': 1.6213428297300226e-05}[Rank 3] Trainer log: {'loss': 0.4016, 'grad_norm': 7.431344032287598, 'learning_rate': 1.6213428297300226e-05} - - -[Rank 0] Trainer log: {'loss': 0.4016, 'grad_norm': 7.431344032287598, 'learning_rate': 1.6213428297300226e-05} -{'loss': 0.4016, 'grad_norm': 7.431344032287598, 'learning_rate': 1.6213428297300226e-05, 'epoch': 0.32} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.43726568222045903, 'train/info_loss': 0.1530928611755371, 'train/ref_loss': None, 'train/uncertainty_loss': -9.945111814886333e-05, 'train/video_loss': 0.15299341082572937, 'train/total_loss': 0.5902590751647949} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.08018993139266968, 'train/info_loss': 0.16648781299591064, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010217452654615045, 'train/video_loss': 0.16638563573360443, 'train/total_loss': 0.24657556414604187} -tensor(0.0697, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3752, 'grad_norm': 3.47300386428833, 'learning_rate': 1.620506632616061e-05}[Rank 3] Trainer log: {'loss': 0.3752, 'grad_norm': 3.47300386428833, 'learning_rate': 1.620506632616061e-05}[Rank 0] Trainer log: {'loss': 0.3752, 'grad_norm': 3.47300386428833, 'learning_rate': 1.620506632616061e-05} - -[Rank 2] Trainer log: {'loss': 0.3752, 'grad_norm': 3.47300386428833, 'learning_rate': 1.620506632616061e-05} - -{'loss': 0.3752, 'grad_norm': 3.47300386428833, 'learning_rate': 1.620506632616061e-05, 'epoch': 0.32} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(0.0910, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0899, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0010216088965535163, 'train/lm_loss': 0.00012291468447074295, 'train/info_loss': 4.0767914470052347e-05, 'train/ref_loss': 0.18336841464042664, 'train/uncertainty_loss': -7.061558426357806e-05, 'train/video_loss': 0.19151143729686737, 'train/total_loss': 0.19163435697555542} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3049536466598511, 'train/info_loss': 0.1514400988817215, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001007717801257968, 'train/video_loss': 0.1513393223285675, 'train/total_loss': 0.456292986869812} -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3962, 'grad_norm': 2.747042179107666, 'learning_rate': 1.6196697293869416e-05} -[Rank 1] Trainer log: {'loss': 0.3962, 'grad_norm': 2.747042179107666, 'learning_rate': 1.6196697293869416e-05}[Rank 3] Trainer log: {'loss': 0.3962, 'grad_norm': 2.747042179107666, 'learning_rate': 1.6196697293869416e-05}[Rank 0] Trainer log: {'loss': 0.3962, 'grad_norm': 2.747042179107666, 'learning_rate': 1.6196697293869416e-05} - - -{'loss': 0.3962, 'grad_norm': 2.747042179107666, 'learning_rate': 1.6196697293869416e-05, 'epoch': 0.32} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.29747464656829836, 'train/info_loss': 0.2720152735710144, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010960937943309546, 'train/video_loss': 0.27190566062927246, 'train/total_loss': 0.5693802833557129} -tensor(0.1580, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) +{'train/tv_loss': 0.00020076960790902377, 'train/lm_loss': 2.4650961859151724e-05, 'train/info_loss': 1.794067611626815e-05, 'train/ref_loss': 0.20887520909309387, 'train/uncertainty_loss': -7.02370423823595e-05, 'train/video_loss': 0.21042905747890472, 'train/total_loss': 0.21045370399951935} tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1489, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.001094124373048544, 'train/lm_loss': 8.936316007748247e-05, 'train/info_loss': 3.713231490110047e-05, 'train/ref_loss': 0.3127734065055847, 'train/uncertainty_loss': 0.014892511069774628, 'train/video_loss': 0.3364560604095459, 'train/total_loss': 0.3365454375743866} -[Rank 2] Trainer log: {'loss': 0.3634, 'grad_norm': 5.327772617340088, 'learning_rate': 1.6188321209950326e-05}[Rank 1] Trainer log: {'loss': 0.3634, 'grad_norm': 5.327772617340088, 'learning_rate': 1.6188321209950326e-05}[Rank 0] Trainer log: {'loss': 0.3634, 'grad_norm': 5.327772617340088, 'learning_rate': 1.6188321209950326e-05} - -[Rank 3] Trainer log: {'loss': 0.3634, 'grad_norm': 5.327772617340088, 'learning_rate': 1.6188321209950326e-05} - -{'loss': 0.3634, 'grad_norm': 5.327772617340088, 'learning_rate': 1.6188321209950326e-05, 'epoch': 0.32} -tensor(-0.0017, device='cuda:3', grad_fn=) tensor(-0.0017, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(0.1851, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1963, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0009804653003811837, 'train/lm_loss': 7.730420329608024e-05, 'train/info_loss': 3.516550350468606e-05, 'train/ref_loss': 0.3439442813396454, 'train/uncertainty_loss': 0.01963055580854416, 'train/video_loss': 0.37145373225212097, 'train/total_loss': 0.37153103947639465} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.15962464809417726, 'train/info_loss': 0.28677600622177124, 'train/ref_loss': None, 'train/uncertainty_loss': -9.765485883690418e-05, 'train/video_loss': 0.28667834401130676, 'train/total_loss': 0.44630300998687744} -tensor(0.2423, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3776, 'grad_norm': 11.040372848510742, 'learning_rate': 1.6179938083935027e-05}[Rank 0] Trainer log: {'loss': 0.3776, 'grad_norm': 11.040372848510742, 'learning_rate': 1.6179938083935027e-05}[Rank 3] Trainer log: {'loss': 0.3776, 'grad_norm': 11.040372848510742, 'learning_rate': 1.6179938083935027e-05} +tensor(0.3715, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) +[Rank 2] Trainer log: {'loss': 0.3726, 'grad_norm': 13.78152084350586, 'learning_rate': 5.19836284493341e-06}[Rank 3] Trainer log: {'loss': 0.3726, 'grad_norm': 13.78152084350586, 'learning_rate': 5.19836284493341e-06}[Rank 1] Trainer log: {'loss': 0.3726, 'grad_norm': 13.78152084350586, 'learning_rate': 5.19836284493341e-06} -[Rank 1] Trainer log: {'loss': 0.3776, 'grad_norm': 11.040372848510742, 'learning_rate': 1.6179938083935027e-05} -{'loss': 0.3776, 'grad_norm': 11.040372848510742, 'learning_rate': 1.6179938083935027e-05, 'epoch': 0.32} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2142038345336914, 'train/info_loss': 0.20404334366321564, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010988774010911584, 'train/video_loss': 0.2039334625005722, 'train/total_loss': 0.4181373119354248} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) +[Rank 0] Trainer log: {'loss': 0.3726, 'grad_norm': 13.78152084350586, 'learning_rate': 5.19836284493341e-06} +{'loss': 0.3726, 'grad_norm': 13.78152084350586, 'learning_rate': 5.19836284493341e-06, 'epoch': 0.68} +tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2574564695358276, 'train/info_loss': 0.19637063145637512, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010931746801361442, 'train/video_loss': 0.19626131653785706, 'train/total_loss': 0.45371779799461365} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(0.0034, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3939, 'grad_norm': 3.4271085262298584, 'learning_rate': 1.617154792536323e-05}[Rank 1] Trainer log: {'loss': 0.3939, 'grad_norm': 3.4271085262298584, 'learning_rate': 1.617154792536323e-05} - -[Rank 2] Trainer log: {'loss': 0.3939, 'grad_norm': 3.4271085262298584, 'learning_rate': 1.617154792536323e-05} -[Rank 3] Trainer log: {'loss': 0.3939, 'grad_norm': 3.4271085262298584, 'learning_rate': 1.617154792536323e-05} -{'loss': 0.3939, 'grad_norm': 3.4271085262298584, 'learning_rate': 1.617154792536323e-05, 'epoch': 0.32} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) +{'train/tv_loss': None, 'train/lm_loss': 0.3160058736801148, 'train/info_loss': 0.15621928870677948, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010966254631057383, 'train/video_loss': 0.15610963106155396, 'train/total_loss': 0.47211551666259766} tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(0.0751, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0009054861962795258, 'train/lm_loss': 0.00022629804443567993, 'train/info_loss': 4.58338727185037e-05, 'train/ref_loss': 0.26662129163742065, 'train/uncertainty_loss': 0.007505714148283005, 'train/video_loss': 0.28141674399375916, 'train/total_loss': 0.2816430330276489} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3642987489700318, 'train/info_loss': 0.23968617618083954, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001345232012681663, 'train/video_loss': 0.23955164849758148, 'train/total_loss': 0.6038504242897034} -tensor(0.2239, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0352, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3923, 'grad_norm': 2.6218597888946533, 'learning_rate': 1.6163150743782648e-05} -[Rank 3] Trainer log: {'loss': 0.3923, 'grad_norm': 2.6218597888946533, 'learning_rate': 1.6163150743782648e-05}[Rank 2] Trainer log: {'loss': 0.3923, 'grad_norm': 2.6218597888946533, 'learning_rate': 1.6163150743782648e-05} - -[Rank 0] Trainer log: {'loss': 0.3923, 'grad_norm': 2.6218597888946533, 'learning_rate': 1.6163150743782648e-05} -{'loss': 0.3923, 'grad_norm': 2.6218597888946533, 'learning_rate': 1.6163150743782648e-05, 'epoch': 0.32} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(0.1588, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.2162, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0008653523400425911, 'train/lm_loss': 0.0003634248394519091, 'train/info_loss': 5.840913945576176e-05, 'train/ref_loss': 0.35375964641571045, 'train/uncertainty_loss': 0.021619726717472077, 'train/video_loss': 0.3823606073856354, 'train/total_loss': 0.3827240467071533} -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2675053358078003, 'train/info_loss': 0.23996949195861816, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001417772611603141, 'train/video_loss': 0.2398277074098587, 'train/total_loss': 0.5073330402374268} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4476, 'grad_norm': 3.615865707397461, 'learning_rate': 1.615474654874898e-05}[Rank 1] Trainer log: {'loss': 0.4476, 'grad_norm': 3.615865707397461, 'learning_rate': 1.615474654874898e-05}[Rank 0] Trainer log: {'loss': 0.4476, 'grad_norm': 3.615865707397461, 'learning_rate': 1.615474654874898e-05} - -[Rank 2] Trainer log: {'loss': 0.4476, 'grad_norm': 3.615865707397461, 'learning_rate': 1.615474654874898e-05} - -{'loss': 0.4476, 'grad_norm': 3.615865707397461, 'learning_rate': 1.615474654874898e-05, 'epoch': 0.32} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3125016212463379, 'train/info_loss': 0.13807782530784607, 'train/ref_loss': None, 'train/uncertainty_loss': -9.473854443058372e-05, 'train/video_loss': 0.13798308372497559, 'train/total_loss': 0.4504847228527069} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3859776020050049, 'train/info_loss': 0.09924226254224777, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011515605729073286, 'train/video_loss': 0.09912710636854172, 'train/total_loss': 0.4851047098636627} +tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.0203, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0091, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3896, 'grad_norm': 1.974806308746338, 'learning_rate': 1.614633534982591e-05}[Rank 3] Trainer log: {'loss': 0.3896, 'grad_norm': 1.974806308746338, 'learning_rate': 1.614633534982591e-05} -[Rank 2] Trainer log: {'loss': 0.3896, 'grad_norm': 1.974806308746338, 'learning_rate': 1.614633534982591e-05} -[Rank 1] Trainer log: {'loss': 0.3896, 'grad_norm': 1.974806308746338, 'learning_rate': 1.614633534982591e-05} - -{'loss': 0.3896, 'grad_norm': 1.974806308746338, 'learning_rate': 1.614633534982591e-05, 'epoch': 0.32} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2297077178955078, 'train/info_loss': 0.15873320400714874, 'train/ref_loss': None, 'train/uncertainty_loss': -9.90675063803792e-05, 'train/video_loss': 0.15863414108753204, 'train/total_loss': 0.38834184408187866} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.2258, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.24933876991271975, 'train/info_loss': 0.18499110639095306, 'train/ref_loss': None, 'train/uncertainty_loss': -8.247103542089463e-05, 'train/video_loss': 0.18490862846374512, 'train/total_loss': 0.4342474043369293} -tensor(0.0094, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3185, 'grad_norm': 5.431887149810791, 'learning_rate': 1.6137917156585096e-05}[Rank 0] Trainer log: {'loss': 0.3185, 'grad_norm': 5.431887149810791, 'learning_rate': 1.6137917156585096e-05}[Rank 1] Trainer log: {'loss': 0.3185, 'grad_norm': 5.431887149810791, 'learning_rate': 1.6137917156585096e-05} - -[Rank 3] Trainer log: {'loss': 0.3185, 'grad_norm': 5.431887149810791, 'learning_rate': 1.6137917156585096e-05} +{'train/tv_loss': None, 'train/lm_loss': 0.017812368273735047, 'train/info_loss': 0.12798281013965607, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010375801939517261, 'train/video_loss': 0.1278790533542633, 'train/total_loss': 0.1456914246082306} +tensor(0.0281, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) +tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) +[Rank 0] Trainer log: {'loss': 0.3523, 'grad_norm': 5.630080699920654, 'learning_rate': 5.189008231527891e-06}[Rank 3] Trainer log: {'loss': 0.3523, 'grad_norm': 5.630080699920654, 'learning_rate': 5.189008231527891e-06} +[Rank 1] Trainer log: {'loss': 0.3523, 'grad_norm': 5.630080699920654, 'learning_rate': 5.189008231527891e-06} +[Rank 2] Trainer log: {'loss': 0.3523, 'grad_norm': 5.630080699920654, 'learning_rate': 5.189008231527891e-06} -{'loss': 0.3185, 'grad_norm': 5.431887149810791, 'learning_rate': 1.6137917156585096e-05, 'epoch': 0.33} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.26536097526550295, 'train/info_loss': 0.15133464336395264, 'train/ref_loss': None, 'train/uncertainty_loss': -8.574576349928975e-05, 'train/video_loss': 0.15124890208244324, 'train/total_loss': 0.41660988330841064} -tensor(0.1482, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) +{'loss': 0.3523, 'grad_norm': 5.630080699920654, 'learning_rate': 5.189008231527891e-06, 'epoch': 0.68} tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.29530498981475833, 'train/info_loss': 0.19007977843284607, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011581596918404103, 'train/video_loss': 0.18996396660804749, 'train/total_loss': 0.4852689504623413} -[Rank 1] Trainer log: {'loss': 0.4773, 'grad_norm': 4.01504373550415, 'learning_rate': 1.6129491978606147e-05}[Rank 2] Trainer log: {'loss': 0.4773, 'grad_norm': 4.01504373550415, 'learning_rate': 1.6129491978606147e-05}[Rank 3] Trainer log: {'loss': 0.4773, 'grad_norm': 4.01504373550415, 'learning_rate': 1.6129491978606147e-05} - - -[Rank 0] Trainer log: {'loss': 0.4773, 'grad_norm': 4.01504373550415, 'learning_rate': 1.6129491978606147e-05} -{'loss': 0.4773, 'grad_norm': 4.01504373550415, 'learning_rate': 1.6129491978606147e-05, 'epoch': 0.33} tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.19349884986877441, 'train/info_loss': 0.12720108032226562, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013151681050658227, 'train/video_loss': 0.12706956267356873, 'train/total_loss': 0.32056841254234314} -tensor(0.1631, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) +tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) +{'train/tv_loss': 0.00021480978466570377, 'train/lm_loss': 2.1766431746073068e-05, 'train/info_loss': 1.5139350580284372e-05, 'train/ref_loss': 0.06869110465049744, 'train/uncertainty_loss': -6.853470695205033e-05, 'train/video_loss': 0.07035618275403976, 'train/total_loss': 0.07037794589996338} tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(0.0253, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1946, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.001117449626326561, 'train/lm_loss': 9.61549347266555e-05, 'train/info_loss': 3.892031963914633e-05, 'train/ref_loss': 0.3318180739879608, 'train/uncertainty_loss': 0.01945956200361252, 'train/video_loss': 0.360256165266037, 'train/total_loss': 0.3603523075580597} -[Rank 2] Trainer log: {'loss': 0.3146, 'grad_norm': 7.179553031921387, 'learning_rate': 1.612105982547663e-05}[Rank 3] Trainer log: {'loss': 0.3146, 'grad_norm': 7.179553031921387, 'learning_rate': 1.612105982547663e-05}[Rank 0] Trainer log: {'loss': 0.3146, 'grad_norm': 7.179553031921387, 'learning_rate': 1.612105982547663e-05} - -[Rank 1] Trainer log: {'loss': 0.3146, 'grad_norm': 7.179553031921387, 'learning_rate': 1.612105982547663e-05} - -{'loss': 0.3146, 'grad_norm': 7.179553031921387, 'learning_rate': 1.612105982547663e-05, 'epoch': 0.33} -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.0244, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0007943136617541314, 'train/lm_loss': 8.77187994774431e-05, 'train/info_loss': 3.743031629710458e-05, 'train/ref_loss': 0.2368764728307724, 'train/uncertainty_loss': 0.0024443870410323144, 'train/video_loss': 0.24571280181407928, 'train/total_loss': 0.24580052495002747} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.26316485404968265, 'train/info_loss': 0.14665135741233826, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010544222313910723, 'train/video_loss': 0.1465459167957306, 'train/total_loss': 0.40971076488494873} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) +{'train/tv_loss': None, 'train/lm_loss': 0.2931710720062256, 'train/info_loss': 0.22656624019145966, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010749018983915449, 'train/video_loss': 0.22645874321460724, 'train/total_loss': 0.5196298360824585} tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3464, 'grad_norm': 9.548771858215332, 'learning_rate': 1.6112620706792044e-05}[Rank 1] Trainer log: {'loss': 0.3464, 'grad_norm': 9.548771858215332, 'learning_rate': 1.6112620706792044e-05}[Rank 2] Trainer log: {'loss': 0.3464, 'grad_norm': 9.548771858215332, 'learning_rate': 1.6112620706792044e-05} - - -[Rank 3] Trainer log: {'loss': 0.3464, 'grad_norm': 9.548771858215332, 'learning_rate': 1.6112620706792044e-05} -{'loss': 0.3464, 'grad_norm': 9.548771858215332, 'learning_rate': 1.6112620706792044e-05, 'epoch': 0.33} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2143861770629883, 'train/info_loss': 0.17061617970466614, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011603714665398002, 'train/video_loss': 0.17050014436244965, 'train/total_loss': 0.38488632440567017} -tensor(0.3187, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(0.0112, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2873803377151489, 'train/info_loss': 0.21251225471496582, 'train/ref_loss': None, 'train/uncertainty_loss': -9.460778092034161e-05, 'train/video_loss': 0.21241764724254608, 'train/total_loss': 0.4997979998588562} -tensor(0.3372, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4201, 'grad_norm': 2.449599504470825, 'learning_rate': 1.6104174632155813e-05} -[Rank 3] Trainer log: {'loss': 0.4201, 'grad_norm': 2.449599504470825, 'learning_rate': 1.6104174632155813e-05} -[Rank 2] Trainer log: {'loss': 0.4201, 'grad_norm': 2.449599504470825, 'learning_rate': 1.6104174632155813e-05} -[Rank 0] Trainer log: {'loss': 0.4201, 'grad_norm': 2.449599504470825, 'learning_rate': 1.6104174632155813e-05} -{'loss': 0.4201, 'grad_norm': 2.449599504470825, 'learning_rate': 1.6104174632155813e-05, 'epoch': 0.33} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.32791240215301515, 'train/info_loss': 0.1386835277080536, 'train/ref_loss': None, 'train/uncertainty_loss': -9.856340475380421e-05, 'train/video_loss': 0.13858497142791748, 'train/total_loss': 0.46649739146232605} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3261512517929077, 'train/info_loss': 0.22392477095127106, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010742142330855131, 'train/video_loss': 0.2238173484802246, 'train/total_loss': 0.5499686002731323} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.5072, 'grad_norm': 2.975259304046631, 'learning_rate': 1.609572161117928e-05}[Rank 0] Trainer log: {'loss': 0.5072, 'grad_norm': 2.975259304046631, 'learning_rate': 1.609572161117928e-05}[Rank 2] Trainer log: {'loss': 0.5072, 'grad_norm': 2.975259304046631, 'learning_rate': 1.609572161117928e-05} - -[Rank 1] Trainer log: {'loss': 0.5072, 'grad_norm': 2.975259304046631, 'learning_rate': 1.609572161117928e-05} - -{'loss': 0.5072, 'grad_norm': 2.975259304046631, 'learning_rate': 1.609572161117928e-05, 'epoch': 0.33} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.001043757237493992, 'train/lm_loss': 0.00020110029727220537, 'train/info_loss': 4.1125513234874234e-05, 'train/ref_loss': 0.1623699963092804, 'train/uncertainty_loss': -7.649477920494974e-05, 'train/video_loss': 0.17068469524383545, 'train/total_loss': 0.17088580131530762} -tensor(0.0950, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(0.0419, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0007512886077165604, 'train/lm_loss': 6.984444917179644e-05, 'train/info_loss': 3.462909808149561e-05, 'train/ref_loss': 0.11861558258533478, 'train/uncertainty_loss': -6.97568932082504e-05, 'train/video_loss': 0.12459076195955276, 'train/total_loss': 0.12466060370206833} -[Rank 3] Trainer log: {'loss': 0.2466, 'grad_norm': 3.548736572265625, 'learning_rate': 1.6087261653481688e-05}[Rank 1] Trainer log: {'loss': 0.2466, 'grad_norm': 3.548736572265625, 'learning_rate': 1.6087261653481688e-05}[Rank 2] Trainer log: {'loss': 0.2466, 'grad_norm': 3.548736572265625, 'learning_rate': 1.6087261653481688e-05} - +tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) +tensor(0.0354, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) +[Rank 1] Trainer log: {'loss': 0.2831, 'grad_norm': 3.7484803199768066, 'learning_rate': 5.179659092865164e-06}[Rank 3] Trainer log: {'loss': 0.2831, 'grad_norm': 3.7484803199768066, 'learning_rate': 5.179659092865164e-06} +[Rank 2] Trainer log: {'loss': 0.2831, 'grad_norm': 3.7484803199768066, 'learning_rate': 5.179659092865164e-06} -[Rank 0] Trainer log: {'loss': 0.2466, 'grad_norm': 3.548736572265625, 'learning_rate': 1.6087261653481688e-05} -{'loss': 0.2466, 'grad_norm': 3.548736572265625, 'learning_rate': 1.6087261653481688e-05, 'epoch': 0.33} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) +[Rank 0] Trainer log: {'loss': 0.2831, 'grad_norm': 3.7484803199768066, 'learning_rate': 5.179659092865164e-06} +{'loss': 0.2831, 'grad_norm': 3.7484803199768066, 'learning_rate': 5.179659092865164e-06, 'epoch': 0.68} +tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) +{'train/tv_loss': None, 'train/lm_loss': 0.30767171382904057, 'train/info_loss': 0.16729487478733063, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012666770489886404, 'train/video_loss': 0.16716820001602173, 'train/total_loss': 0.4748399257659912} tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(0.0621, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.000972279440611601, 'train/lm_loss': 0.00011283551575616003, 'train/info_loss': 3.921831739717163e-05, 'train/ref_loss': 0.2613287568092346, 'train/uncertainty_loss': 0.006205539405345917, 'train/video_loss': 0.27535173296928406, 'train/total_loss': 0.27546456456184387} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.1559, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0010096441954374313, 'train/lm_loss': 0.00024387266021221878, 'train/info_loss': 4.917141268379055e-05, 'train/ref_loss': 0.31677547097206116, 'train/uncertainty_loss': 0.015586629509925842, 'train/video_loss': 0.3404884338378906, 'train/total_loss': 0.34073230624198914} -tensor(0.1332, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(0.0347, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3515, 'grad_norm': 11.603082656860352, 'learning_rate': 1.6078794768690182e-05}[Rank 3] Trainer log: {'loss': 0.3515, 'grad_norm': 11.603082656860352, 'learning_rate': 1.6078794768690182e-05}[Rank 1] Trainer log: {'loss': 0.3515, 'grad_norm': 11.603082656860352, 'learning_rate': 1.6078794768690182e-05} - -[Rank 2] Trainer log: {'loss': 0.3515, 'grad_norm': 11.603082656860352, 'learning_rate': 1.6078794768690182e-05} - -{'loss': 0.3515, 'grad_norm': 11.603082656860352, 'learning_rate': 1.6078794768690182e-05, 'epoch': 0.33} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.1254, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1364, device='cuda:3', grad_fn=) tensor(0.1372, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0009311805479228497, 'train/lm_loss': 0.00012255727779120207, 'train/info_loss': 4.046991671202704e-05, 'train/ref_loss': 0.3051985204219818, 'train/uncertainty_loss': 0.013724373281002046, 'train/video_loss': 0.3264128267765045, 'train/total_loss': 0.3265353739261627} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(0.0340, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0010068912990391255, 'train/lm_loss': 7.737569976598025e-05, 'train/info_loss': 3.486750210868195e-05, 'train/ref_loss': 0.1505722552537918, 'train/uncertainty_loss': -7.40673509426415e-05, 'train/video_loss': 0.15858818590641022, 'train/total_loss': 0.15866556763648987} -[Rank 3] Trainer log: {'loss': 0.4159, 'grad_norm': 2.9683077335357666, 'learning_rate': 1.6070320966439784e-05} -[Rank 2] Trainer log: {'loss': 0.4159, 'grad_norm': 2.9683077335357666, 'learning_rate': 1.6070320966439784e-05} -[Rank 0] Trainer log: {'loss': 0.4159, 'grad_norm': 2.9683077335357666, 'learning_rate': 1.6070320966439784e-05}[Rank 1] Trainer log: {'loss': 0.4159, 'grad_norm': 2.9683077335357666, 'learning_rate': 1.6070320966439784e-05} - -{'loss': 0.4159, 'grad_norm': 2.9683077335357666, 'learning_rate': 1.6070320966439784e-05, 'epoch': 0.33} -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.12675063610076906, 'train/info_loss': 0.29527175426483154, 'train/ref_loss': None, 'train/uncertainty_loss': -9.72831272520125e-05, 'train/video_loss': 0.2951744794845581, 'train/total_loss': 0.4219251275062561} tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.390305233001709, 'train/info_loss': 0.15447980165481567, 'train/ref_loss': None, 'train/uncertainty_loss': -9.896683041006328e-05, 'train/video_loss': 0.15438082814216614, 'train/total_loss': 0.5446860790252686} -tensor(0.0444, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3757, 'grad_norm': 4.209166526794434, 'learning_rate': 1.606184025637339e-05}[Rank 2] Trainer log: {'loss': 0.3757, 'grad_norm': 4.209166526794434, 'learning_rate': 1.606184025637339e-05} -[Rank 3] Trainer log: {'loss': 0.3757, 'grad_norm': 4.209166526794434, 'learning_rate': 1.606184025637339e-05} - -[Rank 0] Trainer log: {'loss': 0.3757, 'grad_norm': 4.209166526794434, 'learning_rate': 1.606184025637339e-05} -{'loss': 0.3757, 'grad_norm': 4.209166526794434, 'learning_rate': 1.606184025637339e-05, 'epoch': 0.33} tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3225510358810425, 'train/info_loss': 0.1433652937412262, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010891730198636651, 'train/video_loss': 0.14325638115406036, 'train/total_loss': 0.4658074378967285} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2786545753479004, 'train/info_loss': 0.193758025765419, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001163292326964438, 'train/video_loss': 0.19364169239997864, 'train/total_loss': 0.47229626774787903} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4673, 'grad_norm': 3.126945734024048, 'learning_rate': 1.605335264814175e-05}[Rank 1] Trainer log: {'loss': 0.4673, 'grad_norm': 3.126945734024048, 'learning_rate': 1.605335264814175e-05} - -[Rank 0] Trainer log: {'loss': 0.4673, 'grad_norm': 3.126945734024048, 'learning_rate': 1.605335264814175e-05} -[Rank 2] Trainer log: {'loss': 0.4673, 'grad_norm': 3.126945734024048, 'learning_rate': 1.605335264814175e-05} -{'loss': 0.4673, 'grad_norm': 3.126945734024048, 'learning_rate': 1.605335264814175e-05, 'epoch': 0.33} -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.21524641513824463, 'train/info_loss': 0.23959839344024658, 'train/ref_loss': None, 'train/uncertainty_loss': -9.676982881501317e-05, 'train/video_loss': 0.23950162529945374, 'train/total_loss': 0.4547480344772339} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.3200, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.3486, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(0.0235, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0009706035256385804, 'train/lm_loss': 0.00010904676746577026, 'train/info_loss': 4.017191531602293e-05, 'train/ref_loss': 0.239169642329216, 'train/uncertainty_loss': 0.002354037389159203, 'train/video_loss': 0.24932868778705597, 'train/total_loss': 0.24943773448467255} -[Rank 3] Trainer log: {'loss': 0.376, 'grad_norm': 10.731908798217773, 'learning_rate': 1.6044858151403478e-05}[Rank 2] Trainer log: {'loss': 0.376, 'grad_norm': 10.731908798217773, 'learning_rate': 1.6044858151403478e-05}[Rank 1] Trainer log: {'loss': 0.376, 'grad_norm': 10.731908798217773, 'learning_rate': 1.6044858151403478e-05} - - -[Rank 0] Trainer log: {'loss': 0.376, 'grad_norm': 10.731908798217773, 'learning_rate': 1.6044858151403478e-05} -{'loss': 0.376, 'grad_norm': 10.731908798217773, 'learning_rate': 1.6044858151403478e-05, 'epoch': 0.33} +{'train/tv_loss': None, 'train/lm_loss': 0.259895396232605, 'train/info_loss': 0.19613586366176605, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011139851994812489, 'train/video_loss': 0.19602446258068085, 'train/total_loss': 0.45591986179351807} tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.1609, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.2081, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0012530672363936903, 'train/lm_loss': 7.794768316671252e-05, 'train/info_loss': 3.379469126230106e-05, 'train/ref_loss': 0.3480134904384613, 'train/uncertainty_loss': 0.020806159079074862, 'train/video_loss': 0.37887799739837646, 'train/total_loss': 0.3789559304714203} tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.08798917531967164, 'train/info_loss': 0.10712561011314392, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010583301773294807, 'train/video_loss': 0.10701977461576462, 'train/total_loss': 0.19500894844532013} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(0.8116, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4345, 'grad_norm': 17.275712966918945, 'learning_rate': 1.603635677582501e-05}[Rank 2] Trainer log: {'loss': 0.4345, 'grad_norm': 17.275712966918945, 'learning_rate': 1.603635677582501e-05}[Rank 1] Trainer log: {'loss': 0.4345, 'grad_norm': 17.275712966918945, 'learning_rate': 1.603635677582501e-05} - - -[Rank 0] Trainer log: {'loss': 0.4345, 'grad_norm': 17.275712966918945, 'learning_rate': 1.603635677582501e-05} -{'loss': 0.4345, 'grad_norm': 17.275712966918945, 'learning_rate': 1.603635677582501e-05, 'epoch': 0.33} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3214067220687866, 'train/info_loss': 0.1887696087360382, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011720227776095271, 'train/video_loss': 0.1886524111032486, 'train/total_loss': 0.510059118270874} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.0326, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0019513495266437531, 'train/lm_loss': 0.00014921819092705846, 'train/info_loss': 4.619146420736797e-05, 'train/ref_loss': 0.10216908156871796, 'train/uncertainty_loss': -8.677395526319743e-05, 'train/video_loss': 0.11773929744958878, 'train/total_loss': 0.11788851767778397} -tensor(0.0078, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3087, 'grad_norm': 6.6473164558410645, 'learning_rate': 1.6027848531080622e-05}[Rank 1] Trainer log: {'loss': 0.3087, 'grad_norm': 6.6473164558410645, 'learning_rate': 1.6027848531080622e-05}[Rank 2] Trainer log: {'loss': 0.3087, 'grad_norm': 6.6473164558410645, 'learning_rate': 1.6027848531080622e-05} - - -[Rank 0] Trainer log: {'loss': 0.3087, 'grad_norm': 6.6473164558410645, 'learning_rate': 1.6027848531080622e-05} -{'loss': 0.3087, 'grad_norm': 6.6473164558410645, 'learning_rate': 1.6027848531080622e-05, 'epoch': 0.33} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.34944019317626956, 'train/info_loss': 0.2008563131093979, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001189467846415937, 'train/video_loss': 0.2007373720407486, 'train/total_loss': 0.5501775741577148} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3352045774459839, 'train/info_loss': 0.15491542220115662, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001010258449241519, 'train/video_loss': 0.15481439232826233, 'train/total_loss': 0.49001896381378174} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.1163, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4666, 'grad_norm': 4.861307621002197, 'learning_rate': 1.6019333426852406e-05}[Rank 2] Trainer log: {'loss': 0.4666, 'grad_norm': 4.861307621002197, 'learning_rate': 1.6019333426852406e-05}[Rank 0] Trainer log: {'loss': 0.4666, 'grad_norm': 4.861307621002197, 'learning_rate': 1.6019333426852406e-05} +tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) +[Rank 3] Trainer log: {'loss': 0.3622, 'grad_norm': 2.7072715759277344, 'learning_rate': 5.170315439584212e-06}[Rank 2] Trainer log: {'loss': 0.3622, 'grad_norm': 2.7072715759277344, 'learning_rate': 5.170315439584212e-06}[Rank 1] Trainer log: {'loss': 0.3622, 'grad_norm': 2.7072715759277344, 'learning_rate': 5.170315439584212e-06} -[Rank 1] Trainer log: {'loss': 0.4666, 'grad_norm': 4.861307621002197, 'learning_rate': 1.6019333426852406e-05} -{'loss': 0.4666, 'grad_norm': 4.861307621002197, 'learning_rate': 1.6019333426852406e-05, 'epoch': 0.33} +[Rank 0] Trainer log: {'loss': 0.3622, 'grad_norm': 2.7072715759277344, 'learning_rate': 5.170315439584212e-06} +{'loss': 0.3622, 'grad_norm': 2.7072715759277344, 'learning_rate': 5.170315439584212e-06, 'epoch': 0.68} tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.3220, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0008322461508214474, 'train/lm_loss': 0.00010244606528431177, 'train/info_loss': 3.80263190891128e-05, 'train/ref_loss': 0.18396742641925812, 'train/uncertainty_loss': -7.110607693903149e-05, 'train/video_loss': 0.19059231877326965, 'train/total_loss': 0.19069476425647736} -tensor(0.1918, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.4124, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) +tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) +tensor(0.1476, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0009207648225128651, 'train/lm_loss': 7.720887078903616e-05, 'train/info_loss': 3.57019089278765e-05, 'train/ref_loss': 0.12701012194156647, 'train/uncertainty_loss': -7.307035266421736e-05, 'train/video_loss': 0.1343388706445694, 'train/total_loss': 0.13441607356071472} -[Rank 1] Trainer log: {'loss': 0.3188, 'grad_norm': 6.997394561767578, 'learning_rate': 1.6010811472830253e-05}[Rank 0] Trainer log: {'loss': 0.3188, 'grad_norm': 6.997394561767578, 'learning_rate': 1.6010811472830253e-05}[Rank 2] Trainer log: {'loss': 0.3188, 'grad_norm': 6.997394561767578, 'learning_rate': 1.6010811472830253e-05} -[Rank 3] Trainer log: {'loss': 0.3188, 'grad_norm': 6.997394561767578, 'learning_rate': 1.6010811472830253e-05} - - -{'loss': 0.3188, 'grad_norm': 6.997394561767578, 'learning_rate': 1.6010811472830253e-05, 'epoch': 0.33} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.16674011945724487, 'train/info_loss': 0.18495704233646393, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010000240290537478, 'train/video_loss': 0.18485704064369202, 'train/total_loss': 0.3515971601009369} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.12168151140213013, 'train/info_loss': 0.159895658493042, 'train/ref_loss': None, 'train/uncertainty_loss': -9.093803819268942e-05, 'train/video_loss': 0.15980471670627594, 'train/total_loss': 0.2814862132072449} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.4298, 'grad_norm': 3.1718268394470215, 'learning_rate': 1.6002282678711853e-05}[Rank 0] Trainer log: {'loss': 0.4298, 'grad_norm': 3.1718268394470215, 'learning_rate': 1.6002282678711853e-05} -[Rank 3] Trainer log: {'loss': 0.4298, 'grad_norm': 3.1718268394470215, 'learning_rate': 1.6002282678711853e-05} -[Rank 1] Trainer log: {'loss': 0.4298, 'grad_norm': 3.1718268394470215, 'learning_rate': 1.6002282678711853e-05} - -{'loss': 0.4298, 'grad_norm': 3.1718268394470215, 'learning_rate': 1.6002282678711853e-05, 'epoch': 0.33} -tensor(0.0414, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1972, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0011655806563794612, 'train/lm_loss': 7.849583053030074e-05, 'train/info_loss': 3.3556287235114723e-05, 'train/ref_loss': 0.3411838114261627, 'train/uncertainty_loss': 0.019720697402954103, 'train/video_loss': 0.3702627122402191, 'train/total_loss': 0.3703412115573883} -tensor(0.1344, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1504, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.0225, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0966, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0009461711160838604, 'train/lm_loss': 8.914867648854852e-05, 'train/info_loss': 3.743031629710458e-05, 'train/ref_loss': 0.2743179202079773, 'train/uncertainty_loss': 0.009662693738937378, 'train/video_loss': 0.2915874123573303, 'train/total_loss': 0.2916765511035919} -[Rank 3] Trainer log: {'loss': 0.3426, 'grad_norm': 2.2070376873016357, 'learning_rate': 1.5993747054202683e-05}[Rank 2] Trainer log: {'loss': 0.3426, 'grad_norm': 2.2070376873016357, 'learning_rate': 1.5993747054202683e-05}[Rank 1] Trainer log: {'loss': 0.3426, 'grad_norm': 2.2070376873016357, 'learning_rate': 1.5993747054202683e-05} - - -[Rank 0] Trainer log: {'loss': 0.3426, 'grad_norm': 2.2070376873016357, 'learning_rate': 1.5993747054202683e-05} -{'loss': 0.3426, 'grad_norm': 2.2070376873016357, 'learning_rate': 1.5993747054202683e-05, 'epoch': 0.33} -tensor(-0.0017, device='cuda:2', grad_fn=) tensor(-0.0017, device='cuda:2', grad_fn=) -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.27979838848114014, 'train/info_loss': 0.18480737507343292, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011954233050346375, 'train/video_loss': 0.18468783795833588, 'train/total_loss': 0.4644862413406372} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.26253483295440677, 'train/info_loss': 0.2584110200405121, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00014068001182749869, 'train/video_loss': 0.25827035307884216, 'train/total_loss': 0.5208051800727844} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.5501, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.4586, 'grad_norm': 9.699905395507812, 'learning_rate': 1.5985204609015986e-05}[Rank 3] Trainer log: {'loss': 0.4586, 'grad_norm': 9.699905395507812, 'learning_rate': 1.5985204609015986e-05}[Rank 1] Trainer log: {'loss': 0.4586, 'grad_norm': 9.699905395507812, 'learning_rate': 1.5985204609015986e-05} - -[Rank 2] Trainer log: {'loss': 0.4586, 'grad_norm': 9.699905395507812, 'learning_rate': 1.5985204609015986e-05} - -{'loss': 0.4586, 'grad_norm': 9.699905395507812, 'learning_rate': 1.5985204609015986e-05, 'epoch': 0.33} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1964, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0010989285074174404, 'train/lm_loss': 0.00013089664280414582, 'train/info_loss': 4.017191531602293e-05, 'train/ref_loss': 0.34301817417144775, 'train/uncertainty_loss': 0.019642318785190585, 'train/video_loss': 0.37149208784103394, 'train/total_loss': 0.3716229796409607} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.26793751716613773, 'train/info_loss': 0.2624358534812927, 'train/ref_loss': None, 'train/uncertainty_loss': -8.980766288004816e-05, 'train/video_loss': 0.2623460590839386, 'train/total_loss': 0.5302835702896118} -[Rank 2] Trainer log: {'loss': 0.4059, 'grad_norm': 6.187150955200195, 'learning_rate': 1.5976655352872773e-05}[Rank 3] Trainer log: {'loss': 0.4059, 'grad_norm': 6.187150955200195, 'learning_rate': 1.5976655352872773e-05}[Rank 1] Trainer log: {'loss': 0.4059, 'grad_norm': 6.187150955200195, 'learning_rate': 1.5976655352872773e-05} - - -[Rank 0] Trainer log: {'loss': 0.4059, 'grad_norm': 6.187150955200195, 'learning_rate': 1.5976655352872773e-05} -{'loss': 0.4059, 'grad_norm': 6.187150955200195, 'learning_rate': 1.5976655352872773e-05, 'epoch': 0.33} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(0.3532, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0007913817651569844, 'train/lm_loss': 8.864821866154671e-05, 'train/info_loss': 3.486750210868195e-05, 'train/ref_loss': 0.4423324465751648, 'train/uncertainty_loss': 0.0353191077709198, 'train/video_loss': 0.4840174913406372, 'train/total_loss': 0.4841061532497406} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) +{'train/tv_loss': 0.0002299083396792412, 'train/lm_loss': 0.00013740110443904996, 'train/info_loss': 2.8251803087187e-05, 'train/ref_loss': 0.2081691175699234, 'train/uncertainty_loss': -7.465039961971343e-05, 'train/video_loss': 0.20996198058128357, 'train/total_loss': 0.21009938418865204} +tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2522159814834595, 'train/info_loss': 0.23566104471683502, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012934593250975014, 'train/video_loss': 0.23553170263767242, 'train/total_loss': 0.4877476692199707} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3991, 'grad_norm': 9.678742408752441, 'learning_rate': 1.5968099295501804e-05}[Rank 3] Trainer log: {'loss': 0.3991, 'grad_norm': 9.678742408752441, 'learning_rate': 1.5968099295501804e-05}[Rank 0] Trainer log: {'loss': 0.3991, 'grad_norm': 9.678742408752441, 'learning_rate': 1.5968099295501804e-05} +{'train/tv_loss': None, 'train/lm_loss': 0.3807173013687134, 'train/info_loss': 0.20246171951293945, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012987093068659306, 'train/video_loss': 0.20233185589313507, 'train/total_loss': 0.5830491781234741} +tensor(0.2717, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) +[Rank 2] Trainer log: {'loss': 0.379, 'grad_norm': 9.212971687316895, 'learning_rate': 5.160977282317798e-06}[Rank 3] Trainer log: {'loss': 0.379, 'grad_norm': 9.212971687316895, 'learning_rate': 5.160977282317798e-06}[Rank 0] Trainer log: {'loss': 0.379, 'grad_norm': 9.212971687316895, 'learning_rate': 5.160977282317798e-06} -[Rank 1] Trainer log: {'loss': 0.3991, 'grad_norm': 9.678742408752441, 'learning_rate': 1.5968099295501804e-05} +[Rank 1] Trainer log: {'loss': 0.379, 'grad_norm': 9.212971687316895, 'learning_rate': 5.160977282317798e-06} -{'loss': 0.3991, 'grad_norm': 9.678742408752441, 'learning_rate': 1.5968099295501804e-05, 'epoch': 0.33} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.30838449001312257, 'train/info_loss': 0.21039479970932007, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011654241243377329, 'train/video_loss': 0.21027825772762299, 'train/total_loss': 0.5186627507209778} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(0.0007, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) +{'loss': 0.379, 'grad_norm': 9.212971687316895, 'learning_rate': 5.160977282317798e-06, 'epoch': 0.68} +tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) +tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) +tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1465477228164673, 'train/info_loss': 0.15409651398658752, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011370573192834855, 'train/video_loss': 0.15398280322551727, 'train/total_loss': 0.3005305230617523} -tensor(0.2196, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3303, 'grad_norm': 2.2374424934387207, 'learning_rate': 1.5959536446639572e-05}[Rank 1] Trainer log: {'loss': 0.3303, 'grad_norm': 2.2374424934387207, 'learning_rate': 1.5959536446639572e-05} - -[Rank 3] Trainer log: {'loss': 0.3303, 'grad_norm': 2.2374424934387207, 'learning_rate': 1.5959536446639572e-05} -[Rank 0] Trainer log: {'loss': 0.3303, 'grad_norm': 2.2374424934387207, 'learning_rate': 1.5959536446639572e-05} -{'loss': 0.3303, 'grad_norm': 2.2374424934387207, 'learning_rate': 1.5959536446639572e-05, 'epoch': 0.33} -tensor(0.1660, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.001182809565216303, 'train/lm_loss': 0.00010034904116764665, 'train/info_loss': 3.713231490110047e-05, 'train/ref_loss': 0.3247128129005432, 'train/uncertainty_loss': 0.01660364419221878, 'train/video_loss': 0.3508160710334778, 'train/total_loss': 0.35091641545295715} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.2298, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.4315, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.23726403713226318, 'train/info_loss': 0.21510286629199982, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011826474219560623, 'train/video_loss': 0.21498459577560425, 'train/total_loss': 0.45224863290786743} +{'train/tv_loss': None, 'train/lm_loss': 0.3761807680130005, 'train/info_loss': 0.15962228178977966, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011396079789847136, 'train/video_loss': 0.15950831770896912, 'train/total_loss': 0.535689115524292} +tensor(-0.0015, device='cuda:0', grad_fn=) tensor(-0.0015, device='cuda:0', grad_fn=) +{'train/tv_loss': None, 'train/lm_loss': 0.18888275623321535, 'train/info_loss': 0.17058894038200378, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00014710992109030486, 'train/video_loss': 0.17044183611869812, 'train/total_loss': 0.35932457447052} tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.2111, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4687, 'grad_norm': 3.1779301166534424, 'learning_rate': 1.5950966816030305e-05} -[Rank 0] Trainer log: {'loss': 0.4687, 'grad_norm': 3.1779301166534424, 'learning_rate': 1.5950966816030305e-05}[Rank 2] Trainer log: {'loss': 0.4687, 'grad_norm': 3.1779301166534424, 'learning_rate': 1.5950966816030305e-05} -[Rank 3] Trainer log: {'loss': 0.4687, 'grad_norm': 3.1779301166534424, 'learning_rate': 1.5950966816030305e-05} - -{'loss': 0.4687, 'grad_norm': 3.1779301166534424, 'learning_rate': 1.5950966816030305e-05, 'epoch': 0.33} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.4329267978668213, 'train/info_loss': 0.22112645208835602, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012712215539067984, 'train/video_loss': 0.2209993302822113, 'train/total_loss': 0.6539261341094971} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(0.0540, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1230, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0009856450371444226, 'train/lm_loss': 6.810459308326244e-05, 'train/info_loss': 3.254307739553042e-05, 'train/ref_loss': 0.2929264307022095, 'train/uncertainty_loss': 0.012304800003767014, 'train/video_loss': 0.31314894556999207, 'train/total_loss': 0.31321704387664795} -[Rank 3] Trainer log: {'loss': 0.4425, 'grad_norm': 7.144225120544434, 'learning_rate': 1.594239041342595e-05}[Rank 2] Trainer log: {'loss': 0.4425, 'grad_norm': 7.144225120544434, 'learning_rate': 1.594239041342595e-05} - -[Rank 1] Trainer log: {'loss': 0.4425, 'grad_norm': 7.144225120544434, 'learning_rate': 1.594239041342595e-05}[Rank 0] Trainer log: {'loss': 0.4425, 'grad_norm': 7.144225120544434, 'learning_rate': 1.594239041342595e-05} +tensor(0.5842, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) +tensor(-0.0006, device='cuda:3', grad_fn=) tensor(-0.0006, device='cuda:3', grad_fn=) +[Rank 3] Trainer log: {'loss': 0.3998, 'grad_norm': 3.0605311393737793, 'learning_rate': 5.15164463169242e-06}[Rank 0] Trainer log: {'loss': 0.3998, 'grad_norm': 3.0605311393737793, 'learning_rate': 5.15164463169242e-06}[Rank 2] Trainer log: {'loss': 0.3998, 'grad_norm': 3.0605311393737793, 'learning_rate': 5.15164463169242e-06} +[Rank 1] Trainer log: {'loss': 0.3998, 'grad_norm': 3.0605311393737793, 'learning_rate': 5.15164463169242e-06} -{'loss': 0.4425, 'grad_norm': 7.144225120544434, 'learning_rate': 1.594239041342595e-05, 'epoch': 0.33} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(0.2133, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(0.0633, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0010762344114482404, 'train/lm_loss': 0.00012496381532400846, 'train/info_loss': 4.017191531602293e-05, 'train/ref_loss': 0.26298755407333374, 'train/uncertainty_loss': 0.00632697194814682, 'train/video_loss': 0.2779645621776581, 'train/total_loss': 0.2780895233154297} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0887, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0011144211515784264, 'train/lm_loss': 0.00012289085425436498, 'train/info_loss': 3.683431350509636e-05, 'train/ref_loss': 0.2713351845741272, 'train/uncertainty_loss': 0.0088687464594841, 'train/video_loss': 0.28915613889694214, 'train/total_loss': 0.289279043674469} -[Rank 1] Trainer log: {'loss': 0.405, 'grad_norm': 5.937835693359375, 'learning_rate': 1.5933807248586158e-05} -[Rank 3] Trainer log: {'loss': 0.405, 'grad_norm': 5.937835693359375, 'learning_rate': 1.5933807248586158e-05}[Rank 2] Trainer log: {'loss': 0.405, 'grad_norm': 5.937835693359375, 'learning_rate': 1.5933807248586158e-05} -[Rank 0] Trainer log: {'loss': 0.405, 'grad_norm': 5.937835693359375, 'learning_rate': 1.5933807248586158e-05} -{'loss': 0.405, 'grad_norm': 5.937835693359375, 'learning_rate': 1.5933807248586158e-05, 'epoch': 0.33} +{'loss': 0.3998, 'grad_norm': 3.0605311393737793, 'learning_rate': 5.15164463169242e-06, 'epoch': 0.68} tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.2340, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0013181407004594803, 'train/lm_loss': 6.824759184382856e-05, 'train/info_loss': 3.2006668334361166e-05, 'train/ref_loss': 0.37197595834732056, 'train/uncertainty_loss': 0.023395060002803805, 'train/video_loss': 0.4059481620788574, 'train/total_loss': 0.40601640939712524} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.10447338819503785, 'train/info_loss': 0.13815245032310486, 'train/ref_loss': None, 'train/uncertainty_loss': -9.193539153784513e-05, 'train/video_loss': 0.13806051015853882, 'train/total_loss': 0.24253389239311218} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.0739, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3724, 'grad_norm': 5.946979522705078, 'learning_rate': 1.592521733127827e-05} -[Rank 1] Trainer log: {'loss': 0.3724, 'grad_norm': 5.946979522705078, 'learning_rate': 1.592521733127827e-05} -[Rank 3] Trainer log: {'loss': 0.3724, 'grad_norm': 5.946979522705078, 'learning_rate': 1.592521733127827e-05}[Rank 0] Trainer log: {'loss': 0.3724, 'grad_norm': 5.946979522705078, 'learning_rate': 1.592521733127827e-05} - -{'loss': 0.3724, 'grad_norm': 5.946979522705078, 'learning_rate': 1.592521733127827e-05, 'epoch': 0.33} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.1707, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.3676, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0008820927701890469, 'train/lm_loss': 7.861499325372279e-05, 'train/info_loss': 3.653631210909225e-05, 'train/ref_loss': 0.46024423837661743, 'train/uncertainty_loss': 0.0367579460144043, 'train/video_loss': 0.5040954351425171, 'train/total_loss': 0.5041740536689758} -tensor(0.0429, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.6764779090881348, 'train/info_loss': 0.1906251758337021, 'train/ref_loss': None, 'train/uncertainty_loss': -8.700740290805698e-05, 'train/video_loss': 0.1905381679534912, 'train/total_loss': 0.867016077041626} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.0482, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0281, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.4245, 'grad_norm': 8.527567863464355, 'learning_rate': 1.591662067127732e-05}[Rank 3] Trainer log: {'loss': 0.4245, 'grad_norm': 8.527567863464355, 'learning_rate': 1.591662067127732e-05} -[Rank 1] Trainer log: {'loss': 0.4245, 'grad_norm': 8.527567863464355, 'learning_rate': 1.591662067127732e-05} -[Rank 2] Trainer log: {'loss': 0.4245, 'grad_norm': 8.527567863464355, 'learning_rate': 1.591662067127732e-05} - -{'loss': 0.4245, 'grad_norm': 8.527567863464355, 'learning_rate': 1.591662067127732e-05, 'epoch': 0.33} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0340, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.000773468241095543, 'train/lm_loss': 6.843826267868281e-05, 'train/info_loss': 3.254307739553042e-05, 'train/ref_loss': 0.2317778468132019, 'train/uncertainty_loss': 0.0033965475857257845, 'train/video_loss': 0.24139468371868134, 'train/total_loss': 0.24146312475204468} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.0673, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0972, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.000938603188842535, 'train/lm_loss': 0.00014838435454294087, 'train/info_loss': 4.4463089579949155e-05, 'train/ref_loss': 0.2755737900733948, 'train/uncertainty_loss': 0.009719964861869813, 'train/video_loss': 0.29284703731536865, 'train/total_loss': 0.292995423078537} -[Rank 0] Trainer log: {'loss': 0.3412, 'grad_norm': 3.2891790866851807, 'learning_rate': 1.590801727836601e-05}[Rank 2] Trainer log: {'loss': 0.3412, 'grad_norm': 3.2891790866851807, 'learning_rate': 1.590801727836601e-05} -[Rank 1] Trainer log: {'loss': 0.3412, 'grad_norm': 3.2891790866851807, 'learning_rate': 1.590801727836601e-05}[Rank 3] Trainer log: {'loss': 0.3412, 'grad_norm': 3.2891790866851807, 'learning_rate': 1.590801727836601e-05} - - -{'loss': 0.3412, 'grad_norm': 3.2891790866851807, 'learning_rate': 1.590801727836601e-05, 'epoch': 0.33} -tensor(0.0512, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.3100, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0008915508165955544, 'train/lm_loss': 9.0197246754542e-05, 'train/info_loss': 3.486750210868195e-05, 'train/ref_loss': 0.4271255135536194, 'train/uncertainty_loss': 0.03100266456604004, 'train/video_loss': 0.4652954638004303, 'train/total_loss': 0.46538567543029785} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3153928518295288, 'train/info_loss': 0.09969936311244965, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011717667803168297, 'train/video_loss': 0.09958218783140182, 'train/total_loss': 0.41497504711151123} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3049, 'grad_norm': 4.590808391571045, 'learning_rate': 1.589940716233471e-05}[Rank 0] Trainer log: {'loss': 0.3049, 'grad_norm': 4.590808391571045, 'learning_rate': 1.589940716233471e-05} -[Rank 2] Trainer log: {'loss': 0.3049, 'grad_norm': 4.590808391571045, 'learning_rate': 1.589940716233471e-05} -[Rank 3] Trainer log: {'loss': 0.3049, 'grad_norm': 4.590808391571045, 'learning_rate': 1.589940716233471e-05} - -{'loss': 0.3049, 'grad_norm': 4.590808391571045, 'learning_rate': 1.589940716233471e-05, 'epoch': 0.33} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3170551061630249, 'train/info_loss': 0.16890890896320343, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012449253117665648, 'train/video_loss': 0.16878440976142883, 'train/total_loss': 0.48583951592445374} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0009923405945301057, 'train/lm_loss': 7.720887078903616e-05, 'train/info_loss': 3.516550350468606e-05, 'train/ref_loss': 0.19769926369190216, 'train/uncertainty_loss': -7.268774788826704e-05, 'train/video_loss': 0.20560047030448914, 'train/total_loss': 0.20567767322063446} -[Rank 3] Trainer log: {'loss': 0.3267, 'grad_norm': 4.883946895599365, 'learning_rate': 1.589079033298142e-05} -[Rank 1] Trainer log: {'loss': 0.3267, 'grad_norm': 4.883946895599365, 'learning_rate': 1.589079033298142e-05} -[Rank 0] Trainer log: {'loss': 0.3267, 'grad_norm': 4.883946895599365, 'learning_rate': 1.589079033298142e-05}[Rank 2] Trainer log: {'loss': 0.3267, 'grad_norm': 4.883946895599365, 'learning_rate': 1.589079033298142e-05} - -{'loss': 0.3267, 'grad_norm': 4.883946895599365, 'learning_rate': 1.589079033298142e-05, 'epoch': 0.33} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.2318, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.1393, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0009189265780150891, 'train/lm_loss': 6.912943790666759e-05, 'train/info_loss': 3.379469126230106e-05, 'train/ref_loss': 0.3072440028190613, 'train/uncertainty_loss': 0.013932968676090242, 'train/video_loss': 0.3285621702671051, 'train/total_loss': 0.32863131165504456} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.11851481199264527, 'train/info_loss': 0.11678551882505417, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010228920727968217, 'train/video_loss': 0.116683229804039, 'train/total_loss': 0.23519805073738098} -tensor(0.0448, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3846, 'grad_norm': 2.1213021278381348, 'learning_rate': 1.588216680011181e-05} -[Rank 2] Trainer log: {'loss': 0.3846, 'grad_norm': 2.1213021278381348, 'learning_rate': 1.588216680011181e-05}[Rank 1] Trainer log: {'loss': 0.3846, 'grad_norm': 2.1213021278381348, 'learning_rate': 1.588216680011181e-05} - -[Rank 0] Trainer log: {'loss': 0.3846, 'grad_norm': 2.1213021278381348, 'learning_rate': 1.588216680011181e-05} -{'loss': 0.3846, 'grad_norm': 2.1213021278381348, 'learning_rate': 1.588216680011181e-05, 'epoch': 0.33} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.49444308280944826, 'train/info_loss': 0.1420062929391861, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001245466060936451, 'train/video_loss': 0.14188174903392792, 'train/total_loss': 0.6363248229026794} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.1050, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0009140139445662498, 'train/lm_loss': 0.00010742639424279333, 'train/info_loss': 4.207910751574673e-05, 'train/ref_loss': 0.10863326489925385, 'train/uncertainty_loss': -6.970321410335601e-05, 'train/video_loss': 0.11591775715351105, 'train/total_loss': 0.11602518707513809} -tensor(0.1502, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3825, 'grad_norm': 2.450460910797119, 'learning_rate': 1.5873536573539153e-05}[Rank 1] Trainer log: {'loss': 0.3825, 'grad_norm': 2.450460910797119, 'learning_rate': 1.5873536573539153e-05} - -[Rank 0] Trainer log: {'loss': 0.3825, 'grad_norm': 2.450460910797119, 'learning_rate': 1.5873536573539153e-05}[Rank 3] Trainer log: {'loss': 0.3825, 'grad_norm': 2.450460910797119, 'learning_rate': 1.5873536573539153e-05} - -{'loss': 0.3825, 'grad_norm': 2.450460910797119, 'learning_rate': 1.5873536573539153e-05, 'epoch': 0.34} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(0.1063, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.3021, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0010750428773462772, 'train/lm_loss': 8.857672801241279e-05, 'train/info_loss': 3.516550350468606e-05, 'train/ref_loss': 0.40660160779953003, 'train/uncertainty_loss': 0.030207663774490356, 'train/video_loss': 0.445444792509079, 'train/total_loss': 0.4455333650112152} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3631515979766846, 'train/info_loss': 0.13795937597751617, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010381226893514396, 'train/video_loss': 0.13785555958747864, 'train/total_loss': 0.5010071992874146} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3639, 'grad_norm': 10.488601684570312, 'learning_rate': 1.5864899663084356e-05}[Rank 1] Trainer log: {'loss': 0.3639, 'grad_norm': 10.488601684570312, 'learning_rate': 1.5864899663084356e-05}[Rank 2] Trainer log: {'loss': 0.3639, 'grad_norm': 10.488601684570312, 'learning_rate': 1.5864899663084356e-05} - - -[Rank 3] Trainer log: {'loss': 0.3639, 'grad_norm': 10.488601684570312, 'learning_rate': 1.5864899663084356e-05} -{'loss': 0.3639, 'grad_norm': 10.488601684570312, 'learning_rate': 1.5864899663084356e-05, 'epoch': 0.34} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.0913, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1201, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.000809625256806612, 'train/lm_loss': 9.012575610540808e-05, 'train/info_loss': 3.951631879317574e-05, 'train/ref_loss': 0.2957640290260315, 'train/uncertainty_loss': 0.012009514123201372, 'train/video_loss': 0.31429004669189453, 'train/total_loss': 0.3143801689147949} -tensor(0.1127, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0018, device='cuda:2', grad_fn=) tensor(-0.0018, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.11777396202087403, 'train/info_loss': 0.1355169117450714, 'train/ref_loss': None, 'train/uncertainty_loss': -8.89415736310184e-05, 'train/video_loss': 0.13542796671390533, 'train/total_loss': 0.2532019317150116} -[Rank 2] Trainer log: {'loss': 0.3881, 'grad_norm': 3.094700813293457, 'learning_rate': 1.585625607857592e-05}[Rank 3] Trainer log: {'loss': 0.3881, 'grad_norm': 3.094700813293457, 'learning_rate': 1.585625607857592e-05}[Rank 0] Trainer log: {'loss': 0.3881, 'grad_norm': 3.094700813293457, 'learning_rate': 1.585625607857592e-05} - -[Rank 1] Trainer log: {'loss': 0.3881, 'grad_norm': 3.094700813293457, 'learning_rate': 1.585625607857592e-05} - -{'loss': 0.3881, 'grad_norm': 3.094700813293457, 'learning_rate': 1.585625607857592e-05, 'epoch': 0.34} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2849531888961792, 'train/info_loss': 0.40507107973098755, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011478584492579103, 'train/video_loss': 0.40495628118515015, 'train/total_loss': 0.6899094581604004} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0430, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.2145, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.15039920806884766, 'train/info_loss': 0.1701315939426422, 'train/ref_loss': None, 'train/uncertainty_loss': -9.417122346349061e-05, 'train/video_loss': 0.1700374186038971, 'train/total_loss': 0.32043662667274475} -[Rank 1] Trainer log: {'loss': 0.4012, 'grad_norm': 4.463277339935303, 'learning_rate': 1.5847605829849953e-05}[Rank 3] Trainer log: {'loss': 0.4012, 'grad_norm': 4.463277339935303, 'learning_rate': 1.5847605829849953e-05}[Rank 2] Trainer log: {'loss': 0.4012, 'grad_norm': 4.463277339935303, 'learning_rate': 1.5847605829849953e-05} - - -[Rank 0] Trainer log: {'loss': 0.4012, 'grad_norm': 4.463277339935303, 'learning_rate': 1.5847605829849953e-05} -{'loss': 0.4012, 'grad_norm': 4.463277339935303, 'learning_rate': 1.5847605829849953e-05, 'epoch': 0.34} -tensor(-0.0017, device='cuda:0', grad_fn=) tensor(-0.0017, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.00012991975527256727, 'train/info_loss': 3.06060173898004e-05, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00017073913477361202, 'train/video_loss': -0.000140133110107854, 'train/total_loss': -1.0213349014520645e-05} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.4406, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.38437843322753906, 'train/info_loss': 0.28778061270713806, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011300015030428767, 'train/video_loss': 0.2876676023006439, 'train/total_loss': 0.6720460653305054} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3889, 'grad_norm': 10.430274963378906, 'learning_rate': 1.5838948926750135e-05}[Rank 1] Trainer log: {'loss': 0.3889, 'grad_norm': 10.430274963378906, 'learning_rate': 1.5838948926750135e-05}[Rank 2] Trainer log: {'loss': 0.3889, 'grad_norm': 10.430274963378906, 'learning_rate': 1.5838948926750135e-05} - -[Rank 3] Trainer log: {'loss': 0.3889, 'grad_norm': 10.430274963378906, 'learning_rate': 1.5838948926750135e-05} - -{'loss': 0.3889, 'grad_norm': 10.430274963378906, 'learning_rate': 1.5838948926750135e-05, 'epoch': 0.34} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(0.1713, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0009829594753682613, 'train/lm_loss': 9.560683392919601e-05, 'train/info_loss': 3.862231824314222e-05, 'train/ref_loss': 0.16269376873970032, 'train/uncertainty_loss': -7.544680847786367e-05, 'train/video_loss': 0.17052061855793, 'train/total_loss': 0.17061622440814972} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(0.0650, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0160, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0006737610790878535, 'train/lm_loss': 0.0001398789347149432, 'train/info_loss': 4.374789568828419e-05, 'train/ref_loss': 0.21857184171676636, 'train/uncertainty_loss': 0.0015975823625922203, 'train/video_loss': 0.22560326755046844, 'train/total_loss': 0.2257431447505951} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3222, 'grad_norm': 9.639322280883789, 'learning_rate': 1.5830285379127724e-05}[Rank 2] Trainer log: {'loss': 0.3222, 'grad_norm': 9.639322280883789, 'learning_rate': 1.5830285379127724e-05}[Rank 3] Trainer log: {'loss': 0.3222, 'grad_norm': 9.639322280883789, 'learning_rate': 1.5830285379127724e-05} - - -[Rank 0] Trainer log: {'loss': 0.3222, 'grad_norm': 9.639322280883789, 'learning_rate': 1.5830285379127724e-05} -{'loss': 0.3222, 'grad_norm': 9.639322280883789, 'learning_rate': 1.5830285379127724e-05, 'epoch': 0.34} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.12007774114608766, 'train/info_loss': 0.12695513665676117, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011058262316510081, 'train/video_loss': 0.12684455513954163, 'train/total_loss': 0.2469222992658615} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1208, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.40276670455932617, 'train/info_loss': 0.2702334523200989, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013414978748187424, 'train/video_loss': 0.27009931206703186, 'train/total_loss': 0.6728659868240356} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3625, 'grad_norm': 3.8932435512542725, 'learning_rate': 1.5821615196841535e-05}[Rank 1] Trainer log: {'loss': 0.3625, 'grad_norm': 3.8932435512542725, 'learning_rate': 1.5821615196841535e-05}[Rank 3] Trainer log: {'loss': 0.3625, 'grad_norm': 3.8932435512542725, 'learning_rate': 1.5821615196841535e-05} - - -[Rank 0] Trainer log: {'loss': 0.3625, 'grad_norm': 3.8932435512542725, 'learning_rate': 1.5821615196841535e-05} -{'loss': 0.3625, 'grad_norm': 3.8932435512542725, 'learning_rate': 1.5821615196841535e-05, 'epoch': 0.34} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3706568479537964, 'train/info_loss': 0.20317019522190094, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001075883279554546, 'train/video_loss': 0.20306260883808136, 'train/total_loss': 0.5737194418907166} -tensor(0.2802, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.22633986473083498, 'train/info_loss': 0.22643724083900452, 'train/ref_loss': None, 'train/uncertainty_loss': -8.828663849271835e-05, 'train/video_loss': 0.22634895145893097, 'train/total_loss': 0.4526888132095337} -tensor(0.4292, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4916, 'grad_norm': 15.725339889526367, 'learning_rate': 1.5812938389757945e-05}[Rank 1] Trainer log: {'loss': 0.4916, 'grad_norm': 15.725339889526367, 'learning_rate': 1.5812938389757945e-05}[Rank 2] Trainer log: {'loss': 0.4916, 'grad_norm': 15.725339889526367, 'learning_rate': 1.5812938389757945e-05} - - -[Rank 0] Trainer log: {'loss': 0.4916, 'grad_norm': 15.725339889526367, 'learning_rate': 1.5812938389757945e-05} -{'loss': 0.4916, 'grad_norm': 15.725339889526367, 'learning_rate': 1.5812938389757945e-05, 'epoch': 0.34} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0009246451780200005, 'train/lm_loss': 9.52493806835264e-05, 'train/info_loss': 3.599991032388061e-05, 'train/ref_loss': 0.06118236482143402, 'train/uncertainty_loss': -7.211556658148766e-05, 'train/video_loss': 0.06854341179132462, 'train/total_loss': 0.06863866001367569} -tensor(0.0370, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0015, device='cuda:3', grad_fn=) tensor(-0.0015, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.3836, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0007094808854162693, 'train/lm_loss': 0.0003841998986899853, 'train/info_loss': 5.5310050811385736e-05, 'train/ref_loss': 0.49114322662353516, 'train/uncertainty_loss': 0.03836014270782471, 'train/video_loss': 0.5352345108985901, 'train/total_loss': 0.5356187224388123}tensor(0.2782, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4136, 'grad_norm': 10.743165969848633, 'learning_rate': 1.5804254967750854e-05} -[Rank 2] Trainer log: {'loss': 0.4136, 'grad_norm': 10.743165969848633, 'learning_rate': 1.5804254967750854e-05} -[Rank 1] Trainer log: {'loss': 0.4136, 'grad_norm': 10.743165969848633, 'learning_rate': 1.5804254967750854e-05} -[Rank 0] Trainer log: {'loss': 0.4136, 'grad_norm': 10.743165969848633, 'learning_rate': 1.5804254967750854e-05} -{'loss': 0.4136, 'grad_norm': 10.743165969848633, 'learning_rate': 1.5804254967750854e-05, 'epoch': 0.34} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.0496, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0420, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0011221399530768395, 'train/lm_loss': 8.850523154251278e-05, 'train/info_loss': 3.683431350509636e-05, 'train/ref_loss': 0.23387983441352844, 'train/uncertainty_loss': 0.004201203212141991, 'train/video_loss': 0.247094988822937, 'train/total_loss': 0.24718348681926727} -tensor(0.4254, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0015, device='cuda:1', grad_fn=) tensor(-0.0015, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2528386116027832, 'train/info_loss': 0.1868072897195816, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011799349449574948, 'train/video_loss': 0.18668930232524872, 'train/total_loss': 0.4395279288291931} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.1571, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4075, 'grad_norm': 2.158468723297119, 'learning_rate': 1.57955649407017e-05}[Rank 1] Trainer log: {'loss': 0.4075, 'grad_norm': 2.158468723297119, 'learning_rate': 1.57955649407017e-05}[Rank 2] Trainer log: {'loss': 0.4075, 'grad_norm': 2.158468723297119, 'learning_rate': 1.57955649407017e-05} - - -[Rank 0] Trainer log: {'loss': 0.4075, 'grad_norm': 2.158468723297119, 'learning_rate': 1.57955649407017e-05} -{'loss': 0.4075, 'grad_norm': 2.158468723297119, 'learning_rate': 1.57955649407017e-05, 'epoch': 0.34} -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.18766574859619142, 'train/info_loss': 0.2785934805870056, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013217844534665346, 'train/video_loss': 0.2784613072872162, 'train/total_loss': 0.46612703800201416} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1429137468338013, 'train/info_loss': 0.19980545341968536, 'train/ref_loss': None, 'train/uncertainty_loss': -9.411797509528697e-05, 'train/video_loss': 0.19971133768558502, 'train/total_loss': 0.34262508153915405} -[Rank 2] Trainer log: {'loss': 0.3792, 'grad_norm': 4.648589611053467, 'learning_rate': 1.5786868318499437e-05}[Rank 0] Trainer log: {'loss': 0.3792, 'grad_norm': 4.648589611053467, 'learning_rate': 1.5786868318499437e-05}[Rank 3] Trainer log: {'loss': 0.3792, 'grad_norm': 4.648589611053467, 'learning_rate': 1.5786868318499437e-05} - -[Rank 1] Trainer log: {'loss': 0.3792, 'grad_norm': 4.648589611053467, 'learning_rate': 1.5786868318499437e-05} - -{'loss': 0.3792, 'grad_norm': 4.648589611053467, 'learning_rate': 1.5786868318499437e-05, 'epoch': 0.34} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0007922560907900333, 'train/lm_loss': 6.812842329964043e-05, 'train/info_loss': 3.278148142271675e-05, 'train/ref_loss': 0.08541804552078247, 'train/uncertainty_loss': -7.55242188461125e-05, 'train/video_loss': 0.09171334654092789, 'train/total_loss': 0.09178147464990616} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0008771516382694245, 'train/lm_loss': 8.874354534782469e-05, 'train/info_loss': 3.6297911719884723e-05, 'train/ref_loss': 0.20650777220726013, 'train/uncertainty_loss': -7.196879014372826e-05, 'train/video_loss': 0.21348930895328522, 'train/total_loss': 0.21357804536819458} -tensor(0.1005, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3465, 'grad_norm': 9.03963851928711, 'learning_rate': 1.5778165111040512e-05}[Rank 1] Trainer log: {'loss': 0.3465, 'grad_norm': 9.03963851928711, 'learning_rate': 1.5778165111040512e-05} -[Rank 3] Trainer log: {'loss': 0.3465, 'grad_norm': 9.03963851928711, 'learning_rate': 1.5778165111040512e-05} -[Rank 0] Trainer log: {'loss': 0.3465, 'grad_norm': 9.03963851928711, 'learning_rate': 1.5778165111040512e-05} - -{'loss': 0.3465, 'grad_norm': 9.03963851928711, 'learning_rate': 1.5778165111040512e-05, 'epoch': 0.34} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(0.0222, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(0.2189, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3773045063018799, 'train/info_loss': 0.18981419503688812, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011059982934966684, 'train/video_loss': 0.18970359861850739, 'train/total_loss': 0.5670081377029419} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1589473605155945, 'train/info_loss': 0.16358093917369843, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010322518646717073, 'train/video_loss': 0.16347771883010864, 'train/total_loss': 0.32242506742477417} -tensor(0.0781, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(0.0954, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3898, 'grad_norm': 13.311031341552734, 'learning_rate': 1.5769455328228888e-05}[Rank 1] Trainer log: {'loss': 0.3898, 'grad_norm': 13.311031341552734, 'learning_rate': 1.5769455328228888e-05}[Rank 3] Trainer log: {'loss': 0.3898, 'grad_norm': 13.311031341552734, 'learning_rate': 1.5769455328228888e-05} - - -[Rank 0] Trainer log: {'loss': 0.3898, 'grad_norm': 13.311031341552734, 'learning_rate': 1.5769455328228888e-05} -{'loss': 0.3898, 'grad_norm': 13.311031341552734, 'learning_rate': 1.5769455328228888e-05, 'epoch': 0.34} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.23635046482086183, 'train/info_loss': 0.25057893991470337, 'train/ref_loss': None, 'train/uncertainty_loss': -8.733721333555878e-05, 'train/video_loss': 0.25049158930778503, 'train/total_loss': 0.4868420362472534} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.30993726253509524, 'train/info_loss': 0.20354391634464264, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010965837864205242, 'train/video_loss': 0.20343425869941711, 'train/total_loss': 0.5133715271949768} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.4768, 'grad_norm': 6.088080406188965, 'learning_rate': 1.576073897997598e-05}[Rank 1] Trainer log: {'loss': 0.4768, 'grad_norm': 6.088080406188965, 'learning_rate': 1.576073897997598e-05}[Rank 0] Trainer log: {'loss': 0.4768, 'grad_norm': 6.088080406188965, 'learning_rate': 1.576073897997598e-05} - - -[Rank 3] Trainer log: {'loss': 0.4768, 'grad_norm': 6.088080406188965, 'learning_rate': 1.576073897997598e-05} -{'loss': 0.4768, 'grad_norm': 6.088080406188965, 'learning_rate': 1.576073897997598e-05, 'epoch': 0.34} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.360989785194397, 'train/info_loss': 0.23664602637290955, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011890854220837356, 'train/video_loss': 0.23652711510658264, 'train/total_loss': 0.5975168943405151} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(0.0459, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.4321929931640625, 'train/info_loss': 0.18490645289421082, 'train/ref_loss': None, 'train/uncertainty_loss': -9.777147788554431e-05, 'train/video_loss': 0.18480868637561798, 'train/total_loss': 0.6170017123222351} -[Rank 1] Trainer log: {'loss': 0.3803, 'grad_norm': 6.23649263381958, 'learning_rate': 1.5752016076200715e-05} -[Rank 0] Trainer log: {'loss': 0.3803, 'grad_norm': 6.23649263381958, 'learning_rate': 1.5752016076200715e-05}[Rank 3] Trainer log: {'loss': 0.3803, 'grad_norm': 6.23649263381958, 'learning_rate': 1.5752016076200715e-05} -[Rank 2] Trainer log: {'loss': 0.3803, 'grad_norm': 6.23649263381958, 'learning_rate': 1.5752016076200715e-05} - -{'loss': 0.3803, 'grad_norm': 6.23649263381958, 'learning_rate': 1.5752016076200715e-05, 'epoch': 0.34} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.5438114166259765, 'train/info_loss': 0.1666916310787201, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011561915744096041, 'train/video_loss': 0.16657601296901703, 'train/total_loss': 0.7103874683380127} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0015, device='cuda:3', grad_fn=) tensor(-0.0015, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.32997698783874513, 'train/info_loss': 0.21658217906951904, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001302808290347457, 'train/video_loss': 0.21645189821720123, 'train/total_loss': 0.546428918838501} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.1507, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.4887, 'grad_norm': 3.8389883041381836, 'learning_rate': 1.5743286626829437e-05}[Rank 1] Trainer log: {'loss': 0.4887, 'grad_norm': 3.8389883041381836, 'learning_rate': 1.5743286626829437e-05} -[Rank 3] Trainer log: {'loss': 0.4887, 'grad_norm': 3.8389883041381836, 'learning_rate': 1.5743286626829437e-05} - -[Rank 2] Trainer log: {'loss': 0.4887, 'grad_norm': 3.8389883041381836, 'learning_rate': 1.5743286626829437e-05} -{'loss': 0.4887, 'grad_norm': 3.8389883041381836, 'learning_rate': 1.5743286626829437e-05, 'epoch': 0.34} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2620125770568848, 'train/info_loss': 0.1897217482328415, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011448318837210537, 'train/video_loss': 0.18960726261138916, 'train/total_loss': 0.45161983370780945} -tensor(0.0019, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.33066725730895996, 'train/info_loss': 0.20798012614250183, 'train/ref_loss': None, 'train/uncertainty_loss': -9.062947938218713e-05, 'train/video_loss': 0.20788949728012085, 'train/total_loss': 0.5385567545890808} -tensor(0.5530, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.2505, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4547, 'grad_norm': 13.096393585205078, 'learning_rate': 1.573455064179597e-05} -[Rank 1] Trainer log: {'loss': 0.4547, 'grad_norm': 13.096393585205078, 'learning_rate': 1.573455064179597e-05}[Rank 2] Trainer log: {'loss': 0.4547, 'grad_norm': 13.096393585205078, 'learning_rate': 1.573455064179597e-05} - -[Rank 0] Trainer log: {'loss': 0.4547, 'grad_norm': 13.096393585205078, 'learning_rate': 1.573455064179597e-05} -{'loss': 0.4547, 'grad_norm': 13.096393585205078, 'learning_rate': 1.573455064179597e-05, 'epoch': 0.34} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1661, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0008268521167337895, 'train/lm_loss': 6.83667603880167e-05, 'train/info_loss': 3.254307739553042e-05, 'train/ref_loss': 0.32549697160720825, 'train/uncertainty_loss': 0.016614583134651185, 'train/video_loss': 0.34875890612602234, 'train/total_loss': 0.3488272726535797} -tensor(0.2573, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.1611, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0009263922460377217, 'train/lm_loss': 0.00010037287138402462, 'train/info_loss': 3.5403903893893585e-05, 'train/ref_loss': 0.3213673233985901, 'train/uncertainty_loss': 0.016105684638023376, 'train/video_loss': 0.3449195623397827, 'train/total_loss': 0.3450199365615845} -tensor(0.1529, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.2879, 'grad_norm': 9.447772979736328, 'learning_rate': 1.5725808131041557e-05}[Rank 2] Trainer log: {'loss': 0.2879, 'grad_norm': 9.447772979736328, 'learning_rate': 1.5725808131041557e-05}[Rank 1] Trainer log: {'loss': 0.2879, 'grad_norm': 9.447772979736328, 'learning_rate': 1.5725808131041557e-05} - - -[Rank 0] Trainer log: {'loss': 0.2879, 'grad_norm': 9.447772979736328, 'learning_rate': 1.5725808131041557e-05} -{'loss': 0.2879, 'grad_norm': 9.447772979736328, 'learning_rate': 1.5725808131041557e-05, 'epoch': 0.34} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(0.0376, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0702, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.000988304615020752, 'train/lm_loss': 6.810459308326244e-05, 'train/info_loss': 3.3556287235114723e-05, 'train/ref_loss': 0.26311546564102173, 'train/uncertainty_loss': 0.007019525766372681, 'train/video_loss': 0.2780749797821045, 'train/total_loss': 0.2781430780887604} -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3269564151763916, 'train/info_loss': 0.16649970412254333, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012531745014712214, 'train/video_loss': 0.16637438535690308, 'train/total_loss': 0.49333080649375916} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.5015, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0969, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4065, 'grad_norm': 3.4763898849487305, 'learning_rate': 1.5717059104514877e-05}[Rank 1] Trainer log: {'loss': 0.4065, 'grad_norm': 3.4763898849487305, 'learning_rate': 1.5717059104514877e-05}[Rank 2] Trainer log: {'loss': 0.4065, 'grad_norm': 3.4763898849487305, 'learning_rate': 1.5717059104514877e-05} - - -[Rank 0] Trainer log: {'loss': 0.4065, 'grad_norm': 3.4763898849487305, 'learning_rate': 1.5717059104514877e-05} -{'loss': 0.4065, 'grad_norm': 3.4763898849487305, 'learning_rate': 1.5717059104514877e-05, 'epoch': 0.34} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.21750359535217287, 'train/info_loss': 0.22790735960006714, 'train/ref_loss': None, 'train/uncertainty_loss': -9.054770343936981e-05, 'train/video_loss': 0.22781680524349213, 'train/total_loss': 0.44532039761543274} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0010088445618748666, 'train/lm_loss': 7.816217839717865e-05, 'train/info_loss': 3.1053055863594636e-05, 'train/ref_loss': 0.18973639607429504, 'train/uncertainty_loss': -6.964800413697959e-05, 'train/video_loss': 0.19776855409145355, 'train/total_loss': 0.19784671068191528} -[Rank 0] Trainer log: {'loss': 0.3533, 'grad_norm': 4.091549396514893, 'learning_rate': 1.5708303572172018e-05}[Rank 1] Trainer log: {'loss': 0.3533, 'grad_norm': 4.091549396514893, 'learning_rate': 1.5708303572172018e-05} -[Rank 3] Trainer log: {'loss': 0.3533, 'grad_norm': 4.091549396514893, 'learning_rate': 1.5708303572172018e-05} -[Rank 2] Trainer log: {'loss': 0.3533, 'grad_norm': 4.091549396514893, 'learning_rate': 1.5708303572172018e-05} - -{'loss': 0.3533, 'grad_norm': 4.091549396514893, 'learning_rate': 1.5708303572172018e-05, 'epoch': 0.34} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.4710683822631836, 'train/info_loss': 0.21215210855007172, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011353116715326906, 'train/video_loss': 0.2120385766029358, 'train/total_loss': 0.6831069588661194} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.3876, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0429, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0009683944284915925, 'train/lm_loss': 0.0001018741517327726, 'train/info_loss': 3.832431684713811e-05, 'train/ref_loss': 0.2492130696773529, 'train/uncertainty_loss': 0.004285250604152679, 'train/video_loss': 0.2612837851047516, 'train/total_loss': 0.26138564944267273} -[Rank 3] Trainer log: {'loss': 0.4404, 'grad_norm': 3.387786388397217, 'learning_rate': 1.5699541543976478e-05} -[Rank 2] Trainer log: {'loss': 0.4404, 'grad_norm': 3.387786388397217, 'learning_rate': 1.5699541543976478e-05} -[Rank 1] Trainer log: {'loss': 0.4404, 'grad_norm': 3.387786388397217, 'learning_rate': 1.5699541543976478e-05} -[Rank 0] Trainer log: {'loss': 0.4404, 'grad_norm': 3.387786388397217, 'learning_rate': 1.5699541543976478e-05} -{'loss': 0.4404, 'grad_norm': 3.387786388397217, 'learning_rate': 1.5699541543976478e-05, 'epoch': 0.34} -tensor(-0.0015, device='cuda:1', grad_fn=) tensor(-0.0015, device='cuda:1', grad_fn=) -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.1437, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0010969148948788644, 'train/lm_loss': 0.00011571873910725117, 'train/info_loss': 3.772831769310869e-05, 'train/ref_loss': 0.30992740392684937, 'train/uncertainty_loss': 0.01437147706747055, 'train/video_loss': 0.3331119418144226, 'train/total_loss': 0.33322766423225403} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(0.8098, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0011487090028822423, 'train/lm_loss': 0.00010034904116764665, 'train/info_loss': 3.683431350509636e-05, 'train/ref_loss': 0.11792074143886566, 'train/uncertainty_loss': -7.43156357202679e-05, 'train/video_loss': 0.12707293033599854, 'train/total_loss': 0.1271732747554779} -[Rank 3] Trainer log: {'loss': 0.4453, 'grad_norm': 13.149181365966797, 'learning_rate': 1.5690773029899143e-05}[Rank 2] Trainer log: {'loss': 0.4453, 'grad_norm': 13.149181365966797, 'learning_rate': 1.5690773029899143e-05}[Rank 1] Trainer log: {'loss': 0.4453, 'grad_norm': 13.149181365966797, 'learning_rate': 1.5690773029899143e-05} - - -[Rank 0] Trainer log: {'loss': 0.4453, 'grad_norm': 13.149181365966797, 'learning_rate': 1.5690773029899143e-05} -{'loss': 0.4453, 'grad_norm': 13.149181365966797, 'learning_rate': 1.5690773029899143e-05, 'epoch': 0.34} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.7243, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0770, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0011715554632246495, 'train/lm_loss': 0.00010173117043450476, 'train/info_loss': 3.772831769310869e-05, 'train/ref_loss': 0.2639877200126648, 'train/uncertainty_loss': 0.007701466232538224, 'train/video_loss': 0.2810993492603302, 'train/total_loss': 0.2812010943889618} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(0.2682, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.3376, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0012744962237775327, 'train/lm_loss': 7.73280335124582e-05, 'train/info_loss': 3.278148142271675e-05, 'train/ref_loss': 0.15324510633945465, 'train/uncertainty_loss': -7.17300339601934e-05, 'train/video_loss': 0.16340212523937225, 'train/total_loss': 0.16347944736480713} -[Rank 2] Trainer log: {'loss': 0.3932, 'grad_norm': 8.711523056030273, 'learning_rate': 1.5681998039918272e-05}[Rank 0] Trainer log: {'loss': 0.3932, 'grad_norm': 8.711523056030273, 'learning_rate': 1.5681998039918272e-05} -[Rank 3] Trainer log: {'loss': 0.3932, 'grad_norm': 8.711523056030273, 'learning_rate': 1.5681998039918272e-05}[Rank 1] Trainer log: {'loss': 0.3932, 'grad_norm': 8.711523056030273, 'learning_rate': 1.5681998039918272e-05} - - -{'loss': 0.3932, 'grad_norm': 8.711523056030273, 'learning_rate': 1.5681998039918272e-05, 'epoch': 0.34} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2711989402770996, 'train/info_loss': 0.15718568861484528, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011434355983510614, 'train/video_loss': 0.15707135200500488, 'train/total_loss': 0.4282703101634979} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.11774529218673707, 'train/info_loss': 0.2250039130449295, 'train/ref_loss': None, 'train/uncertainty_loss': -8.841823437251152e-05, 'train/video_loss': 0.2249154895544052, 'train/total_loss': 0.3426607847213745} -[Rank 1] Trainer log: {'loss': 0.3909, 'grad_norm': 2.8918118476867676, 'learning_rate': 1.567321658401951e-05}[Rank 0] Trainer log: {'loss': 0.3909, 'grad_norm': 2.8918118476867676, 'learning_rate': 1.567321658401951e-05} -[Rank 3] Trainer log: {'loss': 0.3909, 'grad_norm': 2.8918118476867676, 'learning_rate': 1.567321658401951e-05} -[Rank 2] Trainer log: {'loss': 0.3909, 'grad_norm': 2.8918118476867676, 'learning_rate': 1.567321658401951e-05} - -{'loss': 0.3909, 'grad_norm': 2.8918118476867676, 'learning_rate': 1.567321658401951e-05, 'epoch': 0.34} -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.06341180205345154, 'train/info_loss': 0.19976936280727386, 'train/ref_loss': None, 'train/uncertainty_loss': -9.941450553014876e-05, 'train/video_loss': 0.1996699422597885, 'train/total_loss': 0.26308172941207886} -tensor(0.1240, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0894, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.25198585987091066, 'train/info_loss': 0.29820725321769714, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012576369335874917, 'train/video_loss': 0.29808148741722107, 'train/total_loss': 0.5500673651695251} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3434, 'grad_norm': 1.80190908908844, 'learning_rate': 1.5664428672195852e-05}[Rank 2] Trainer log: {'loss': 0.3434, 'grad_norm': 1.80190908908844, 'learning_rate': 1.5664428672195852e-05}[Rank 1] Trainer log: {'loss': 0.3434, 'grad_norm': 1.80190908908844, 'learning_rate': 1.5664428672195852e-05} - - -[Rank 0] Trainer log: {'loss': 0.3434, 'grad_norm': 1.80190908908844, 'learning_rate': 1.5664428672195852e-05} -{'loss': 0.3434, 'grad_norm': 1.80190908908844, 'learning_rate': 1.5664428672195852e-05, 'epoch': 0.34} -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.4043272972106934, 'train/info_loss': 0.22679482400417328, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012732746545225383, 'train/video_loss': 0.22666749358177185, 'train/total_loss': 0.6309947967529297} -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(0.0750, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.30996360778808596, 'train/info_loss': 0.2309192270040512, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012035008985549212, 'train/video_loss': 0.2307988703250885, 'train/total_loss': 0.5407624840736389} -tensor(0.0446, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.4549, 'grad_norm': 3.2171928882598877, 'learning_rate': 1.565563431444763e-05}[Rank 3] Trainer log: {'loss': 0.4549, 'grad_norm': 3.2171928882598877, 'learning_rate': 1.565563431444763e-05}[Rank 1] Trainer log: {'loss': 0.4549, 'grad_norm': 3.2171928882598877, 'learning_rate': 1.565563431444763e-05} - - -[Rank 0] Trainer log: {'loss': 0.4549, 'grad_norm': 3.2171928882598877, 'learning_rate': 1.565563431444763e-05} -{'loss': 0.4549, 'grad_norm': 3.2171928882598877, 'learning_rate': 1.565563431444763e-05, 'epoch': 0.34} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0117, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0010913616977632045, 'train/lm_loss': 0.00017859112704172732, 'train/info_loss': 4.1125513234874234e-05, 'train/ref_loss': 0.2199040949344635, 'train/uncertainty_loss': 0.0011722839437425137, 'train/video_loss': 0.22984839975833893, 'train/total_loss': 0.2300269901752472} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.13560642004013063, 'train/info_loss': 0.17631377279758453, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001234297058545053, 'train/video_loss': 0.1761903464794159, 'train/total_loss': 0.31179678440093994} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3192, 'grad_norm': 2.8691091537475586, 'learning_rate': 1.5646833520782525e-05}[Rank 3] Trainer log: {'loss': 0.3192, 'grad_norm': 2.8691091537475586, 'learning_rate': 1.5646833520782525e-05} - -[Rank 2] Trainer log: {'loss': 0.3192, 'grad_norm': 2.8691091537475586, 'learning_rate': 1.5646833520782525e-05} -[Rank 0] Trainer log: {'loss': 0.3192, 'grad_norm': 2.8691091537475586, 'learning_rate': 1.5646833520782525e-05} -{'loss': 0.3192, 'grad_norm': 2.8691091537475586, 'learning_rate': 1.5646833520782525e-05, 'epoch': 0.34} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.17937642335891724, 'train/info_loss': 0.16748204827308655, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010114974575117231, 'train/video_loss': 0.1673808991909027, 'train/total_loss': 0.34675732254981995} -tensor(0.1875, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0756, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0017, device='cuda:0', grad_fn=) tensor(-0.0017, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3899754047393799, 'train/info_loss': 0.2528241276741028, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001655134023167193, 'train/video_loss': 0.2526586055755615, 'train/total_loss': 0.6426340341567993} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3918, 'grad_norm': 7.804955959320068, 'learning_rate': 1.5638026301215537e-05}[Rank 0] Trainer log: {'loss': 0.3918, 'grad_norm': 7.804955959320068, 'learning_rate': 1.5638026301215537e-05} -[Rank 2] Trainer log: {'loss': 0.3918, 'grad_norm': 7.804955959320068, 'learning_rate': 1.5638026301215537e-05}[Rank 1] Trainer log: {'loss': 0.3918, 'grad_norm': 7.804955959320068, 'learning_rate': 1.5638026301215537e-05} - - -{'loss': 0.3918, 'grad_norm': 7.804955959320068, 'learning_rate': 1.5638026301215537e-05, 'epoch': 0.34} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1581848502159119, 'train/info_loss': 0.13721036911010742, 'train/ref_loss': None, 'train/uncertainty_loss': -8.663865155540408e-05, 'train/video_loss': 0.1371237337589264, 'train/total_loss': 0.29530858993530273} -tensor(0.1493, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.32106933593750003, 'train/info_loss': 0.24159500002861023, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012471772497519852, 'train/video_loss': 0.24147027730941772, 'train/total_loss': 0.5625395774841309} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(0.1010, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4298, 'grad_norm': 3.997100353240967, 'learning_rate': 1.562921266576898e-05}[Rank 2] Trainer log: {'loss': 0.4298, 'grad_norm': 3.997100353240967, 'learning_rate': 1.562921266576898e-05}[Rank 3] Trainer log: {'loss': 0.4298, 'grad_norm': 3.997100353240967, 'learning_rate': 1.562921266576898e-05} - - -[Rank 0] Trainer log: {'loss': 0.4298, 'grad_norm': 3.997100353240967, 'learning_rate': 1.562921266576898e-05} -{'loss': 0.4298, 'grad_norm': 3.997100353240967, 'learning_rate': 1.562921266576898e-05, 'epoch': 0.34} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(0.0072, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.001032422110438347, 'train/lm_loss': 0.00013213560450822114, 'train/info_loss': 4.017191531602293e-05, 'train/ref_loss': 0.20043635368347168, 'train/uncertainty_loss': 0.0007249395828694106, 'train/video_loss': 0.20946083962917328, 'train/total_loss': 0.20959296822547913} -tensor(0.4682, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0015, device='cuda:2', grad_fn=) tensor(-0.0015, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.11937690973281861, 'train/info_loss': 0.22582241892814636, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011914016213268042, 'train/video_loss': 0.22570328414440155, 'train/total_loss': 0.3450801968574524} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.385, 'grad_norm': 6.640826225280762, 'learning_rate': 1.562039262447246e-05}[Rank 0] Trainer log: {'loss': 0.385, 'grad_norm': 6.640826225280762, 'learning_rate': 1.562039262447246e-05} -[Rank 2] Trainer log: {'loss': 0.385, 'grad_norm': 6.640826225280762, 'learning_rate': 1.562039262447246e-05} -[Rank 1] Trainer log: {'loss': 0.385, 'grad_norm': 6.640826225280762, 'learning_rate': 1.562039262447246e-05} - -{'loss': 0.385, 'grad_norm': 6.640826225280762, 'learning_rate': 1.562039262447246e-05, 'epoch': 0.34} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(0.0867, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.2981, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.000986987352371216, 'train/lm_loss': 8.943465072661639e-05, 'train/info_loss': 3.683431350509636e-05, 'train/ref_loss': 0.41796165704727173, 'train/uncertainty_loss': 0.029808360338211062, 'train/video_loss': 0.4557027518749237, 'train/total_loss': 0.4557921886444092} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2580089092254639, 'train/info_loss': 0.18633423745632172, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011337067699059845, 'train/video_loss': 0.18622086942195892, 'train/total_loss': 0.44422978162765503} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.1446, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3548, 'grad_norm': 4.600274085998535, 'learning_rate': 1.561156618736289e-05}[Rank 2] Trainer log: {'loss': 0.3548, 'grad_norm': 4.600274085998535, 'learning_rate': 1.561156618736289e-05}[Rank 1] Trainer log: {'loss': 0.3548, 'grad_norm': 4.600274085998535, 'learning_rate': 1.561156618736289e-05} - - -[Rank 0] Trainer log: {'loss': 0.3548, 'grad_norm': 4.600274085998535, 'learning_rate': 1.561156618736289e-05} -{'loss': 0.3548, 'grad_norm': 4.600274085998535, 'learning_rate': 1.561156618736289e-05, 'epoch': 0.34} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.2532, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0008495026268064976, 'train/lm_loss': 0.00019867087248712779, 'train/info_loss': 4.1125513234874234e-05, 'train/ref_loss': 0.3852170705795288, 'train/uncertainty_loss': 0.025315144658088685, 'train/video_loss': 0.41736936569213867, 'train/total_loss': 0.41756802797317505} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.3328, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0565, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0010650861077010632, 'train/lm_loss': 0.00011552810901775956, 'train/info_loss': 4.017191531602293e-05, 'train/ref_loss': 0.2607022523880005, 'train/uncertainty_loss': 0.0056522760540246965, 'train/video_loss': 0.2749153971672058, 'train/total_loss': 0.2750309109687805} -[Rank 3] Trainer log: {'loss': 0.4176, 'grad_norm': 9.069355964660645, 'learning_rate': 1.5602733364484443e-05}[Rank 2] Trainer log: {'loss': 0.4176, 'grad_norm': 9.069355964660645, 'learning_rate': 1.5602733364484443e-05}[Rank 1] Trainer log: {'loss': 0.4176, 'grad_norm': 9.069355964660645, 'learning_rate': 1.5602733364484443e-05} - - -[Rank 0] Trainer log: {'loss': 0.4176, 'grad_norm': 9.069355964660645, 'learning_rate': 1.5602733364484443e-05} -{'loss': 0.4176, 'grad_norm': 9.069355964660645, 'learning_rate': 1.5602733364484443e-05, 'epoch': 0.35} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2859352111816406, 'train/info_loss': 0.15540634095668793, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011045406572520734, 'train/video_loss': 0.15529589354991913, 'train/total_loss': 0.4412311315536499} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.2362, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.37310819625854497, 'train/info_loss': 0.09549912810325623, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010856268927454949, 'train/video_loss': 0.09539056569337845, 'train/total_loss': 0.46849876642227173} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(0.0738, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4192, 'grad_norm': 8.861328125, 'learning_rate': 1.559389416588857e-05}[Rank 2] Trainer log: {'loss': 0.4192, 'grad_norm': 8.861328125, 'learning_rate': 1.559389416588857e-05}[Rank 1] Trainer log: {'loss': 0.4192, 'grad_norm': 8.861328125, 'learning_rate': 1.559389416588857e-05} - - -[Rank 0] Trainer log: {'loss': 0.4192, 'grad_norm': 8.861328125, 'learning_rate': 1.559389416588857e-05} -{'loss': 0.4192, 'grad_norm': 8.861328125, 'learning_rate': 1.559389416588857e-05, 'epoch': 0.35} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(0.2593, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0011542354710400106, 'train/lm_loss': 8.845757110975683e-05, 'train/info_loss': 3.379469126230106e-05, 'train/ref_loss': 0.39022648334503174, 'train/uncertainty_loss': 0.025927710533142093, 'train/video_loss': 0.42542189359664917, 'train/total_loss': 0.42551034688949585} -tensor(0.2010, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.21282689571380617, 'train/info_loss': 0.16277679800987244, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011912331683561206, 'train/video_loss': 0.16265767812728882, 'train/total_loss': 0.3754845857620239} -tensor(0.1186, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1578, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.39, 'grad_norm': 19.60680389404297, 'learning_rate': 1.5585048601633976e-05}[Rank 3] Trainer log: {'loss': 0.39, 'grad_norm': 19.60680389404297, 'learning_rate': 1.5585048601633976e-05}[Rank 2] Trainer log: {'loss': 0.39, 'grad_norm': 19.60680389404297, 'learning_rate': 1.5585048601633976e-05} - - -[Rank 0] Trainer log: {'loss': 0.39, 'grad_norm': 19.60680389404297, 'learning_rate': 1.5585048601633976e-05} -{'loss': 0.39, 'grad_norm': 19.60680389404297, 'learning_rate': 1.5585048601633976e-05, 'epoch': 0.35} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.24781312942504885, 'train/info_loss': 0.2032291293144226, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001291562570258975, 'train/video_loss': 0.20309996604919434, 'train/total_loss': 0.45091310143470764} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.6845, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(0.1299, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.4210, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0009716834872961045, 'train/lm_loss': 8.843374089337886e-05, 'train/info_loss': 3.516550350468606e-05, 'train/ref_loss': 0.5103176832199097, 'train/uncertainty_loss': 0.04210207760334015, 'train/video_loss': 0.5602284073829651, 'train/total_loss': 0.5603168606758118} -[Rank 1] Trainer log: {'loss': 0.5288, 'grad_norm': 6.454098701477051, 'learning_rate': 1.5576196681786605e-05} -[Rank 2] Trainer log: {'loss': 0.5288, 'grad_norm': 6.454098701477051, 'learning_rate': 1.5576196681786605e-05} -[Rank 3] Trainer log: {'loss': 0.5288, 'grad_norm': 6.454098701477051, 'learning_rate': 1.5576196681786605e-05} -[Rank 0] Trainer log: {'loss': 0.5288, 'grad_norm': 6.454098701477051, 'learning_rate': 1.5576196681786605e-05} -{'loss': 0.5288, 'grad_norm': 6.454098701477051, 'learning_rate': 1.5576196681786605e-05, 'epoch': 0.35} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.34071705341339115, 'train/info_loss': 0.20294854044914246, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012273037573322655, 'train/video_loss': 0.20282581448554993, 'train/total_loss': 0.5435428619384766} -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.24937863349914552, 'train/info_loss': 0.19028571248054504, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010403873166069389, 'train/video_loss': 0.1901816725730896, 'train/total_loss': 0.43956029415130615} -tensor(0.5035, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.5117, 'grad_norm': 5.684687614440918, 'learning_rate': 1.556733841641964e-05}[Rank 2] Trainer log: {'loss': 0.5117, 'grad_norm': 5.684687614440918, 'learning_rate': 1.556733841641964e-05}[Rank 3] Trainer log: {'loss': 0.5117, 'grad_norm': 5.684687614440918, 'learning_rate': 1.556733841641964e-05} - - -[Rank 1] Trainer log: {'loss': 0.5117, 'grad_norm': 5.684687614440918, 'learning_rate': 1.556733841641964e-05} -{'loss': 0.5117, 'grad_norm': 5.684687614440918, 'learning_rate': 1.556733841641964e-05, 'epoch': 0.35} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2633425235748291, 'train/info_loss': 0.1372365653514862, 'train/ref_loss': None, 'train/uncertainty_loss': -9.703071555122733e-05, 'train/video_loss': 0.13713952898979187, 'train/total_loss': 0.40048205852508545} -tensor(0.0124, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0883, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0011168060824275018, 'train/lm_loss': 8.867204887792469e-05, 'train/info_loss': 3.57019089278765e-05, 'train/ref_loss': 0.12059136480093002, 'train/uncertainty_loss': -7.523508393205703e-05, 'train/video_loss': 0.12948627769947052, 'train/total_loss': 0.1295749545097351} -[Rank 2] Trainer log: {'loss': 0.3171, 'grad_norm': 2.851663589477539, 'learning_rate': 1.5558473815613476e-05}[Rank 3] Trainer log: {'loss': 0.3171, 'grad_norm': 2.851663589477539, 'learning_rate': 1.5558473815613476e-05}[Rank 1] Trainer log: {'loss': 0.3171, 'grad_norm': 2.851663589477539, 'learning_rate': 1.5558473815613476e-05} - - -[Rank 0] Trainer log: {'loss': 0.3171, 'grad_norm': 2.851663589477539, 'learning_rate': 1.5558473815613476e-05} -{'loss': 0.3171, 'grad_norm': 2.851663589477539, 'learning_rate': 1.5558473815613476e-05, 'epoch': 0.35} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(0.0158, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.17596905231475832, 'train/info_loss': 0.24702085554599762, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011676655849441887, 'train/video_loss': 0.24690409004688263, 'train/total_loss': 0.4228731393814087} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2460472345352173, 'train/info_loss': 0.14581267535686493, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011521355481818319, 'train/video_loss': 0.1456974595785141, 'train/total_loss': 0.3917447030544281} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.2103, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.369, 'grad_norm': 4.058562755584717, 'learning_rate': 1.5549602889455728e-05} -[Rank 3] Trainer log: {'loss': 0.369, 'grad_norm': 4.058562755584717, 'learning_rate': 1.5549602889455728e-05} -[Rank 1] Trainer log: {'loss': 0.369, 'grad_norm': 4.058562755584717, 'learning_rate': 1.5549602889455728e-05}[Rank 0] Trainer log: {'loss': 0.369, 'grad_norm': 4.058562755584717, 'learning_rate': 1.5549602889455728e-05} - -{'loss': 0.369, 'grad_norm': 4.058562755584717, 'learning_rate': 1.5549602889455728e-05, 'epoch': 0.35} -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.29470272064208985, 'train/info_loss': 0.1783895492553711, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013133628526702524, 'train/video_loss': 0.17825821042060852, 'train/total_loss': 0.4729609489440918} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(0.0323, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(0.9131, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.000974928867071867, 'train/lm_loss': 8.817159687168896e-05, 'train/info_loss': 3.462909808149561e-05, 'train/ref_loss': 0.838339626789093, 'train/uncertainty_loss': 0.0913073480129242, 'train/video_loss': 0.9374810457229614, 'train/total_loss': 0.9375692009925842} -[Rank 1] Trainer log: {'loss': 0.5385, 'grad_norm': 5.133808612823486, 'learning_rate': 1.5540725648041205e-05}[Rank 2] Trainer log: {'loss': 0.5385, 'grad_norm': 5.133808612823486, 'learning_rate': 1.5540725648041205e-05} -[Rank 3] Trainer log: {'loss': 0.5385, 'grad_norm': 5.133808612823486, 'learning_rate': 1.5540725648041205e-05} - -[Rank 0] Trainer log: {'loss': 0.5385, 'grad_norm': 5.133808612823486, 'learning_rate': 1.5540725648041205e-05} -{'loss': 0.5385, 'grad_norm': 5.133808612823486, 'learning_rate': 1.5540725648041205e-05, 'epoch': 0.35} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.25903470516204835, 'train/info_loss': 0.21437111496925354, 'train/ref_loss': None, 'train/uncertainty_loss': -9.438220877200365e-05, 'train/video_loss': 0.2142767310142517, 'train/total_loss': 0.4733114540576935} -tensor(0.5352, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0015, device='cuda:1', grad_fn=) tensor(-0.0015, device='cuda:1', grad_fn=) -tensor(0.0771, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.3772, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0012618489563465118, 'train/lm_loss': 6.839059642516077e-05, 'train/info_loss': 3.176826794515364e-05, 'train/ref_loss': 0.4656543731689453, 'train/uncertainty_loss': 0.03771707713603974, 'train/video_loss': 0.5134980082511902, 'train/total_loss': 0.5135663747787476} -[Rank 3] Trainer log: {'loss': 0.4997, 'grad_norm': 12.92607593536377, 'learning_rate': 1.55318421014719e-05} -[Rank 1] Trainer log: {'loss': 0.4997, 'grad_norm': 12.92607593536377, 'learning_rate': 1.55318421014719e-05} -[Rank 2] Trainer log: {'loss': 0.4997, 'grad_norm': 12.92607593536377, 'learning_rate': 1.55318421014719e-05} -[Rank 0] Trainer log: {'loss': 0.4997, 'grad_norm': 12.92607593536377, 'learning_rate': 1.55318421014719e-05} -{'loss': 0.4997, 'grad_norm': 12.92607593536377, 'learning_rate': 1.55318421014719e-05, 'epoch': 0.35} -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.16443777084350586, 'train/info_loss': 0.14727389812469482, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011111258063465357, 'train/video_loss': 0.1471627801656723, 'train/total_loss': 0.31160056591033936} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.384514856338501, 'train/info_loss': 0.1494957059621811, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011423933319747448, 'train/video_loss': 0.14938147366046906, 'train/total_loss': 0.5338963270187378} -tensor(0.0508, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.4393, 'grad_norm': 2.4638516902923584, 'learning_rate': 1.552295225985698e-05}[Rank 2] Trainer log: {'loss': 0.4393, 'grad_norm': 2.4638516902923584, 'learning_rate': 1.552295225985698e-05} - -[Rank 3] Trainer log: {'loss': 0.4393, 'grad_norm': 2.4638516902923584, 'learning_rate': 1.552295225985698e-05} -[Rank 1] Trainer log: {'loss': 0.4393, 'grad_norm': 2.4638516902923584, 'learning_rate': 1.552295225985698e-05} -{'loss': 0.4393, 'grad_norm': 2.4638516902923584, 'learning_rate': 1.552295225985698e-05, 'epoch': 0.35} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(0.0719, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=)tensor(0.0181, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) - tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0011035840027034283, 'train/lm_loss': 7.954445900395513e-05, 'train/info_loss': 3.599991032388061e-05, 'train/ref_loss': 0.2037462294101715, 'train/uncertainty_loss': 0.001806824468076229, 'train/video_loss': 0.2144177258014679, 'train/total_loss': 0.21449726819992065} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.30186376571655277, 'train/info_loss': 0.1962280571460724, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011309148976579309, 'train/video_loss': 0.19611497223377228, 'train/total_loss': 0.49797874689102173} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.2625, 'grad_norm': 3.806175708770752, 'learning_rate': 1.551405613331278e-05}[Rank 2] Trainer log: {'loss': 0.2625, 'grad_norm': 3.806175708770752, 'learning_rate': 1.551405613331278e-05}[Rank 3] Trainer log: {'loss': 0.2625, 'grad_norm': 3.806175708770752, 'learning_rate': 1.551405613331278e-05} - - -[Rank 0] Trainer log: {'loss': 0.2625, 'grad_norm': 3.806175708770752, 'learning_rate': 1.551405613331278e-05} -{'loss': 0.2625, 'grad_norm': 3.806175708770752, 'learning_rate': 1.551405613331278e-05, 'epoch': 0.35} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3062163829803467, 'train/info_loss': 0.24194936454296112, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001121704000979662, 'train/video_loss': 0.24183718860149384, 'train/total_loss': 0.5480535626411438} -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(0.1755, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.26810171604156496, 'train/info_loss': 0.3762279450893402, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012322432594373824, 'train/video_loss': 0.3761047124862671, 'train/total_loss': 0.6442064046859741} -tensor(0.1342, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.4563, 'grad_norm': 3.727020502090454, 'learning_rate': 1.5505153731962793e-05} -[Rank 1] Trainer log: {'loss': 0.4563, 'grad_norm': 3.727020502090454, 'learning_rate': 1.5505153731962793e-05} -[Rank 3] Trainer log: {'loss': 0.4563, 'grad_norm': 3.727020502090454, 'learning_rate': 1.5505153731962793e-05} -[Rank 0] Trainer log: {'loss': 0.4563, 'grad_norm': 3.727020502090454, 'learning_rate': 1.5505153731962793e-05} -{'loss': 0.4563, 'grad_norm': 3.727020502090454, 'learning_rate': 1.5505153731962793e-05, 'epoch': 0.35} -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.23879854679107668, 'train/info_loss': 0.17717301845550537, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013481987407431008, 'train/video_loss': 0.17703819274902344, 'train/total_loss': 0.41583675146102905} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0011375119909644126, 'train/lm_loss': 0.00012200925266370177, 'train/info_loss': 4.1125513234874234e-05, 'train/ref_loss': 0.15250152349472046, 'train/uncertainty_loss': -7.22432800102979e-05, 'train/video_loss': 0.1615705043077469, 'train/total_loss': 0.1616925150156021} -tensor(0.1125, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4193, 'grad_norm': 4.344142436981201, 'learning_rate': 1.549624506593763e-05}[Rank 2] Trainer log: {'loss': 0.4193, 'grad_norm': 4.344142436981201, 'learning_rate': 1.549624506593763e-05}[Rank 3] Trainer log: {'loss': 0.4193, 'grad_norm': 4.344142436981201, 'learning_rate': 1.549624506593763e-05} - - -[Rank 0] Trainer log: {'loss': 0.4193, 'grad_norm': 4.344142436981201, 'learning_rate': 1.549624506593763e-05} -{'loss': 0.4193, 'grad_norm': 4.344142436981201, 'learning_rate': 1.549624506593763e-05, 'epoch': 0.35} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.18840432167053223, 'train/info_loss': 0.1523984670639038, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011232917895540596, 'train/video_loss': 0.15228614211082458, 'train/total_loss': 0.3406904637813568} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0010710135102272034, 'train/lm_loss': 0.00011540896957740188, 'train/info_loss': 4.017191531602293e-05, 'train/ref_loss': 0.1592293232679367, 'train/uncertainty_loss': -6.764151621609927e-05, 'train/video_loss': 0.16776996850967407, 'train/total_loss': 0.16788537800312042} -[Rank 2] Trainer log: {'loss': 0.3603, 'grad_norm': 3.254692554473877, 'learning_rate': 1.548733014537506e-05}[Rank 1] Trainer log: {'loss': 0.3603, 'grad_norm': 3.254692554473877, 'learning_rate': 1.548733014537506e-05} -[Rank 3] Trainer log: {'loss': 0.3603, 'grad_norm': 3.254692554473877, 'learning_rate': 1.548733014537506e-05} - -[Rank 0] Trainer log: {'loss': 0.3603, 'grad_norm': 3.254692554473877, 'learning_rate': 1.548733014537506e-05} -{'loss': 0.3603, 'grad_norm': 3.254692554473877, 'learning_rate': 1.548733014537506e-05, 'epoch': 0.35} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.4879484653472901, 'train/info_loss': 0.21059933304786682, 'train/ref_loss': None, 'train/uncertainty_loss': -9.186017559841275e-05, 'train/video_loss': 0.21050746738910675, 'train/total_loss': 0.6984559297561646} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3129342079162598, 'train/info_loss': 0.18776187300682068, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010979719227179886, 'train/video_loss': 0.1876520812511444, 'train/total_loss': 0.5005862712860107} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3883, 'grad_norm': 5.762475967407227, 'learning_rate': 1.5478408980419944e-05}[Rank 1] Trainer log: {'loss': 0.3883, 'grad_norm': 5.762475967407227, 'learning_rate': 1.5478408980419944e-05}[Rank 2] Trainer log: {'loss': 0.3883, 'grad_norm': 5.762475967407227, 'learning_rate': 1.5478408980419944e-05} - - -[Rank 0] Trainer log: {'loss': 0.3883, 'grad_norm': 5.762475967407227, 'learning_rate': 1.5478408980419944e-05} -{'loss': 0.3883, 'grad_norm': 5.762475967407227, 'learning_rate': 1.5478408980419944e-05, 'epoch': 0.35} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0010767504572868347, 'train/lm_loss': 9.467744384892285e-05, 'train/info_loss': 3.80263190891128e-05, 'train/ref_loss': 0.09809298813343048, 'train/uncertainty_loss': -7.036722963675857e-05, 'train/video_loss': 0.10667464882135391, 'train/total_loss': 0.10676932334899902} -tensor(0.2545, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0015, device='cuda:2', grad_fn=) tensor(-0.0015, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.38148367404937744, 'train/info_loss': 0.24267932772636414, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013340417062863709, 'train/video_loss': 0.24254591763019562, 'train/total_loss': 0.6240295767784119} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(0.0102, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3389, 'grad_norm': 3.7216715812683105, 'learning_rate': 1.5469481581224274e-05}[Rank 1] Trainer log: {'loss': 0.3389, 'grad_norm': 3.7216715812683105, 'learning_rate': 1.5469481581224274e-05}[Rank 3] Trainer log: {'loss': 0.3389, 'grad_norm': 3.7216715812683105, 'learning_rate': 1.5469481581224274e-05} - -[Rank 2] Trainer log: {'loss': 0.3389, 'grad_norm': 3.7216715812683105, 'learning_rate': 1.5469481581224274e-05} - -{'loss': 0.3389, 'grad_norm': 3.7216715812683105, 'learning_rate': 1.5469481581224274e-05, 'epoch': 0.35} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3382720708847046, 'train/info_loss': 0.15727825462818146, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011491467012092471, 'train/video_loss': 0.1571633368730545, 'train/total_loss': 0.4954354166984558} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.08604118227958679, 'train/info_loss': 0.15140004456043243, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011334181763231755, 'train/video_loss': 0.15128670632839203, 'train/total_loss': 0.23732788860797882} -tensor(0.3826, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3614, 'grad_norm': 3.5042946338653564, 'learning_rate': 1.5460547957947105e-05}[Rank 2] Trainer log: {'loss': 0.3614, 'grad_norm': 3.5042946338653564, 'learning_rate': 1.5460547957947105e-05}[Rank 3] Trainer log: {'loss': 0.3614, 'grad_norm': 3.5042946338653564, 'learning_rate': 1.5460547957947105e-05} - - -[Rank 0] Trainer log: {'loss': 0.3614, 'grad_norm': 3.5042946338653564, 'learning_rate': 1.5460547957947105e-05} -{'loss': 0.3614, 'grad_norm': 3.5042946338653564, 'learning_rate': 1.5460547957947105e-05, 'epoch': 0.35} -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.14044151306152344, 'train/info_loss': 0.49465325474739075, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013762947637587785, 'train/video_loss': 0.49451562762260437, 'train/total_loss': 0.6349571347236633} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(0.0174, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0749, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.0316, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00099629620090127, 'train/lm_loss': 8.929166360758246e-05, 'train/info_loss': 3.683431350509636e-05, 'train/ref_loss': 0.20883941650390625, 'train/uncertainty_loss': 0.0031569022685289383, 'train/video_loss': 0.22000351548194885, 'train/total_loss': 0.2200928032398224} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3836, 'grad_norm': 4.291467666625977, 'learning_rate': 1.54516081207546e-05}[Rank 1] Trainer log: {'loss': 0.3836, 'grad_norm': 4.291467666625977, 'learning_rate': 1.54516081207546e-05}[Rank 2] Trainer log: {'loss': 0.3836, 'grad_norm': 4.291467666625977, 'learning_rate': 1.54516081207546e-05} - - -[Rank 0] Trainer log: {'loss': 0.3836, 'grad_norm': 4.291467666625977, 'learning_rate': 1.54516081207546e-05} -{'loss': 0.3836, 'grad_norm': 4.291467666625977, 'learning_rate': 1.54516081207546e-05, 'epoch': 0.35} -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1747468113899231, 'train/info_loss': 0.07281667739152908, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012504017213359477, 'train/video_loss': 0.07269163429737091, 'train/total_loss': 0.247438445687294} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.2400, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0010656355880200863, 'train/lm_loss': 7.878182223066688e-05, 'train/info_loss': 3.325828583911061e-05, 'train/ref_loss': 0.37674883008003235, 'train/uncertainty_loss': 0.023998963832855227, 'train/video_loss': 0.409306138753891, 'train/total_loss': 0.40938490629196167} -[Rank 3] Trainer log: {'loss': 0.2863, 'grad_norm': 2.6677374839782715, 'learning_rate': 1.5442662079819994e-05}[Rank 2] Trainer log: {'loss': 0.2863, 'grad_norm': 2.6677374839782715, 'learning_rate': 1.5442662079819994e-05} - -[Rank 1] Trainer log: {'loss': 0.2863, 'grad_norm': 2.6677374839782715, 'learning_rate': 1.5442662079819994e-05}[Rank 0] Trainer log: {'loss': 0.2863, 'grad_norm': 2.6677374839782715, 'learning_rate': 1.5442662079819994e-05} - -{'loss': 0.2863, 'grad_norm': 2.6677374839782715, 'learning_rate': 1.5442662079819994e-05, 'epoch': 0.35} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.4329216480255127, 'train/info_loss': 0.19014501571655273, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010990587761625648, 'train/video_loss': 0.19003510475158691, 'train/total_loss': 0.6229567527770996} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(0.0864, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1284, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0009875372052192688, 'train/lm_loss': 9.470127988606692e-05, 'train/info_loss': 3.713231490110047e-05, 'train/ref_loss': 0.27870428562164307, 'train/uncertainty_loss': 0.012838959693908691, 'train/video_loss': 0.299480676651001, 'train/total_loss': 0.2995753884315491} -[Rank 1] Trainer log: {'loss': 0.402, 'grad_norm': 6.945250511169434, 'learning_rate': 1.5433709845323563e-05}[Rank 2] Trainer log: {'loss': 0.402, 'grad_norm': 6.945250511169434, 'learning_rate': 1.5433709845323563e-05} - -[Rank 3] Trainer log: {'loss': 0.402, 'grad_norm': 6.945250511169434, 'learning_rate': 1.5433709845323563e-05} -[Rank 0] Trainer log: {'loss': 0.402, 'grad_norm': 6.945250511169434, 'learning_rate': 1.5433709845323563e-05} -{'loss': 0.402, 'grad_norm': 6.945250511169434, 'learning_rate': 1.5433709845323563e-05, 'epoch': 0.35} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.30093691349029544, 'train/info_loss': 0.20165258646011353, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010522044030949474, 'train/video_loss': 0.20154736936092377, 'train/total_loss': 0.5024842619895935} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.6286, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.0960148274898529, 'train/info_loss': 0.0939614400267601, 'train/ref_loss': None, 'train/uncertainty_loss': -9.903947357088328e-05, 'train/video_loss': 0.09386239945888519, 'train/total_loss': 0.1898772269487381} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3702, 'grad_norm': 7.970930576324463, 'learning_rate': 1.5424751427452644e-05}[Rank 3] Trainer log: {'loss': 0.3702, 'grad_norm': 7.970930576324463, 'learning_rate': 1.5424751427452644e-05}[Rank 1] Trainer log: {'loss': 0.3702, 'grad_norm': 7.970930576324463, 'learning_rate': 1.5424751427452644e-05} - -[Rank 0] Trainer log: {'loss': 0.3702, 'grad_norm': 7.970930576324463, 'learning_rate': 1.5424751427452644e-05} - -{'loss': 0.3702, 'grad_norm': 7.970930576324463, 'learning_rate': 1.5424751427452644e-05, 'epoch': 0.35} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.1297, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0010275993496179582, 'train/lm_loss': 7.809068192727865e-05, 'train/info_loss': 3.462909808149561e-05, 'train/ref_loss': 0.29069000482559204, 'train/uncertainty_loss': 0.012970606982707978, 'train/video_loss': 0.3119160234928131, 'train/total_loss': 0.31199410557746887} -tensor(0.3001, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.08984715938568116, 'train/info_loss': 0.22860531508922577, 'train/ref_loss': None, 'train/uncertainty_loss': -9.337186347693206e-05, 'train/video_loss': 0.22851194441318512, 'train/total_loss': 0.3183591067790985} -tensor(0.0416, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3942, 'grad_norm': 11.946371078491211, 'learning_rate': 1.5415786836401604e-05}[Rank 2] Trainer log: {'loss': 0.3942, 'grad_norm': 11.946371078491211, 'learning_rate': 1.5415786836401604e-05}[Rank 1] Trainer log: {'loss': 0.3942, 'grad_norm': 11.946371078491211, 'learning_rate': 1.5415786836401604e-05} - - -[Rank 0] Trainer log: {'loss': 0.3942, 'grad_norm': 11.946371078491211, 'learning_rate': 1.5415786836401604e-05} -{'loss': 0.3942, 'grad_norm': 11.946371078491211, 'learning_rate': 1.5415786836401604e-05, 'epoch': 0.35} -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.24079811573028564, 'train/info_loss': 0.10708942264318466, 'train/ref_loss': None, 'train/uncertainty_loss': -9.969237726181746e-05, 'train/video_loss': 0.10698973387479782, 'train/total_loss': 0.34778785705566406} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.1093, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.001061242911964655, 'train/lm_loss': 0.0001150753814727068, 'train/info_loss': 4.0767914470052347e-05, 'train/ref_loss': 0.28542575240135193, 'train/uncertainty_loss': 0.010928051173686983, 'train/video_loss': 0.30488452315330505, 'train/total_loss': 0.30499958992004395} -[Rank 2] Trainer log: {'loss': 0.3899, 'grad_norm': 3.1676247119903564, 'learning_rate': 1.5406816082371843e-05}[Rank 1] Trainer log: {'loss': 0.3899, 'grad_norm': 3.1676247119903564, 'learning_rate': 1.5406816082371843e-05}[Rank 3] Trainer log: {'loss': 0.3899, 'grad_norm': 3.1676247119903564, 'learning_rate': 1.5406816082371843e-05} - - -[Rank 0] Trainer log: {'loss': 0.3899, 'grad_norm': 3.1676247119903564, 'learning_rate': 1.5406816082371843e-05} -{'loss': 0.3899, 'grad_norm': 3.1676247119903564, 'learning_rate': 1.5406816082371843e-05, 'epoch': 0.35} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.3183, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0010521597228944303, 'train/lm_loss': 0.00010673535289242864, 'train/info_loss': 4.017191531602293e-05, 'train/ref_loss': 0.4256397485733032, 'train/uncertainty_loss': 0.03182973563671112, 'train/video_loss': 0.4659269452095032, 'train/total_loss': 0.4660336673259735} -tensor(0.0103, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(0.0410, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3603096008300781, 'train/info_loss': 0.08860121667385101, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011757876491174102, 'train/video_loss': 0.08848363906145096, 'train/total_loss': 0.4487932324409485} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3896, 'grad_norm': 4.7439727783203125, 'learning_rate': 1.5397839175571773e-05}[Rank 2] Trainer log: {'loss': 0.3896, 'grad_norm': 4.7439727783203125, 'learning_rate': 1.5397839175571773e-05}[Rank 3] Trainer log: {'loss': 0.3896, 'grad_norm': 4.7439727783203125, 'learning_rate': 1.5397839175571773e-05} -[Rank 0] Trainer log: {'loss': 0.3896, 'grad_norm': 4.7439727783203125, 'learning_rate': 1.5397839175571773e-05} - - -{'loss': 0.3896, 'grad_norm': 4.7439727783203125, 'learning_rate': 1.5397839175571773e-05, 'epoch': 0.35} -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.29651257991790775, 'train/info_loss': 0.23680071532726288, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013286418979987502, 'train/video_loss': 0.23666785657405853, 'train/total_loss': 0.5331804156303406} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.1121, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.2153, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0015, device='cuda:1', grad_fn=) tensor(-0.0015, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2935965538024902, 'train/info_loss': 0.17344465851783752, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010376318823546171, 'train/video_loss': 0.17334090173244476, 'train/total_loss': 0.46693748235702515} -tensor(0.0435, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4211, 'grad_norm': 2.842745780944824, 'learning_rate': 1.5388856126216797e-05} -[Rank 2] Trainer log: {'loss': 0.4211, 'grad_norm': 2.842745780944824, 'learning_rate': 1.5388856126216797e-05} -[Rank 0] Trainer log: {'loss': 0.4211, 'grad_norm': 2.842745780944824, 'learning_rate': 1.5388856126216797e-05}[Rank 3] Trainer log: {'loss': 0.4211, 'grad_norm': 2.842745780944824, 'learning_rate': 1.5388856126216797e-05} - -{'loss': 0.4211, 'grad_norm': 2.842745780944824, 'learning_rate': 1.5388856126216797e-05, 'epoch': 0.35} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0307, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0010209997184574605, 'train/lm_loss': 0.0001014928799122572, 'train/info_loss': 3.862231824314222e-05, 'train/ref_loss': 0.24132299423217773, 'train/uncertainty_loss': 0.003071655333042145, 'train/video_loss': 0.2526012659072876, 'train/total_loss': 0.2527027726173401} -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2859298944473267, 'train/info_loss': 0.22619538009166718, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001436907215975225, 'train/video_loss': 0.2260516881942749, 'train/total_loss': 0.5119816064834595} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.408, 'grad_norm': 1.871898889541626, 'learning_rate': 1.537986694452932e-05} -[Rank 0] Trainer log: {'loss': 0.408, 'grad_norm': 1.871898889541626, 'learning_rate': 1.537986694452932e-05}[Rank 1] Trainer log: {'loss': 0.408, 'grad_norm': 1.871898889541626, 'learning_rate': 1.537986694452932e-05} -[Rank 3] Trainer log: {'loss': 0.408, 'grad_norm': 1.871898889541626, 'learning_rate': 1.537986694452932e-05} - -{'loss': 0.408, 'grad_norm': 1.871898889541626, 'learning_rate': 1.537986694452932e-05, 'epoch': 0.35} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.12224088907241822, 'train/info_loss': 0.19250799715518951, 'train/ref_loss': None, 'train/uncertainty_loss': -8.596525876782835e-05, 'train/video_loss': 0.1924220323562622, 'train/total_loss': 0.3146629333496094} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0938, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2754035234451294, 'train/info_loss': 0.15518033504486084, 'train/ref_loss': None, 'train/uncertainty_loss': -9.694314212538302e-05, 'train/video_loss': 0.15508338809013367, 'train/total_loss': 0.43048691749572754} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3542, 'grad_norm': 4.167984962463379, 'learning_rate': 1.5370871640738726e-05} -[Rank 0] Trainer log: {'loss': 0.3542, 'grad_norm': 4.167984962463379, 'learning_rate': 1.5370871640738726e-05} -[Rank 3] Trainer log: {'loss': 0.3542, 'grad_norm': 4.167984962463379, 'learning_rate': 1.5370871640738726e-05}[Rank 2] Trainer log: {'loss': 0.3542, 'grad_norm': 4.167984962463379, 'learning_rate': 1.5370871640738726e-05} - -{'loss': 0.3542, 'grad_norm': 4.167984962463379, 'learning_rate': 1.5370871640738726e-05, 'epoch': 0.35} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.32282857894897465, 'train/info_loss': 0.23095642030239105, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001127253519371152, 'train/video_loss': 0.2308436930179596, 'train/total_loss': 0.5536723136901855} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(0.0356, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0008435542695224285, 'train/lm_loss': 0.00013194499770179393, 'train/info_loss': 4.0767914470052347e-05, 'train/ref_loss': 0.10961463302373886, 'train/uncertainty_loss': -7.598024676553906e-05, 'train/video_loss': 0.11632785201072693, 'train/total_loss': 0.11645979434251785} -tensor(1.1930, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1070, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3909, 'grad_norm': 6.613725185394287, 'learning_rate': 1.536187022508135e-05}[Rank 2] Trainer log: {'loss': 0.3909, 'grad_norm': 6.613725185394287, 'learning_rate': 1.536187022508135e-05} -[Rank 0] Trainer log: {'loss': 0.3909, 'grad_norm': 6.613725185394287, 'learning_rate': 1.536187022508135e-05}[Rank 3] Trainer log: {'loss': 0.3909, 'grad_norm': 6.613725185394287, 'learning_rate': 1.536187022508135e-05} - - -{'loss': 0.3909, 'grad_norm': 6.613725185394287, 'learning_rate': 1.536187022508135e-05, 'epoch': 0.35} -tensor(-0.0016, device='cuda:1', grad_fn=) tensor(-0.0016, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2988748550415039, 'train/info_loss': 0.19719533622264862, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010107643902301788, 'train/video_loss': 0.19709426164627075, 'train/total_loss': 0.49596911668777466} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0016, device='cuda:0', grad_fn=) tensor(-0.0016, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.34527244567871096, 'train/info_loss': 0.21101222932338715, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001573348534293473, 'train/video_loss': 0.2108548879623413, 'train/total_loss': 0.5561273097991943} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(0.0195, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3913, 'grad_norm': 6.201315879821777, 'learning_rate': 1.5352862707800496e-05}[Rank 2] Trainer log: {'loss': 0.3913, 'grad_norm': 6.201315879821777, 'learning_rate': 1.5352862707800496e-05}[Rank 1] Trainer log: {'loss': 0.3913, 'grad_norm': 6.201315879821777, 'learning_rate': 1.5352862707800496e-05} - - -[Rank 0] Trainer log: {'loss': 0.3913, 'grad_norm': 6.201315879821777, 'learning_rate': 1.5352862707800496e-05} -{'loss': 0.3913, 'grad_norm': 6.201315879821777, 'learning_rate': 1.5352862707800496e-05, 'epoch': 0.35} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.29634096622467043, 'train/info_loss': 0.15808095037937164, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010750408982858063, 'train/video_loss': 0.15797345340251923, 'train/total_loss': 0.4543144106864929} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0774, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0009579878300428391, 'train/lm_loss': 0.00011445584241300822, 'train/info_loss': 4.273470403859392e-05, 'train/ref_loss': 0.26080724596977234, 'train/uncertainty_loss': 0.007735628634691238, 'train/video_loss': 0.2762495279312134, 'train/total_loss': 0.2763639986515045} -tensor(0.0177, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(0.3751, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3287, 'grad_norm': 5.453532695770264, 'learning_rate': 1.5343849099146414e-05}[Rank 2] Trainer log: {'loss': 0.3287, 'grad_norm': 5.453532695770264, 'learning_rate': 1.5343849099146414e-05}[Rank 0] Trainer log: {'loss': 0.3287, 'grad_norm': 5.453532695770264, 'learning_rate': 1.5343849099146414e-05} - -[Rank 1] Trainer log: {'loss': 0.3287, 'grad_norm': 5.453532695770264, 'learning_rate': 1.5343849099146414e-05} - -{'loss': 0.3287, 'grad_norm': 5.453532695770264, 'learning_rate': 1.5343849099146414e-05, 'epoch': 0.35} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1535, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0008847201243042946, 'train/lm_loss': 0.000147145485971123, 'train/info_loss': 4.410549081512727e-05, 'train/ref_loss': 0.31431278586387634, 'train/uncertainty_loss': 0.01535104662179947, 'train/video_loss': 0.3367857038974762, 'train/total_loss': 0.33693283796310425} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.3298, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.0124, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0010694749653339386, 'train/lm_loss': 0.00014688342344015837, 'train/info_loss': 4.4820684706792235e-05, 'train/ref_loss': 0.2252657115459442, 'train/uncertainty_loss': 0.0012439009733498096, 'train/video_loss': 0.23511023819446564, 'train/total_loss': 0.2352571189403534} -[Rank 2] Trainer log: {'loss': 0.3401, 'grad_norm': 12.84589672088623, 'learning_rate': 1.5334829409376272e-05} -[Rank 1] Trainer log: {'loss': 0.3401, 'grad_norm': 12.84589672088623, 'learning_rate': 1.5334829409376272e-05} -[Rank 0] Trainer log: {'loss': 0.3401, 'grad_norm': 12.84589672088623, 'learning_rate': 1.5334829409376272e-05}[Rank 3] Trainer log: {'loss': 0.3401, 'grad_norm': 12.84589672088623, 'learning_rate': 1.5334829409376272e-05} - -{'loss': 0.3401, 'grad_norm': 12.84589672088623, 'learning_rate': 1.5334829409376272e-05, 'epoch': 0.35} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.0098, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0009276930242776871, 'train/lm_loss': 8.962529827840627e-05, 'train/info_loss': 3.683431350509636e-05, 'train/ref_loss': 0.20891022682189941, 'train/uncertainty_loss': -7.034019217826426e-05, 'train/video_loss': 0.21629826724529266, 'train/total_loss': 0.21638789772987366} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3113407850265503, 'train/info_loss': 0.2577396035194397, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012173058930784464, 'train/video_loss': 0.25761786103248596, 'train/total_loss': 0.5689586400985718} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3866, 'grad_norm': 5.194007873535156, 'learning_rate': 1.532580364875417e-05}[Rank 1] Trainer log: {'loss': 0.3866, 'grad_norm': 5.194007873535156, 'learning_rate': 1.532580364875417e-05} - -[Rank 0] Trainer log: {'loss': 0.3866, 'grad_norm': 5.194007873535156, 'learning_rate': 1.532580364875417e-05}[Rank 2] Trainer log: {'loss': 0.3866, 'grad_norm': 5.194007873535156, 'learning_rate': 1.532580364875417e-05} - -{'loss': 0.3866, 'grad_norm': 5.194007873535156, 'learning_rate': 1.532580364875417e-05, 'epoch': 0.36} -tensor(-0.0019, device='cuda:0', grad_fn=) tensor(-0.0019, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.33760325908660893, 'train/info_loss': 0.3360842764377594, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00018782146507874132, 'train/video_loss': 0.33589646220207214, 'train/total_loss': 0.6734997034072876} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0010193645022809506, 'train/lm_loss': 7.816217839717865e-05, 'train/info_loss': 3.486750210868195e-05, 'train/ref_loss': 0.12942001223564148, 'train/uncertainty_loss': -7.24509300198406e-05, 'train/video_loss': 0.13753734529018402, 'train/total_loss': 0.13761550188064575} -tensor(0.0187, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3411, 'grad_norm': 2.5195579528808594, 'learning_rate': 1.5316771827551108e-05} -[Rank 0] Trainer log: {'loss': 0.3411, 'grad_norm': 2.5195579528808594, 'learning_rate': 1.5316771827551108e-05}[Rank 3] Trainer log: {'loss': 0.3411, 'grad_norm': 2.5195579528808594, 'learning_rate': 1.5316771827551108e-05}[Rank 2] Trainer log: {'loss': 0.3411, 'grad_norm': 2.5195579528808594, 'learning_rate': 1.5316771827551108e-05} - - -{'loss': 0.3411, 'grad_norm': 2.5195579528808594, 'learning_rate': 1.5316771827551108e-05, 'epoch': 0.36} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(0.0655, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0008079893887042999, 'train/lm_loss': 0.00010845104698091746, 'train/info_loss': 4.1125513234874234e-05, 'train/ref_loss': 0.2622256875038147, 'train/uncertainty_loss': 0.00655200481414795, 'train/video_loss': 0.27528274059295654, 'train/total_loss': 0.27539119124412537} -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3528272390365601, 'train/info_loss': 0.20893122255802155, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012424985179677607, 'train/video_loss': 0.20880697667598724, 'train/total_loss': 0.5616342425346375} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4199, 'grad_norm': 4.872361183166504, 'learning_rate': 1.5307733956045e-05}[Rank 2] Trainer log: {'loss': 0.4199, 'grad_norm': 4.872361183166504, 'learning_rate': 1.5307733956045e-05} -[Rank 0] Trainer log: {'loss': 0.4199, 'grad_norm': 4.872361183166504, 'learning_rate': 1.5307733956045e-05} -[Rank 1] Trainer log: {'loss': 0.4199, 'grad_norm': 4.872361183166504, 'learning_rate': 1.5307733956045e-05} - -{'loss': 0.4199, 'grad_norm': 4.872361183166504, 'learning_rate': 1.5307733956045e-05, 'epoch': 0.36} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0009987554512917995, 'train/lm_loss': 8.983978186734022e-05, 'train/info_loss': 3.921831739717163e-05, 'train/ref_loss': 0.10249096900224686, 'train/uncertainty_loss': -7.272615912370385e-05, 'train/video_loss': 0.11044750362634659, 'train/total_loss': 0.1105373427271843} -tensor(0.0060, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(0.0275, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0246, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0009527295827865601, 'train/lm_loss': 7.840050384402276e-05, 'train/info_loss': 3.43310966854915e-05, 'train/ref_loss': 0.18044431507587433, 'train/uncertainty_loss': -7.384181953966618e-05, 'train/video_loss': 0.18802665174007416, 'train/total_loss': 0.188105046749115} -[Rank 3] Trainer log: {'loss': 0.3163, 'grad_norm': 3.2908990383148193, 'learning_rate': 1.5298690044520617e-05}[Rank 0] Trainer log: {'loss': 0.3163, 'grad_norm': 3.2908990383148193, 'learning_rate': 1.5298690044520617e-05} -[Rank 1] Trainer log: {'loss': 0.3163, 'grad_norm': 3.2908990383148193, 'learning_rate': 1.5298690044520617e-05}[Rank 2] Trainer log: {'loss': 0.3163, 'grad_norm': 3.2908990383148193, 'learning_rate': 1.5298690044520617e-05} - - -{'loss': 0.3163, 'grad_norm': 3.2908990383148193, 'learning_rate': 1.5298690044520617e-05, 'epoch': 0.36} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1693402409553528, 'train/info_loss': 0.17090441286563873, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001065023010596633, 'train/video_loss': 0.1707979142665863, 'train/total_loss': 0.34013813734054565} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.18474727869033813, 'train/info_loss': 0.22758245468139648, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001243199221789837, 'train/video_loss': 0.2274581342935562, 'train/total_loss': 0.41220539808273315} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.2736, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.4634, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3186, 'grad_norm': 8.461430549621582, 'learning_rate': 1.5289640103269626e-05}[Rank 0] Trainer log: {'loss': 0.3186, 'grad_norm': 8.461430549621582, 'learning_rate': 1.5289640103269626e-05} -[Rank 2] Trainer log: {'loss': 0.3186, 'grad_norm': 8.461430549621582, 'learning_rate': 1.5289640103269626e-05} -[Rank 3] Trainer log: {'loss': 0.3186, 'grad_norm': 8.461430549621582, 'learning_rate': 1.5289640103269626e-05} - -{'loss': 0.3186, 'grad_norm': 8.461430549621582, 'learning_rate': 1.5289640103269626e-05, 'epoch': 0.36} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.18598244190216065, 'train/info_loss': 0.13221235573291779, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011960560223087669, 'train/video_loss': 0.13209274411201477, 'train/total_loss': 0.31807518005371094} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.0944, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.33578333854675296, 'train/info_loss': 0.24604880809783936, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011112682987004519, 'train/video_loss': 0.24593767523765564, 'train/total_loss': 0.5817210078239441} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(0.1623, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4567, 'grad_norm': 7.066009044647217, 'learning_rate': 1.528058414259055e-05} -[Rank 0] Trainer log: {'loss': 0.4567, 'grad_norm': 7.066009044647217, 'learning_rate': 1.528058414259055e-05}[Rank 3] Trainer log: {'loss': 0.4567, 'grad_norm': 7.066009044647217, 'learning_rate': 1.528058414259055e-05} -[Rank 2] Trainer log: {'loss': 0.4567, 'grad_norm': 7.066009044647217, 'learning_rate': 1.528058414259055e-05} - -{'loss': 0.4567, 'grad_norm': 7.066009044647217, 'learning_rate': 1.528058414259055e-05, 'epoch': 0.36} -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3529890775680542, 'train/info_loss': 0.20217131078243256, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012737478828057648, 'train/video_loss': 0.20204393565654755, 'train/total_loss': 0.5550330281257629} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.46059479713439944, 'train/info_loss': 0.18175971508026123, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010111023439094425, 'train/video_loss': 0.18165861070156097, 'train/total_loss': 0.6422533988952637} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.2640, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.4448, 'grad_norm': 8.97813606262207, 'learning_rate': 1.527152217278875e-05}[Rank 1] Trainer log: {'loss': 0.4448, 'grad_norm': 8.97813606262207, 'learning_rate': 1.527152217278875e-05}[Rank 0] Trainer log: {'loss': 0.4448, 'grad_norm': 8.97813606262207, 'learning_rate': 1.527152217278875e-05} - - -[Rank 3] Trainer log: {'loss': 0.4448, 'grad_norm': 8.97813606262207, 'learning_rate': 1.527152217278875e-05} -{'loss': 0.4448, 'grad_norm': 8.97813606262207, 'learning_rate': 1.527152217278875e-05, 'epoch': 0.36} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.29646728038787845, 'train/info_loss': 0.11565768718719482, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011614533141255379, 'train/video_loss': 0.11554154008626938, 'train/total_loss': 0.4120088219642639} -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0016, device='cuda:0', grad_fn=) tensor(-0.0016, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3675469160079956, 'train/info_loss': 0.14109934866428375, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00016426262445747854, 'train/video_loss': 0.14093509316444397, 'train/total_loss': 0.5084819793701172} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3943, 'grad_norm': 3.6456544399261475, 'learning_rate': 1.5262454204176447e-05}[Rank 1] Trainer log: {'loss': 0.3943, 'grad_norm': 3.6456544399261475, 'learning_rate': 1.5262454204176447e-05}[Rank 2] Trainer log: {'loss': 0.3943, 'grad_norm': 3.6456544399261475, 'learning_rate': 1.5262454204176447e-05} - - -[Rank 0] Trainer log: {'loss': 0.3943, 'grad_norm': 3.6456544399261475, 'learning_rate': 1.5262454204176447e-05} -{'loss': 0.3943, 'grad_norm': 3.6456544399261475, 'learning_rate': 1.5262454204176447e-05, 'epoch': 0.36} -tensor(-0.0015, device='cuda:3', grad_fn=) tensor(-0.0015, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.26700119972229003, 'train/info_loss': 0.14762069284915924, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011556461686268449, 'train/video_loss': 0.14750513434410095, 'train/total_loss': 0.41450634598731995} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2811643123626709, 'train/info_loss': 0.2073657363653183, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011677804868668318, 'train/video_loss': 0.20724895596504211, 'train/total_loss': 0.4884132742881775} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.0257, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3764, 'grad_norm': 4.2912092208862305, 'learning_rate': 1.525338024707267e-05}[Rank 2] Trainer log: {'loss': 0.3764, 'grad_norm': 4.2912092208862305, 'learning_rate': 1.525338024707267e-05}[Rank 0] Trainer log: {'loss': 0.3764, 'grad_norm': 4.2912092208862305, 'learning_rate': 1.525338024707267e-05} - -[Rank 1] Trainer log: {'loss': 0.3764, 'grad_norm': 4.2912092208862305, 'learning_rate': 1.525338024707267e-05} - -{'loss': 0.3764, 'grad_norm': 4.2912092208862305, 'learning_rate': 1.525338024707267e-05, 'epoch': 0.36} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3126668453216553, 'train/info_loss': 0.22472108900547028, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011226714123040438, 'train/video_loss': 0.22460882365703583, 'train/total_loss': 0.5372756719589233} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0433, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(0.0380, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0010013245046138763, 'train/lm_loss': 0.0001774715492501855, 'train/info_loss': 4.619146420736797e-05, 'train/ref_loss': 0.24415460228919983, 'train/uncertainty_loss': 0.0037996441125869754, 'train/video_loss': 0.256011039018631, 'train/total_loss': 0.2561885118484497} -tensor(0.1005, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0227, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3023, 'grad_norm': 1.8492178916931152, 'learning_rate': 1.5244300311803278e-05}[Rank 2] Trainer log: {'loss': 0.3023, 'grad_norm': 1.8492178916931152, 'learning_rate': 1.5244300311803278e-05} - -[Rank 0] Trainer log: {'loss': 0.3023, 'grad_norm': 1.8492178916931152, 'learning_rate': 1.5244300311803278e-05}[Rank 3] Trainer log: {'loss': 0.3023, 'grad_norm': 1.8492178916931152, 'learning_rate': 1.5244300311803278e-05} - -{'loss': 0.3023, 'grad_norm': 1.8492178916931152, 'learning_rate': 1.5244300311803278e-05, 'epoch': 0.36} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.28351750373840334, 'train/info_loss': 0.14422158896923065, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011285515502095223, 'train/video_loss': 0.14410872757434845, 'train/total_loss': 0.42762625217437744} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.2035, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(0.2994, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0017, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0017, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.27752089500427246, 'train/info_loss': 0.17904745042324066, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013764703180640936, 'train/video_loss': 0.1789098083972931, 'train/total_loss': 0.45643070340156555} -tensor(-0.0015, device='cuda:1', grad_fn=) tensor(-0.0015, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3352, 'grad_norm': 8.005034446716309, 'learning_rate': 1.5235214408700911e-05}[Rank 1] Trainer log: {'loss': 0.3352, 'grad_norm': 8.005034446716309, 'learning_rate': 1.5235214408700911e-05}[Rank 3] Trainer log: {'loss': 0.3352, 'grad_norm': 8.005034446716309, 'learning_rate': 1.5235214408700911e-05} - - -[Rank 0] Trainer log: {'loss': 0.3352, 'grad_norm': 8.005034446716309, 'learning_rate': 1.5235214408700911e-05} -{'loss': 0.3352, 'grad_norm': 8.005034446716309, 'learning_rate': 1.5235214408700911e-05, 'epoch': 0.36} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3107861280441284, 'train/info_loss': 0.2466629147529602, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011274004355072976, 'train/video_loss': 0.24655017256736755, 'train/total_loss': 0.5573363304138184} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1975290656089783, 'train/info_loss': 0.32156631350517273, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012288423022255301, 'train/video_loss': 0.32144343852996826, 'train/total_loss': 0.5189725160598755} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.4546, 'grad_norm': 4.946585178375244, 'learning_rate': 1.5226122548105022e-05}[Rank 3] Trainer log: {'loss': 0.4546, 'grad_norm': 4.946585178375244, 'learning_rate': 1.5226122548105022e-05} - -[Rank 0] Trainer log: {'loss': 0.4546, 'grad_norm': 4.946585178375244, 'learning_rate': 1.5226122548105022e-05}[Rank 1] Trainer log: {'loss': 0.4546, 'grad_norm': 4.946585178375244, 'learning_rate': 1.5226122548105022e-05} - -{'loss': 0.4546, 'grad_norm': 4.946585178375244, 'learning_rate': 1.5226122548105022e-05, 'epoch': 0.36} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0009639100171625614, 'train/lm_loss': 0.0001236294978298247, 'train/info_loss': 4.33902969234623e-05, 'train/ref_loss': 0.1721583604812622, 'train/uncertainty_loss': -7.52988096792251e-05, 'train/video_loss': 0.17983773350715637, 'train/total_loss': 0.17996136844158173} -tensor(0.0914, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.41758437156677247, 'train/info_loss': 0.19301436841487885, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001288185128942132, 'train/video_loss': 0.19288554787635803, 'train/total_loss': 0.6104699373245239} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.389, 'grad_norm': 1.4787845611572266, 'learning_rate': 1.5217024740361834e-05}[Rank 2] Trainer log: {'loss': 0.389, 'grad_norm': 1.4787845611572266, 'learning_rate': 1.5217024740361834e-05} -[Rank 1] Trainer log: {'loss': 0.389, 'grad_norm': 1.4787845611572266, 'learning_rate': 1.5217024740361834e-05} - -[Rank 0] Trainer log: {'loss': 0.389, 'grad_norm': 1.4787845611572266, 'learning_rate': 1.5217024740361834e-05} -{'loss': 0.389, 'grad_norm': 1.4787845611572266, 'learning_rate': 1.5217024740361834e-05, 'epoch': 0.36} -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.17230894565582278, 'train/info_loss': 0.23139849305152893, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013262118445709348, 'train/video_loss': 0.2312658727169037, 'train/total_loss': 0.4035748243331909} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(0.1611, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.1515, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0009256200864911079, 'train/lm_loss': 7.816217839717865e-05, 'train/info_loss': 3.486750210868195e-05, 'train/ref_loss': 0.31241780519485474, 'train/uncertainty_loss': 0.015150965750217439, 'train/video_loss': 0.3350085914134979, 'train/total_loss': 0.33508676290512085} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.2703, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3328, 'grad_norm': 7.029734134674072, 'learning_rate': 1.5207920995824339e-05} -[Rank 2] Trainer log: {'loss': 0.3328, 'grad_norm': 7.029734134674072, 'learning_rate': 1.5207920995824339e-05}[Rank 1] Trainer log: {'loss': 0.3328, 'grad_norm': 7.029734134674072, 'learning_rate': 1.5207920995824339e-05}[Rank 0] Trainer log: {'loss': 0.3328, 'grad_norm': 7.029734134674072, 'learning_rate': 1.5207920995824339e-05} - - -{'loss': 0.3328, 'grad_norm': 7.029734134674072, 'learning_rate': 1.5207920995824339e-05, 'epoch': 0.36} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.1100, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.001032957062125206, 'train/lm_loss': 0.00010616345098242165, 'train/info_loss': 3.921831739717163e-05, 'train/ref_loss': 0.12541721761226654, 'train/uncertainty_loss': -7.518870988860727e-05, 'train/video_loss': 0.13364490866661072, 'train/total_loss': 0.13375107944011688} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.23226065635681153, 'train/info_loss': 0.09906257688999176, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010279599810019136, 'train/video_loss': 0.098959781229496, 'train/total_loss': 0.33122044801712036} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4207, 'grad_norm': 3.359489679336548, 'learning_rate': 1.5198811324852278e-05}[Rank 2] Trainer log: {'loss': 0.4207, 'grad_norm': 3.359489679336548, 'learning_rate': 1.5198811324852278e-05}[Rank 0] Trainer log: {'loss': 0.4207, 'grad_norm': 3.359489679336548, 'learning_rate': 1.5198811324852278e-05} - -[Rank 1] Trainer log: {'loss': 0.4207, 'grad_norm': 3.359489679336548, 'learning_rate': 1.5198811324852278e-05} - -{'loss': 0.4207, 'grad_norm': 3.359489679336548, 'learning_rate': 1.5198811324852278e-05, 'epoch': 0.36} -tensor(-0.0019, device='cuda:2', grad_fn=) tensor(-0.0019, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2870481967926026, 'train/info_loss': 0.18639498949050903, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012665694812312724, 'train/video_loss': 0.18626832962036133, 'train/total_loss': 0.4733165204524994} -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(0.1026, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0009092044085264206, 'train/lm_loss': 8.886270225048066e-05, 'train/info_loss': 3.599991032388061e-05, 'train/ref_loss': 0.2834168076515198, 'train/uncertainty_loss': 0.010262295603752136, 'train/video_loss': 0.30098873376846313, 'train/total_loss': 0.30107760429382324} -tensor(0.0059, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4495, 'grad_norm': 6.113222599029541, 'learning_rate': 1.5189695737812153e-05}[Rank 2] Trainer log: {'loss': 0.4495, 'grad_norm': 6.113222599029541, 'learning_rate': 1.5189695737812153e-05} - -[Rank 0] Trainer log: {'loss': 0.4495, 'grad_norm': 6.113222599029541, 'learning_rate': 1.5189695737812153e-05}[Rank 1] Trainer log: {'loss': 0.4495, 'grad_norm': 6.113222599029541, 'learning_rate': 1.5189695737812153e-05} - -{'loss': 0.4495, 'grad_norm': 6.113222599029541, 'learning_rate': 1.5189695737812153e-05, 'epoch': 0.36} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.0751, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1187, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.000916182529181242, 'train/lm_loss': 0.00011443201219663024, 'train/info_loss': 4.207910751574673e-05, 'train/ref_loss': 0.29593366384506226, 'train/uncertainty_loss': 0.01187010630965233, 'train/video_loss': 0.315175324678421, 'train/total_loss': 0.31528976559638977} -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.000883624330163002, 'train/lm_loss': 0.00010032521095126867, 'train/info_loss': 4.046991671202704e-05, 'train/ref_loss': 0.18394631147384644, 'train/uncertainty_loss': -7.085553370416165e-05, 'train/video_loss': 0.19098491966724396, 'train/total_loss': 0.19108524918556213} -tensor(0.5577, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3979, 'grad_norm': 7.963551044464111, 'learning_rate': 1.518057424507718e-05}[Rank 2] Trainer log: {'loss': 0.3979, 'grad_norm': 7.963551044464111, 'learning_rate': 1.518057424507718e-05}[Rank 3] Trainer log: {'loss': 0.3979, 'grad_norm': 7.963551044464111, 'learning_rate': 1.518057424507718e-05} - - -[Rank 0] Trainer log: {'loss': 0.3979, 'grad_norm': 7.963551044464111, 'learning_rate': 1.518057424507718e-05} -{'loss': 0.3979, 'grad_norm': 7.963551044464111, 'learning_rate': 1.518057424507718e-05, 'epoch': 0.36} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2669972896575928, 'train/info_loss': 0.35539448261260986, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012366195442155005, 'train/video_loss': 0.3552708327770233, 'train/total_loss': 0.6222681403160095} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(0.0170, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(0.1340, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0010374453850090504, 'train/lm_loss': 0.00013897357275709511, 'train/info_loss': 4.410549081512727e-05, 'train/ref_loss': 0.3046366572380066, 'train/uncertainty_loss': 0.013396705687046052, 'train/video_loss': 0.3263770341873169, 'train/total_loss': 0.3265160024166107} -[Rank 3] Trainer log: {'loss': 0.4236, 'grad_norm': 1.6020183563232422, 'learning_rate': 1.5171446857027309e-05}[Rank 1] Trainer log: {'loss': 0.4236, 'grad_norm': 1.6020183563232422, 'learning_rate': 1.5171446857027309e-05}[Rank 2] Trainer log: {'loss': 0.4236, 'grad_norm': 1.6020183563232422, 'learning_rate': 1.5171446857027309e-05} - - -[Rank 0] Trainer log: {'loss': 0.4236, 'grad_norm': 1.6020183563232422, 'learning_rate': 1.5171446857027309e-05} -{'loss': 0.4236, 'grad_norm': 1.6020183563232422, 'learning_rate': 1.5171446857027309e-05, 'epoch': 0.36} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0009726442396640778, 'train/lm_loss': 0.00010042053181678057, 'train/info_loss': 3.832431684713811e-05, 'train/ref_loss': 0.1120947003364563, 'train/uncertainty_loss': -7.033874280750752e-05, 'train/video_loss': 0.11984384059906006, 'train/total_loss': 0.1199442595243454} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.1211, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0009912817738950252, 'train/lm_loss': 0.00011512303026393056, 'train/info_loss': 4.142351099289954e-05, 'train/ref_loss': 0.29193928837776184, 'train/uncertainty_loss': 0.012113147228956223, 'train/video_loss': 0.3120241165161133, 'train/total_loss': 0.31213924288749695} -[Rank 1] Trainer log: {'loss': 0.3271, 'grad_norm': 6.076045036315918, 'learning_rate': 1.5162313584049186e-05} -[Rank 3] Trainer log: {'loss': 0.3271, 'grad_norm': 6.076045036315918, 'learning_rate': 1.5162313584049186e-05} -[Rank 0] Trainer log: {'loss': 0.3271, 'grad_norm': 6.076045036315918, 'learning_rate': 1.5162313584049186e-05} -[Rank 2] Trainer log: {'loss': 0.3271, 'grad_norm': 6.076045036315918, 'learning_rate': 1.5162313584049186e-05} -{'loss': 0.3271, 'grad_norm': 6.076045036315918, 'learning_rate': 1.5162313584049186e-05, 'epoch': 0.36} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2552379369735718, 'train/info_loss': 0.21169205009937286, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013527859700843692, 'train/video_loss': 0.21155677735805511, 'train/total_loss': 0.4667947292327881} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(0.0877, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0008282340131700039, 'train/lm_loss': 8.905334980227054e-05, 'train/info_loss': 3.713231490110047e-05, 'train/ref_loss': 0.2769569158554077, 'train/uncertainty_loss': 0.008768696337938309, 'train/video_loss': 0.2923886179924011, 'train/total_loss': 0.29247766733169556} -[Rank 3] Trainer log: {'loss': 0.4176, 'grad_norm': 6.71674108505249, 'learning_rate': 1.5153174436536166e-05}[Rank 1] Trainer log: {'loss': 0.4176, 'grad_norm': 6.71674108505249, 'learning_rate': 1.5153174436536166e-05}[Rank 0] Trainer log: {'loss': 0.4176, 'grad_norm': 6.71674108505249, 'learning_rate': 1.5153174436536166e-05} - -[Rank 2] Trainer log: {'loss': 0.4176, 'grad_norm': 6.71674108505249, 'learning_rate': 1.5153174436536166e-05} - -{'loss': 0.4176, 'grad_norm': 6.71674108505249, 'learning_rate': 1.5153174436536166e-05, 'epoch': 0.36} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.0365, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0009099421091377735, 'train/lm_loss': 0.00011493241181597114, 'train/info_loss': 3.9814316551201046e-05, 'train/ref_loss': 0.2391064167022705, 'train/uncertainty_loss': 0.0036496009677648546, 'train/video_loss': 0.2500753700733185, 'train/total_loss': 0.25019028782844543} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.35207853317260746, 'train/info_loss': 0.2023613601922989, 'train/ref_loss': None, 'train/uncertainty_loss': -8.771615684963763e-05, 'train/video_loss': 0.2022736370563507, 'train/total_loss': 0.5543521642684937} -tensor(0.1985, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0372, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4253, 'grad_norm': 3.6137921810150146, 'learning_rate': 1.5144029424888283e-05}[Rank 2] Trainer log: {'loss': 0.4253, 'grad_norm': 3.6137921810150146, 'learning_rate': 1.5144029424888283e-05}[Rank 1] Trainer log: {'loss': 0.4253, 'grad_norm': 3.6137921810150146, 'learning_rate': 1.5144029424888283e-05} - - -[Rank 0] Trainer log: {'loss': 0.4253, 'grad_norm': 3.6137921810150146, 'learning_rate': 1.5144029424888283e-05} -{'loss': 0.4253, 'grad_norm': 3.6137921810150146, 'learning_rate': 1.5144029424888283e-05, 'epoch': 0.36} -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.29533884525299076, 'train/info_loss': 0.2185956835746765, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00014219543663784862, 'train/video_loss': 0.21845348179340363, 'train/total_loss': 0.5137923359870911} -tensor(0.1528, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1115, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3274204015731812, 'train/info_loss': 0.19653378427028656, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001408563810400665, 'train/video_loss': 0.19639292359352112, 'train/total_loss': 0.5238133668899536} -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4236, 'grad_norm': 12.08053970336914, 'learning_rate': 1.5134878559512243e-05}[Rank 1] Trainer log: {'loss': 0.4236, 'grad_norm': 12.08053970336914, 'learning_rate': 1.5134878559512243e-05}[Rank 2] Trainer log: {'loss': 0.4236, 'grad_norm': 12.08053970336914, 'learning_rate': 1.5134878559512243e-05} - - -[Rank 0] Trainer log: {'loss': 0.4236, 'grad_norm': 12.08053970336914, 'learning_rate': 1.5134878559512243e-05} -{'loss': 0.4236, 'grad_norm': 12.08053970336914, 'learning_rate': 1.5134878559512243e-05, 'epoch': 0.36} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(0.0138, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(0.0962, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0008625790476799012, 'train/lm_loss': 0.00014847964048385622, 'train/info_loss': 4.4820684706792235e-05, 'train/ref_loss': 0.2824379801750183, 'train/uncertainty_loss': 0.009616340696811677, 'train/video_loss': 0.2989997863769531, 'train/total_loss': 0.2991482615470886} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0008385340683162213, 'train/lm_loss': 0.000130015064496547, 'train/info_loss': 4.3092302803415805e-05, 'train/ref_loss': 0.1285165250301361, 'train/uncertainty_loss': -7.335199625231327e-05, 'train/video_loss': 0.1351945400238037, 'train/total_loss': 0.13532455265522003} -tensor(0.1045, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3016, 'grad_norm': 3.299135208129883, 'learning_rate': 1.5125721850821415e-05}[Rank 1] Trainer log: {'loss': 0.3016, 'grad_norm': 3.299135208129883, 'learning_rate': 1.5125721850821415e-05}[Rank 2] Trainer log: {'loss': 0.3016, 'grad_norm': 3.299135208129883, 'learning_rate': 1.5125721850821415e-05} - -[Rank 0] Trainer log: {'loss': 0.3016, 'grad_norm': 3.299135208129883, 'learning_rate': 1.5125721850821415e-05} - -{'loss': 0.3016, 'grad_norm': 3.299135208129883, 'learning_rate': 1.5125721850821415e-05, 'epoch': 0.36} -tensor(-0.0016, device='cuda:1', grad_fn=) tensor(-0.0016, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.42246432304382325, 'train/info_loss': 0.14372403919696808, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012860355200245978, 'train/video_loss': 0.14359544217586517, 'train/total_loss': 0.5660597681999207} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.37235865592956546, 'train/info_loss': 0.2750464975833893, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012553390115499496, 'train/video_loss': 0.2749209702014923, 'train/total_loss': 0.6472796201705933} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.6348, 'grad_norm': 4.6229352951049805, 'learning_rate': 1.5116559309235825e-05}[Rank 2] Trainer log: {'loss': 0.6348, 'grad_norm': 4.6229352951049805, 'learning_rate': 1.5116559309235825e-05}[Rank 1] Trainer log: {'loss': 0.6348, 'grad_norm': 4.6229352951049805, 'learning_rate': 1.5116559309235825e-05} - - -[Rank 0] Trainer log: {'loss': 0.6348, 'grad_norm': 4.6229352951049805, 'learning_rate': 1.5116559309235825e-05} -{'loss': 0.6348, 'grad_norm': 4.6229352951049805, 'learning_rate': 1.5116559309235825e-05, 'epoch': 0.36} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.42646312713623047, 'train/info_loss': 0.22275874018669128, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011357787298038602, 'train/video_loss': 0.22264516353607178, 'train/total_loss': 0.6491082906723022} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(0.3011, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.4052813529968262, 'train/info_loss': 0.21674272418022156, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012025570031255485, 'train/video_loss': 0.2166224718093872, 'train/total_loss': 0.6219038367271423} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4794, 'grad_norm': 5.236268520355225, 'learning_rate': 1.5107390945182119e-05}[Rank 2] Trainer log: {'loss': 0.4794, 'grad_norm': 5.236268520355225, 'learning_rate': 1.5107390945182119e-05}[Rank 3] Trainer log: {'loss': 0.4794, 'grad_norm': 5.236268520355225, 'learning_rate': 1.5107390945182119e-05} - - -[Rank 0] Trainer log: {'loss': 0.4794, 'grad_norm': 5.236268520355225, 'learning_rate': 1.5107390945182119e-05} -{'loss': 0.4794, 'grad_norm': 5.236268520355225, 'learning_rate': 1.5107390945182119e-05, 'epoch': 0.36} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3866177320480347, 'train/info_loss': 0.2099151313304901, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012977354926988483, 'train/video_loss': 0.2097853571176529, 'train/total_loss': 0.5964031219482422} -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2648634433746338, 'train/info_loss': 0.252676397562027, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012509837979450824, 'train/video_loss': 0.25255128741264343, 'train/total_loss': 0.5174147486686707} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0648, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3489, 'grad_norm': 5.148230075836182, 'learning_rate': 1.5098216769093588e-05} -[Rank 3] Trainer log: {'loss': 0.3489, 'grad_norm': 5.148230075836182, 'learning_rate': 1.5098216769093588e-05} -[Rank 2] Trainer log: {'loss': 0.3489, 'grad_norm': 5.148230075836182, 'learning_rate': 1.5098216769093588e-05} -[Rank 0] Trainer log: {'loss': 0.3489, 'grad_norm': 5.148230075836182, 'learning_rate': 1.5098216769093588e-05} -{'loss': 0.3489, 'grad_norm': 5.148230075836182, 'learning_rate': 1.5098216769093588e-05, 'epoch': 0.36} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0010449131950736046, 'train/lm_loss': 7.80430156737566e-05, 'train/info_loss': 3.599991032388061e-05, 'train/ref_loss': 0.1096353828907013, 'train/uncertainty_loss': -7.688901969231665e-05, 'train/video_loss': 0.11795379966497421, 'train/total_loss': 0.11803184449672699} -tensor(0.1060, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.32273564338684085, 'train/info_loss': 0.2037520557641983, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011058097006753088, 'train/video_loss': 0.20364147424697876, 'train/total_loss': 0.5263770818710327} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0233, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.2981, 'grad_norm': 1.669852614402771, 'learning_rate': 1.508903679141012e-05}[Rank 1] Trainer log: {'loss': 0.2981, 'grad_norm': 1.669852614402771, 'learning_rate': 1.508903679141012e-05}[Rank 2] Trainer log: {'loss': 0.2981, 'grad_norm': 1.669852614402771, 'learning_rate': 1.508903679141012e-05} - -[Rank 3] Trainer log: {'loss': 0.2981, 'grad_norm': 1.669852614402771, 'learning_rate': 1.508903679141012e-05} - -{'loss': 0.2981, 'grad_norm': 1.669852614402771, 'learning_rate': 1.508903679141012e-05, 'epoch': 0.36} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.4201, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.001107765082269907, 'train/lm_loss': 0.00024165806826204062, 'train/info_loss': 4.654905933421105e-05, 'train/ref_loss': 0.4705703556537628, 'train/uncertainty_loss': 0.04200577735900879, 'train/video_loss': 0.5214847922325134, 'train/total_loss': 0.5217264294624329} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0015, device='cuda:0', grad_fn=) tensor(-0.0015, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3505163908004761, 'train/info_loss': 0.3101218342781067, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001454188139177859, 'train/video_loss': 0.3099764287471771, 'train/total_loss': 0.6604928374290466} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.4991, 'grad_norm': 4.095403671264648, 'learning_rate': 1.5079851022578221e-05} -[Rank 1] Trainer log: {'loss': 0.4991, 'grad_norm': 4.095403671264648, 'learning_rate': 1.5079851022578221e-05} -[Rank 3] Trainer log: {'loss': 0.4991, 'grad_norm': 4.095403671264648, 'learning_rate': 1.5079851022578221e-05}[Rank 0] Trainer log: {'loss': 0.4991, 'grad_norm': 4.095403671264648, 'learning_rate': 1.5079851022578221e-05} - -{'loss': 0.4991, 'grad_norm': 4.095403671264648, 'learning_rate': 1.5079851022578221e-05, 'epoch': 0.36} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.26025354862213135, 'train/info_loss': 0.23385591804981232, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011642454192042352, 'train/video_loss': 0.23373949527740479, 'train/total_loss': 0.49399304389953613} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.31603574752807617, 'train/info_loss': 0.13020329177379608, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010855430737137795, 'train/video_loss': 0.1300947368144989, 'train/total_loss': 0.4461304843425751} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4386, 'grad_norm': 4.419051170349121, 'learning_rate': 1.5070659473050975e-05}[Rank 3] Trainer log: {'loss': 0.4386, 'grad_norm': 4.419051170349121, 'learning_rate': 1.5070659473050975e-05}[Rank 2] Trainer log: {'loss': 0.4386, 'grad_norm': 4.419051170349121, 'learning_rate': 1.5070659473050975e-05} - - -[Rank 0] Trainer log: {'loss': 0.4386, 'grad_norm': 4.419051170349121, 'learning_rate': 1.5070659473050975e-05} -{'loss': 0.4386, 'grad_norm': 4.419051170349121, 'learning_rate': 1.5070659473050975e-05, 'epoch': 0.36} -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.19099701642990113, 'train/info_loss': 0.21595320105552673, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010446197120472789, 'train/video_loss': 0.21584874391555786, 'train/total_loss': 0.40684574842453003} -tensor(0.6155, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(0.3941, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0010383891873061658, 'train/lm_loss': 0.00014709783717989922, 'train/info_loss': 4.374789568828419e-05, 'train/ref_loss': 0.43648314476013184, 'train/uncertainty_loss': 0.03941328525543213, 'train/video_loss': 0.48424726724624634, 'train/total_loss': 0.484394371509552} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.479, 'grad_norm': 12.907425880432129, 'learning_rate': 1.506146215328805e-05}[Rank 0] Trainer log: {'loss': 0.479, 'grad_norm': 12.907425880432129, 'learning_rate': 1.506146215328805e-05} -[Rank 2] Trainer log: {'loss': 0.479, 'grad_norm': 12.907425880432129, 'learning_rate': 1.506146215328805e-05} -[Rank 3] Trainer log: {'loss': 0.479, 'grad_norm': 12.907425880432129, 'learning_rate': 1.506146215328805e-05} - -{'loss': 0.479, 'grad_norm': 12.907425880432129, 'learning_rate': 1.506146215328805e-05, 'epoch': 0.36} -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3792869567871094, 'train/info_loss': 0.19881655275821686, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012865600874647498, 'train/video_loss': 0.19868789613246918, 'train/total_loss': 0.5779748558998108} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0270, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.29509763717651366, 'train/info_loss': 0.16546078026294708, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001419314299710095, 'train/video_loss': 0.16531884670257568, 'train/total_loss': 0.4604164958000183} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3818, 'grad_norm': 6.235344409942627, 'learning_rate': 1.5052259073755678e-05} -[Rank 3] Trainer log: {'loss': 0.3818, 'grad_norm': 6.235344409942627, 'learning_rate': 1.5052259073755678e-05} -[Rank 0] Trainer log: {'loss': 0.3818, 'grad_norm': 6.235344409942627, 'learning_rate': 1.5052259073755678e-05}[Rank 2] Trainer log: {'loss': 0.3818, 'grad_norm': 6.235344409942627, 'learning_rate': 1.5052259073755678e-05} - -{'loss': 0.3818, 'grad_norm': 6.235344409942627, 'learning_rate': 1.5052259073755678e-05, 'epoch': 0.36} -tensor(-0.0019, device='cuda:0', grad_fn=) tensor(-0.0019, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3076168775558472, 'train/info_loss': 0.5124601125717163, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00019186314893886449, 'train/video_loss': 0.5122682452201843, 'train/total_loss': 0.8198851346969604} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(0.2761, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1546319842338562, 'train/info_loss': 0.18126267194747925, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001047754311002791, 'train/video_loss': 0.1811579018831253, 'train/total_loss': 0.33578988909721375} -[Rank 2] Trainer log: {'loss': 0.4376, 'grad_norm': 5.060033798217773, 'learning_rate': 1.5043050244926644e-05}[Rank 0] Trainer log: {'loss': 0.4376, 'grad_norm': 5.060033798217773, 'learning_rate': 1.5043050244926644e-05}[Rank 3] Trainer log: {'loss': 0.4376, 'grad_norm': 5.060033798217773, 'learning_rate': 1.5043050244926644e-05} - - -[Rank 1] Trainer log: {'loss': 0.4376, 'grad_norm': 5.060033798217773, 'learning_rate': 1.5043050244926644e-05} -{'loss': 0.4376, 'grad_norm': 5.060033798217773, 'learning_rate': 1.5043050244926644e-05, 'epoch': 0.37} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.43027377128601074, 'train/info_loss': 0.14611639082431793, 'train/ref_loss': None, 'train/uncertainty_loss': -9.843459120020271e-05, 'train/video_loss': 0.14601795375347137, 'train/total_loss': 0.5762917399406433} -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(0.2067, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.23934526443481446, 'train/info_loss': 0.2529483437538147, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001369569217786193, 'train/video_loss': 0.25281137228012085, 'train/total_loss': 0.49215662479400635} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.0742, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0800, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3898, 'grad_norm': 8.726551055908203, 'learning_rate': 1.5033835677280285e-05} -[Rank 1] Trainer log: {'loss': 0.3898, 'grad_norm': 8.726551055908203, 'learning_rate': 1.5033835677280285e-05} -[Rank 2] Trainer log: {'loss': 0.3898, 'grad_norm': 8.726551055908203, 'learning_rate': 1.5033835677280285e-05} -[Rank 0] Trainer log: {'loss': 0.3898, 'grad_norm': 8.726551055908203, 'learning_rate': 1.5033835677280285e-05} -{'loss': 0.3898, 'grad_norm': 8.726551055908203, 'learning_rate': 1.5033835677280285e-05, 'epoch': 0.37} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3177137613296509, 'train/info_loss': 0.13685129582881927, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010459467303007841, 'train/video_loss': 0.13674670457839966, 'train/total_loss': 0.454460471868515} -tensor(0.0507, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1122, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(0.2316, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.001001252606511116, 'train/lm_loss': 9.996776934713126e-05, 'train/info_loss': 4.046991671202704e-05, 'train/ref_loss': 0.3677663803100586, 'train/uncertainty_loss': 0.023161934316158296, 'train/video_loss': 0.3989788293838501, 'train/total_loss': 0.3990787863731384} -tensor(0.0398, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.407, 'grad_norm': 4.5443925857543945, 'learning_rate': 1.5024615381302452e-05}[Rank 2] Trainer log: {'loss': 0.407, 'grad_norm': 4.5443925857543945, 'learning_rate': 1.5024615381302452e-05}[Rank 3] Trainer log: {'loss': 0.407, 'grad_norm': 4.5443925857543945, 'learning_rate': 1.5024615381302452e-05} - - -[Rank 0] Trainer log: {'loss': 0.407, 'grad_norm': 4.5443925857543945, 'learning_rate': 1.5024615381302452e-05} -{'loss': 0.407, 'grad_norm': 4.5443925857543945, 'learning_rate': 1.5024615381302452e-05, 'epoch': 0.37} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.14048912525177001, 'train/info_loss': 0.20156267285346985, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010114776669070125, 'train/video_loss': 0.201461523771286, 'train/total_loss': 0.3419506549835205} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.6957, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.14263538122177125, 'train/info_loss': 0.19982148706912994, 'train/ref_loss': None, 'train/uncertainty_loss': -8.105804445222021e-05, 'train/video_loss': 0.1997404247522354, 'train/total_loss': 0.34237581491470337} -tensor(0.2696, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4432, 'grad_norm': 13.025771141052246, 'learning_rate': 1.501538936748553e-05}[Rank 2] Trainer log: {'loss': 0.4432, 'grad_norm': 13.025771141052246, 'learning_rate': 1.501538936748553e-05} -[Rank 1] Trainer log: {'loss': 0.4432, 'grad_norm': 13.025771141052246, 'learning_rate': 1.501538936748553e-05} - -[Rank 0] Trainer log: {'loss': 0.4432, 'grad_norm': 13.025771141052246, 'learning_rate': 1.501538936748553e-05} -{'loss': 0.4432, 'grad_norm': 13.025771141052246, 'learning_rate': 1.501538936748553e-05, 'epoch': 0.37} -tensor(-0.0018, device='cuda:2', grad_fn=) tensor(-0.0018, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(0.1091, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0009795991703867913, 'train/lm_loss': 0.00012865696335211398, 'train/info_loss': 4.207910751574673e-05, 'train/ref_loss': 0.2878034710884094, 'train/uncertainty_loss': 0.010907368361949922, 'train/video_loss': 0.3065897226333618, 'train/total_loss': 0.3067183792591095} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(0.0663, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(1.6256, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.0713, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0009316744282841683, 'train/lm_loss': 7.723270682618022e-05, 'train/info_loss': 3.462909808149561e-05, 'train/ref_loss': 0.263214111328125, 'train/uncertainty_loss': 0.0071320094168186195, 'train/video_loss': 0.2778341472148895, 'train/total_loss': 0.27791136503219604} -[Rank 3] Trainer log: {'loss': 0.5037, 'grad_norm': 22.505382537841797, 'learning_rate': 1.50061576463284e-05}[Rank 1] Trainer log: {'loss': 0.5037, 'grad_norm': 22.505382537841797, 'learning_rate': 1.50061576463284e-05}[Rank 2] Trainer log: {'loss': 0.5037, 'grad_norm': 22.505382537841797, 'learning_rate': 1.50061576463284e-05} - - -[Rank 0] Trainer log: {'loss': 0.5037, 'grad_norm': 22.505382537841797, 'learning_rate': 1.50061576463284e-05} -{'loss': 0.5037, 'grad_norm': 22.505382537841797, 'learning_rate': 1.50061576463284e-05, 'epoch': 0.37} -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.36312878131866455, 'train/info_loss': 0.12978911399841309, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001433066325262189, 'train/video_loss': 0.12964580953121185, 'train/total_loss': 0.4927746057510376} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.24549477100372316, 'train/info_loss': 0.16399399936199188, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001394615974277258, 'train/video_loss': 0.16385453939437866, 'train/total_loss': 0.40934932231903076} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(0.1207, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3527, 'grad_norm': 6.5288472175598145, 'learning_rate': 1.499692022833645e-05} -[Rank 0] Trainer log: {'loss': 0.3527, 'grad_norm': 6.5288472175598145, 'learning_rate': 1.499692022833645e-05}[Rank 3] Trainer log: {'loss': 0.3527, 'grad_norm': 6.5288472175598145, 'learning_rate': 1.499692022833645e-05}[Rank 2] Trainer log: {'loss': 0.3527, 'grad_norm': 6.5288472175598145, 'learning_rate': 1.499692022833645e-05} - - -{'loss': 0.3527, 'grad_norm': 6.5288472175598145, 'learning_rate': 1.499692022833645e-05, 'epoch': 0.37} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.22523357868194582, 'train/info_loss': 0.2055409699678421, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001248539541848004, 'train/video_loss': 0.20541611313819885, 'train/total_loss': 0.43064969778060913} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0729, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.000996299646794796, 'train/lm_loss': 8.819542708806694e-05, 'train/info_loss': 3.862231824314222e-05, 'train/ref_loss': 0.21951542794704437, 'train/uncertainty_loss': -7.259215926751495e-05, 'train/video_loss': 0.2274518460035324, 'train/total_loss': 0.2275400459766388} -[Rank 3] Trainer log: {'loss': 0.4063, 'grad_norm': 2.535419464111328, 'learning_rate': 1.4987677124021538e-05} -[Rank 1] Trainer log: {'loss': 0.4063, 'grad_norm': 2.535419464111328, 'learning_rate': 1.4987677124021538e-05}[Rank 2] Trainer log: {'loss': 0.4063, 'grad_norm': 2.535419464111328, 'learning_rate': 1.4987677124021538e-05} - -[Rank 0] Trainer log: {'loss': 0.4063, 'grad_norm': 2.535419464111328, 'learning_rate': 1.4987677124021538e-05} -{'loss': 0.4063, 'grad_norm': 2.535419464111328, 'learning_rate': 1.4987677124021538e-05, 'epoch': 0.37} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.38909635543823246, 'train/info_loss': 0.2565133273601532, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011147172190248967, 'train/video_loss': 0.2564018666744232, 'train/total_loss': 0.6454982161521912} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.3163, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0006566289812326431, 'train/lm_loss': 0.0001554123591631651, 'train/info_loss': 4.768144208355807e-05, 'train/ref_loss': 0.43844956159591675, 'train/uncertainty_loss': 0.03163293600082397, 'train/video_loss': 0.4753831923007965, 'train/total_loss': 0.47553861141204834} -tensor(0.1281, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4776, 'grad_norm': 4.234839916229248, 'learning_rate': 1.4978428343901999e-05}[Rank 2] Trainer log: {'loss': 0.4776, 'grad_norm': 4.234839916229248, 'learning_rate': 1.4978428343901999e-05} -[Rank 0] Trainer log: {'loss': 0.4776, 'grad_norm': 4.234839916229248, 'learning_rate': 1.4978428343901999e-05} -[Rank 1] Trainer log: {'loss': 0.4776, 'grad_norm': 4.234839916229248, 'learning_rate': 1.4978428343901999e-05} - -{'loss': 0.4776, 'grad_norm': 4.234839916229248, 'learning_rate': 1.4978428343901999e-05, 'epoch': 0.37} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.20955069065093995, 'train/info_loss': 0.14831307530403137, 'train/ref_loss': None, 'train/uncertainty_loss': -8.758127223700285e-05, 'train/video_loss': 0.14822550117969513, 'train/total_loss': 0.3577761948108673} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0015, device='cuda:1', grad_fn=) tensor(-0.0015, device='cuda:1', grad_fn=) -tensor(0.1762, device='cuda:2', grad_fn=) tensor(-0.0006, device='cuda:2', grad_fn=) -tensor(0.1458, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0006841102614998818, 'train/lm_loss': 0.00012172332499176264, 'train/info_loss': 4.374789568828419e-05, 'train/ref_loss': 0.30795854330062866, 'train/uncertainty_loss': 0.014583052694797517, 'train/video_loss': 0.3280582129955292, 'train/total_loss': 0.3281799256801605} -[Rank 3] Trainer log: {'loss': 0.371, 'grad_norm': 4.616837978363037, 'learning_rate': 1.4969173898502625e-05}[Rank 2] Trainer log: {'loss': 0.371, 'grad_norm': 4.616837978363037, 'learning_rate': 1.4969173898502625e-05}[Rank 1] Trainer log: {'loss': 0.371, 'grad_norm': 4.616837978363037, 'learning_rate': 1.4969173898502625e-05} - - -[Rank 0] Trainer log: {'loss': 0.371, 'grad_norm': 4.616837978363037, 'learning_rate': 1.4969173898502625e-05} -{'loss': 0.371, 'grad_norm': 4.616837978363037, 'learning_rate': 1.4969173898502625e-05, 'epoch': 0.37} -tensor(0.1237, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0278, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.8541, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.000838046707212925, 'train/lm_loss': 0.00011483709095045925, 'train/info_loss': 4.3092302803415805e-05, 'train/ref_loss': 0.8773698210716248, 'train/uncertainty_loss': 0.08541227579116822, 'train/video_loss': 0.9695295691490173, 'train/total_loss': 0.9696444272994995} -tensor(0.3952, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.38315556049346927, 'train/info_loss': 0.2575109899044037, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012806693557649852, 'train/video_loss': 0.25738292932510376, 'train/total_loss': 0.6405384540557861} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.1925, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4944, 'grad_norm': 26.038673400878906, 'learning_rate': 1.4959913798354658e-05} -[Rank 3] Trainer log: {'loss': 0.4944, 'grad_norm': 26.038673400878906, 'learning_rate': 1.4959913798354658e-05}[Rank 2] Trainer log: {'loss': 0.4944, 'grad_norm': 26.038673400878906, 'learning_rate': 1.4959913798354658e-05} - -[Rank 0] Trainer log: {'loss': 0.4944, 'grad_norm': 26.038673400878906, 'learning_rate': 1.4959913798354658e-05} -{'loss': 0.4944, 'grad_norm': 26.038673400878906, 'learning_rate': 1.4959913798354658e-05, 'epoch': 0.37} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.0769, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0008654359728097916, 'train/lm_loss': 9.34859155677259e-05, 'train/info_loss': 3.9814316551201046e-05, 'train/ref_loss': 0.04751706123352051, 'train/uncertainty_loss': -6.985778454691172e-05, 'train/video_loss': 0.05441050976514816, 'train/total_loss': 0.054503995925188065} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.20990328788757326, 'train/info_loss': 0.16271789371967316, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011567182373255492, 'train/video_loss': 0.16260221600532532, 'train/total_loss': 0.3725054860115051} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3014, 'grad_norm': 3.1910221576690674, 'learning_rate': 1.4950648053995769e-05}[Rank 0] Trainer log: {'loss': 0.3014, 'grad_norm': 3.1910221576690674, 'learning_rate': 1.4950648053995769e-05}[Rank 3] Trainer log: {'loss': 0.3014, 'grad_norm': 3.1910221576690674, 'learning_rate': 1.4950648053995769e-05} - -[Rank 2] Trainer log: {'loss': 0.3014, 'grad_norm': 3.1910221576690674, 'learning_rate': 1.4950648053995769e-05} - -{'loss': 0.3014, 'grad_norm': 3.1910221576690674, 'learning_rate': 1.4950648053995769e-05, 'epoch': 0.37} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.3391, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0030, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.000826200656592846, 'train/lm_loss': 9.968180675059558e-05, 'train/info_loss': 4.017191531602293e-05, 'train/ref_loss': 0.18502703309059143, 'train/uncertainty_loss': 0.0002998936921358109, 'train/video_loss': 0.19197671115398407, 'train/total_loss': 0.1920763999223709} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.1663, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0007710193749517202, 'train/lm_loss': 0.0001130499760620296, 'train/info_loss': 4.33902969234623e-05, 'train/ref_loss': 0.32357925176620483, 'train/uncertainty_loss': 0.016633540391921997, 'train/video_loss': 0.3464243412017822, 'train/total_loss': 0.34653738141059875} -[Rank 0] Trainer log: {'loss': 0.3809, 'grad_norm': 11.5079984664917, 'learning_rate': 1.4941376675970058e-05}[Rank 1] Trainer log: {'loss': 0.3809, 'grad_norm': 11.5079984664917, 'learning_rate': 1.4941376675970058e-05} -[Rank 2] Trainer log: {'loss': 0.3809, 'grad_norm': 11.5079984664917, 'learning_rate': 1.4941376675970058e-05} - -[Rank 3] Trainer log: {'loss': 0.3809, 'grad_norm': 11.5079984664917, 'learning_rate': 1.4941376675970058e-05} -{'loss': 0.3809, 'grad_norm': 11.5079984664917, 'learning_rate': 1.4941376675970058e-05, 'epoch': 0.37} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.41114273071289065, 'train/info_loss': 0.24095135927200317, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012135911965742708, 'train/video_loss': 0.24083000421524048, 'train/total_loss': 0.651972770690918} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1395564317703247, 'train/info_loss': 0.3627263009548187, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011385279940441252, 'train/video_loss': 0.36261245608329773, 'train/total_loss': 0.5021688938140869} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.1134, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4503, 'grad_norm': 5.181075096130371, 'learning_rate': 1.4932099674828035e-05}[Rank 1] Trainer log: {'loss': 0.4503, 'grad_norm': 5.181075096130371, 'learning_rate': 1.4932099674828035e-05}[Rank 0] Trainer log: {'loss': 0.4503, 'grad_norm': 5.181075096130371, 'learning_rate': 1.4932099674828035e-05} - - -[Rank 2] Trainer log: {'loss': 0.4503, 'grad_norm': 5.181075096130371, 'learning_rate': 1.4932099674828035e-05} -{'loss': 0.4503, 'grad_norm': 5.181075096130371, 'learning_rate': 1.4932099674828035e-05, 'epoch': 0.37} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.1561, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0007572034373879433, 'train/lm_loss': 8.77187994774431e-05, 'train/info_loss': 3.832431684713811e-05, 'train/ref_loss': 0.28658321499824524, 'train/uncertainty_loss': 0.015605235099792482, 'train/video_loss': 0.3082844018936157, 'train/total_loss': 0.3083721101284027} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.16997554302215578, 'train/info_loss': 0.21795547008514404, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011627476196736098, 'train/video_loss': 0.21783919632434845, 'train/total_loss': 0.3878147602081299} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(0.1509, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3911, 'grad_norm': 5.813899040222168, 'learning_rate': 1.4922817061126607e-05}[Rank 3] Trainer log: {'loss': 0.3911, 'grad_norm': 5.813899040222168, 'learning_rate': 1.4922817061126607e-05}[Rank 2] Trainer log: {'loss': 0.3911, 'grad_norm': 5.813899040222168, 'learning_rate': 1.4922817061126607e-05} - - -[Rank 0] Trainer log: {'loss': 0.3911, 'grad_norm': 5.813899040222168, 'learning_rate': 1.4922817061126607e-05} -{'loss': 0.3911, 'grad_norm': 5.813899040222168, 'learning_rate': 1.4922817061126607e-05, 'epoch': 0.37} -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.5256, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0012639869935810568, 'train/lm_loss': 8.819542708806694e-05, 'train/info_loss': 3.832431684713811e-05, 'train/ref_loss': 0.6507666110992432, 'train/uncertainty_loss': 0.0525624692440033, 'train/video_loss': 0.7134792804718018, 'train/total_loss': 0.7135674953460693} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.21001436710357668, 'train/info_loss': 0.13915014266967773, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001190803130157292, 'train/video_loss': 0.1390310674905777, 'train/total_loss': 0.34904545545578003} -tensor(0.9923, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.5215, 'grad_norm': 5.933712005615234, 'learning_rate': 1.4913528845429065e-05}[Rank 3] Trainer log: {'loss': 0.5215, 'grad_norm': 5.933712005615234, 'learning_rate': 1.4913528845429065e-05}[Rank 2] Trainer log: {'loss': 0.5215, 'grad_norm': 5.933712005615234, 'learning_rate': 1.4913528845429065e-05} - - -[Rank 0] Trainer log: {'loss': 0.5215, 'grad_norm': 5.933712005615234, 'learning_rate': 1.4913528845429065e-05} -{'loss': 0.5215, 'grad_norm': 5.933712005615234, 'learning_rate': 1.4913528845429065e-05, 'epoch': 0.37} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3828892469406128, 'train/info_loss': 0.22668184340000153, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011379480129107833, 'train/video_loss': 0.2265680432319641, 'train/total_loss': 0.60945725440979} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.1604, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.22747709751129153, 'train/info_loss': 0.13844701647758484, 'train/ref_loss': None, 'train/uncertainty_loss': -9.138126624748112e-05, 'train/video_loss': 0.13835564255714417, 'train/total_loss': 0.36583274602890015} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.1904, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0142, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.4279, 'grad_norm': 6.51883602142334, 'learning_rate': 1.4904235038305084e-05}[Rank 1] Trainer log: {'loss': 0.4279, 'grad_norm': 6.51883602142334, 'learning_rate': 1.4904235038305084e-05}[Rank 3] Trainer log: {'loss': 0.4279, 'grad_norm': 6.51883602142334, 'learning_rate': 1.4904235038305084e-05} - - -[Rank 0] Trainer log: {'loss': 0.4279, 'grad_norm': 6.51883602142334, 'learning_rate': 1.4904235038305084e-05} -{'loss': 0.4279, 'grad_norm': 6.51883602142334, 'learning_rate': 1.4904235038305084e-05, 'epoch': 0.37} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.05854741334915162, 'train/info_loss': 0.21058616042137146, 'train/ref_loss': None, 'train/uncertainty_loss': -9.162297938019038e-05, 'train/video_loss': 0.2104945331811905, 'train/total_loss': 0.2690419554710388} -tensor(0.0397, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0008038039319217205, 'train/lm_loss': 0.00011223980691283942, 'train/info_loss': 4.1125513234874234e-05, 'train/ref_loss': 0.04271562397480011, 'train/uncertainty_loss': -6.577377207577229e-05, 'train/video_loss': 0.04912140965461731, 'train/total_loss': 0.04923364892601967} -tensor(0.1357, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.305, 'grad_norm': 4.396207809448242, 'learning_rate': 1.489493565033069e-05}[Rank 0] Trainer log: {'loss': 0.305, 'grad_norm': 4.396207809448242, 'learning_rate': 1.489493565033069e-05}[Rank 2] Trainer log: {'loss': 0.305, 'grad_norm': 4.396207809448242, 'learning_rate': 1.489493565033069e-05} - -[Rank 1] Trainer log: {'loss': 0.305, 'grad_norm': 4.396207809448242, 'learning_rate': 1.489493565033069e-05} - -{'loss': 0.305, 'grad_norm': 4.396207809448242, 'learning_rate': 1.489493565033069e-05, 'epoch': 0.37} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.10750242471694947, 'train/info_loss': 0.1669144630432129, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011365816462785006, 'train/video_loss': 0.16680081188678741, 'train/total_loss': 0.27430322766304016} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.4090, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.1570, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0009314822964370251, 'train/lm_loss': 8.652722463011742e-05, 'train/info_loss': 3.683431350509636e-05, 'train/ref_loss': 0.30282163619995117, 'train/uncertainty_loss': 0.015700581669807433, 'train/video_loss': 0.32601091265678406, 'train/total_loss': 0.32609742879867554} -[Rank 3] Trainer log: {'loss': 0.4368, 'grad_norm': 11.70082950592041, 'learning_rate': 1.4885630692088269e-05}[Rank 2] Trainer log: {'loss': 0.4368, 'grad_norm': 11.70082950592041, 'learning_rate': 1.4885630692088269e-05}[Rank 1] Trainer log: {'loss': 0.4368, 'grad_norm': 11.70082950592041, 'learning_rate': 1.4885630692088269e-05} - - -[Rank 0] Trainer log: {'loss': 0.4368, 'grad_norm': 11.70082950592041, 'learning_rate': 1.4885630692088269e-05} -{'loss': 0.4368, 'grad_norm': 11.70082950592041, 'learning_rate': 1.4885630692088269e-05, 'epoch': 0.37} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.2327, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0012425468303263187, 'train/lm_loss': 8.819542708806694e-05, 'train/info_loss': 3.599991032388061e-05, 'train/ref_loss': 0.3679341673851013, 'train/uncertainty_loss': 0.023274020850658418, 'train/video_loss': 0.4011845886707306, 'train/total_loss': 0.4012727737426758} -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0016, device='cuda:0', grad_fn=) tensor(-0.0016, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.09557578563690186, 'train/info_loss': 0.16491933166980743, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001573768211528659, 'train/video_loss': 0.1647619605064392, 'train/total_loss': 0.2603377401828766} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(0.1050, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3181, 'grad_norm': 1.6709177494049072, 'learning_rate': 1.4876320174166544e-05}[Rank 3] Trainer log: {'loss': 0.3181, 'grad_norm': 1.6709177494049072, 'learning_rate': 1.4876320174166544e-05} -[Rank 2] Trainer log: {'loss': 0.3181, 'grad_norm': 1.6709177494049072, 'learning_rate': 1.4876320174166544e-05} - -[Rank 1] Trainer log: {'loss': 0.3181, 'grad_norm': 1.6709177494049072, 'learning_rate': 1.4876320174166544e-05} -{'loss': 0.3181, 'grad_norm': 1.6709177494049072, 'learning_rate': 1.4876320174166544e-05, 'epoch': 0.37} -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.6777, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.48958935737609866, 'train/info_loss': 0.1565723717212677, 'train/ref_loss': None, 'train/uncertainty_loss': -9.930667001754046e-05, 'train/video_loss': 0.1564730703830719, 'train/total_loss': 0.646062433719635} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3650300264358521, 'train/info_loss': 0.13940048217773438, 'train/ref_loss': None, 'train/uncertainty_loss': -9.898096323013306e-05, 'train/video_loss': 0.13930150866508484, 'train/total_loss': 0.5043315291404724} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4812, 'grad_norm': 6.980248928070068, 'learning_rate': 1.4867004107160563e-05} -[Rank 2] Trainer log: {'loss': 0.4812, 'grad_norm': 6.980248928070068, 'learning_rate': 1.4867004107160563e-05}[Rank 0] Trainer log: {'loss': 0.4812, 'grad_norm': 6.980248928070068, 'learning_rate': 1.4867004107160563e-05} - -[Rank 1] Trainer log: {'loss': 0.4812, 'grad_norm': 6.980248928070068, 'learning_rate': 1.4867004107160563e-05} -{'loss': 0.4812, 'grad_norm': 6.980248928070068, 'learning_rate': 1.4867004107160563e-05, 'epoch': 0.37} -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.04426298439502716, 'train/info_loss': 0.3008870780467987, 'train/ref_loss': None, 'train/uncertainty_loss': -9.490891243331135e-05, 'train/video_loss': 0.3007921576499939, 'train/total_loss': 0.34505513310432434} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2567347526550293, 'train/info_loss': 0.2673966884613037, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001346872071735561, 'train/video_loss': 0.2672620117664337, 'train/total_loss': 0.5239967703819275} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3405, 'grad_norm': 3.9509010314941406, 'learning_rate': 1.4857682501671686e-05}[Rank 2] Trainer log: {'loss': 0.3405, 'grad_norm': 3.9509010314941406, 'learning_rate': 1.4857682501671686e-05}[Rank 1] Trainer log: {'loss': 0.3405, 'grad_norm': 3.9509010314941406, 'learning_rate': 1.4857682501671686e-05} - - -[Rank 0] Trainer log: {'loss': 0.3405, 'grad_norm': 3.9509010314941406, 'learning_rate': 1.4857682501671686e-05} -{'loss': 0.3405, 'grad_norm': 3.9509010314941406, 'learning_rate': 1.4857682501671686e-05, 'epoch': 0.37} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.1647, device='cuda:0', grad_fn=) tensor(0.0565, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) -{'train/tv_loss': 0.0007789116818457842, 'train/lm_loss': 0.00010063500376418234, 'train/info_loss': 4.1125513234874234e-05, 'train/ref_loss': 0.3081527054309845, 'train/uncertainty_loss': 0.016467058658599855, 'train/video_loss': 0.3308922052383423, 'train/total_loss': 0.33099284768104553} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.241076135635376, 'train/info_loss': 0.1771499365568161, 'train/ref_loss': None, 'train/uncertainty_loss': -9.9668197799474e-05, 'train/video_loss': 0.17705026268959045, 'train/total_loss': 0.4181264042854309} -tensor(0.1172, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0460, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3874, 'grad_norm': 11.031661033630371, 'learning_rate': 1.4848355368307581e-05}[Rank 1] Trainer log: {'loss': 0.3874, 'grad_norm': 11.031661033630371, 'learning_rate': 1.4848355368307581e-05}[Rank 2] Trainer log: {'loss': 0.3874, 'grad_norm': 11.031661033630371, 'learning_rate': 1.4848355368307581e-05} - - -[Rank 0] Trainer log: {'loss': 0.3874, 'grad_norm': 11.031661033630371, 'learning_rate': 1.4848355368307581e-05} -{'loss': 0.3874, 'grad_norm': 11.031661033630371, 'learning_rate': 1.4848355368307581e-05, 'epoch': 0.37} -tensor(0.0663, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0009015705436468124, 'train/lm_loss': 9.38433746341616e-05, 'train/info_loss': 4.0767914470052347e-05, 'train/ref_loss': 0.2404152750968933, 'train/uncertainty_loss': 0.006630583852529526, 'train/video_loss': 0.25429919362068176, 'train/total_loss': 0.2543930411338806} -tensor(0.0023, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1583, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.36142494678497317, 'train/info_loss': 0.20532284677028656, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001077659078873694, 'train/video_loss': 0.20521508157253265, 'train/total_loss': 0.5666400194168091} -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(0.0769, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3133, 'grad_norm': 3.021932601928711, 'learning_rate': 1.4839022717682205e-05} -[Rank 2] Trainer log: {'loss': 0.3133, 'grad_norm': 3.021932601928711, 'learning_rate': 1.4839022717682205e-05}[Rank 1] Trainer log: {'loss': 0.3133, 'grad_norm': 3.021932601928711, 'learning_rate': 1.4839022717682205e-05} - -[Rank 0] Trainer log: {'loss': 0.3133, 'grad_norm': 3.021932601928711, 'learning_rate': 1.4839022717682205e-05} -{'loss': 0.3133, 'grad_norm': 3.021932601928711, 'learning_rate': 1.4839022717682205e-05, 'epoch': 0.37} -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2726943016052246, 'train/info_loss': 0.3870304226875305, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013401295291259885, 'train/video_loss': 0.38689640164375305, 'train/total_loss': 0.6595907211303711} -tensor(0.3292, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.2717, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.000839979387819767, 'train/lm_loss': 7.74948624894023e-05, 'train/info_loss': 3.6297911719884723e-05, 'train/ref_loss': 0.12534350156784058, 'train/uncertainty_loss': -7.219503168016672e-05, 'train/video_loss': 0.13202743232250214, 'train/total_loss': 0.13210493326187134} -tensor(0.0141, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3745, 'grad_norm': 8.921089172363281, 'learning_rate': 1.482968456041579e-05} -[Rank 3] Trainer log: {'loss': 0.3745, 'grad_norm': 8.921089172363281, 'learning_rate': 1.482968456041579e-05}[Rank 0] Trainer log: {'loss': 0.3745, 'grad_norm': 8.921089172363281, 'learning_rate': 1.482968456041579e-05} -[Rank 2] Trainer log: {'loss': 0.3745, 'grad_norm': 8.921089172363281, 'learning_rate': 1.482968456041579e-05} - -{'loss': 0.3745, 'grad_norm': 8.921089172363281, 'learning_rate': 1.482968456041579e-05, 'epoch': 0.37} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.14123679399490358, 'train/info_loss': 0.1554073691368103, 'train/ref_loss': None, 'train/uncertainty_loss': -9.847090113908054e-05, 'train/video_loss': 0.15530890226364136, 'train/total_loss': 0.29654568433761597} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0593, device='cuda:0', grad_fn=) tensor(0.1190, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) -{'train/tv_loss': 0.0008913602679967881, 'train/lm_loss': 8.871971513144672e-05, 'train/info_loss': 4.1125513234874234e-05, 'train/ref_loss': 0.23602737486362457, 'train/uncertainty_loss': 0.0059329021722078325, 'train/video_loss': 0.24913227558135986, 'train/total_loss': 0.24922099709510803} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.2601, 'grad_norm': 5.811514377593994, 'learning_rate': 1.482034090713484e-05}[Rank 2] Trainer log: {'loss': 0.2601, 'grad_norm': 5.811514377593994, 'learning_rate': 1.482034090713484e-05} - -[Rank 1] Trainer log: {'loss': 0.2601, 'grad_norm': 5.811514377593994, 'learning_rate': 1.482034090713484e-05} -[Rank 0] Trainer log: {'loss': 0.2601, 'grad_norm': 5.811514377593994, 'learning_rate': 1.482034090713484e-05} -{'loss': 0.2601, 'grad_norm': 5.811514377593994, 'learning_rate': 1.482034090713484e-05, 'epoch': 0.37} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(0.2753, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0007834527641534805, 'train/lm_loss': 0.00012098466977477074, 'train/info_loss': 4.3092302803415805e-05, 'train/ref_loss': 0.4041358232498169, 'train/uncertainty_loss': 0.027530750632286074, 'train/video_loss': 0.43797728419303894, 'train/total_loss': 0.438098281621933} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1897, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.03847850561141968, 'train/info_loss': 0.13089849054813385, 'train/ref_loss': None, 'train/uncertainty_loss': -9.925863705575467e-05, 'train/video_loss': 0.13079923391342163, 'train/total_loss': 0.16927774250507355} -tensor(0.0984, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3551, 'grad_norm': 1.6929435729980469, 'learning_rate': 1.4810991768472106e-05} -[Rank 3] Trainer log: {'loss': 0.3551, 'grad_norm': 1.6929435729980469, 'learning_rate': 1.4810991768472106e-05}[Rank 2] Trainer log: {'loss': 0.3551, 'grad_norm': 1.6929435729980469, 'learning_rate': 1.4810991768472106e-05} -[Rank 0] Trainer log: {'loss': 0.3551, 'grad_norm': 1.6929435729980469, 'learning_rate': 1.4810991768472106e-05} - -{'loss': 0.3551, 'grad_norm': 1.6929435729980469, 'learning_rate': 1.4810991768472106e-05, 'epoch': 0.37} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.0659, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0008199810050427914, 'train/lm_loss': 8.786178659647703e-05, 'train/info_loss': 3.921831739717163e-05, 'train/ref_loss': 0.24678486585617065, 'train/uncertainty_loss': 0.0065938010811805725, 'train/video_loss': 0.2599777281284332, 'train/total_loss': 0.26006558537483215} -tensor(0.3486, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.20186798572540285, 'train/info_loss': 0.17047454416751862, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011158281704410912, 'train/video_loss': 0.17036296427249908, 'train/total_loss': 0.3722309470176697} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.5216, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4524, 'grad_norm': 14.713385581970215, 'learning_rate': 1.4801637155066595e-05}[Rank 3] Trainer log: {'loss': 0.4524, 'grad_norm': 14.713385581970215, 'learning_rate': 1.4801637155066595e-05}[Rank 2] Trainer log: {'loss': 0.4524, 'grad_norm': 14.713385581970215, 'learning_rate': 1.4801637155066595e-05}[Rank 0] Trainer log: {'loss': 0.4524, 'grad_norm': 14.713385581970215, 'learning_rate': 1.4801637155066595e-05} - - - -{'loss': 0.4524, 'grad_norm': 14.713385581970215, 'learning_rate': 1.4801637155066595e-05, 'epoch': 0.37} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.16483497619628906, 'train/info_loss': 0.2036459892988205, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011206782655790449, 'train/video_loss': 0.20353391766548157, 'train/total_loss': 0.36836889386177063} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0264, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.2597, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0008122126571834088, 'train/lm_loss': 7.751869852654636e-05, 'train/info_loss': 3.516550350468606e-05, 'train/ref_loss': 0.0602579228579998, 'train/uncertainty_loss': -7.124426192604006e-05, 'train/video_loss': 0.06671954691410065, 'train/total_loss': 0.06679706275463104} -[Rank 3] Trainer log: {'loss': 0.3294, 'grad_norm': 4.080789566040039, 'learning_rate': 1.4792277077563525e-05}[Rank 0] Trainer log: {'loss': 0.3294, 'grad_norm': 4.080789566040039, 'learning_rate': 1.4792277077563525e-05}[Rank 2] Trainer log: {'loss': 0.3294, 'grad_norm': 4.080789566040039, 'learning_rate': 1.4792277077563525e-05} - -[Rank 1] Trainer log: {'loss': 0.3294, 'grad_norm': 4.080789566040039, 'learning_rate': 1.4792277077563525e-05} - -{'loss': 0.3294, 'grad_norm': 4.080789566040039, 'learning_rate': 1.4792277077563525e-05, 'epoch': 0.37} -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.29558274745941165, 'train/info_loss': 0.3219328820705414, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013030333211645485, 'train/video_loss': 0.3218025863170624, 'train/total_loss': 0.6173853278160095} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(0.0553, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3330230712890625, 'train/info_loss': 0.18801617622375488, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013379878364503384, 'train/video_loss': 0.18788237869739532, 'train/total_loss': 0.5209054350852966} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4464, 'grad_norm': 5.748510837554932, 'learning_rate': 1.4782911546614344e-05}[Rank 1] Trainer log: {'loss': 0.4464, 'grad_norm': 5.748510837554932, 'learning_rate': 1.4782911546614344e-05}[Rank 2] Trainer log: {'loss': 0.4464, 'grad_norm': 5.748510837554932, 'learning_rate': 1.4782911546614344e-05} - - -[Rank 0] Trainer log: {'loss': 0.4464, 'grad_norm': 5.748510837554932, 'learning_rate': 1.4782911546614344e-05} -{'loss': 0.4464, 'grad_norm': 5.748510837554932, 'learning_rate': 1.4782911546614344e-05, 'epoch': 0.37} -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.27165608406066893, 'train/info_loss': 0.1858316957950592, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010301085421815515, 'train/video_loss': 0.18572868406772614, 'train/total_loss': 0.45738476514816284} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.31857120990753174, 'train/info_loss': 0.11101125925779343, 'train/ref_loss': None, 'train/uncertainty_loss': -0.000113195413723588, 'train/video_loss': 0.11089806258678436, 'train/total_loss': 0.4294692873954773} -tensor(0.0400, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.4073, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3483, 'grad_norm': 5.359172344207764, 'learning_rate': 1.4773540572876703e-05}[Rank 2] Trainer log: {'loss': 0.3483, 'grad_norm': 5.359172344207764, 'learning_rate': 1.4773540572876703e-05}[Rank 3] Trainer log: {'loss': 0.3483, 'grad_norm': 5.359172344207764, 'learning_rate': 1.4773540572876703e-05} - - -[Rank 0] Trainer log: {'loss': 0.3483, 'grad_norm': 5.359172344207764, 'learning_rate': 1.4773540572876703e-05} -{'loss': 0.3483, 'grad_norm': 5.359172344207764, 'learning_rate': 1.4773540572876703e-05, 'epoch': 0.37} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.2382, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0009379643946886063, 'train/lm_loss': 7.82098446507007e-05, 'train/info_loss': 3.409269265830517e-05, 'train/ref_loss': 0.3698093295097351, 'train/uncertainty_loss': 0.023824457824230195, 'train/video_loss': 0.40117159485816956, 'train/total_loss': 0.40124979615211487} -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.4552823543548584, 'train/info_loss': 0.17465274035930634, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011966751189902426, 'train/video_loss': 0.17453306913375854, 'train/total_loss': 0.6298154592514038} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.0870, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4132, 'grad_norm': 8.145251274108887, 'learning_rate': 1.4764164167014451e-05}[Rank 3] Trainer log: {'loss': 0.4132, 'grad_norm': 8.145251274108887, 'learning_rate': 1.4764164167014451e-05}[Rank 2] Trainer log: {'loss': 0.4132, 'grad_norm': 8.145251274108887, 'learning_rate': 1.4764164167014451e-05} - - -[Rank 0] Trainer log: {'loss': 0.4132, 'grad_norm': 8.145251274108887, 'learning_rate': 1.4764164167014451e-05} -{'loss': 0.4132, 'grad_norm': 8.145251274108887, 'learning_rate': 1.4764164167014451e-05, 'epoch': 0.37} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.14643287658691406, 'train/info_loss': 0.14038188755512238, 'train/ref_loss': None, 'train/uncertainty_loss': -9.222210501320661e-05, 'train/video_loss': 0.14028966426849365, 'train/total_loss': 0.2867225408554077} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.0678395688533783, 'train/info_loss': 0.35017815232276917, 'train/ref_loss': None, 'train/uncertainty_loss': -8.947788737714291e-05, 'train/video_loss': 0.3500886857509613, 'train/total_loss': 0.4179282486438751} -tensor(0.0388, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.2891, 'grad_norm': 6.289341926574707, 'learning_rate': 1.475478233969761e-05} -[Rank 0] Trainer log: {'loss': 0.2891, 'grad_norm': 6.289341926574707, 'learning_rate': 1.475478233969761e-05}[Rank 3] Trainer log: {'loss': 0.2891, 'grad_norm': 6.289341926574707, 'learning_rate': 1.475478233969761e-05} -[Rank 1] Trainer log: {'loss': 0.2891, 'grad_norm': 6.289341926574707, 'learning_rate': 1.475478233969761e-05} - -{'loss': 0.2891, 'grad_norm': 6.289341926574707, 'learning_rate': 1.475478233969761e-05, 'epoch': 0.38} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0006925893016159535, 'train/lm_loss': 7.792385295033455e-05, 'train/info_loss': 3.683431350509636e-05, 'train/ref_loss': 0.16657432913780212, 'train/uncertainty_loss': -7.672923966310919e-05, 'train/video_loss': 0.17207515239715576, 'train/total_loss': 0.1721530705690384} -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0006028398405760528, 'train/lm_loss': 8.743281941860915e-05, 'train/info_loss': 3.921831739717163e-05, 'train/ref_loss': 0.19374971091747284, 'train/uncertainty_loss': -7.479157648049295e-05, 'train/video_loss': 0.19853685796260834, 'train/total_loss': 0.19862429797649384} -tensor(0.0079, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4027, 'grad_norm': 4.947473049163818, 'learning_rate': 1.474539510160238e-05}[Rank 3] Trainer log: {'loss': 0.4027, 'grad_norm': 4.947473049163818, 'learning_rate': 1.474539510160238e-05}[Rank 0] Trainer log: {'loss': 0.4027, 'grad_norm': 4.947473049163818, 'learning_rate': 1.474539510160238e-05} - -[Rank 2] Trainer log: {'loss': 0.4027, 'grad_norm': 4.947473049163818, 'learning_rate': 1.474539510160238e-05} - -{'loss': 0.4027, 'grad_norm': 4.947473049163818, 'learning_rate': 1.474539510160238e-05, 'epoch': 0.38} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.000591878267005086, 'train/lm_loss': 7.654155488125981e-05, 'train/info_loss': 3.772831769310869e-05, 'train/ref_loss': 0.10778993368148804, 'train/uncertainty_loss': -7.164519047364593e-05, 'train/video_loss': 0.11249104142189026, 'train/total_loss': 0.11256758123636246} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1969836711883545, 'train/info_loss': 0.1386205404996872, 'train/ref_loss': None, 'train/uncertainty_loss': -9.815626544877887e-05, 'train/video_loss': 0.13852238655090332, 'train/total_loss': 0.3355060815811157} -[Rank 3] Trainer log: {'loss': 0.4539, 'grad_norm': 2.700230360031128, 'learning_rate': 1.473600246341111e-05}[Rank 1] Trainer log: {'loss': 0.4539, 'grad_norm': 2.700230360031128, 'learning_rate': 1.473600246341111e-05}[Rank 0] Trainer log: {'loss': 0.4539, 'grad_norm': 2.700230360031128, 'learning_rate': 1.473600246341111e-05} - - -[Rank 2] Trainer log: {'loss': 0.4539, 'grad_norm': 2.700230360031128, 'learning_rate': 1.473600246341111e-05} -{'loss': 0.4539, 'grad_norm': 2.700230360031128, 'learning_rate': 1.473600246341111e-05, 'epoch': 0.38} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3349081039428711, 'train/info_loss': 0.18082164227962494, 'train/ref_loss': None, 'train/uncertainty_loss': -9.943055920302868e-05, 'train/video_loss': 0.1807222068309784, 'train/total_loss': 0.515630304813385} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0945, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0007084787823259831, 'train/lm_loss': 8.728983229957522e-05, 'train/info_loss': 3.862231824314222e-05, 'train/ref_loss': 0.27416568994522095, 'train/uncertainty_loss': 0.009448271244764328, 'train/video_loss': 0.2893204092979431, 'train/total_loss': 0.2894077003002167} -[Rank 2] Trainer log: {'loss': 0.2977, 'grad_norm': 10.785696983337402, 'learning_rate': 1.4726604435812302e-05}[Rank 1] Trainer log: {'loss': 0.2977, 'grad_norm': 10.785696983337402, 'learning_rate': 1.4726604435812302e-05}[Rank 0] Trainer log: {'loss': 0.2977, 'grad_norm': 10.785696983337402, 'learning_rate': 1.4726604435812302e-05} - -[Rank 3] Trainer log: {'loss': 0.2977, 'grad_norm': 10.785696983337402, 'learning_rate': 1.4726604435812302e-05} - -{'loss': 0.2977, 'grad_norm': 10.785696983337402, 'learning_rate': 1.4726604435812302e-05, 'epoch': 0.38} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.3156, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0005467121489346028, 'train/lm_loss': 7.682755240239203e-05, 'train/info_loss': 3.713231490110047e-05, 'train/ref_loss': 0.4271990656852722, 'train/uncertainty_loss': 0.03156073987483978, 'train/video_loss': 0.4631706476211548, 'train/total_loss': 0.46324747800827026} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3002610206604004, 'train/info_loss': 0.23848992586135864, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011251561809331179, 'train/video_loss': 0.2383774071931839, 'train/total_loss': 0.5386384129524231} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0283, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.365, 'grad_norm': 7.76344108581543, 'learning_rate': 1.4717201029500583e-05}[Rank 2] Trainer log: {'loss': 0.365, 'grad_norm': 7.76344108581543, 'learning_rate': 1.4717201029500583e-05}[Rank 1] Trainer log: {'loss': 0.365, 'grad_norm': 7.76344108581543, 'learning_rate': 1.4717201029500583e-05} - - -[Rank 0] Trainer log: {'loss': 0.365, 'grad_norm': 7.76344108581543, 'learning_rate': 1.4717201029500583e-05} -{'loss': 0.365, 'grad_norm': 7.76344108581543, 'learning_rate': 1.4717201029500583e-05, 'epoch': 0.38} -tensor(-0.0015, device='cuda:2', grad_fn=) tensor(-0.0015, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0005464762449264527, 'train/lm_loss': 0.00013916417956352233, 'train/info_loss': 4.410549081512727e-05, 'train/ref_loss': 0.19055448472499847, 'train/uncertainty_loss': -7.733948878012598e-05, 'train/video_loss': 0.19489306211471558, 'train/total_loss': 0.19503222405910492} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(0.0043, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0005483495537191629, 'train/lm_loss': 5.239754100330174e-05, 'train/info_loss': 3.409269265830517e-05, 'train/ref_loss': 0.08397065103054047, 'train/uncertainty_loss': -7.103672833181917e-05, 'train/video_loss': 0.08832050859928131, 'train/total_loss': 0.08837290853261948} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.2925, 'grad_norm': 8.994170188903809, 'learning_rate': 1.4707792255176714e-05}[Rank 2] Trainer log: {'loss': 0.2925, 'grad_norm': 8.994170188903809, 'learning_rate': 1.4707792255176714e-05}[Rank 3] Trainer log: {'loss': 0.2925, 'grad_norm': 8.994170188903809, 'learning_rate': 1.4707792255176714e-05} - - -[Rank 0] Trainer log: {'loss': 0.2925, 'grad_norm': 8.994170188903809, 'learning_rate': 1.4707792255176714e-05} -{'loss': 0.2925, 'grad_norm': 8.994170188903809, 'learning_rate': 1.4707792255176714e-05, 'epoch': 0.38} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.1068, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00046188901178538804, 'train/lm_loss': 8.745664963498712e-05, 'train/info_loss': 3.921831739717163e-05, 'train/ref_loss': 0.09520679712295532, 'train/uncertainty_loss': -7.914883317425848e-05, 'train/video_loss': 0.09886198490858078, 'train/total_loss': 0.09894943982362747} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0005316596943885088, 'train/lm_loss': 0.00010721193393692374, 'train/info_loss': 4.172151238890365e-05, 'train/ref_loss': 0.1679275631904602, 'train/uncertainty_loss': -7.436622981913388e-05, 'train/video_loss': 0.172148197889328, 'train/total_loss': 0.17225541174411774} -tensor(0.0233, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.246, 'grad_norm': 7.525926113128662, 'learning_rate': 1.4698378123547546e-05}[Rank 2] Trainer log: {'loss': 0.246, 'grad_norm': 7.525926113128662, 'learning_rate': 1.4698378123547546e-05} -[Rank 1] Trainer log: {'loss': 0.246, 'grad_norm': 7.525926113128662, 'learning_rate': 1.4698378123547546e-05} -[Rank 3] Trainer log: {'loss': 0.246, 'grad_norm': 7.525926113128662, 'learning_rate': 1.4698378123547546e-05} - -{'loss': 0.246, 'grad_norm': 7.525926113128662, 'learning_rate': 1.4698378123547546e-05, 'epoch': 0.38} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.14383591413497926, 'train/info_loss': 0.23893707990646362, 'train/ref_loss': None, 'train/uncertainty_loss': -8.479680982418359e-05, 'train/video_loss': 0.23885227739810944, 'train/total_loss': 0.3826881945133209} -tensor(0.0267, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.0575, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0004722346551716328, 'train/lm_loss': 0.00012825190788134933, 'train/info_loss': 4.3092302803415805e-05, 'train/ref_loss': 0.25818488001823425, 'train/uncertainty_loss': 0.005753329768776894, 'train/video_loss': 0.26775920391082764, 'train/total_loss': 0.2678874433040619} -[Rank 3] Trainer log: {'loss': 0.389, 'grad_norm': 3.039344072341919, 'learning_rate': 1.4688958645326042e-05} -[Rank 0] Trainer log: {'loss': 0.389, 'grad_norm': 3.039344072341919, 'learning_rate': 1.4688958645326042e-05}[Rank 1] Trainer log: {'loss': 0.389, 'grad_norm': 3.039344072341919, 'learning_rate': 1.4688958645326042e-05} -[Rank 2] Trainer log: {'loss': 0.389, 'grad_norm': 3.039344072341919, 'learning_rate': 1.4688958645326042e-05} - -{'loss': 0.389, 'grad_norm': 3.039344072341919, 'learning_rate': 1.4688958645326042e-05, 'epoch': 0.38} -tensor(-0.0015, device='cuda:3', grad_fn=) tensor(-0.0015, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1712, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0004665999673306942, 'train/lm_loss': 8.905334980227054e-05, 'train/info_loss': 3.683431350509636e-05, 'train/ref_loss': 0.32538944482803345, 'train/uncertainty_loss': 0.01711627691984177, 'train/video_loss': 0.34627535939216614, 'train/total_loss': 0.34636440873146057} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.16367030143737793, 'train/info_loss': 0.12583664059638977, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001227773609571159, 'train/video_loss': 0.12571386992931366, 'train/total_loss': 0.2893841862678528} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.3395, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3121, 'grad_norm': 19.54503059387207, 'learning_rate': 1.4679533831231239e-05}[Rank 3] Trainer log: {'loss': 0.3121, 'grad_norm': 19.54503059387207, 'learning_rate': 1.4679533831231239e-05}[Rank 0] Trainer log: {'loss': 0.3121, 'grad_norm': 19.54503059387207, 'learning_rate': 1.4679533831231239e-05} - -[Rank 2] Trainer log: {'loss': 0.3121, 'grad_norm': 19.54503059387207, 'learning_rate': 1.4679533831231239e-05} - -{'loss': 0.3121, 'grad_norm': 19.54503059387207, 'learning_rate': 1.4679533831231239e-05, 'epoch': 0.38} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.3657, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00037199601065367465, 'train/lm_loss': 0.00011483709095045925, 'train/info_loss': 4.4463089579949155e-05, 'train/ref_loss': 0.06456094235181808, 'train/uncertainty_loss': -7.083482923917473e-05, 'train/video_loss': 0.06751054525375366, 'train/total_loss': 0.06762538105249405} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.1014, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003445826238021255, 'train/lm_loss': 7.62317329645157e-05, 'train/info_loss': 3.486750210868195e-05, 'train/ref_loss': 0.2852287292480469, 'train/uncertainty_loss': 0.010139705985784531, 'train/video_loss': 0.29815995693206787, 'train/total_loss': 0.2982361912727356} -[Rank 0] Trainer log: {'loss': 0.3351, 'grad_norm': 6.229938983917236, 'learning_rate': 1.4670103691988256e-05}[Rank 1] Trainer log: {'loss': 0.3351, 'grad_norm': 6.229938983917236, 'learning_rate': 1.4670103691988256e-05} -[Rank 2] Trainer log: {'loss': 0.3351, 'grad_norm': 6.229938983917236, 'learning_rate': 1.4670103691988256e-05} - -[Rank 3] Trainer log: {'loss': 0.3351, 'grad_norm': 6.229938983917236, 'learning_rate': 1.4670103691988256e-05} -{'loss': 0.3351, 'grad_norm': 6.229938983917236, 'learning_rate': 1.4670103691988256e-05, 'epoch': 0.38} -tensor(-0.0015, device='cuda:2', grad_fn=) tensor(-0.0015, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.000295386859215796, 'train/lm_loss': 0.00012141356710344554, 'train/info_loss': 4.4463089579949155e-05, 'train/ref_loss': 0.11614163964986801, 'train/uncertainty_loss': -7.555320626124741e-05, 'train/video_loss': 0.11847364157438278, 'train/total_loss': 0.11859505623579025} -tensor(0.1516, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(0.3892, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0017, device='cuda:0', grad_fn=) tensor(-0.0017, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.45522046089172363, 'train/info_loss': 0.3452918827533722, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00017471329774707556, 'train/video_loss': 0.3451171815395355, 'train/total_loss': 0.8003376722335815} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.4101, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.421, 'grad_norm': 10.4870023727417, 'learning_rate': 1.4660668238328262e-05}[Rank 1] Trainer log: {'loss': 0.421, 'grad_norm': 10.4870023727417, 'learning_rate': 1.4660668238328262e-05}[Rank 2] Trainer log: {'loss': 0.421, 'grad_norm': 10.4870023727417, 'learning_rate': 1.4660668238328262e-05} - - -[Rank 0] Trainer log: {'loss': 0.421, 'grad_norm': 10.4870023727417, 'learning_rate': 1.4660668238328262e-05} -{'loss': 0.421, 'grad_norm': 10.4870023727417, 'learning_rate': 1.4660668238328262e-05, 'epoch': 0.38} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.4784, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003041001269593835, 'train/lm_loss': 9.994393913075328e-05, 'train/info_loss': 4.172151238890365e-05, 'train/ref_loss': 0.47725003957748413, 'train/uncertainty_loss': 0.04784381985664368, 'train/video_loss': 0.5275683999061584, 'train/total_loss': 0.5276683568954468} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.240520977973938, 'train/info_loss': 0.23548544943332672, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010999187361449003, 'train/video_loss': 0.23537546396255493, 'train/total_loss': 0.4758964478969574} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.4444, 'grad_norm': 7.150793552398682, 'learning_rate': 1.4651227480988484e-05}[Rank 3] Trainer log: {'loss': 0.4444, 'grad_norm': 7.150793552398682, 'learning_rate': 1.4651227480988484e-05}[Rank 1] Trainer log: {'loss': 0.4444, 'grad_norm': 7.150793552398682, 'learning_rate': 1.4651227480988484e-05} - -[Rank 0] Trainer log: {'loss': 0.4444, 'grad_norm': 7.150793552398682, 'learning_rate': 1.4651227480988484e-05} - -{'loss': 0.4444, 'grad_norm': 7.150793552398682, 'learning_rate': 1.4651227480988484e-05, 'epoch': 0.38} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3613159656524658, 'train/info_loss': 0.1930738389492035, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011062993435189128, 'train/video_loss': 0.19296321272850037, 'train/total_loss': 0.5542792081832886} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0804, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.36087419986724856, 'train/info_loss': 0.18505734205245972, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011722591007128358, 'train/video_loss': 0.18494011461734772, 'train/total_loss': 0.5458143353462219} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.4833, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.4311, 'grad_norm': 6.871325492858887, 'learning_rate': 1.464178143071217e-05}[Rank 3] Trainer log: {'loss': 0.4311, 'grad_norm': 6.871325492858887, 'learning_rate': 1.464178143071217e-05} - -[Rank 0] Trainer log: {'loss': 0.4311, 'grad_norm': 6.871325492858887, 'learning_rate': 1.464178143071217e-05}[Rank 1] Trainer log: {'loss': 0.4311, 'grad_norm': 6.871325492858887, 'learning_rate': 1.464178143071217e-05} - -{'loss': 0.4311, 'grad_norm': 6.871325492858887, 'learning_rate': 1.464178143071217e-05, 'epoch': 0.38} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00035025784745812416, 'train/lm_loss': 0.00010590133024379611, 'train/info_loss': 4.1125513234874234e-05, 'train/ref_loss': 0.20787984132766724, 'train/uncertainty_loss': -6.909461226314307e-05, 'train/video_loss': 0.21065393090248108, 'train/total_loss': 0.21075983345508575} -tensor(-0.0018, device='cuda:1', grad_fn=) tensor(-0.0018, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.11596829891204835, 'train/info_loss': 0.23125816881656647, 'train/ref_loss': None, 'train/uncertainty_loss': -9.696989436633885e-05, 'train/video_loss': 0.2311611920595169, 'train/total_loss': 0.3471294939517975} -[Rank 3] Trainer log: {'loss': 0.3189, 'grad_norm': 6.8558149337768555, 'learning_rate': 1.4632330098248604e-05}[Rank 1] Trainer log: {'loss': 0.3189, 'grad_norm': 6.8558149337768555, 'learning_rate': 1.4632330098248604e-05}[Rank 2] Trainer log: {'loss': 0.3189, 'grad_norm': 6.8558149337768555, 'learning_rate': 1.4632330098248604e-05} - - -[Rank 0] Trainer log: {'loss': 0.3189, 'grad_norm': 6.8558149337768555, 'learning_rate': 1.4632330098248604e-05} -{'loss': 0.3189, 'grad_norm': 6.8558149337768555, 'learning_rate': 1.4632330098248604e-05, 'epoch': 0.38} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.07330606579780578, 'train/info_loss': 0.12832851707935333, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010693742660805583, 'train/video_loss': 0.12822158634662628, 'train/total_loss': 0.2015276551246643} -tensor(0.1331, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(0.1954, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1489, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0015440289862453938, 'train/lm_loss': 0.00012732266914099457, 'train/info_loss': 4.3092302803415805e-05, 'train/ref_loss': 0.31461700797080994, 'train/uncertainty_loss': 0.01488795578479767, 'train/video_loss': 0.3419002890586853, 'train/total_loss': 0.34202760457992554} -[Rank 3] Trainer log: {'loss': 0.355, 'grad_norm': 3.619764566421509, 'learning_rate': 1.4622873494353078e-05}[Rank 1] Trainer log: {'loss': 0.355, 'grad_norm': 3.619764566421509, 'learning_rate': 1.4622873494353078e-05}[Rank 0] Trainer log: {'loss': 0.355, 'grad_norm': 3.619764566421509, 'learning_rate': 1.4622873494353078e-05} - -[Rank 2] Trainer log: {'loss': 0.355, 'grad_norm': 3.619764566421509, 'learning_rate': 1.4622873494353078e-05} - -{'loss': 0.355, 'grad_norm': 3.619764566421509, 'learning_rate': 1.4622873494353078e-05, 'epoch': 0.38} -tensor(-0.0015, device='cuda:3', grad_fn=) tensor(-0.0015, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2702240228652954, 'train/info_loss': 0.17971810698509216, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012501550372689962, 'train/video_loss': 0.17959308624267578, 'train/total_loss': 0.44981712102890015} -tensor(0.2105, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00027966124471277, 'train/lm_loss': 0.000100039248354733, 'train/info_loss': 3.80263190891128e-05, 'train/ref_loss': 0.16758424043655396, 'train/uncertainty_loss': -7.233077776618302e-05, 'train/video_loss': 0.1697872281074524, 'train/total_loss': 0.16988727450370789} -tensor(0.1373, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3278, 'grad_norm': 13.483813285827637, 'learning_rate': 1.461341162978688e-05}[Rank 3] Trainer log: {'loss': 0.3278, 'grad_norm': 13.483813285827637, 'learning_rate': 1.461341162978688e-05} - -[Rank 2] Trainer log: {'loss': 0.3278, 'grad_norm': 13.483813285827637, 'learning_rate': 1.461341162978688e-05} -[Rank 0] Trainer log: {'loss': 0.3278, 'grad_norm': 13.483813285827637, 'learning_rate': 1.461341162978688e-05} -{'loss': 0.3278, 'grad_norm': 13.483813285827637, 'learning_rate': 1.461341162978688e-05, 'epoch': 0.38} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.0129, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.0974, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.000606312369927764, 'train/lm_loss': 9.300929959863425e-05, 'train/info_loss': 3.921831739717163e-05, 'train/ref_loss': 0.28117191791534424, 'train/uncertainty_loss': 0.00973847359418869, 'train/video_loss': 0.295800119638443, 'train/total_loss': 0.295893132686615} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1072, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00047720433212816715, 'train/lm_loss': 0.00017811470897868278, 'train/info_loss': 4.4463089579949155e-05, 'train/ref_loss': 0.2818828821182251, 'train/uncertainty_loss': 0.01072465255856514, 'train/video_loss': 0.29646965861320496, 'train/total_loss': 0.2966477870941162} -[Rank 0] Trainer log: {'loss': 0.3439, 'grad_norm': 7.048087120056152, 'learning_rate': 1.4603944515317284e-05}[Rank 1] Trainer log: {'loss': 0.3439, 'grad_norm': 7.048087120056152, 'learning_rate': 1.4603944515317284e-05}[Rank 2] Trainer log: {'loss': 0.3439, 'grad_norm': 7.048087120056152, 'learning_rate': 1.4603944515317284e-05} - -[Rank 3] Trainer log: {'loss': 0.3439, 'grad_norm': 7.048087120056152, 'learning_rate': 1.4603944515317284e-05} - -{'loss': 0.3439, 'grad_norm': 7.048087120056152, 'learning_rate': 1.4603944515317284e-05, 'epoch': 0.38} -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.5459500789642334, 'train/info_loss': 0.1937795728445053, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00014438965590670704, 'train/video_loss': 0.19363518059253693, 'train/total_loss': 0.739585280418396} -tensor(-0.0017, device='cuda:2', grad_fn=) tensor(-0.0017, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(0.1884, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00034536221064627174, 'train/lm_loss': 6.758024683222175e-05, 'train/info_loss': 3.325828583911061e-05, 'train/ref_loss': 0.3403252363204956, 'train/uncertainty_loss': 0.018836973607540133, 'train/video_loss': 0.3619583547115326, 'train/total_loss': 0.3620259463787079} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.4197, 'grad_norm': 4.5460429191589355, 'learning_rate': 1.4594472161717538e-05}[Rank 0] Trainer log: {'loss': 0.4197, 'grad_norm': 4.5460429191589355, 'learning_rate': 1.4594472161717538e-05} -[Rank 1] Trainer log: {'loss': 0.4197, 'grad_norm': 4.5460429191589355, 'learning_rate': 1.4594472161717538e-05} -[Rank 3] Trainer log: {'loss': 0.4197, 'grad_norm': 4.5460429191589355, 'learning_rate': 1.4594472161717538e-05} - -{'loss': 0.4197, 'grad_norm': 4.5460429191589355, 'learning_rate': 1.4594472161717538e-05, 'epoch': 0.38} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(0.1071, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0005814726464450359, 'train/lm_loss': 0.00010709279449656607, 'train/info_loss': 4.142351099289954e-05, 'train/ref_loss': 0.1471662074327469, 'train/uncertainty_loss': -7.751506054773927e-05, 'train/video_loss': 0.15178190171718597, 'train/total_loss': 0.15188899636268616} -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3191608190536499, 'train/info_loss': 0.2771400511264801, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013838732847943903, 'train/video_loss': 0.27700164914131165, 'train/total_loss': 0.5961624383926392} -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3377, 'grad_norm': 4.1200151443481445, 'learning_rate': 1.4584994579766865e-05}[Rank 0] Trainer log: {'loss': 0.3377, 'grad_norm': 4.1200151443481445, 'learning_rate': 1.4584994579766865e-05}[Rank 1] Trainer log: {'loss': 0.3377, 'grad_norm': 4.1200151443481445, 'learning_rate': 1.4584994579766865e-05} - -[Rank 3] Trainer log: {'loss': 0.3377, 'grad_norm': 4.1200151443481445, 'learning_rate': 1.4584994579766865e-05} - -{'loss': 0.3377, 'grad_norm': 4.1200151443481445, 'learning_rate': 1.4584994579766865e-05, 'epoch': 0.38} -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.11097300052642822, 'train/info_loss': 0.09289895743131638, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012763000559061767, 'train/video_loss': 0.09277132898569107, 'train/total_loss': 0.2037443220615387} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(0.2272, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.2365, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3578636407852173, 'train/info_loss': 0.16937194764614105, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011229310184717179, 'train/video_loss': 0.16925965249538422, 'train/total_loss': 0.5271232724189758} -tensor(0.4510, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0357, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.4048, 'grad_norm': 9.222996711730957, 'learning_rate': 1.4575511780250413e-05} -[Rank 0] Trainer log: {'loss': 0.4048, 'grad_norm': 9.222996711730957, 'learning_rate': 1.4575511780250413e-05}[Rank 3] Trainer log: {'loss': 0.4048, 'grad_norm': 9.222996711730957, 'learning_rate': 1.4575511780250413e-05} -[Rank 1] Trainer log: {'loss': 0.4048, 'grad_norm': 9.222996711730957, 'learning_rate': 1.4575511780250413e-05} - -{'loss': 0.4048, 'grad_norm': 9.222996711730957, 'learning_rate': 1.4575511780250413e-05, 'epoch': 0.38} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2607626438140869, 'train/info_loss': 0.26494452357292175, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012357297819107771, 'train/video_loss': 0.26482096314430237, 'train/total_loss': 0.5255836248397827} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3991572380065918, 'train/info_loss': 0.24637703597545624, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013315838295966387, 'train/video_loss': 0.24624387919902802, 'train/total_loss': 0.645401120185852} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0350, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4136, 'grad_norm': 5.537405967712402, 'learning_rate': 1.4566023773959288e-05}[Rank 0] Trainer log: {'loss': 0.4136, 'grad_norm': 5.537405967712402, 'learning_rate': 1.4566023773959288e-05}[Rank 1] Trainer log: {'loss': 0.4136, 'grad_norm': 5.537405967712402, 'learning_rate': 1.4566023773959288e-05} - - -[Rank 2] Trainer log: {'loss': 0.4136, 'grad_norm': 5.537405967712402, 'learning_rate': 1.4566023773959288e-05} -{'loss': 0.4136, 'grad_norm': 5.537405967712402, 'learning_rate': 1.4566023773959288e-05, 'epoch': 0.38} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1792718768119812, 'train/info_loss': 0.20300506055355072, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010146930580958725, 'train/video_loss': 0.2029035985469818, 'train/total_loss': 0.382175475358963} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1112721085548401, 'train/info_loss': 0.23818306624889374, 'train/ref_loss': None, 'train/uncertainty_loss': -8.346617687493563e-05, 'train/video_loss': 0.238099604845047, 'train/total_loss': 0.3493717312812805} -[Rank 2] Trainer log: {'loss': 0.464, 'grad_norm': 2.2262890338897705, 'learning_rate': 1.4556530571690513e-05} -[Rank 1] Trainer log: {'loss': 0.464, 'grad_norm': 2.2262890338897705, 'learning_rate': 1.4556530571690513e-05} -[Rank 3] Trainer log: {'loss': 0.464, 'grad_norm': 2.2262890338897705, 'learning_rate': 1.4556530571690513e-05} -[Rank 0] Trainer log: {'loss': 0.464, 'grad_norm': 2.2262890338897705, 'learning_rate': 1.4556530571690513e-05} -{'loss': 0.464, 'grad_norm': 2.2262890338897705, 'learning_rate': 1.4556530571690513e-05, 'epoch': 0.38} -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.12223913669586182, 'train/info_loss': 0.22842583060264587, 'train/ref_loss': None, 'train/uncertainty_loss': -9.95842507109046e-05, 'train/video_loss': 0.2283262461423874, 'train/total_loss': 0.3505653738975525} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0004723156336694956, 'train/lm_loss': 5.964343436062336e-05, 'train/info_loss': 3.2006668334361166e-05, 'train/ref_loss': 0.1558982878923416, 'train/uncertainty_loss': -7.56506051402539e-05, 'train/video_loss': 0.15963317453861237, 'train/total_loss': 0.15969282388687134} -[Rank 3] Trainer log: {'loss': 0.277, 'grad_norm': 5.7588911056518555, 'learning_rate': 1.4547032184247022e-05} -[Rank 1] Trainer log: {'loss': 0.277, 'grad_norm': 5.7588911056518555, 'learning_rate': 1.4547032184247022e-05} -[Rank 0] Trainer log: {'loss': 0.277, 'grad_norm': 5.7588911056518555, 'learning_rate': 1.4547032184247022e-05} -[Rank 2] Trainer log: {'loss': 0.277, 'grad_norm': 5.7588911056518555, 'learning_rate': 1.4547032184247022e-05} -{'loss': 0.277, 'grad_norm': 5.7588911056518555, 'learning_rate': 1.4547032184247022e-05, 'epoch': 0.38} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(0.6313, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.31824350357055664, 'train/info_loss': 0.31116563081741333, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001421982189640403, 'train/video_loss': 0.31102344393730164, 'train/total_loss': 0.6292669773101807} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1374538540840149, 'train/info_loss': 0.10954345762729645, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011899120872840286, 'train/video_loss': 0.10942446440458298, 'train/total_loss': 0.24687832593917847} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(0.0208, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.4793, 'grad_norm': 11.67172908782959, 'learning_rate': 1.4537528622437656e-05}[Rank 3] Trainer log: {'loss': 0.4793, 'grad_norm': 11.67172908782959, 'learning_rate': 1.4537528622437656e-05}[Rank 2] Trainer log: {'loss': 0.4793, 'grad_norm': 11.67172908782959, 'learning_rate': 1.4537528622437656e-05} - - -[Rank 1] Trainer log: {'loss': 0.4793, 'grad_norm': 11.67172908782959, 'learning_rate': 1.4537528622437656e-05} -{'loss': 0.4793, 'grad_norm': 11.67172908782959, 'learning_rate': 1.4537528622437656e-05, 'epoch': 0.38} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.33481931686401367, 'train/info_loss': 0.20790395140647888, 'train/ref_loss': None, 'train/uncertainty_loss': -9.263175306841732e-05, 'train/video_loss': 0.20781132578849792, 'train/total_loss': 0.542630672454834} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.0191, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003635434433817864, 'train/lm_loss': 0.0001271797111257911, 'train/info_loss': 4.3092302803415805e-05, 'train/ref_loss': 0.22497393190860748, 'train/uncertainty_loss': 0.0019095348194241525, 'train/video_loss': 0.2298349142074585, 'train/total_loss': 0.22996209561824799} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.3796, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4346, 'grad_norm': 11.575284957885742, 'learning_rate': 1.4528019897077142e-05}[Rank 0] Trainer log: {'loss': 0.4346, 'grad_norm': 11.575284957885742, 'learning_rate': 1.4528019897077142e-05}[Rank 3] Trainer log: {'loss': 0.4346, 'grad_norm': 11.575284957885742, 'learning_rate': 1.4528019897077142e-05} - - -{'loss': 0.4346, 'grad_norm': 11.575284957885742, 'learning_rate': 1.4528019897077142e-05, 'epoch': 0.38} -[Rank 2] Trainer log: {'loss': 0.4346, 'grad_norm': 11.575284957885742, 'learning_rate': 1.4528019897077142e-05} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003262773621827364, 'train/lm_loss': 0.00011347888503223658, 'train/info_loss': 4.142351099289954e-05, 'train/ref_loss': 0.21770350635051727, 'train/uncertainty_loss': -7.634145440533758e-05, 'train/video_loss': 0.2202788144350052, 'train/total_loss': 0.22039228677749634} -tensor(0.1237, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.20170469284057618, 'train/info_loss': 0.206663578748703, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012530622771009803, 'train/video_loss': 0.20653827488422394, 'train/total_loss': 0.40824297070503235} -tensor(0.0310, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.2061, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3996, 'grad_norm': 5.784742832183838, 'learning_rate': 1.4518506018986076e-05}[Rank 2] Trainer log: {'loss': 0.3996, 'grad_norm': 5.784742832183838, 'learning_rate': 1.4518506018986076e-05} -[Rank 1] Trainer log: {'loss': 0.3996, 'grad_norm': 5.784742832183838, 'learning_rate': 1.4518506018986076e-05} - -[Rank 0] Trainer log: {'loss': 0.3996, 'grad_norm': 5.784742832183838, 'learning_rate': 1.4518506018986076e-05} -{'loss': 0.3996, 'grad_norm': 5.784742832183838, 'learning_rate': 1.4518506018986076e-05, 'epoch': 0.38} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.0690, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003895872971042991, 'train/lm_loss': 7.70182115957141e-05, 'train/info_loss': 3.57019089278765e-05, 'train/ref_loss': 0.26596418023109436, 'train/uncertainty_loss': 0.0068994112312793735, 'train/video_loss': 0.2760159969329834, 'train/total_loss': 0.2760930061340332} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.22219712734222413, 'train/info_loss': 0.16284534335136414, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010561130475252868, 'train/video_loss': 0.16273973882198334, 'train/total_loss': 0.3849368691444397} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.0168, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3628, 'grad_norm': 7.342229843139648, 'learning_rate': 1.450898699899093e-05}[Rank 2] Trainer log: {'loss': 0.3628, 'grad_norm': 7.342229843139648, 'learning_rate': 1.450898699899093e-05}[Rank 0] Trainer log: {'loss': 0.3628, 'grad_norm': 7.342229843139648, 'learning_rate': 1.450898699899093e-05} - - -[Rank 3] Trainer log: {'loss': 0.3628, 'grad_norm': 7.342229843139648, 'learning_rate': 1.450898699899093e-05}{'loss': 0.3628, 'grad_norm': 7.342229843139648, 'learning_rate': 1.450898699899093e-05, 'epoch': 0.38} - -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3519636392593384, 'train/info_loss': 0.1816101223230362, 'train/ref_loss': None, 'train/uncertainty_loss': -9.930580854415894e-05, 'train/video_loss': 0.1815108209848404, 'train/total_loss': 0.5334744453430176} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(0.2950, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.31602187156677247, 'train/info_loss': 0.19124522805213928, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013600611127913, 'train/video_loss': 0.19110922515392303, 'train/total_loss': 0.5071310997009277} -tensor(0.8393, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.5416, 'grad_norm': 10.657554626464844, 'learning_rate': 1.4499462847924016e-05}[Rank 3] Trainer log: {'loss': 0.5416, 'grad_norm': 10.657554626464844, 'learning_rate': 1.4499462847924016e-05}[Rank 2] Trainer log: {'loss': 0.5416, 'grad_norm': 10.657554626464844, 'learning_rate': 1.4499462847924016e-05} - -[Rank 0] Trainer log: {'loss': 0.5416, 'grad_norm': 10.657554626464844, 'learning_rate': 1.4499462847924016e-05} - -{'loss': 0.5416, 'grad_norm': 10.657554626464844, 'learning_rate': 1.4499462847924016e-05, 'epoch': 0.38} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.22788643836975098, 'train/info_loss': 0.162540003657341, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010544337565079332, 'train/video_loss': 0.16243456304073334, 'train/total_loss': 0.3903210163116455} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(0.2074, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(0.0054, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00024722397793084386, 'train/lm_loss': 6.922477623447776e-05, 'train/info_loss': 3.599991032388061e-05, 'train/ref_loss': 0.2186952531337738, 'train/uncertainty_loss': 0.0005354431457817555, 'train/video_loss': 0.22124448418617249, 'train/total_loss': 0.2213137149810791} -[Rank 1] Trainer log: {'loss': 0.3802, 'grad_norm': 4.742737293243408, 'learning_rate': 1.4489933576623491e-05}[Rank 3] Trainer log: {'loss': 0.3802, 'grad_norm': 4.742737293243408, 'learning_rate': 1.4489933576623491e-05}[Rank 2] Trainer log: {'loss': 0.3802, 'grad_norm': 4.742737293243408, 'learning_rate': 1.4489933576623491e-05} - - -[Rank 0] Trainer log: {'loss': 0.3802, 'grad_norm': 4.742737293243408, 'learning_rate': 1.4489933576623491e-05} -{'loss': 0.3802, 'grad_norm': 4.742737293243408, 'learning_rate': 1.4489933576623491e-05, 'epoch': 0.38} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2539290189743042, 'train/info_loss': 0.18807797133922577, 'train/ref_loss': None, 'train/uncertainty_loss': -9.141711052507162e-05, 'train/video_loss': 0.1879865527153015, 'train/total_loss': 0.4419155716896057} -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(0.1154, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2214754819869995, 'train/info_loss': 0.13418658077716827, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001147128874436021, 'train/video_loss': 0.13407187163829803, 'train/total_loss': 0.35554736852645874} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3481, 'grad_norm': 5.831614017486572, 'learning_rate': 1.4480399195933334e-05}[Rank 1] Trainer log: {'loss': 0.3481, 'grad_norm': 5.831614017486572, 'learning_rate': 1.4480399195933334e-05}[Rank 2] Trainer log: {'loss': 0.3481, 'grad_norm': 5.831614017486572, 'learning_rate': 1.4480399195933334e-05} - - -[Rank 0] Trainer log: {'loss': 0.3481, 'grad_norm': 5.831614017486572, 'learning_rate': 1.4480399195933334e-05} -{'loss': 0.3481, 'grad_norm': 5.831614017486572, 'learning_rate': 1.4480399195933334e-05, 'epoch': 0.38} -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.25505771636962893, 'train/info_loss': 0.2505171298980713, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013726112665608526, 'train/video_loss': 0.25037986040115356, 'train/total_loss': 0.5054376125335693} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.0660, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1526, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00020433196332305672, 'train/lm_loss': 0.00013089664280414582, 'train/info_loss': 4.0767914470052347e-05, 'train/ref_loss': 0.31294214725494385, 'train/uncertainty_loss': 0.015264596045017243, 'train/video_loss': 0.3298821747303009, 'train/total_loss': 0.33001306653022766} -[Rank 1] Trainer log: {'loss': 0.3886, 'grad_norm': 10.908761024475098, 'learning_rate': 1.4470859716703349e-05}[Rank 2] Trainer log: {'loss': 0.3886, 'grad_norm': 10.908761024475098, 'learning_rate': 1.4470859716703349e-05}[Rank 3] Trainer log: {'loss': 0.3886, 'grad_norm': 10.908761024475098, 'learning_rate': 1.4470859716703349e-05} - - -[Rank 0] Trainer log: {'loss': 0.3886, 'grad_norm': 10.908761024475098, 'learning_rate': 1.4470859716703349e-05} -{'loss': 0.3886, 'grad_norm': 10.908761024475098, 'learning_rate': 1.4470859716703349e-05, 'epoch': 0.38} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.27524991035461427, 'train/info_loss': 0.20790661871433258, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011688565136864782, 'train/video_loss': 0.20778973400592804, 'train/total_loss': 0.4830396771430969} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.7970, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1762, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003603392979130149, 'train/lm_loss': 9.315228671766818e-05, 'train/info_loss': 3.921831739717163e-05, 'train/ref_loss': 0.33075207471847534, 'train/uncertainty_loss': 0.017619486153125762, 'train/video_loss': 0.35129350423812866, 'train/total_loss': 0.3513866662979126} -tensor(0.0837, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3682, 'grad_norm': 6.185774803161621, 'learning_rate': 1.4461315149789128e-05}[Rank 3] Trainer log: {'loss': 0.3682, 'grad_norm': 6.185774803161621, 'learning_rate': 1.4461315149789128e-05}[Rank 2] Trainer log: {'loss': 0.3682, 'grad_norm': 6.185774803161621, 'learning_rate': 1.4461315149789128e-05} -[Rank 0] Trainer log: {'loss': 0.3682, 'grad_norm': 6.185774803161621, 'learning_rate': 1.4461315149789128e-05} - - -{'loss': 0.3682, 'grad_norm': 6.185774803161621, 'learning_rate': 1.4461315149789128e-05, 'epoch': 0.39} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002876846352592111, 'train/lm_loss': 6.79854245390743e-05, 'train/info_loss': 3.57019089278765e-05, 'train/ref_loss': 0.12039010226726532, 'train/uncertainty_loss': -6.951533141545952e-05, 'train/video_loss': 0.12265776842832565, 'train/total_loss': 0.12272575497627258} -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3764258146286011, 'train/info_loss': 0.20296898484230042, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010589739540591837, 'train/video_loss': 0.20286308228969574, 'train/total_loss': 0.579288899898529} -[Rank 2] Trainer log: {'loss': 0.3888, 'grad_norm': 5.923813819885254, 'learning_rate': 1.4451765506052064e-05}[Rank 0] Trainer log: {'loss': 0.3888, 'grad_norm': 5.923813819885254, 'learning_rate': 1.4451765506052064e-05}[Rank 1] Trainer log: {'loss': 0.3888, 'grad_norm': 5.923813819885254, 'learning_rate': 1.4451765506052064e-05} - -[Rank 3] Trainer log: {'loss': 0.3888, 'grad_norm': 5.923813819885254, 'learning_rate': 1.4451765506052064e-05} - -{'loss': 0.3888, 'grad_norm': 5.923813819885254, 'learning_rate': 1.4451765506052064e-05, 'epoch': 0.39} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.04949542582035065, 'train/info_loss': 0.1346898376941681, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010036054300144315, 'train/video_loss': 0.13458947837352753, 'train/total_loss': 0.1840849071741104} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.0741, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0015, device='cuda:2', grad_fn=) tensor(-0.0015, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.11959418058395387, 'train/info_loss': 0.16686789691448212, 'train/ref_loss': None, 'train/uncertainty_loss': -9.828442707657815e-05, 'train/video_loss': 0.1667696088552475, 'train/total_loss': 0.28636378049850464} -[Rank 1] Trainer log: {'loss': 0.396, 'grad_norm': 4.411120414733887, 'learning_rate': 1.4442210796359316e-05} -[Rank 0] Trainer log: {'loss': 0.396, 'grad_norm': 4.411120414733887, 'learning_rate': 1.4442210796359316e-05}[Rank 3] Trainer log: {'loss': 0.396, 'grad_norm': 4.411120414733887, 'learning_rate': 1.4442210796359316e-05} -[Rank 2] Trainer log: {'loss': 0.396, 'grad_norm': 4.411120414733887, 'learning_rate': 1.4442210796359316e-05} - -{'loss': 0.396, 'grad_norm': 4.411120414733887, 'learning_rate': 1.4442210796359316e-05, 'epoch': 0.39} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3194318294525147, 'train/info_loss': 0.27355867624282837, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001226032734848559, 'train/video_loss': 0.2734360694885254, 'train/total_loss': 0.592867910861969} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(0.1558, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -tensor(-0.0008, device='cuda:3', grad_fn=) -{'train/tv_loss': 0.0002873212564736605, 'train/lm_loss': 8.871971513144672e-05, 'train/info_loss': 4.1125513234874234e-05, 'train/ref_loss': 0.08270609378814697, 'train/uncertainty_loss': -6.858179694972933e-05, 'train/video_loss': 0.08497720956802368, 'train/total_loss': 0.08506593108177185} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3904, 'grad_norm': 8.151347160339355, 'learning_rate': 1.4432651031583818e-05}[Rank 1] Trainer log: {'loss': 0.3904, 'grad_norm': 8.151347160339355, 'learning_rate': 1.4432651031583818e-05}[Rank 0] Trainer log: {'loss': 0.3904, 'grad_norm': 8.151347160339355, 'learning_rate': 1.4432651031583818e-05} - -[Rank 3] Trainer log: {'loss': 0.3904, 'grad_norm': 8.151347160339355, 'learning_rate': 1.4432651031583818e-05} - -{'loss': 0.3904, 'grad_norm': 8.151347160339355, 'learning_rate': 1.4432651031583818e-05, 'epoch': 0.39} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.21801364421844482, 'train/info_loss': 0.10116565227508545, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010744897881522776, 'train/video_loss': 0.10105820000171661, 'train/total_loss': 0.31907182931900024} -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3057993173599243, 'train/info_loss': 0.14046138525009155, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010212495690211655, 'train/video_loss': 0.14035926759243011, 'train/total_loss': 0.44615858793258667} -tensor(0.4566, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.4553, 'grad_norm': 12.64486026763916, 'learning_rate': 1.4423086222604257e-05}[Rank 3] Trainer log: {'loss': 0.4553, 'grad_norm': 12.64486026763916, 'learning_rate': 1.4423086222604257e-05} -[Rank 2] Trainer log: {'loss': 0.4553, 'grad_norm': 12.64486026763916, 'learning_rate': 1.4423086222604257e-05} -[Rank 1] Trainer log: {'loss': 0.4553, 'grad_norm': 12.64486026763916, 'learning_rate': 1.4423086222604257e-05} - -{'loss': 0.4553, 'grad_norm': 12.64486026763916, 'learning_rate': 1.4423086222604257e-05, 'epoch': 0.39} -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.35741887092590335, 'train/info_loss': 0.18524709343910217, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001131392433308065, 'train/video_loss': 0.1851339489221573, 'train/total_loss': 0.5425528287887573} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.1359, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2718914747238159, 'train/info_loss': 0.18658120930194855, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001210486749187112, 'train/video_loss': 0.18646016716957092, 'train/total_loss': 0.45835164189338684} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.2383, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4895, 'grad_norm': 8.681769371032715, 'learning_rate': 1.4413516380305046e-05}[Rank 1] Trainer log: {'loss': 0.4895, 'grad_norm': 8.681769371032715, 'learning_rate': 1.4413516380305046e-05}[Rank 2] Trainer log: {'loss': 0.4895, 'grad_norm': 8.681769371032715, 'learning_rate': 1.4413516380305046e-05} - - -[Rank 0] Trainer log: {'loss': 0.4895, 'grad_norm': 8.681769371032715, 'learning_rate': 1.4413516380305046e-05} -{'loss': 0.4895, 'grad_norm': 8.681769371032715, 'learning_rate': 1.4413516380305046e-05, 'epoch': 0.39} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.18496252298355104, 'train/info_loss': 0.09756822139024734, 'train/ref_loss': None, 'train/uncertainty_loss': -9.207715629599988e-05, 'train/video_loss': 0.09747614711523056, 'train/total_loss': 0.2824386656284332} -tensor(0.0843, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.19980188608169558, 'train/info_loss': 0.22860953211784363, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012062016176059843, 'train/video_loss': 0.22848890721797943, 'train/total_loss': 0.42829078435897827} -tensor(0.4075, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3371, 'grad_norm': 14.219959259033203, 'learning_rate': 1.4403941515576344e-05}[Rank 3] Trainer log: {'loss': 0.3371, 'grad_norm': 14.219959259033203, 'learning_rate': 1.4403941515576344e-05} - -[Rank 0] Trainer log: {'loss': 0.3371, 'grad_norm': 14.219959259033203, 'learning_rate': 1.4403941515576344e-05}[Rank 1] Trainer log: {'loss': 0.3371, 'grad_norm': 14.219959259033203, 'learning_rate': 1.4403941515576344e-05} - -{'loss': 0.3371, 'grad_norm': 14.219959259033203, 'learning_rate': 1.4403941515576344e-05, 'epoch': 0.39} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.14723637104034423, 'train/info_loss': 0.1806749701499939, 'train/ref_loss': None, 'train/uncertainty_loss': -9.61200159508735e-05, 'train/video_loss': 0.18057884275913239, 'train/total_loss': 0.3278152346611023} -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.8878195762634278, 'train/info_loss': 0.16411638259887695, 'train/ref_loss': None, 'train/uncertainty_loss': -9.825442684814335e-05, 'train/video_loss': 0.16401812434196472, 'train/total_loss': 1.051837682723999} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0476, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.4504, 'grad_norm': 7.432770729064941, 'learning_rate': 1.4394361639314015e-05}[Rank 2] Trainer log: {'loss': 0.4504, 'grad_norm': 7.432770729064941, 'learning_rate': 1.4394361639314015e-05} -[Rank 3] Trainer log: {'loss': 0.4504, 'grad_norm': 7.432770729064941, 'learning_rate': 1.4394361639314015e-05} - -[Rank 1] Trainer log: {'loss': 0.4504, 'grad_norm': 7.432770729064941, 'learning_rate': 1.4394361639314015e-05} -{'loss': 0.4504, 'grad_norm': 7.432770729064941, 'learning_rate': 1.4394361639314015e-05, 'epoch': 0.39} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1036, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002347078174352646, 'train/lm_loss': 8.714684518054129e-05, 'train/info_loss': 3.9814316551201046e-05, 'train/ref_loss': 0.2688283622264862, 'train/uncertainty_loss': 0.01035972833633423, 'train/video_loss': 0.28110557794570923, 'train/total_loss': 0.28119271993637085} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2531026840209961, 'train/info_loss': 0.21614326536655426, 'train/ref_loss': None, 'train/uncertainty_loss': -0.000133492611348629, 'train/video_loss': 0.21600976586341858, 'train/total_loss': 0.46911245584487915} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.1464, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3652, 'grad_norm': 2.800964832305908, 'learning_rate': 1.4384776762419625e-05}[Rank 1] Trainer log: {'loss': 0.3652, 'grad_norm': 2.800964832305908, 'learning_rate': 1.4384776762419625e-05} -[Rank 2] Trainer log: {'loss': 0.3652, 'grad_norm': 2.800964832305908, 'learning_rate': 1.4384776762419625e-05} - -[Rank 3] Trainer log: {'loss': 0.3652, 'grad_norm': 2.800964832305908, 'learning_rate': 1.4384776762419625e-05} -{'loss': 0.3652, 'grad_norm': 2.800964832305908, 'learning_rate': 1.4384776762419625e-05, 'epoch': 0.39} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.0472, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003056176705285907, 'train/lm_loss': 0.00011993625666946174, 'train/info_loss': 4.619146420736797e-05, 'train/ref_loss': 0.1509859263896942, 'train/uncertainty_loss': -7.128905854187906e-05, 'train/video_loss': 0.15340577065944672, 'train/total_loss': 0.153525710105896} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.25319581031799315, 'train/info_loss': 0.16283059120178223, 'train/ref_loss': None, 'train/uncertainty_loss': -9.987055091187358e-05, 'train/video_loss': 0.16273072361946106, 'train/total_loss': 0.4159265458583832} -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3534, 'grad_norm': 2.8409042358398438, 'learning_rate': 1.4375186895800436e-05}[Rank 2] Trainer log: {'loss': 0.3534, 'grad_norm': 2.8409042358398438, 'learning_rate': 1.4375186895800436e-05} -[Rank 1] Trainer log: {'loss': 0.3534, 'grad_norm': 2.8409042358398438, 'learning_rate': 1.4375186895800436e-05} - -[Rank 3] Trainer log: {'loss': 0.3534, 'grad_norm': 2.8409042358398438, 'learning_rate': 1.4375186895800436e-05} -{'loss': 0.3534, 'grad_norm': 2.8409042358398438, 'learning_rate': 1.4375186895800436e-05, 'epoch': 0.39} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00021560147870332004, 'train/lm_loss': 0.00016448884271085264, 'train/info_loss': 4.410549081512727e-05, 'train/ref_loss': 0.14672566950321198, 'train/uncertainty_loss': -7.046200335025787e-05, 'train/video_loss': 0.14842411875724792, 'train/total_loss': 0.1485886126756668} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002603395842015743, 'train/lm_loss': 0.00031101349741220475, 'train/info_loss': 5.113816951052286e-05, 'train/ref_loss': 0.09259446710348129, 'train/uncertainty_loss': -7.233808864839376e-05, 'train/video_loss': 0.09465598315000534, 'train/total_loss': 0.09496700018644333} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3754, 'grad_norm': 2.0537493228912354, 'learning_rate': 1.4365592050369388e-05}[Rank 3] Trainer log: {'loss': 0.3754, 'grad_norm': 2.0537493228912354, 'learning_rate': 1.4365592050369388e-05} - -[Rank 0] Trainer log: {'loss': 0.3754, 'grad_norm': 2.0537493228912354, 'learning_rate': 1.4365592050369388e-05} -{'loss': 0.3754, 'grad_norm': 2.0537493228912354, 'learning_rate': 1.4365592050369388e-05, 'epoch': 0.39} -[Rank 1] Trainer log: {'loss': 0.3754, 'grad_norm': 2.0537493228912354, 'learning_rate': 1.4365592050369388e-05} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.2936, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00019729123450815678, 'train/lm_loss': 6.805692100897431e-05, 'train/info_loss': 3.516550350468606e-05, 'train/ref_loss': 0.41022980213165283, 'train/uncertainty_loss': 0.029360684752464297, 'train/video_loss': 0.4412039816379547, 'train/total_loss': 0.4412720501422882} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(0.1660, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2430351495742798, 'train/info_loss': 0.21064572036266327, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001334988744929433, 'train/video_loss': 0.2105122208595276, 'train/total_loss': 0.4535473585128784} -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.1419, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3469, 'grad_norm': 8.600909233093262, 'learning_rate': 1.4355992237045078e-05} -[Rank 0] Trainer log: {'loss': 0.3469, 'grad_norm': 8.600909233093262, 'learning_rate': 1.4355992237045078e-05}[Rank 1] Trainer log: {'loss': 0.3469, 'grad_norm': 8.600909233093262, 'learning_rate': 1.4355992237045078e-05} - -[Rank 3] Trainer log: {'loss': 0.3469, 'grad_norm': 8.600909233093262, 'learning_rate': 1.4355992237045078e-05} -{'loss': 0.3469, 'grad_norm': 8.600909233093262, 'learning_rate': 1.4355992237045078e-05, 'epoch': 0.39} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.4350426197052002, 'train/info_loss': 0.18088804185390472, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011030402965843678, 'train/video_loss': 0.18077774345874786, 'train/total_loss': 0.6158203482627869} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1901, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0051, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3058550596237183, 'train/info_loss': 0.21102721989154816, 'train/ref_loss': None, 'train/uncertainty_loss': -9.75360511802137e-05, 'train/video_loss': 0.21092967689037323, 'train/total_loss': 0.5167847275733948} -tensor(0.5318, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.4411, 'grad_norm': 9.000370025634766, 'learning_rate': 1.4346387466751768e-05} -[Rank 1] Trainer log: {'loss': 0.4411, 'grad_norm': 9.000370025634766, 'learning_rate': 1.4346387466751768e-05} -[Rank 3] Trainer log: {'loss': 0.4411, 'grad_norm': 9.000370025634766, 'learning_rate': 1.4346387466751768e-05} -[Rank 0] Trainer log: {'loss': 0.4411, 'grad_norm': 9.000370025634766, 'learning_rate': 1.4346387466751768e-05} -{'loss': 0.4411, 'grad_norm': 9.000370025634766, 'learning_rate': 1.4346387466751768e-05, 'epoch': 0.39} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3479896306991577, 'train/info_loss': 0.16109688580036163, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012188347754999996, 'train/video_loss': 0.16097500920295715, 'train/total_loss': 0.5089646577835083} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.3976, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2052576780319214, 'train/info_loss': 0.18546923995018005, 'train/ref_loss': None, 'train/uncertainty_loss': -8.363122469745576e-05, 'train/video_loss': 0.18538561463356018, 'train/total_loss': 0.39064329862594604} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3571, 'grad_norm': 8.494061470031738, 'learning_rate': 1.4336777750419353e-05}[Rank 2] Trainer log: {'loss': 0.3571, 'grad_norm': 8.494061470031738, 'learning_rate': 1.4336777750419353e-05}[Rank 1] Trainer log: {'loss': 0.3571, 'grad_norm': 8.494061470031738, 'learning_rate': 1.4336777750419353e-05} - - -[Rank 3] Trainer log: {'loss': 0.3571, 'grad_norm': 8.494061470031738, 'learning_rate': 1.4336777750419353e-05} -{'loss': 0.3571, 'grad_norm': 8.494061470031738, 'learning_rate': 1.4336777750419353e-05, 'epoch': 0.39} -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.103838050365448, 'train/info_loss': 0.2347031682729721, 'train/ref_loss': None, 'train/uncertainty_loss': -9.627972613088787e-05, 'train/video_loss': 0.23460689187049866, 'train/total_loss': 0.33844494819641113} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.5618, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1820, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0015, device='cuda:3', grad_fn=) tensor(-0.0015, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.14816957712173462, 'train/info_loss': 0.1335594356060028, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010268774349242449, 'train/video_loss': 0.133456751704216, 'train/total_loss': 0.2816263437271118} -tensor(0.2461, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3901, 'grad_norm': 9.970301628112793, 'learning_rate': 1.4327163098983362e-05} -[Rank 1] Trainer log: {'loss': 0.3901, 'grad_norm': 9.970301628112793, 'learning_rate': 1.4327163098983362e-05}[Rank 0] Trainer log: {'loss': 0.3901, 'grad_norm': 9.970301628112793, 'learning_rate': 1.4327163098983362e-05} - -{'loss': 0.3901, 'grad_norm': 9.970301628112793, 'learning_rate': 1.4327163098983362e-05, 'epoch': 0.39}[Rank 3] Trainer log: {'loss': 0.3901, 'grad_norm': 9.970301628112793, 'learning_rate': 1.4327163098983362e-05} - -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(0.0183, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00029929482843726873, 'train/lm_loss': 0.00010037287138402462, 'train/info_loss': 4.017191531602293e-05, 'train/ref_loss': 0.2150421440601349, 'train/uncertainty_loss': 0.0018258344382047654, 'train/video_loss': 0.21930252015590668, 'train/total_loss': 0.21940289437770844} -tensor(0.1154, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.3099, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.2428, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00022926374804228545, 'train/lm_loss': 0.00010711662471294404, 'train/info_loss': 3.892031963914633e-05, 'train/ref_loss': 0.378101646900177, 'train/uncertainty_loss': 0.024284039437770844, 'train/video_loss': 0.40425872802734375, 'train/total_loss': 0.4043658375740051} -tensor(0.1018, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3679, 'grad_norm': 7.863321781158447, 'learning_rate': 1.4317543523384928e-05}[Rank 1] Trainer log: {'loss': 0.3679, 'grad_norm': 7.863321781158447, 'learning_rate': 1.4317543523384928e-05}[Rank 2] Trainer log: {'loss': 0.3679, 'grad_norm': 7.863321781158447, 'learning_rate': 1.4317543523384928e-05} - - -[Rank 0] Trainer log: {'loss': 0.3679, 'grad_norm': 7.863321781158447, 'learning_rate': 1.4317543523384928e-05} -{'loss': 0.3679, 'grad_norm': 7.863321781158447, 'learning_rate': 1.4317543523384928e-05, 'epoch': 0.39} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.47691774368286133, 'train/info_loss': 0.22138981521129608, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001050568069331348, 'train/video_loss': 0.22128476202487946, 'train/total_loss': 0.6982024908065796} -tensor(0.1324, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1475, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(0.0733, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002663233783096075, 'train/lm_loss': 9.51540598180145e-05, 'train/info_loss': 3.921831739717163e-05, 'train/ref_loss': 0.12075843662023544, 'train/uncertainty_loss': -6.883103051222861e-05, 'train/video_loss': 0.1228594183921814, 'train/total_loss': 0.12295456975698471} -tensor(0.0141, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3667, 'grad_norm': 3.2984635829925537, 'learning_rate': 1.430791903457081e-05}[Rank 3] Trainer log: {'loss': 0.3667, 'grad_norm': 3.2984635829925537, 'learning_rate': 1.430791903457081e-05}[Rank 1] Trainer log: {'loss': 0.3667, 'grad_norm': 3.2984635829925537, 'learning_rate': 1.430791903457081e-05} - - -[Rank 0] Trainer log: {'loss': 0.3667, 'grad_norm': 3.2984635829925537, 'learning_rate': 1.430791903457081e-05} -{'loss': 0.3667, 'grad_norm': 3.2984635829925537, 'learning_rate': 1.430791903457081e-05, 'epoch': 0.39} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.26583735942840575, 'train/info_loss': 0.21293942630290985, 'train/ref_loss': None, 'train/uncertainty_loss': -9.001527796499432e-05, 'train/video_loss': 0.21284940838813782, 'train/total_loss': 0.47868677973747253} -tensor(0.1439, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.11686432361602783, 'train/info_loss': 0.22441913187503815, 'train/ref_loss': None, 'train/uncertainty_loss': -8.304078946821392e-05, 'train/video_loss': 0.22433608770370483, 'train/total_loss': 0.34120041131973267} -tensor(0.2490, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3849, 'grad_norm': 4.931875705718994, 'learning_rate': 1.4298289643493335e-05} -[Rank 0] Trainer log: {'loss': 0.3849, 'grad_norm': 4.931875705718994, 'learning_rate': 1.4298289643493335e-05}[Rank 1] Trainer log: {'loss': 0.3849, 'grad_norm': 4.931875705718994, 'learning_rate': 1.4298289643493335e-05}[Rank 2] Trainer log: {'loss': 0.3849, 'grad_norm': 4.931875705718994, 'learning_rate': 1.4298289643493335e-05} - - -{'loss': 0.3849, 'grad_norm': 4.931875705718994, 'learning_rate': 1.4298289643493335e-05, 'epoch': 0.39} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00034970103297382593, 'train/lm_loss': 0.00011257340665906669, 'train/info_loss': 4.2436706280568615e-05, 'train/ref_loss': 0.1683778315782547, 'train/uncertainty_loss': -7.030750275589525e-05, 'train/video_loss': 0.17114757001399994, 'train/total_loss': 0.17126014828681946} -tensor(0.0076, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(0.2395, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0888, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002780026523396373, 'train/lm_loss': 6.743724807165564e-05, 'train/info_loss': 3.516550350468606e-05, 'train/ref_loss': 0.27662456035614014, 'train/uncertainty_loss': 0.008879037201404571, 'train/video_loss': 0.2877627909183502, 'train/total_loss': 0.2878302335739136} -[Rank 3] Trainer log: {'loss': 0.4222, 'grad_norm': 10.050193786621094, 'learning_rate': 1.4288655361110421e-05}[Rank 1] Trainer log: {'loss': 0.4222, 'grad_norm': 10.050193786621094, 'learning_rate': 1.4288655361110421e-05}[Rank 2] Trainer log: {'loss': 0.4222, 'grad_norm': 10.050193786621094, 'learning_rate': 1.4288655361110421e-05} - - -[Rank 0] Trainer log: {'loss': 0.4222, 'grad_norm': 10.050193786621094, 'learning_rate': 1.4288655361110421e-05} -{'loss': 0.4222, 'grad_norm': 10.050193786621094, 'learning_rate': 1.4288655361110421e-05, 'epoch': 0.39} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.16320935487747193, 'train/info_loss': 0.13929621875286102, 'train/ref_loss': None, 'train/uncertainty_loss': -9.491844102740289e-05, 'train/video_loss': 0.1392012983560562, 'train/total_loss': 0.30241066217422485} -tensor(0.0166, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.6060578346252442, 'train/info_loss': 0.23729948699474335, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013766560005024076, 'train/video_loss': 0.2371618151664734, 'train/total_loss': 0.8432196378707886} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.0656, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4177, 'grad_norm': 4.428340435028076, 'learning_rate': 1.4279016198385547e-05}[Rank 3] Trainer log: {'loss': 0.4177, 'grad_norm': 4.428340435028076, 'learning_rate': 1.4279016198385547e-05}[Rank 2] Trainer log: {'loss': 0.4177, 'grad_norm': 4.428340435028076, 'learning_rate': 1.4279016198385547e-05} -[Rank 0] Trainer log: {'loss': 0.4177, 'grad_norm': 4.428340435028076, 'learning_rate': 1.4279016198385547e-05} - - -{'loss': 0.4177, 'grad_norm': 4.428340435028076, 'learning_rate': 1.4279016198385547e-05, 'epoch': 0.39} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2267728090286255, 'train/info_loss': 0.1666504591703415, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011609707726165653, 'train/video_loss': 0.16653436422348022, 'train/total_loss': 0.3933071792125702} -tensor(0.1353, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2683493852615357, 'train/info_loss': 0.2360333949327469, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012138922465965152, 'train/video_loss': 0.2359120100736618, 'train/total_loss': 0.5042613744735718} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4158, 'grad_norm': 1.940561294555664, 'learning_rate': 1.426937216628775e-05}[Rank 2] Trainer log: {'loss': 0.4158, 'grad_norm': 1.940561294555664, 'learning_rate': 1.426937216628775e-05}[Rank 3] Trainer log: {'loss': 0.4158, 'grad_norm': 1.940561294555664, 'learning_rate': 1.426937216628775e-05} - - -[Rank 0] Trainer log: {'loss': 0.4158, 'grad_norm': 1.940561294555664, 'learning_rate': 1.426937216628775e-05} -{'loss': 0.4158, 'grad_norm': 1.940561294555664, 'learning_rate': 1.426937216628775e-05, 'epoch': 0.39} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1867, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00021259181667119265, 'train/lm_loss': 0.0001142652239650488, 'train/info_loss': 4.207910751574673e-05, 'train/ref_loss': 0.33477383852005005, 'train/uncertainty_loss': 0.018666137754917145, 'train/video_loss': 0.35518279671669006, 'train/total_loss': 0.3552970588207245} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.000306038442067802, 'train/lm_loss': 0.00012970531824976205, 'train/info_loss': 4.3092302803415805e-05, 'train/ref_loss': 0.1520814597606659, 'train/uncertainty_loss': -6.877082050777971e-05, 'train/video_loss': 0.15450409054756165, 'train/total_loss': 0.1546337902545929} -[Rank 1] Trainer log: {'loss': 0.295, 'grad_norm': 12.695387840270996, 'learning_rate': 1.4259723275791606e-05} -[Rank 3] Trainer log: {'loss': 0.295, 'grad_norm': 12.695387840270996, 'learning_rate': 1.4259723275791606e-05} -[Rank 2] Trainer log: {'loss': 0.295, 'grad_norm': 12.695387840270996, 'learning_rate': 1.4259723275791606e-05} -[Rank 0] Trainer log: {'loss': 0.295, 'grad_norm': 12.695387840270996, 'learning_rate': 1.4259723275791606e-05} -{'loss': 0.295, 'grad_norm': 12.695387840270996, 'learning_rate': 1.4259723275791606e-05, 'epoch': 0.39} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.1337, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1250, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0004775537643581629, 'train/lm_loss': 5.921440897509456e-05, 'train/info_loss': 3.1529863917967305e-05, 'train/ref_loss': 0.1640169471502304, 'train/uncertainty_loss': -7.344405166804792e-05, 'train/video_loss': 0.16779546439647675, 'train/total_loss': 0.1678546816110611} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2505443811416626, 'train/info_loss': 0.2896290123462677, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011868766741827131, 'train/video_loss': 0.2895103394985199, 'train/total_loss': 0.5400547385215759} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.0769, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.2999, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3614, 'grad_norm': 18.148000717163086, 'learning_rate': 1.425006953787722e-05}[Rank 2] Trainer log: {'loss': 0.3614, 'grad_norm': 18.148000717163086, 'learning_rate': 1.425006953787722e-05}[Rank 1] Trainer log: {'loss': 0.3614, 'grad_norm': 18.148000717163086, 'learning_rate': 1.425006953787722e-05} - - -[Rank 0] Trainer log: {'loss': 0.3614, 'grad_norm': 18.148000717163086, 'learning_rate': 1.425006953787722e-05} -{'loss': 0.3614, 'grad_norm': 18.148000717163086, 'learning_rate': 1.425006953787722e-05, 'epoch': 0.39} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3098063945770264, 'train/info_loss': 0.19421640038490295, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011786677641794086, 'train/video_loss': 0.19409853219985962, 'train/total_loss': 0.5039049386978149} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.5250, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.24482352733612062, 'train/info_loss': 0.19222497940063477, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010585985146462918, 'train/video_loss': 0.19211912155151367, 'train/total_loss': 0.4369426369667053} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.1263, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.5095, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4171, 'grad_norm': 4.868343830108643, 'learning_rate': 1.424041096353021e-05} -[Rank 3] Trainer log: {'loss': 0.4171, 'grad_norm': 4.868343830108643, 'learning_rate': 1.424041096353021e-05}[Rank 0] Trainer log: {'loss': 0.4171, 'grad_norm': 4.868343830108643, 'learning_rate': 1.424041096353021e-05} -[Rank 2] Trainer log: {'loss': 0.4171, 'grad_norm': 4.868343830108643, 'learning_rate': 1.424041096353021e-05} - -{'loss': 0.4171, 'grad_norm': 4.868343830108643, 'learning_rate': 1.424041096353021e-05, 'epoch': 0.39} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00040494338609278204, 'train/lm_loss': 8.73851589858532e-05, 'train/info_loss': 3.772831769310869e-05, 'train/ref_loss': 0.18619582056999207, 'train/uncertainty_loss': -7.920535863377155e-05, 'train/video_loss': 0.18939389288425446, 'train/total_loss': 0.18948127329349518} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2725529432296753, 'train/info_loss': 0.300446093082428, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013115921756252648, 'train/video_loss': 0.30031493306159973, 'train/total_loss': 0.5728678703308105} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.1134, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3638, 'grad_norm': 3.2330596446990967, 'learning_rate': 1.4230747563741707e-05}[Rank 3] Trainer log: {'loss': 0.3638, 'grad_norm': 3.2330596446990967, 'learning_rate': 1.4230747563741707e-05}[Rank 0] Trainer log: {'loss': 0.3638, 'grad_norm': 3.2330596446990967, 'learning_rate': 1.4230747563741707e-05} - -[Rank 2] Trainer log: {'loss': 0.3638, 'grad_norm': 3.2330596446990967, 'learning_rate': 1.4230747563741707e-05} - -{'loss': 0.3638, 'grad_norm': 3.2330596446990967, 'learning_rate': 1.4230747563741707e-05, 'epoch': 0.39} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.19281537532806398, 'train/info_loss': 0.11239191144704819, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010025483788922429, 'train/video_loss': 0.11229165643453598, 'train/total_loss': 0.3051070272922516} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002855689730495215, 'train/lm_loss': 0.00010080181527882814, 'train/info_loss': 4.172151238890365e-05, 'train/ref_loss': 0.2133607119321823, 'train/uncertainty_loss': -7.386664510704577e-05, 'train/video_loss': 0.21561312675476074, 'train/total_loss': 0.21571393311023712} -tensor(0.2025, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.2654, 'grad_norm': 4.589602470397949, 'learning_rate': 1.422107934950832e-05}[Rank 2] Trainer log: {'loss': 0.2654, 'grad_norm': 4.589602470397949, 'learning_rate': 1.422107934950832e-05}[Rank 3] Trainer log: {'loss': 0.2654, 'grad_norm': 4.589602470397949, 'learning_rate': 1.422107934950832e-05} - - -[Rank 0] Trainer log: {'loss': 0.2654, 'grad_norm': 4.589602470397949, 'learning_rate': 1.422107934950832e-05} -{'loss': 0.2654, 'grad_norm': 4.589602470397949, 'learning_rate': 1.422107934950832e-05, 'epoch': 0.39} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.0107, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.2079, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002194827888160944, 'train/lm_loss': 0.00010749788489192725, 'train/info_loss': 4.207910751574673e-05, 'train/ref_loss': 0.3473450541496277, 'train/uncertainty_loss': 0.02078976631164551, 'train/video_loss': 0.36993277072906494, 'train/total_loss': 0.37004026770591736} -tensor(0.0014, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3162313222885132, 'train/info_loss': 0.17612260580062866, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011937841773033143, 'train/video_loss': 0.17600323259830475, 'train/total_loss': 0.49223458766937256} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3443, 'grad_norm': 5.603410243988037, 'learning_rate': 1.4211406331832147e-05}[Rank 1] Trainer log: {'loss': 0.3443, 'grad_norm': 5.603410243988037, 'learning_rate': 1.4211406331832147e-05}[Rank 0] Trainer log: {'loss': 0.3443, 'grad_norm': 5.603410243988037, 'learning_rate': 1.4211406331832147e-05} - - -[Rank 2] Trainer log: {'loss': 0.3443, 'grad_norm': 5.603410243988037, 'learning_rate': 1.4211406331832147e-05} -{'loss': 0.3443, 'grad_norm': 5.603410243988037, 'learning_rate': 1.4211406331832147e-05, 'epoch': 0.39} -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1263150453567505, 'train/info_loss': 0.11583907902240753, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010111115407198668, 'train/video_loss': 0.11573796719312668, 'train/total_loss': 0.24205300211906433} -tensor(0.1371, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(0.0564, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(0.0166, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.000275520165450871, 'train/lm_loss': 0.00012153269490227104, 'train/info_loss': 4.273470403859392e-05, 'train/ref_loss': 0.15076282620429993, 'train/uncertainty_loss': -6.980329053476454e-05, 'train/video_loss': 0.15293993055820465, 'train/total_loss': 0.15306146442890167} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.2865, 'grad_norm': 5.592893600463867, 'learning_rate': 1.4201728521720748e-05}[Rank 2] Trainer log: {'loss': 0.2865, 'grad_norm': 5.592893600463867, 'learning_rate': 1.4201728521720748e-05}[Rank 0] Trainer log: {'loss': 0.2865, 'grad_norm': 5.592893600463867, 'learning_rate': 1.4201728521720748e-05} - -[Rank 1] Trainer log: {'loss': 0.2865, 'grad_norm': 5.592893600463867, 'learning_rate': 1.4201728521720748e-05} - -{'loss': 0.2865, 'grad_norm': 5.592893600463867, 'learning_rate': 1.4201728521720748e-05, 'epoch': 0.39} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003030982334166765, 'train/lm_loss': 9.403402218595148e-05, 'train/info_loss': 4.046991671202704e-05, 'train/ref_loss': 0.2120336890220642, 'train/uncertainty_loss': -7.692191284149886e-05, 'train/video_loss': 0.21442203223705292, 'train/total_loss': 0.2145160734653473} -tensor(0.0023, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0761, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00016730692004784942, 'train/lm_loss': 0.00025989804416894915, 'train/info_loss': 4.8456229706062004e-05, 'train/ref_loss': 0.2680627107620239, 'train/uncertainty_loss': 0.00761384516954422, 'train/video_loss': 0.2770634591579437, 'train/total_loss': 0.2773233652114868} -tensor(0.0560, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3668, 'grad_norm': 5.899405002593994, 'learning_rate': 1.419204593018714e-05}[Rank 2] Trainer log: {'loss': 0.3668, 'grad_norm': 5.899405002593994, 'learning_rate': 1.419204593018714e-05}[Rank 3] Trainer log: {'loss': 0.3668, 'grad_norm': 5.899405002593994, 'learning_rate': 1.419204593018714e-05} - - -[Rank 0] Trainer log: {'loss': 0.3668, 'grad_norm': 5.899405002593994, 'learning_rate': 1.419204593018714e-05} -{'loss': 0.3668, 'grad_norm': 5.899405002593994, 'learning_rate': 1.419204593018714e-05, 'epoch': 0.39} -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.29141085147857665, 'train/info_loss': 0.16672734916210175, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012801806442439557, 'train/video_loss': 0.1665993332862854, 'train/total_loss': 0.458010196685791} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2120236396789551, 'train/info_loss': 0.13862183690071106, 'train/ref_loss': None, 'train/uncertainty_loss': -0.000122123712208122, 'train/video_loss': 0.13849970698356628, 'train/total_loss': 0.35052335262298584} -tensor(0.1627, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3407, 'grad_norm': 2.6277263164520264, 'learning_rate': 1.4182358568249773e-05}[Rank 2] Trainer log: {'loss': 0.3407, 'grad_norm': 2.6277263164520264, 'learning_rate': 1.4182358568249773e-05}[Rank 3] Trainer log: {'loss': 0.3407, 'grad_norm': 2.6277263164520264, 'learning_rate': 1.4182358568249773e-05} - - -[Rank 0] Trainer log: {'loss': 0.3407, 'grad_norm': 2.6277263164520264, 'learning_rate': 1.4182358568249773e-05} -{'loss': 0.3407, 'grad_norm': 2.6277263164520264, 'learning_rate': 1.4182358568249773e-05, 'epoch': 0.39} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(0.0931, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(0.2151, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00035703629255294804, 'train/lm_loss': 0.0001068783225491643, 'train/info_loss': 4.2436706280568615e-05, 'train/ref_loss': 0.3549324572086334, 'train/uncertainty_loss': 0.021506302058696747, 'train/video_loss': 0.37933748960494995, 'train/total_loss': 0.3794443607330322} -tensor(0.2394, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.09338147044181824, 'train/info_loss': 0.22793924808502197, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010145414853468537, 'train/video_loss': 0.22783780097961426, 'train/total_loss': 0.321219265460968} -tensor(0.8841, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4483, 'grad_norm': 16.781604766845703, 'learning_rate': 1.4172666446932533e-05}[Rank 2] Trainer log: {'loss': 0.4483, 'grad_norm': 16.781604766845703, 'learning_rate': 1.4172666446932533e-05}[Rank 0] Trainer log: {'loss': 0.4483, 'grad_norm': 16.781604766845703, 'learning_rate': 1.4172666446932533e-05} - -[Rank 1] Trainer log: {'loss': 0.4483, 'grad_norm': 16.781604766845703, 'learning_rate': 1.4172666446932533e-05} - -{'loss': 0.4483, 'grad_norm': 16.781604766845703, 'learning_rate': 1.4172666446932533e-05, 'epoch': 0.39} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(0.0494, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0001903137075714767, 'train/lm_loss': 9.408168261870742e-05, 'train/info_loss': 3.832431684713811e-05, 'train/ref_loss': 0.07246603071689606, 'train/uncertainty_loss': -7.479066262021661e-05, 'train/video_loss': 0.0739520788192749, 'train/total_loss': 0.07404615730047226} -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2515239477157593, 'train/info_loss': 0.2303479015827179, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010137823410332203, 'train/video_loss': 0.23024652898311615, 'train/total_loss': 0.48177045583724976} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(0.0136, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(0.0768, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3613, 'grad_norm': 5.5696821212768555, 'learning_rate': 1.416296957726472e-05} -[Rank 3] Trainer log: {'loss': 0.3613, 'grad_norm': 5.5696821212768555, 'learning_rate': 1.416296957726472e-05} -[Rank 0] Trainer log: {'loss': 0.3613, 'grad_norm': 5.5696821212768555, 'learning_rate': 1.416296957726472e-05}[Rank 2] Trainer log: {'loss': 0.3613, 'grad_norm': 5.5696821212768555, 'learning_rate': 1.416296957726472e-05} - -{'loss': 0.3613, 'grad_norm': 5.5696821212768555, 'learning_rate': 1.416296957726472e-05, 'epoch': 0.4} -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(0.1823, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00030881096608936787, 'train/lm_loss': 7.830517133697868e-05, 'train/info_loss': 3.486750210868195e-05, 'train/ref_loss': 0.11344755440950394, 'train/uncertainty_loss': -7.380598690360786e-05, 'train/video_loss': 0.1158791035413742, 'train/total_loss': 0.11595740914344788} -tensor(-0.0015, device='cuda:3', grad_fn=) tensor(-0.0015, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0004229598212987185, 'train/lm_loss': 0.00010008690878748894, 'train/info_loss': 4.0767914470052347e-05, 'train/ref_loss': 0.1434062272310257, 'train/uncertainty_loss': -7.826525252312422e-05, 'train/video_loss': 0.14675241708755493, 'train/total_loss': 0.146852508187294} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0029, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3052, 'grad_norm': 4.369136333465576, 'learning_rate': 1.4153267970281039e-05}[Rank 1] Trainer log: {'loss': 0.3052, 'grad_norm': 4.369136333465576, 'learning_rate': 1.4153267970281039e-05} -[Rank 2] Trainer log: {'loss': 0.3052, 'grad_norm': 4.369136333465576, 'learning_rate': 1.4153267970281039e-05} - -[Rank 3] Trainer log: {'loss': 0.3052, 'grad_norm': 4.369136333465576, 'learning_rate': 1.4153267970281039e-05} -{'loss': 0.3052, 'grad_norm': 4.369136333465576, 'learning_rate': 1.4153267970281039e-05, 'epoch': 0.4} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.21891837120056154, 'train/info_loss': 0.15286009013652802, 'train/ref_loss': None, 'train/uncertainty_loss': -9.505097405053675e-05, 'train/video_loss': 0.15276503562927246, 'train/total_loss': 0.37168341875076294} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00025037503801286223, 'train/lm_loss': 6.781858392059804e-05, 'train/info_loss': 3.254307739553042e-05, 'train/ref_loss': 0.17185428738594055, 'train/uncertainty_loss': -7.397503941319883e-05, 'train/video_loss': 0.17381586134433746, 'train/total_loss': 0.17388367652893066} -[Rank 1] Trainer log: {'loss': 0.4435, 'grad_norm': 4.330347537994385, 'learning_rate': 1.4143561637021583e-05} -[Rank 2] Trainer log: {'loss': 0.4435, 'grad_norm': 4.330347537994385, 'learning_rate': 1.4143561637021583e-05} -[Rank 3] Trainer log: {'loss': 0.4435, 'grad_norm': 4.330347537994385, 'learning_rate': 1.4143561637021583e-05} -[Rank 0] Trainer log: {'loss': 0.4435, 'grad_norm': 4.330347537994385, 'learning_rate': 1.4143561637021583e-05} -{'loss': 0.4435, 'grad_norm': 4.330347537994385, 'learning_rate': 1.4143561637021583e-05, 'epoch': 0.4} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.2360, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0305, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00028462684713304043, 'train/lm_loss': 9.303312981501222e-05, 'train/info_loss': 3.743031629710458e-05, 'train/ref_loss': 0.24011147022247314, 'train/uncertainty_loss': 0.0030541507527232174, 'train/video_loss': 0.24548007547855377, 'train/total_loss': 0.24557310342788696} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.18099300861358644, 'train/info_loss': 0.16825079917907715, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001198371290229261, 'train/video_loss': 0.16813096404075623, 'train/total_loss': 0.3491239547729492} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.1771, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3523, 'grad_norm': 3.8545851707458496, 'learning_rate': 1.4133850588531826e-05}[Rank 0] Trainer log: {'loss': 0.3523, 'grad_norm': 3.8545851707458496, 'learning_rate': 1.4133850588531826e-05}[Rank 1] Trainer log: {'loss': 0.3523, 'grad_norm': 3.8545851707458496, 'learning_rate': 1.4133850588531826e-05} - - -[Rank 3] Trainer log: {'loss': 0.3523, 'grad_norm': 3.8545851707458496, 'learning_rate': 1.4133850588531826e-05} -{'loss': 0.3523, 'grad_norm': 3.8545851707458496, 'learning_rate': 1.4133850588531826e-05, 'epoch': 0.4} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3592735290527344, 'train/info_loss': 0.19412778317928314, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012470766669139265, 'train/video_loss': 0.19400307536125183, 'train/total_loss': 0.5532765984535217} -tensor(-0.0007, device='cuda:1', grad_fn=)tensor(0.3756, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002131549408659339, 'train/lm_loss': 7.618406671099365e-05, 'train/info_loss': 3.57019089278765e-05, 'train/ref_loss': 0.1550242304801941, 'train/uncertainty_loss': -6.96134869940579e-05, 'train/video_loss': 0.15669555962085724, 'train/total_loss': 0.15677174925804138} -tensor(0.0051, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0584, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.2481, 'grad_norm': 7.076961994171143, 'learning_rate': 1.4124134835862606e-05}[Rank 3] Trainer log: {'loss': 0.2481, 'grad_norm': 7.076961994171143, 'learning_rate': 1.4124134835862606e-05}[Rank 1] Trainer log: {'loss': 0.2481, 'grad_norm': 7.076961994171143, 'learning_rate': 1.4124134835862606e-05} - - -{'loss': 0.2481, 'grad_norm': 7.076961994171143, 'learning_rate': 1.4124134835862606e-05, 'epoch': 0.4} -[Rank 2] Trainer log: {'loss': 0.2481, 'grad_norm': 7.076961994171143, 'learning_rate': 1.4124134835862606e-05} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00023022990208119155, 'train/lm_loss': 9.329526801593602e-05, 'train/info_loss': 3.9814316551201046e-05, 'train/ref_loss': 0.20901896059513092, 'train/uncertainty_loss': -7.033649017103016e-05, 'train/video_loss': 0.21083028614521027, 'train/total_loss': 0.21092358231544495} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.2842, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(0.0517, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.23474385738372805, 'train/info_loss': 0.26499760150909424, 'train/ref_loss': None, 'train/uncertainty_loss': -8.858738583512605e-05, 'train/video_loss': 0.264909029006958, 'train/total_loss': 0.4996528923511505} -[Rank 1] Trainer log: {'loss': 0.3972, 'grad_norm': 10.118739128112793, 'learning_rate': 1.4114414390070112e-05}[Rank 0] Trainer log: {'loss': 0.3972, 'grad_norm': 10.118739128112793, 'learning_rate': 1.4114414390070112e-05} -[Rank 3] Trainer log: {'loss': 0.3972, 'grad_norm': 10.118739128112793, 'learning_rate': 1.4114414390070112e-05} -[Rank 2] Trainer log: {'loss': 0.3972, 'grad_norm': 10.118739128112793, 'learning_rate': 1.4114414390070112e-05} - -{'loss': 0.3972, 'grad_norm': 10.118739128112793, 'learning_rate': 1.4114414390070112e-05, 'epoch': 0.4} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00021759625524282457, 'train/lm_loss': 7.697054534219206e-05, 'train/info_loss': 3.486750210868195e-05, 'train/ref_loss': 0.14930224418640137, 'train/uncertainty_loss': -6.974030402489007e-05, 'train/video_loss': 0.15100814402103424, 'train/total_loss': 0.15108510851860046} -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(0.1102, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00031339072156697515, 'train/lm_loss': 0.00018840496195480228, 'train/info_loss': 4.273470403859392e-05, 'train/ref_loss': 0.2897026538848877, 'train/uncertainty_loss': 0.011016263067722321, 'train/video_loss': 0.30326876044273376, 'train/total_loss': 0.3034571707248688} -tensor(0.2360, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.303, 'grad_norm': 2.3599960803985596, 'learning_rate': 1.410468926221588e-05} -[Rank 3] Trainer log: {'loss': 0.303, 'grad_norm': 2.3599960803985596, 'learning_rate': 1.410468926221588e-05}[Rank 0] Trainer log: {'loss': 0.303, 'grad_norm': 2.3599960803985596, 'learning_rate': 1.410468926221588e-05} - -[Rank 2] Trainer log: {'loss': 0.303, 'grad_norm': 2.3599960803985596, 'learning_rate': 1.410468926221588e-05} -{'loss': 0.303, 'grad_norm': 2.3599960803985596, 'learning_rate': 1.410468926221588e-05, 'epoch': 0.4} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.16963142156600952, 'train/info_loss': 0.2201630026102066, 'train/ref_loss': None, 'train/uncertainty_loss': -9.453062666580081e-05, 'train/video_loss': 0.22006846964359283, 'train/total_loss': 0.38969987630844116} -tensor(0.1262, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0015, device='cuda:2', grad_fn=) tensor(-0.0015, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(0.0696, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00023385544773191214, 'train/lm_loss': 0.00021248492412269117, 'train/info_loss': 4.619146420736797e-05, 'train/ref_loss': 0.2680330276489258, 'train/uncertainty_loss': 0.006956043839454651, 'train/video_loss': 0.2769061028957367, 'train/total_loss': 0.27711859345436096} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.2716, 'grad_norm': 3.310142755508423, 'learning_rate': 1.4094959463366766e-05}[Rank 0] Trainer log: {'loss': 0.2716, 'grad_norm': 3.310142755508423, 'learning_rate': 1.4094959463366766e-05}[Rank 2] Trainer log: {'loss': 0.2716, 'grad_norm': 3.310142755508423, 'learning_rate': 1.4094959463366766e-05} - -[Rank 3] Trainer log: {'loss': 0.2716, 'grad_norm': 3.310142755508423, 'learning_rate': 1.4094959463366766e-05} - -{'loss': 0.2716, 'grad_norm': 3.310142755508423, 'learning_rate': 1.4094959463366766e-05, 'epoch': 0.4} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.14771249294281005, 'train/info_loss': 0.16676360368728638, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010021975031122566, 'train/video_loss': 0.16666337847709656, 'train/total_loss': 0.3143758773803711} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2666561365127564, 'train/info_loss': 0.21198827028274536, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012718071229755878, 'train/video_loss': 0.21186108887195587, 'train/total_loss': 0.47851723432540894} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3598, 'grad_norm': 6.809991359710693, 'learning_rate': 1.408522500459495e-05} -[Rank 0] Trainer log: {'loss': 0.3598, 'grad_norm': 6.809991359710693, 'learning_rate': 1.408522500459495e-05}[Rank 3] Trainer log: {'loss': 0.3598, 'grad_norm': 6.809991359710693, 'learning_rate': 1.408522500459495e-05} -[Rank 2] Trainer log: {'loss': 0.3598, 'grad_norm': 6.809991359710693, 'learning_rate': 1.408522500459495e-05} - -{'loss': 0.3598, 'grad_norm': 6.809991359710693, 'learning_rate': 1.408522500459495e-05, 'epoch': 0.4} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.1714, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00027581045869737867, 'train/lm_loss': 9.26995009649545e-05, 'train/info_loss': 3.921831739717163e-05, 'train/ref_loss': 0.32648715376853943, 'train/uncertainty_loss': 0.017138516902923586, 'train/video_loss': 0.34587135910987854, 'train/total_loss': 0.3459640443325043} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.0933274269104004, 'train/info_loss': 0.16309133172035217, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011756843887269498, 'train/video_loss': 0.16297376155853271, 'train/total_loss': 0.2563011944293976} -tensor(0.6766, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3891, 'grad_norm': 10.660493850708008, 'learning_rate': 1.4075485896977904e-05} -[Rank 1] Trainer log: {'loss': 0.3891, 'grad_norm': 10.660493850708008, 'learning_rate': 1.4075485896977904e-05}[Rank 2] Trainer log: {'loss': 0.3891, 'grad_norm': 10.660493850708008, 'learning_rate': 1.4075485896977904e-05} - -[Rank 0] Trainer log: {'loss': 0.3891, 'grad_norm': 10.660493850708008, 'learning_rate': 1.4075485896977904e-05} -{'loss': 0.3891, 'grad_norm': 10.660493850708008, 'learning_rate': 1.4075485896977904e-05, 'epoch': 0.4} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2740654945373535, 'train/info_loss': 0.14262469112873077, 'train/ref_loss': None, 'train/uncertainty_loss': -8.761989302001894e-05, 'train/video_loss': 0.14253707230091095, 'train/total_loss': 0.41660255193710327} -tensor(0.4320, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(0.0414, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.10433260202407837, 'train/info_loss': 0.23348651826381683, 'train/ref_loss': None, 'train/uncertainty_loss': -8.727511740289629e-05, 'train/video_loss': 0.23339924216270447, 'train/total_loss': 0.33773183822631836} -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3265, 'grad_norm': 5.834242343902588, 'learning_rate': 1.406574215159841e-05}[Rank 0] Trainer log: {'loss': 0.3265, 'grad_norm': 5.834242343902588, 'learning_rate': 1.406574215159841e-05}[Rank 2] Trainer log: {'loss': 0.3265, 'grad_norm': 5.834242343902588, 'learning_rate': 1.406574215159841e-05} - - -[Rank 1] Trainer log: {'loss': 0.3265, 'grad_norm': 5.834242343902588, 'learning_rate': 1.406574215159841e-05} -{'loss': 0.3265, 'grad_norm': 5.834242343902588, 'learning_rate': 1.406574215159841e-05, 'epoch': 0.4} -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1558619260787964, 'train/info_loss': 0.208215594291687, 'train/ref_loss': None, 'train/uncertainty_loss': -9.66542516835034e-05, 'train/video_loss': 0.20811894536018372, 'train/total_loss': 0.36398088932037354} -tensor(0.0015, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00017956264782696964, 'train/lm_loss': 9.791838238015771e-05, 'train/info_loss': 3.713231490110047e-05, 'train/ref_loss': 0.22758503258228302, 'train/uncertainty_loss': -7.106587872840464e-05, 'train/video_loss': 0.22898760437965393, 'train/total_loss': 0.2290855199098587} -tensor(0.0047, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1461, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3599, 'grad_norm': 2.3876006603240967, 'learning_rate': 1.40559937795445e-05}[Rank 0] Trainer log: {'loss': 0.3599, 'grad_norm': 2.3876006603240967, 'learning_rate': 1.40559937795445e-05} -[Rank 3] Trainer log: {'loss': 0.3599, 'grad_norm': 2.3876006603240967, 'learning_rate': 1.40559937795445e-05} -[Rank 2] Trainer log: {'loss': 0.3599, 'grad_norm': 2.3876006603240967, 'learning_rate': 1.40559937795445e-05} - -{'loss': 0.3599, 'grad_norm': 2.3876006603240967, 'learning_rate': 1.40559937795445e-05, 'epoch': 0.4} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.05316884517669678, 'train/info_loss': 0.1457439512014389, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011473592603579165, 'train/video_loss': 0.14562921226024628, 'train/total_loss': 0.1987980604171753} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002701493445783854, 'train/lm_loss': 5.2326032891869546e-05, 'train/info_loss': 2.9861035727662966e-05, 'train/ref_loss': 0.2074786126613617, 'train/uncertainty_loss': -7.157556246966124e-05, 'train/video_loss': 0.2095980942249298, 'train/total_loss': 0.2096504271030426} -tensor(0.3470, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3487, 'grad_norm': 3.973571538925171, 'learning_rate': 1.4046240791909492e-05} -[Rank 2] Trainer log: {'loss': 0.3487, 'grad_norm': 3.973571538925171, 'learning_rate': 1.4046240791909492e-05} -[Rank 3] Trainer log: {'loss': 0.3487, 'grad_norm': 3.973571538925171, 'learning_rate': 1.4046240791909492e-05} -[Rank 0] Trainer log: {'loss': 0.3487, 'grad_norm': 3.973571538925171, 'learning_rate': 1.4046240791909492e-05} -{'loss': 0.3487, 'grad_norm': 3.973571538925171, 'learning_rate': 1.4046240791909492e-05, 'epoch': 0.4} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.24613988399505615, 'train/info_loss': 0.19643540680408478, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001015629735775292, 'train/video_loss': 0.1963338404893875, 'train/total_loss': 0.44247370958328247} -tensor(0.0710, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.20651459693908691, 'train/info_loss': 0.13742198050022125, 'train/ref_loss': None, 'train/uncertainty_loss': -8.704436477273703e-05, 'train/video_loss': 0.137334942817688, 'train/total_loss': 0.3438495397567749} -tensor(0.2345, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0414, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3355, 'grad_norm': 2.5763027667999268, 'learning_rate': 1.4036483199791949e-05} -[Rank 2] Trainer log: {'loss': 0.3355, 'grad_norm': 2.5763027667999268, 'learning_rate': 1.4036483199791949e-05} -[Rank 0] Trainer log: {'loss': 0.3355, 'grad_norm': 2.5763027667999268, 'learning_rate': 1.4036483199791949e-05}[Rank 1] Trainer log: {'loss': 0.3355, 'grad_norm': 2.5763027667999268, 'learning_rate': 1.4036483199791949e-05} - -{'loss': 0.3355, 'grad_norm': 2.5763027667999268, 'learning_rate': 1.4036483199791949e-05, 'epoch': 0.4} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.423983907699585, 'train/info_loss': 0.13878300786018372, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011098359245806933, 'train/video_loss': 0.13867202401161194, 'train/total_loss': 0.5626559257507324} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.1885, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00039228000678122045, 'train/lm_loss': 0.0001008494757115841, 'train/info_loss': 4.046991671202704e-05, 'train/ref_loss': 0.33470842242240906, 'train/uncertainty_loss': 0.018848368525505067, 'train/video_loss': 0.356735497713089, 'train/total_loss': 0.35683634877204895} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.4067, 'grad_norm': 3.0548441410064697, 'learning_rate': 1.4026721014295675e-05}[Rank 1] Trainer log: {'loss': 0.4067, 'grad_norm': 3.0548441410064697, 'learning_rate': 1.4026721014295675e-05}[Rank 3] Trainer log: {'loss': 0.4067, 'grad_norm': 3.0548441410064697, 'learning_rate': 1.4026721014295675e-05} - - -[Rank 0] Trainer log: {'loss': 0.4067, 'grad_norm': 3.0548441410064697, 'learning_rate': 1.4026721014295675e-05} -{'loss': 0.4067, 'grad_norm': 3.0548441410064697, 'learning_rate': 1.4026721014295675e-05, 'epoch': 0.4} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.15020663738250734, 'train/info_loss': 0.16932827234268188, 'train/ref_loss': None, 'train/uncertainty_loss': -8.284140494652092e-05, 'train/video_loss': 0.16924543678760529, 'train/total_loss': 0.31945207715034485} -tensor(0.0845, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.4594, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0015, device='cuda:2', grad_fn=) tensor(-0.0015, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.5706025123596191, 'train/info_loss': 0.18533791601657867, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010757918935269118, 'train/video_loss': 0.1852303296327591, 'train/total_loss': 0.755832850933075} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(0.1058, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.4665, 'grad_norm': 2.789064645767212, 'learning_rate': 1.4016954246529697e-05}[Rank 3] Trainer log: {'loss': 0.4665, 'grad_norm': 2.789064645767212, 'learning_rate': 1.4016954246529697e-05}[Rank 1] Trainer log: {'loss': 0.4665, 'grad_norm': 2.789064645767212, 'learning_rate': 1.4016954246529697e-05} - - -[Rank 0] Trainer log: {'loss': 0.4665, 'grad_norm': 2.789064645767212, 'learning_rate': 1.4016954246529697e-05} -{'loss': 0.4665, 'grad_norm': 2.789064645767212, 'learning_rate': 1.4016954246529697e-05, 'epoch': 0.4} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.000200556218624115, 'train/lm_loss': 0.0003947175107896328, 'train/info_loss': 5.7991957874037325e-05, 'train/ref_loss': 0.2005596160888672, 'train/uncertainty_loss': -7.0324691478163e-05, 'train/video_loss': 0.20215174555778503, 'train/total_loss': 0.20254646241664886} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00025296062231063845, 'train/lm_loss': 8.70515184942633e-05, 'train/info_loss': 3.832431684713811e-05, 'train/ref_loss': 0.10010382533073425, 'train/uncertainty_loss': -7.16259761247784e-05, 'train/video_loss': 0.10209421068429947, 'train/total_loss': 0.10218126326799393} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.2978, 'grad_norm': 4.398181438446045, 'learning_rate': 1.4007182907608264e-05}[Rank 2] Trainer log: {'loss': 0.2978, 'grad_norm': 4.398181438446045, 'learning_rate': 1.4007182907608264e-05}[Rank 3] Trainer log: {'loss': 0.2978, 'grad_norm': 4.398181438446045, 'learning_rate': 1.4007182907608264e-05} - - -[Rank 0] Trainer log: {'loss': 0.2978, 'grad_norm': 4.398181438446045, 'learning_rate': 1.4007182907608264e-05} -{'loss': 0.2978, 'grad_norm': 4.398181438446045, 'learning_rate': 1.4007182907608264e-05, 'epoch': 0.4} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0918, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00019265315495431423, 'train/lm_loss': 5.9095234610140325e-05, 'train/info_loss': 3.516550350468606e-05, 'train/ref_loss': 0.13302089273929596, 'train/uncertainty_loss': -6.821284187026322e-05, 'train/video_loss': 0.13452906906604767, 'train/total_loss': 0.13458816707134247} -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.33605675697326665, 'train/info_loss': 0.28187063336372375, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013680709525942802, 'train/video_loss': 0.28173384070396423, 'train/total_loss': 0.6177905797958374} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3428, 'grad_norm': 3.912733316421509, 'learning_rate': 1.399740700865082e-05}[Rank 3] Trainer log: {'loss': 0.3428, 'grad_norm': 3.912733316421509, 'learning_rate': 1.399740700865082e-05} - -[Rank 2] Trainer log: {'loss': 0.3428, 'grad_norm': 3.912733316421509, 'learning_rate': 1.399740700865082e-05} -[Rank 0] Trainer log: {'loss': 0.3428, 'grad_norm': 3.912733316421509, 'learning_rate': 1.399740700865082e-05} -{'loss': 0.3428, 'grad_norm': 3.912733316421509, 'learning_rate': 1.399740700865082e-05, 'epoch': 0.4} -tensor(-0.0017, device='cuda:3', grad_fn=) tensor(-0.0017, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.10887235403060913, 'train/info_loss': 0.12969979643821716, 'train/ref_loss': None, 'train/uncertainty_loss': -9.183140355162323e-05, 'train/video_loss': 0.12960796058177948, 'train/total_loss': 0.2384803146123886} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.0576, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00032940162345767026, 'train/lm_loss': 8.702768827788532e-05, 'train/info_loss': 3.713231490110047e-05, 'train/ref_loss': 0.17277750372886658, 'train/uncertainty_loss': -7.169058080762625e-05, 'train/video_loss': 0.17537815868854523, 'train/total_loss': 0.1754651814699173} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.2566, 'grad_norm': 1.4442999362945557, 'learning_rate': 1.3987626560782006e-05}[Rank 3] Trainer log: {'loss': 0.2566, 'grad_norm': 1.4442999362945557, 'learning_rate': 1.3987626560782006e-05}[Rank 2] Trainer log: {'loss': 0.2566, 'grad_norm': 1.4442999362945557, 'learning_rate': 1.3987626560782006e-05} - - -[Rank 0] Trainer log: {'loss': 0.2566, 'grad_norm': 1.4442999362945557, 'learning_rate': 1.3987626560782006e-05} -{'loss': 0.2566, 'grad_norm': 1.4442999362945557, 'learning_rate': 1.3987626560782006e-05, 'epoch': 0.4} -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.6170, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00018000544514507055, 'train/lm_loss': 6.777091766707599e-05, 'train/info_loss': 3.43310966854915e-05, 'train/ref_loss': 0.6325143575668335, 'train/uncertainty_loss': 0.0616972029209137, 'train/video_loss': 0.6956859230995178, 'train/total_loss': 0.6957536935806274} -tensor(0.0646, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.0320, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00019035208970308304, 'train/lm_loss': 7.873415597714484e-05, 'train/info_loss': 4.0767914470052347e-05, 'train/ref_loss': 0.23426425457000732, 'train/uncertainty_loss': 0.003200538083910942, 'train/video_loss': 0.23902836441993713, 'train/total_loss': 0.23910710215568542} -[Rank 0] Trainer log: {'loss': 0.4392, 'grad_norm': 2.5543346405029297, 'learning_rate': 1.3977841575131628e-05}[Rank 3] Trainer log: {'loss': 0.4392, 'grad_norm': 2.5543346405029297, 'learning_rate': 1.3977841575131628e-05} - -[Rank 1] Trainer log: {'loss': 0.4392, 'grad_norm': 2.5543346405029297, 'learning_rate': 1.3977841575131628e-05}[Rank 2] Trainer log: {'loss': 0.4392, 'grad_norm': 2.5543346405029297, 'learning_rate': 1.3977841575131628e-05} - -{'loss': 0.4392, 'grad_norm': 2.5543346405029297, 'learning_rate': 1.3977841575131628e-05, 'epoch': 0.4} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.21961896419525148, 'train/info_loss': 0.18146686255931854, 'train/ref_loss': None, 'train/uncertainty_loss': -9.155258885584772e-05, 'train/video_loss': 0.18137530982494354, 'train/total_loss': 0.40099427103996277} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00029038696084171534, 'train/lm_loss': 8.829074795357883e-05, 'train/info_loss': 3.921831739717163e-05, 'train/ref_loss': 0.11820612847805023, 'train/uncertainty_loss': -6.771337357349695e-05, 'train/video_loss': 0.12050073593854904, 'train/total_loss': 0.12058902531862259} -[Rank 0] Trainer log: {'loss': 0.3828, 'grad_norm': 5.858794689178467, 'learning_rate': 1.3968052062834667e-05}[Rank 1] Trainer log: {'loss': 0.3828, 'grad_norm': 5.858794689178467, 'learning_rate': 1.3968052062834667e-05} -[Rank 3] Trainer log: {'loss': 0.3828, 'grad_norm': 5.858794689178467, 'learning_rate': 1.3968052062834667e-05} - -[Rank 2] Trainer log: {'loss': 0.3828, 'grad_norm': 5.858794689178467, 'learning_rate': 1.3968052062834667e-05} -{'loss': 0.3828, 'grad_norm': 5.858794689178467, 'learning_rate': 1.3968052062834667e-05, 'epoch': 0.4} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(0.3746, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0163, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.000222201575525105, 'train/lm_loss': 0.00022439283784478903, 'train/info_loss': 4.8456229706062004e-05, 'train/ref_loss': 0.22930024564266205, 'train/uncertainty_loss': 0.0016313033178448678, 'train/video_loss': 0.23275762796401978, 'train/total_loss': 0.23298202455043793} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.20106196403503418, 'train/info_loss': 0.1583949774503708, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010414093267172576, 'train/video_loss': 0.158290833234787, 'train/total_loss': 0.35935279726982117} -tensor(0.1921, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3837, 'grad_norm': 4.897834300994873, 'learning_rate': 1.395825803503125e-05}[Rank 1] Trainer log: {'loss': 0.3837, 'grad_norm': 4.897834300994873, 'learning_rate': 1.395825803503125e-05}[Rank 3] Trainer log: {'loss': 0.3837, 'grad_norm': 4.897834300994873, 'learning_rate': 1.395825803503125e-05} - - -[Rank 2] Trainer log: {'loss': 0.3837, 'grad_norm': 4.897834300994873, 'learning_rate': 1.395825803503125e-05} -{'loss': 0.3837, 'grad_norm': 4.897834300994873, 'learning_rate': 1.395825803503125e-05, 'epoch': 0.4} -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3658956050872803, 'train/info_loss': 0.21741308271884918, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001272806664928794, 'train/video_loss': 0.21728579699993134, 'train/total_loss': 0.5831813812255859} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(0.0097, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.33976933956146244, 'train/info_loss': 0.1566154807806015, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013241436099633574, 'train/video_loss': 0.15648306906223297, 'train/total_loss': 0.4962524175643921} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(0.1494, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3939, 'grad_norm': 2.9543075561523438, 'learning_rate': 1.394845950286664e-05}[Rank 1] Trainer log: {'loss': 0.3939, 'grad_norm': 2.9543075561523438, 'learning_rate': 1.394845950286664e-05}[Rank 3] Trainer log: {'loss': 0.3939, 'grad_norm': 2.9543075561523438, 'learning_rate': 1.394845950286664e-05} - - -[Rank 2] Trainer log: {'loss': 0.3939, 'grad_norm': 2.9543075561523438, 'learning_rate': 1.394845950286664e-05} -{'loss': 0.3939, 'grad_norm': 2.9543075561523438, 'learning_rate': 1.394845950286664e-05, 'epoch': 0.4} -tensor(-0.0015, device='cuda:1', grad_fn=) tensor(-0.0015, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.3343, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00022634181659668686, 'train/lm_loss': 7.651772466488183e-05, 'train/info_loss': 3.653631210909225e-05, 'train/ref_loss': 0.44741615653038025, 'train/uncertainty_loss': 0.03343303203582764, 'train/video_loss': 0.48269644379615784, 'train/total_loss': 0.48277297616004944} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2678464651107788, 'train/info_loss': 0.1923554241657257, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00014027177821844818, 'train/video_loss': 0.19221515953540802, 'train/total_loss': 0.46006160974502563} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.6460, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4529, 'grad_norm': 11.21522331237793, 'learning_rate': 1.3938656477491233e-05}[Rank 3] Trainer log: {'loss': 0.4529, 'grad_norm': 11.21522331237793, 'learning_rate': 1.3938656477491233e-05} - -[Rank 2] Trainer log: {'loss': 0.4529, 'grad_norm': 11.21522331237793, 'learning_rate': 1.3938656477491233e-05} -[Rank 0] Trainer log: {'loss': 0.4529, 'grad_norm': 11.21522331237793, 'learning_rate': 1.3938656477491233e-05} -{'loss': 0.4529, 'grad_norm': 11.21522331237793, 'learning_rate': 1.3938656477491233e-05, 'epoch': 0.4} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.10062665939331056, 'train/info_loss': 0.13749995827674866, 'train/ref_loss': None, 'train/uncertainty_loss': -9.869781788438559e-05, 'train/video_loss': 0.1374012678861618, 'train/total_loss': 0.2380279302597046} -tensor(0.0096, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.23248813152313233, 'train/info_loss': 0.17499282956123352, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011464565759524704, 'train/video_loss': 0.17487818002700806, 'train/total_loss': 0.4073663353919983} -tensor(0.1037, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3758, 'grad_norm': 10.39178466796875, 'learning_rate': 1.392884897006053e-05} -[Rank 0] Trainer log: {'loss': 0.3758, 'grad_norm': 10.39178466796875, 'learning_rate': 1.392884897006053e-05}[Rank 1] Trainer log: {'loss': 0.3758, 'grad_norm': 10.39178466796875, 'learning_rate': 1.392884897006053e-05} -[Rank 2] Trainer log: {'loss': 0.3758, 'grad_norm': 10.39178466796875, 'learning_rate': 1.392884897006053e-05} - -{'loss': 0.3758, 'grad_norm': 10.39178466796875, 'learning_rate': 1.392884897006053e-05, 'epoch': 0.4} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(1.2701, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) {'train/tv_loss': 0.0004888799507170915, 'train/lm_loss': 5.876154755242169e-05, 'train/info_loss': 3.129145989078097e-05, 'train/ref_loss': 1.0933351516723633, 'train/uncertainty_loss': 0.12700778245925903, 'train/video_loss': 1.2242851257324219, 'train/total_loss': 1.2243438959121704} -tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.3413, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.2230, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0001746831461787224, 'train/lm_loss': 7.675605011172593e-05, 'train/info_loss': 3.772831769310869e-05, 'train/ref_loss': 0.19614174962043762, 'train/uncertainty_loss': -6.957107689231634e-05, 'train/video_loss': 0.1975073665380478, 'train/total_loss': 0.1975841224193573} -[Rank 1] Trainer log: {'loss': 0.4438, 'grad_norm': 26.59707260131836, 'learning_rate': 1.391903699173514e-05}[Rank 3] Trainer log: {'loss': 0.4438, 'grad_norm': 26.59707260131836, 'learning_rate': 1.391903699173514e-05}[Rank 0] Trainer log: {'loss': 0.4438, 'grad_norm': 26.59707260131836, 'learning_rate': 1.391903699173514e-05} - -[Rank 2] Trainer log: {'loss': 0.4438, 'grad_norm': 26.59707260131836, 'learning_rate': 1.391903699173514e-05} - -{'loss': 0.4438, 'grad_norm': 26.59707260131836, 'learning_rate': 1.391903699173514e-05, 'epoch': 0.4} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(0.3315, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.314320707321167, 'train/info_loss': 0.2228691428899765, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001118929241783917, 'train/video_loss': 0.2227572500705719, 'train/total_loss': 0.5370779633522034} -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3525458812713623, 'train/info_loss': 0.23121245205402374, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012738971272483469, 'train/video_loss': 0.23108506202697754, 'train/total_loss': 0.5836309194564819} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.353, 'grad_norm': 6.995198726654053, 'learning_rate': 1.3909220553680752e-05}[Rank 0] Trainer log: {'loss': 0.353, 'grad_norm': 6.995198726654053, 'learning_rate': 1.3909220553680752e-05}[Rank 3] Trainer log: {'loss': 0.353, 'grad_norm': 6.995198726654053, 'learning_rate': 1.3909220553680752e-05} - - -[Rank 2] Trainer log: {'loss': 0.353, 'grad_norm': 6.995198726654053, 'learning_rate': 1.3909220553680752e-05} -{'loss': 0.353, 'grad_norm': 6.995198726654053, 'learning_rate': 1.3909220553680752e-05, 'epoch': 0.4} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.0082, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00025468599051237106, 'train/lm_loss': 7.663688738830389e-05, 'train/info_loss': 3.599991032388061e-05, 'train/ref_loss': 0.21671336889266968, 'train/uncertainty_loss': 0.0008171082474291325, 'train/video_loss': 0.21960397064685822, 'train/total_loss': 0.21968060731887817} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3766593933105469, 'train/info_loss': 0.26090022921562195, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012990684481337667, 'train/video_loss': 0.260770320892334, 'train/total_loss': 0.6374297142028809} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4356, 'grad_norm': 4.68511438369751, 'learning_rate': 1.3899399667068138e-05}[Rank 2] Trainer log: {'loss': 0.4356, 'grad_norm': 4.68511438369751, 'learning_rate': 1.3899399667068138e-05} - -[Rank 3] Trainer log: {'loss': 0.4356, 'grad_norm': 4.68511438369751, 'learning_rate': 1.3899399667068138e-05} -[Rank 0] Trainer log: {'loss': 0.4356, 'grad_norm': 4.68511438369751, 'learning_rate': 1.3899399667068138e-05} -{'loss': 0.4356, 'grad_norm': 4.68511438369751, 'learning_rate': 1.3899399667068138e-05, 'epoch': 0.4} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.2631, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00046638739295303824, 'train/lm_loss': 0.00010523410746827723, 'train/info_loss': 4.1125513234874234e-05, 'train/ref_loss': 0.3852769434452057, 'train/uncertainty_loss': 0.026311886310577393, 'train/video_loss': 0.41536104679107666, 'train/total_loss': 0.4154662787914276} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.4627386569976807, 'train/info_loss': 0.25661805272102356, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011224375339224935, 'train/video_loss': 0.2565058171749115, 'train/total_loss': 0.7192444801330566} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(0.2592, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.4474, 'grad_norm': 6.247527122497559, 'learning_rate': 1.3889574343073125e-05}[Rank 2] Trainer log: {'loss': 0.4474, 'grad_norm': 6.247527122497559, 'learning_rate': 1.3889574343073125e-05} - -[Rank 3] Trainer log: {'loss': 0.4474, 'grad_norm': 6.247527122497559, 'learning_rate': 1.3889574343073125e-05}[Rank 1] Trainer log: {'loss': 0.4474, 'grad_norm': 6.247527122497559, 'learning_rate': 1.3889574343073125e-05} - -{'loss': 0.4474, 'grad_norm': 6.247527122497559, 'learning_rate': 1.3889574343073125e-05, 'epoch': 0.4} -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.15378267765045167, 'train/info_loss': 0.10639529675245285, 'train/ref_loss': None, 'train/uncertainty_loss': -8.806225960142911e-05, 'train/video_loss': 0.10630723088979721, 'train/total_loss': 0.2600899040699005} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.1666, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0220, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.47559485435485843, 'train/info_loss': 0.19835808873176575, 'train/ref_loss': None, 'train/uncertainty_loss': -8.787352708168329e-05, 'train/video_loss': 0.19827021658420563, 'train/total_loss': 0.6738650798797607} -tensor(0.0086, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0020, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3305, 'grad_norm': 5.624344348907471, 'learning_rate': 1.3879744592876599e-05}[Rank 1] Trainer log: {'loss': 0.3305, 'grad_norm': 5.624344348907471, 'learning_rate': 1.3879744592876599e-05}[Rank 3] Trainer log: {'loss': 0.3305, 'grad_norm': 5.624344348907471, 'learning_rate': 1.3879744592876599e-05} - - -[Rank 2] Trainer log: {'loss': 0.3305, 'grad_norm': 5.624344348907471, 'learning_rate': 1.3879744592876599e-05} -{'loss': 0.3305, 'grad_norm': 5.624344348907471, 'learning_rate': 1.3879744592876599e-05, 'epoch': 0.4} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0005936915520578623, 'train/lm_loss': 0.000120913190767169, 'train/info_loss': 4.4820684706792235e-05, 'train/ref_loss': 0.12408995628356934, 'train/uncertainty_loss': -7.183293346315623e-05, 'train/video_loss': 0.12881247699260712, 'train/total_loss': 0.128933385014534} -tensor(0.1461, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3103263139724732, 'train/info_loss': 0.2286849468946457, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012185336090624333, 'train/video_loss': 0.2285631000995636, 'train/total_loss': 0.5388894081115723} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3804, 'grad_norm': 7.533289432525635, 'learning_rate': 1.3869910427664467e-05}[Rank 0] Trainer log: {'loss': 0.3804, 'grad_norm': 7.533289432525635, 'learning_rate': 1.3869910427664467e-05} -[Rank 1] Trainer log: {'loss': 0.3804, 'grad_norm': 7.533289432525635, 'learning_rate': 1.3869910427664467e-05} -[Rank 2] Trainer log: {'loss': 0.3804, 'grad_norm': 7.533289432525635, 'learning_rate': 1.3869910427664467e-05} - -{'loss': 0.3804, 'grad_norm': 7.533289432525635, 'learning_rate': 1.3869910427664467e-05, 'epoch': 0.4} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.5052, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.22424082756042482, 'train/info_loss': 0.2851575016975403, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011614019749686123, 'train/video_loss': 0.28504136204719543, 'train/total_loss': 0.5092821717262268} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(0.3096, device='cuda:0', grad_fn=)tensor(0.3314, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) - tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002506816759705544, 'train/lm_loss': 5.950042395852506e-05, 'train/info_loss': 3.462909808149561e-05, 'train/ref_loss': 0.43049418926239014, 'train/uncertainty_loss': 0.030959203839302063, 'train/video_loss': 0.4634934663772583, 'train/total_loss': 0.46355298161506653} -tensor(0.0416, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4699, 'grad_norm': 22.693336486816406, 'learning_rate': 1.3860071858627677e-05}[Rank 3] Trainer log: {'loss': 0.4699, 'grad_norm': 22.693336486816406, 'learning_rate': 1.3860071858627677e-05} - -[Rank 2] Trainer log: {'loss': 0.4699, 'grad_norm': 22.693336486816406, 'learning_rate': 1.3860071858627677e-05}[Rank 0] Trainer log: {'loss': 0.4699, 'grad_norm': 22.693336486816406, 'learning_rate': 1.3860071858627677e-05} - -{'loss': 0.4699, 'grad_norm': 22.693336486816406, 'learning_rate': 1.3860071858627677e-05, 'epoch': 0.41} -tensor(-0.0015, device='cuda:0', grad_fn=) tensor(-0.0015, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2847421884536743, 'train/info_loss': 0.17596971988677979, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00015092166140675547, 'train/video_loss': 0.1758188009262085, 'train/total_loss': 0.46056100726127625} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0422, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(0.0636, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003970605786889792, 'train/lm_loss': 8.698002202436329e-05, 'train/info_loss': 3.683431350509636e-05, 'train/ref_loss': 0.2407735288143158, 'train/uncertainty_loss': 0.006355507671833039, 'train/video_loss': 0.25034233927726746, 'train/total_loss': 0.25042933225631714} -[Rank 3] Trainer log: {'loss': 0.3791, 'grad_norm': 5.5703229904174805, 'learning_rate': 1.3850228896962178e-05}[Rank 2] Trainer log: {'loss': 0.3791, 'grad_norm': 5.5703229904174805, 'learning_rate': 1.3850228896962178e-05}[Rank 1] Trainer log: {'loss': 0.3791, 'grad_norm': 5.5703229904174805, 'learning_rate': 1.3850228896962178e-05} - - -[Rank 0] Trainer log: {'loss': 0.3791, 'grad_norm': 5.5703229904174805, 'learning_rate': 1.3850228896962178e-05} -{'loss': 0.3791, 'grad_norm': 5.5703229904174805, 'learning_rate': 1.3850228896962178e-05, 'epoch': 0.41} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.35749142169952397, 'train/info_loss': 0.23427391052246094, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011133761145174504, 'train/video_loss': 0.2341625690460205, 'train/total_loss': 0.5916540026664734} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.0045, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0015, device='cuda:2', grad_fn=) tensor(-0.0015, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.42003793716430665, 'train/info_loss': 0.14168855547904968, 'train/ref_loss': None, 'train/uncertainty_loss': -9.577451273798943e-05, 'train/video_loss': 0.14159278571605682, 'train/total_loss': 0.5616307258605957} -[Rank 1] Trainer log: {'loss': 0.4887, 'grad_norm': 4.6952900886535645, 'learning_rate': 1.3840381553868929e-05}[Rank 3] Trainer log: {'loss': 0.4887, 'grad_norm': 4.6952900886535645, 'learning_rate': 1.3840381553868929e-05}[Rank 2] Trainer log: {'loss': 0.4887, 'grad_norm': 4.6952900886535645, 'learning_rate': 1.3840381553868929e-05} - - -[Rank 0] Trainer log: {'loss': 0.4887, 'grad_norm': 4.6952900886535645, 'learning_rate': 1.3840381553868929e-05} -{'loss': 0.4887, 'grad_norm': 4.6952900886535645, 'learning_rate': 1.3840381553868929e-05, 'epoch': 0.41} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.0221, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00022854458075016737, 'train/lm_loss': 0.00012046046322211623, 'train/info_loss': 4.58338727185037e-05, 'train/ref_loss': 0.20116645097732544, 'train/uncertainty_loss': 0.00220564678311348, 'train/video_loss': 0.20524628460407257, 'train/total_loss': 0.20536674559116364} -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002973356051370502, 'train/lm_loss': 6.777091766707599e-05, 'train/info_loss': 3.325828583911061e-05, 'train/ref_loss': 0.16776660084724426, 'train/uncertainty_loss': -6.728996522724629e-05, 'train/video_loss': 0.1701112538576126, 'train/total_loss': 0.17017902433872223} -[Rank 2] Trainer log: {'loss': 0.3503, 'grad_norm': 3.279536008834839, 'learning_rate': 1.3830529840553857e-05}[Rank 0] Trainer log: {'loss': 0.3503, 'grad_norm': 3.279536008834839, 'learning_rate': 1.3830529840553857e-05}[Rank 1] Trainer log: {'loss': 0.3503, 'grad_norm': 3.279536008834839, 'learning_rate': 1.3830529840553857e-05} - -[Rank 3] Trainer log: {'loss': 0.3503, 'grad_norm': 3.279536008834839, 'learning_rate': 1.3830529840553857e-05} - -{'loss': 0.3503, 'grad_norm': 3.279536008834839, 'learning_rate': 1.3830529840553857e-05, 'epoch': 0.41} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.4001, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0678, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0004214144311845303, 'train/lm_loss': 5.9190572937950495e-05, 'train/info_loss': 3.278148142271675e-05, 'train/ref_loss': 0.2540576756000519, 'train/uncertainty_loss': 0.006781004369258881, 'train/video_loss': 0.2642427980899811, 'train/total_loss': 0.26430198550224304} -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3458537578582764, 'train/info_loss': 0.17126213014125824, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012651528231799603, 'train/video_loss': 0.17113561928272247, 'train/total_loss': 0.5169894099235535} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.4309, 'grad_norm': 4.749833583831787, 'learning_rate': 1.382067376822788e-05}[Rank 0] Trainer log: {'loss': 0.4309, 'grad_norm': 4.749833583831787, 'learning_rate': 1.382067376822788e-05} -[Rank 3] Trainer log: {'loss': 0.4309, 'grad_norm': 4.749833583831787, 'learning_rate': 1.382067376822788e-05} -[Rank 1] Trainer log: {'loss': 0.4309, 'grad_norm': 4.749833583831787, 'learning_rate': 1.382067376822788e-05} - -{'loss': 0.4309, 'grad_norm': 4.749833583831787, 'learning_rate': 1.382067376822788e-05, 'epoch': 0.41} -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2295245409011841, 'train/info_loss': 0.25366345047950745, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010466251987963916, 'train/video_loss': 0.25355878472328186, 'train/total_loss': 0.4830833077430725} -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(0.0643, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.2007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3798624753952027, 'train/info_loss': 0.20177240669727325, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013363470789045097, 'train/video_loss': 0.20163877308368683, 'train/total_loss': 0.5815012454986572} -tensor(0.4138, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.5613, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4583, 'grad_norm': 22.635026931762695, 'learning_rate': 1.3810813348106859e-05}[Rank 2] Trainer log: {'loss': 0.4583, 'grad_norm': 22.635026931762695, 'learning_rate': 1.3810813348106859e-05}[Rank 1] Trainer log: {'loss': 0.4583, 'grad_norm': 22.635026931762695, 'learning_rate': 1.3810813348106859e-05} - - -[Rank 0] Trainer log: {'loss': 0.4583, 'grad_norm': 22.635026931762695, 'learning_rate': 1.3810813348106859e-05} -{'loss': 0.4583, 'grad_norm': 22.635026931762695, 'learning_rate': 1.3810813348106859e-05, 'epoch': 0.41} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2657996416091919, 'train/info_loss': 0.1649838238954544, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010426046792417766, 'train/video_loss': 0.16487956047058105, 'train/total_loss': 0.43067920207977295} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2178767204284668, 'train/info_loss': 0.20901770889759064, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011828683782368899, 'train/video_loss': 0.20889942348003387, 'train/total_loss': 0.42677614092826843} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4339, 'grad_norm': 3.3792202472686768, 'learning_rate': 1.3800948591411622e-05}[Rank 3] Trainer log: {'loss': 0.4339, 'grad_norm': 3.3792202472686768, 'learning_rate': 1.3800948591411622e-05} - -[Rank 2] Trainer log: {'loss': 0.4339, 'grad_norm': 3.3792202472686768, 'learning_rate': 1.3800948591411622e-05} -[Rank 0] Trainer log: {'loss': 0.4339, 'grad_norm': 3.3792202472686768, 'learning_rate': 1.3800948591411622e-05} -{'loss': 0.4339, 'grad_norm': 3.3792202472686768, 'learning_rate': 1.3800948591411622e-05, 'epoch': 0.41} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.1898, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0005367392674088478, 'train/lm_loss': 9.32237773668021e-05, 'train/info_loss': 3.951631879317574e-05, 'train/ref_loss': 0.33897578716278076, 'train/uncertainty_loss': 0.018978793919086457, 'train/video_loss': 0.3622880280017853, 'train/total_loss': 0.362381249666214} -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.203043270111084, 'train/info_loss': 0.18465998768806458, 'train/ref_loss': None, 'train/uncertainty_loss': -9.27461893297732e-05, 'train/video_loss': 0.18456724286079407, 'train/total_loss': 0.3876104950904846} -tensor(0.0486, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.2331, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1747, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3662, 'grad_norm': 3.1098575592041016, 'learning_rate': 1.3791079509367915e-05}[Rank 2] Trainer log: {'loss': 0.3662, 'grad_norm': 3.1098575592041016, 'learning_rate': 1.3791079509367915e-05}[Rank 0] Trainer log: {'loss': 0.3662, 'grad_norm': 3.1098575592041016, 'learning_rate': 1.3791079509367915e-05} - - -{'loss': 0.3662, 'grad_norm': 3.1098575592041016, 'learning_rate': 1.3791079509367915e-05, 'epoch': 0.41} -[Rank 3] Trainer log: {'loss': 0.3662, 'grad_norm': 3.1098575592041016, 'learning_rate': 1.3791079509367915e-05} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1870, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00029390526469796896, 'train/lm_loss': 5.8618537150323394e-05, 'train/info_loss': 3.129145989078097e-05, 'train/ref_loss': 0.3384823799133301, 'train/uncertainty_loss': 0.018700072169303895, 'train/video_loss': 0.35956498980522156, 'train/total_loss': 0.35962361097335815} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.07318536043167115, 'train/info_loss': 0.16652260720729828, 'train/ref_loss': None, 'train/uncertainty_loss': -8.293928112834692e-05, 'train/video_loss': 0.16643966734409332, 'train/total_loss': 0.23962503671646118} -[Rank 0] Trainer log: {'loss': 0.4068, 'grad_norm': 8.545979499816895, 'learning_rate': 1.3781206113206418e-05}[Rank 1] Trainer log: {'loss': 0.4068, 'grad_norm': 8.545979499816895, 'learning_rate': 1.3781206113206418e-05} -[Rank 2] Trainer log: {'loss': 0.4068, 'grad_norm': 8.545979499816895, 'learning_rate': 1.3781206113206418e-05} - -[Rank 3] Trainer log: {'loss': 0.4068, 'grad_norm': 8.545979499816895, 'learning_rate': 1.3781206113206418e-05} -{'loss': 0.4068, 'grad_norm': 8.545979499816895, 'learning_rate': 1.3781206113206418e-05, 'epoch': 0.41} -tensor(0.0374, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.4617, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.2287, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.4604369640350342, 'train/info_loss': 0.15732067823410034, 'train/ref_loss': None, 'train/uncertainty_loss': -0.000100024847779423, 'train/video_loss': 0.15722064673900604, 'train/total_loss': 0.6176576018333435} -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(0.0503, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003507171757519245, 'train/lm_loss': 8.650339441373944e-05, 'train/info_loss': 3.5403903893893585e-05, 'train/ref_loss': 0.20535556972026825, 'train/uncertainty_loss': -7.037746836431324e-05, 'train/video_loss': 0.20812633633613586, 'train/total_loss': 0.20821283757686615} -[Rank 0] Trainer log: {'loss': 0.4503, 'grad_norm': 8.875125885009766, 'learning_rate': 1.3771328414162714e-05}[Rank 3] Trainer log: {'loss': 0.4503, 'grad_norm': 8.875125885009766, 'learning_rate': 1.3771328414162714e-05} -[Rank 1] Trainer log: {'loss': 0.4503, 'grad_norm': 8.875125885009766, 'learning_rate': 1.3771328414162714e-05} -[Rank 2] Trainer log: {'loss': 0.4503, 'grad_norm': 8.875125885009766, 'learning_rate': 1.3771328414162714e-05} - -{'loss': 0.4503, 'grad_norm': 8.875125885009766, 'learning_rate': 1.3771328414162714e-05, 'epoch': 0.41} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.30193138122558594, 'train/info_loss': 0.225450798869133, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011655729031190276, 'train/video_loss': 0.22533424198627472, 'train/total_loss': 0.5272656083106995} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.21705965995788576, 'train/info_loss': 0.1936219334602356, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011821911903098226, 'train/video_loss': 0.1935037076473236, 'train/total_loss': 0.4105633497238159} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.5336, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0213, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.406, 'grad_norm': 7.272488594055176, 'learning_rate': 1.3761446423477282e-05}[Rank 1] Trainer log: {'loss': 0.406, 'grad_norm': 7.272488594055176, 'learning_rate': 1.3761446423477282e-05} - -[Rank 0] Trainer log: {'loss': 0.406, 'grad_norm': 7.272488594055176, 'learning_rate': 1.3761446423477282e-05}[Rank 2] Trainer log: {'loss': 0.406, 'grad_norm': 7.272488594055176, 'learning_rate': 1.3761446423477282e-05} - -{'loss': 0.406, 'grad_norm': 7.272488594055176, 'learning_rate': 1.3761446423477282e-05, 'epoch': 0.41} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.19940872192382814, 'train/info_loss': 0.2630764842033386, 'train/ref_loss': None, 'train/uncertainty_loss': -9.307272848673165e-05, 'train/video_loss': 0.26298341155052185, 'train/total_loss': 0.4623921513557434} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3411092519760132, 'train/info_loss': 0.18675246834754944, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013900039484724403, 'train/video_loss': 0.18661347031593323, 'train/total_loss': 0.5277227163314819} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(0.1958, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.338, 'grad_norm': 5.568989276885986, 'learning_rate': 1.3751560152395489e-05}[Rank 3] Trainer log: {'loss': 0.338, 'grad_norm': 5.568989276885986, 'learning_rate': 1.3751560152395489e-05}[Rank 2] Trainer log: {'loss': 0.338, 'grad_norm': 5.568989276885986, 'learning_rate': 1.3751560152395489e-05} - - -[Rank 0] Trainer log: {'loss': 0.338, 'grad_norm': 5.568989276885986, 'learning_rate': 1.3751560152395489e-05} -{'loss': 0.338, 'grad_norm': 5.568989276885986, 'learning_rate': 1.3751560152395489e-05, 'epoch': 0.41} -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2572486877441406, 'train/info_loss': 0.2812058627605438, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012806217418983578, 'train/video_loss': 0.2810778021812439, 'train/total_loss': 0.5383265018463135} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.23515539169311525, 'train/info_loss': 0.21411937475204468, 'train/ref_loss': None, 'train/uncertainty_loss': -9.942869655787945e-05, 'train/video_loss': 0.21401993930339813, 'train/total_loss': 0.44917532801628113} -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3694, 'grad_norm': 3.491997003555298, 'learning_rate': 1.374166961216757e-05}[Rank 3] Trainer log: {'loss': 0.3694, 'grad_norm': 3.491997003555298, 'learning_rate': 1.374166961216757e-05} - -[Rank 1] Trainer log: {'loss': 0.3694, 'grad_norm': 3.491997003555298, 'learning_rate': 1.374166961216757e-05} -[Rank 0] Trainer log: {'loss': 0.3694, 'grad_norm': 3.491997003555298, 'learning_rate': 1.374166961216757e-05} -{'loss': 0.3694, 'grad_norm': 3.491997003555298, 'learning_rate': 1.374166961216757e-05, 'epoch': 0.41} -tensor(-0.0015, device='cuda:1', grad_fn=) tensor(-0.0015, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003797085722908378, 'train/lm_loss': 8.709917892701924e-05, 'train/info_loss': 3.653631210909225e-05, 'train/ref_loss': 0.1358242630958557, 'train/uncertainty_loss': -7.6149997767061e-05, 'train/video_loss': 0.13882231712341309, 'train/total_loss': 0.13890941441059113} -tensor(0.0481, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.036082908511161804, 'train/info_loss': 0.13875597715377808, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001068638637661934, 'train/video_loss': 0.1386491060256958, 'train/total_loss': 0.1747320145368576} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.2538, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3415, 'grad_norm': 6.5381693840026855, 'learning_rate': 1.3731774814048619e-05}[Rank 3] Trainer log: {'loss': 0.3415, 'grad_norm': 6.5381693840026855, 'learning_rate': 1.3731774814048619e-05}[Rank 2] Trainer log: {'loss': 0.3415, 'grad_norm': 6.5381693840026855, 'learning_rate': 1.3731774814048619e-05} - - -[Rank 0] Trainer log: {'loss': 0.3415, 'grad_norm': 6.5381693840026855, 'learning_rate': 1.3731774814048619e-05} -{'loss': 0.3415, 'grad_norm': 6.5381693840026855, 'learning_rate': 1.3731774814048619e-05, 'epoch': 0.41} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.3234, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(0.0553, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00017742788186296822, 'train/lm_loss': 5.189699004404247e-05, 'train/info_loss': 2.9861035727662966e-05, 'train/ref_loss': 0.24254253506660461, 'train/uncertainty_loss': 0.005531790852546692, 'train/video_loss': 0.2495236098766327, 'train/total_loss': 0.24957551062107086} -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(0.1830, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00040817777626216416, 'train/lm_loss': 4.555660125333816e-05, 'train/info_loss': 3.027824277523905e-05, 'train/ref_loss': 0.1984700709581375, 'train/uncertainty_loss': -7.372286054305733e-05, 'train/video_loss': 0.20169205963611603, 'train/total_loss': 0.20173761248588562} -tensor(0.6546, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.4129, 'grad_norm': 5.545064926147461, 'learning_rate': 1.3721875769298575e-05}[Rank 0] Trainer log: {'loss': 0.4129, 'grad_norm': 5.545064926147461, 'learning_rate': 1.3721875769298575e-05}[Rank 3] Trainer log: {'loss': 0.4129, 'grad_norm': 5.545064926147461, 'learning_rate': 1.3721875769298575e-05} - - -[Rank 1] Trainer log: {'loss': 0.4129, 'grad_norm': 5.545064926147461, 'learning_rate': 1.3721875769298575e-05} -{'loss': 0.4129, 'grad_norm': 5.545064926147461, 'learning_rate': 1.3721875769298575e-05, 'epoch': 0.41} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.1763, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002435807138681412, 'train/lm_loss': 8.676553843542934e-05, 'train/info_loss': 3.862231824314222e-05, 'train/ref_loss': 0.3314680755138397, 'train/uncertainty_loss': 0.017631065845489503, 'train/video_loss': 0.35108640789985657, 'train/total_loss': 0.35117316246032715} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0015, device='cuda:0', grad_fn=) tensor(-0.0015, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.28087902069091797, 'train/info_loss': 0.2160940170288086, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00015372912166640164, 'train/video_loss': 0.21594028174877167, 'train/total_loss': 0.49681931734085083} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3684, 'grad_norm': 1.962329387664795, 'learning_rate': 1.3711972489182208e-05}[Rank 3] Trainer log: {'loss': 0.3684, 'grad_norm': 1.962329387664795, 'learning_rate': 1.3711972489182208e-05}[Rank 2] Trainer log: {'loss': 0.3684, 'grad_norm': 1.962329387664795, 'learning_rate': 1.3711972489182208e-05} - -[Rank 1] Trainer log: {'loss': 0.3684, 'grad_norm': 1.962329387664795, 'learning_rate': 1.3711972489182208e-05} - -{'loss': 0.3684, 'grad_norm': 1.962329387664795, 'learning_rate': 1.3711972489182208e-05, 'epoch': 0.41} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.22478392124176028, 'train/info_loss': 0.13718444108963013, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011944992002099752, 'train/video_loss': 0.13706499338150024, 'train/total_loss': 0.361848920583725} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.8219, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3689150810241699, 'train/info_loss': 0.21289490163326263, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012108379742130637, 'train/video_loss': 0.21277381479740143, 'train/total_loss': 0.5816888809204102} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.5195, 'grad_norm': 6.891185283660889, 'learning_rate': 1.3702064984969112e-05}[Rank 0] Trainer log: {'loss': 0.5195, 'grad_norm': 6.891185283660889, 'learning_rate': 1.3702064984969112e-05}[Rank 3] Trainer log: {'loss': 0.5195, 'grad_norm': 6.891185283660889, 'learning_rate': 1.3702064984969112e-05} - -[Rank 1] Trainer log: {'loss': 0.5195, 'grad_norm': 6.891185283660889, 'learning_rate': 1.3702064984969112e-05} - -{'loss': 0.5195, 'grad_norm': 6.891185283660889, 'learning_rate': 1.3702064984969112e-05, 'epoch': 0.41} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.17662663459777833, 'train/info_loss': 0.12270689010620117, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010512975277379156, 'train/video_loss': 0.12260176241397858, 'train/total_loss': 0.29922839999198914} -tensor(0.0069, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.22996551990509034, 'train/info_loss': 0.2098693996667862, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011715227738022805, 'train/video_loss': 0.20975224673748016, 'train/total_loss': 0.43971776962280273} -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(0.0765, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3528, 'grad_norm': 4.348979949951172, 'learning_rate': 1.3692153267933686e-05}[Rank 3] Trainer log: {'loss': 0.3528, 'grad_norm': 4.348979949951172, 'learning_rate': 1.3692153267933686e-05}[Rank 2] Trainer log: {'loss': 0.3528, 'grad_norm': 4.348979949951172, 'learning_rate': 1.3692153267933686e-05} - - -[Rank 0] Trainer log: {'loss': 0.3528, 'grad_norm': 4.348979949951172, 'learning_rate': 1.3692153267933686e-05} -{'loss': 0.3528, 'grad_norm': 4.348979949951172, 'learning_rate': 1.3692153267933686e-05, 'epoch': 0.41} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.20727331638336183, 'train/info_loss': 0.13726918399333954, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011169277131557466, 'train/video_loss': 0.13715748488903046, 'train/total_loss': 0.3444308042526245} -tensor(0.3952, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.2572, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003141602734103799, 'train/lm_loss': 0.00011386013356968762, 'train/info_loss': 4.410549081512727e-05, 'train/ref_loss': 0.05289160832762718, 'train/uncertainty_loss': -7.130444864742458e-05, 'train/video_loss': 0.05537768825888634, 'train/total_loss': 0.05549154803156853} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3162, 'grad_norm': 5.43202543258667, 'learning_rate': 1.368223734935512e-05}[Rank 2] Trainer log: {'loss': 0.3162, 'grad_norm': 5.43202543258667, 'learning_rate': 1.368223734935512e-05} - -[Rank 1] Trainer log: {'loss': 0.3162, 'grad_norm': 5.43202543258667, 'learning_rate': 1.368223734935512e-05}[Rank 0] Trainer log: {'loss': 0.3162, 'grad_norm': 5.43202543258667, 'learning_rate': 1.368223734935512e-05} - -{'loss': 0.3162, 'grad_norm': 5.43202543258667, 'learning_rate': 1.368223734935512e-05, 'epoch': 0.41} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2703237771987915, 'train/info_loss': 0.17534175515174866, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010856798617169261, 'train/video_loss': 0.17523318529129028, 'train/total_loss': 0.44555696845054626} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.1480, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.0773, device='cuda:2', grad_fn=) tensor(0.0086, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002486221492290497, 'train/lm_loss': 9.956265566870571e-05, 'train/info_loss': 3.892031963914633e-05, 'train/ref_loss': 0.0866260826587677, 'train/uncertainty_loss': -7.39737180992961e-05, 'train/video_loss': 0.08858000487089157, 'train/total_loss': 0.08867956697940826} -[Rank 3] Trainer log: {'loss': 0.2409, 'grad_norm': 3.5467851161956787, 'learning_rate': 1.3672317240517388e-05}[Rank 1] Trainer log: {'loss': 0.2409, 'grad_norm': 3.5467851161956787, 'learning_rate': 1.3672317240517388e-05}[Rank 2] Trainer log: {'loss': 0.2409, 'grad_norm': 3.5467851161956787, 'learning_rate': 1.3672317240517388e-05} - - -[Rank 0] Trainer log: {'loss': 0.2409, 'grad_norm': 3.5467851161956787, 'learning_rate': 1.3672317240517388e-05} -{'loss': 0.2409, 'grad_norm': 3.5467851161956787, 'learning_rate': 1.3672317240517388e-05, 'epoch': 0.41} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00019423031480982902, 'train/lm_loss': 6.719890516251326e-05, 'train/info_loss': 3.462909808149561e-05, 'train/ref_loss': 0.14483189582824707, 'train/uncertainty_loss': -6.675883196294308e-05, 'train/video_loss': 0.146353617310524, 'train/total_loss': 0.14642082154750824} -tensor(-0.0016, device='cuda:0', grad_fn=) tensor(-0.0016, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.0001282757380977273, 'train/info_loss': 3.4360338759142905e-05, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001608126680366695, 'train/video_loss': -0.0001264523307327181, 'train/total_loss': 1.82341318577528e-06} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.4060, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.268, 'grad_norm': 3.3204827308654785, 'learning_rate': 1.366239295270923e-05}[Rank 3] Trainer log: {'loss': 0.268, 'grad_norm': 3.3204827308654785, 'learning_rate': 1.366239295270923e-05} - -[Rank 0] Trainer log: {'loss': 0.268, 'grad_norm': 3.3204827308654785, 'learning_rate': 1.366239295270923e-05}[Rank 2] Trainer log: {'loss': 0.268, 'grad_norm': 3.3204827308654785, 'learning_rate': 1.366239295270923e-05} - -{'loss': 0.268, 'grad_norm': 3.3204827308654785, 'learning_rate': 1.366239295270923e-05, 'epoch': 0.41} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.00010482901707291603, 'train/info_loss': 0.002031029434874654, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010982827516272665, 'train/video_loss': 0.0019212011247873306, 'train/total_loss': 0.002026030095294118} -tensor(-0.0015, device='cuda:1', grad_fn=) tensor(-0.0015, device='cuda:1', grad_fn=) -tensor(0.1137, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0268, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3315824747085572, 'train/info_loss': 0.17569191753864288, 'train/ref_loss': None, 'train/uncertainty_loss': -9.074982372112573e-05, 'train/video_loss': 0.17560116946697235, 'train/total_loss': 0.5071836709976196} -tensor(0.0017, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0036, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3061, 'grad_norm': 9.43321418762207, 'learning_rate': 1.3652464497224146e-05}[Rank 0] Trainer log: {'loss': 0.3061, 'grad_norm': 9.43321418762207, 'learning_rate': 1.3652464497224146e-05} - -[Rank 3] Trainer log: {'loss': 0.3061, 'grad_norm': 9.43321418762207, 'learning_rate': 1.3652464497224146e-05}[Rank 1] Trainer log: {'loss': 0.3061, 'grad_norm': 9.43321418762207, 'learning_rate': 1.3652464497224146e-05} - -{'loss': 0.3061, 'grad_norm': 9.43321418762207, 'learning_rate': 1.3652464497224146e-05, 'epoch': 0.41} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(0.2728, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0079, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00023827801924198867, 'train/lm_loss': 9.837115649133921e-05, 'train/info_loss': 3.57019089278765e-05, 'train/ref_loss': 0.21338725090026855, 'train/uncertainty_loss': 0.0007925134152173996, 'train/video_loss': 0.21612170338630676, 'train/total_loss': 0.21622008085250854} -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.24249088764190674, 'train/info_loss': 0.21116921305656433, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012886746553704144, 'train/video_loss': 0.21104034781455994, 'train/total_loss': 0.4535312354564667} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(0.2153, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4271, 'grad_norm': 6.5379319190979, 'learning_rate': 1.3642531885360375e-05} -[Rank 2] Trainer log: {'loss': 0.4271, 'grad_norm': 6.5379319190979, 'learning_rate': 1.3642531885360375e-05} -[Rank 3] Trainer log: {'loss': 0.4271, 'grad_norm': 6.5379319190979, 'learning_rate': 1.3642531885360375e-05} -[Rank 0] Trainer log: {'loss': 0.4271, 'grad_norm': 6.5379319190979, 'learning_rate': 1.3642531885360375e-05} -{'loss': 0.4271, 'grad_norm': 6.5379319190979, 'learning_rate': 1.3642531885360375e-05, 'epoch': 0.41} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(0.0252, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.3196, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.1850, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00029134401120245457, 'train/lm_loss': 5.86900394409895e-05, 'train/info_loss': 3.2006668334361166e-05, 'train/ref_loss': 0.3356165289878845, 'train/uncertainty_loss': 0.018504352867603303, 'train/video_loss': 0.3564836382865906, 'train/total_loss': 0.35654231905937195} -tensor(-0.0015, device='cuda:3', grad_fn=) tensor(-0.0015, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.0507, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00028071543201804163, 'train/lm_loss': 0.00027170788962394, 'train/info_loss': 5.113816951052286e-05, 'train/ref_loss': 0.24356471002101898, 'train/uncertainty_loss': 0.005065457522869111, 'train/video_loss': 0.25092703104019165, 'train/total_loss': 0.25119873881340027} -[Rank 3] Trainer log: {'loss': 0.3436, 'grad_norm': 6.201205253601074, 'learning_rate': 1.363259512842089e-05} -[Rank 1] Trainer log: {'loss': 0.3436, 'grad_norm': 6.201205253601074, 'learning_rate': 1.363259512842089e-05} -[Rank 2] Trainer log: {'loss': 0.3436, 'grad_norm': 6.201205253601074, 'learning_rate': 1.363259512842089e-05} -[Rank 0] Trainer log: {'loss': 0.3436, 'grad_norm': 6.201205253601074, 'learning_rate': 1.363259512842089e-05} -{'loss': 0.3436, 'grad_norm': 6.201205253601074, 'learning_rate': 1.363259512842089e-05, 'epoch': 0.41} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3039451837539673, 'train/info_loss': 0.13627225160598755, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001208203728310764, 'train/video_loss': 0.13615143299102783, 'train/total_loss': 0.4400966167449951} -tensor(0.1048, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0283, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.32285962104797367, 'train/info_loss': 0.19232486188411713, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013009578688070179, 'train/video_loss': 0.19219475984573364, 'train/total_loss': 0.5150543451309204} -[Rank 0] Trainer log: {'loss': 0.3903, 'grad_norm': 5.607570171356201, 'learning_rate': 1.3622654237713371e-05}[Rank 2] Trainer log: {'loss': 0.3903, 'grad_norm': 5.607570171356201, 'learning_rate': 1.3622654237713371e-05} -[Rank 1] Trainer log: {'loss': 0.3903, 'grad_norm': 5.607570171356201, 'learning_rate': 1.3622654237713371e-05} -[Rank 3] Trainer log: {'loss': 0.3903, 'grad_norm': 5.607570171356201, 'learning_rate': 1.3622654237713371e-05} - -{'loss': 0.3903, 'grad_norm': 5.607570171356201, 'learning_rate': 1.3622654237713371e-05, 'epoch': 0.41} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(0.0410, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003584705526009202, 'train/lm_loss': 7.72565370425582e-05, 'train/info_loss': 3.486750210868195e-05, 'train/ref_loss': 0.2472728192806244, 'train/uncertainty_loss': 0.004101280868053436, 'train/video_loss': 0.25427672266960144, 'train/total_loss': 0.25435397028923035} -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.38953599929809574, 'train/info_loss': 0.20973514020442963, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012383598368614912, 'train/video_loss': 0.20961131155490875, 'train/total_loss': 0.5991473197937012} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.2907, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.4081, 'grad_norm': 7.58552885055542, 'learning_rate': 1.3612709224550219e-05}[Rank 3] Trainer log: {'loss': 0.4081, 'grad_norm': 7.58552885055542, 'learning_rate': 1.3612709224550219e-05} - -[Rank 0] Trainer log: {'loss': 0.4081, 'grad_norm': 7.58552885055542, 'learning_rate': 1.3612709224550219e-05}[Rank 1] Trainer log: {'loss': 0.4081, 'grad_norm': 7.58552885055542, 'learning_rate': 1.3612709224550219e-05} - -{'loss': 0.4081, 'grad_norm': 7.58552885055542, 'learning_rate': 1.3612709224550219e-05, 'epoch': 0.41} -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.23756034374237062, 'train/info_loss': 0.11810138821601868, 'train/ref_loss': None, 'train/uncertainty_loss': -9.937991853803396e-05, 'train/video_loss': 0.11800200492143631, 'train/total_loss': 0.35556235909461975} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.3745, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(0.0513, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002572661265730858, 'train/lm_loss': 5.2326032891869546e-05, 'train/info_loss': 3.075505082961172e-05, 'train/ref_loss': 0.2546209692955017, 'train/uncertainty_loss': 0.005129205808043481, 'train/video_loss': 0.2618390619754791, 'train/total_loss': 0.2618913948535919} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3711, 'grad_norm': 2.8522140979766846, 'learning_rate': 1.3602760100248504e-05} -[Rank 0] Trainer log: {'loss': 0.3711, 'grad_norm': 2.8522140979766846, 'learning_rate': 1.3602760100248504e-05}[Rank 3] Trainer log: {'loss': 0.3711, 'grad_norm': 2.8522140979766846, 'learning_rate': 1.3602760100248504e-05} - -[Rank 1] Trainer log: {'loss': 0.3711, 'grad_norm': 2.8522140979766846, 'learning_rate': 1.3602760100248504e-05} -{'loss': 0.3711, 'grad_norm': 2.8522140979766846, 'learning_rate': 1.3602760100248504e-05, 'epoch': 0.41} -tensor(-0.0016, device='cuda:0', grad_fn=) tensor(-0.0016, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.23154048919677736, 'train/info_loss': 0.22454117238521576, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00015564358327537777, 'train/video_loss': 0.22438552975654602, 'train/total_loss': 0.45592600107192993} -tensor(0.1293, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.0215, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.14597820043563844, 'train/info_loss': 0.1936994343996048, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010632614139467479, 'train/video_loss': 0.1935931146144867, 'train/total_loss': 0.3395712971687317} -tensor(0.1422, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.2861, 'grad_norm': 4.326483726501465, 'learning_rate': 1.3592806876129995e-05}[Rank 1] Trainer log: {'loss': 0.2861, 'grad_norm': 4.326483726501465, 'learning_rate': 1.3592806876129995e-05} -[Rank 3] Trainer log: {'loss': 0.2861, 'grad_norm': 4.326483726501465, 'learning_rate': 1.3592806876129995e-05} -[Rank 2] Trainer log: {'loss': 0.2861, 'grad_norm': 4.326483726501465, 'learning_rate': 1.3592806876129995e-05} - -{'loss': 0.2861, 'grad_norm': 4.326483726501465, 'learning_rate': 1.3592806876129995e-05, 'epoch': 0.41} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(0.1671, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003069448750466109, 'train/lm_loss': 8.728983229957522e-05, 'train/info_loss': 3.599991032388061e-05, 'train/ref_loss': 0.3242368996143341, 'train/uncertainty_loss': 0.01670655608177185, 'train/video_loss': 0.34343501925468445, 'train/total_loss': 0.343522310256958} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.357656192779541, 'train/info_loss': 0.17637208104133606, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011097616516053677, 'train/video_loss': 0.17626111209392548, 'train/total_loss': 0.5339173078536987} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.1262, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3289, 'grad_norm': 1.4358038902282715, 'learning_rate': 1.3582849563521117e-05}[Rank 3] Trainer log: {'loss': 0.3289, 'grad_norm': 1.4358038902282715, 'learning_rate': 1.3582849563521117e-05} -[Rank 0] Trainer log: {'loss': 0.3289, 'grad_norm': 1.4358038902282715, 'learning_rate': 1.3582849563521117e-05} - -[Rank 2] Trainer log: {'loss': 0.3289, 'grad_norm': 1.4358038902282715, 'learning_rate': 1.3582849563521117e-05} -{'loss': 0.3289, 'grad_norm': 1.4358038902282715, 'learning_rate': 1.3582849563521117e-05, 'epoch': 0.41} -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3188606023788452, 'train/info_loss': 0.20881704986095428, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001443229615688324, 'train/video_loss': 0.20867273211479187, 'train/total_loss': 0.5275333523750305} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(0.4713, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0122, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00019954333547502758, 'train/lm_loss': 6.653156015090644e-05, 'train/info_loss': 3.325828583911061e-05, 'train/ref_loss': 0.23222532868385315, 'train/uncertainty_loss': 0.0012245378457009792, 'train/video_loss': 0.23507946729660034, 'train/total_loss': 0.23514600098133087} -[Rank 0] Trainer log: {'loss': 0.404, 'grad_norm': 1.6297718286514282, 'learning_rate': 1.3572888173752947e-05}[Rank 1] Trainer log: {'loss': 0.404, 'grad_norm': 1.6297718286514282, 'learning_rate': 1.3572888173752947e-05}[Rank 3] Trainer log: {'loss': 0.404, 'grad_norm': 1.6297718286514282, 'learning_rate': 1.3572888173752947e-05} - - -[Rank 2] Trainer log: {'loss': 0.404, 'grad_norm': 1.6297718286514282, 'learning_rate': 1.3572888173752947e-05} -{'loss': 0.404, 'grad_norm': 1.6297718286514282, 'learning_rate': 1.3572888173752947e-05, 'epoch': 0.41} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.1305, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002684651408344507, 'train/lm_loss': 5.935741937719286e-05, 'train/info_loss': 3.379469126230106e-05, 'train/ref_loss': 0.30310922861099243, 'train/uncertainty_loss': 0.013054361939430237, 'train/video_loss': 0.3183451294898987, 'train/total_loss': 0.31840449571609497} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1591, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0018, device='cuda:2', grad_fn=) tensor(-0.0018, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.09436299800872804, 'train/info_loss': 0.21262620389461517, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00014426109846681358, 'train/video_loss': 0.21248194575309753, 'train/total_loss': 0.30684494972229004} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3702, 'grad_norm': 5.029908657073975, 'learning_rate': 1.35629227181612e-05} -[Rank 3] Trainer log: {'loss': 0.3702, 'grad_norm': 5.029908657073975, 'learning_rate': 1.35629227181612e-05} -[Rank 0] Trainer log: {'loss': 0.3702, 'grad_norm': 5.029908657073975, 'learning_rate': 1.35629227181612e-05} -[Rank 1] Trainer log: {'loss': 0.3702, 'grad_norm': 5.029908657073975, 'learning_rate': 1.35629227181612e-05} -{'loss': 0.3702, 'grad_norm': 5.029908657073975, 'learning_rate': 1.35629227181612e-05, 'epoch': 0.41} -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2700037479400635, 'train/info_loss': 0.23339509963989258, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00014063093112781644, 'train/video_loss': 0.23325446248054504, 'train/total_loss': 0.503258228302002} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.35344221591949465, 'train/info_loss': 0.1544828563928604, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012813413050025702, 'train/video_loss': 0.15435472130775452, 'train/total_loss': 0.5077969431877136} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.1347, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4556, 'grad_norm': 5.258204936981201, 'learning_rate': 1.3552953208086227e-05} -[Rank 2] Trainer log: {'loss': 0.4556, 'grad_norm': 5.258204936981201, 'learning_rate': 1.3552953208086227e-05}[Rank 1] Trainer log: {'loss': 0.4556, 'grad_norm': 5.258204936981201, 'learning_rate': 1.3552953208086227e-05} - -[Rank 0] Trainer log: {'loss': 0.4556, 'grad_norm': 5.258204936981201, 'learning_rate': 1.3552953208086227e-05} -{'loss': 0.4556, 'grad_norm': 5.258204936981201, 'learning_rate': 1.3552953208086227e-05, 'epoch': 0.42} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.20875368118286133, 'train/info_loss': 0.14046989381313324, 'train/ref_loss': None, 'train/uncertainty_loss': -9.519070154055953e-05, 'train/video_loss': 0.14037470519542694, 'train/total_loss': 0.349128395318985} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002200167626142502, 'train/lm_loss': 6.741341203451157e-05, 'train/info_loss': 3.5403903893893585e-05, 'train/ref_loss': 0.17710845172405243, 'train/uncertainty_loss': -7.292901864275336e-05, 'train/video_loss': 0.1788310706615448, 'train/total_loss': 0.17889848351478577} -tensor(0.1482, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3677, 'grad_norm': 2.3441290855407715, 'learning_rate': 1.3542979654872986e-05}[Rank 1] Trainer log: {'loss': 0.3677, 'grad_norm': 2.3441290855407715, 'learning_rate': 1.3542979654872986e-05} - -[Rank 3] Trainer log: {'loss': 0.3677, 'grad_norm': 2.3441290855407715, 'learning_rate': 1.3542979654872986e-05} -[Rank 2] Trainer log: {'loss': 0.3677, 'grad_norm': 2.3441290855407715, 'learning_rate': 1.3542979654872986e-05} -{'loss': 0.3677, 'grad_norm': 2.3441290855407715, 'learning_rate': 1.3542979654872986e-05, 'epoch': 0.42} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(0.5125, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0007973357103765011, 'train/lm_loss': 5.184931796975434e-05, 'train/info_loss': 2.9861035727662966e-05, 'train/ref_loss': 0.5507732629776001, 'train/uncertainty_loss': 0.05124592185020447, 'train/video_loss': 0.6084277629852295, 'train/total_loss': 0.6084796190261841} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(0.6501, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002103532198816538, 'train/lm_loss': 8.733749273233117e-05, 'train/info_loss': 3.599991032388061e-05, 'train/ref_loss': 0.6449301242828369, 'train/uncertainty_loss': 0.06501425504684448, 'train/video_loss': 0.7116631865501404, 'train/total_loss': 0.7117505073547363} -[Rank 2] Trainer log: {'loss': 0.3907, 'grad_norm': 8.33923053741455, 'learning_rate': 1.353300206987103e-05} -[Rank 1] Trainer log: {'loss': 0.3907, 'grad_norm': 8.33923053741455, 'learning_rate': 1.353300206987103e-05}[Rank 3] Trainer log: {'loss': 0.3907, 'grad_norm': 8.33923053741455, 'learning_rate': 1.353300206987103e-05} -[Rank 0] Trainer log: {'loss': 0.3907, 'grad_norm': 8.33923053741455, 'learning_rate': 1.353300206987103e-05} - -{'loss': 0.3907, 'grad_norm': 8.33923053741455, 'learning_rate': 1.353300206987103e-05, 'epoch': 0.42} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.05905197858810425, 'train/info_loss': 0.20950308442115784, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010539073264226317, 'train/video_loss': 0.20939768850803375, 'train/total_loss': 0.26844966411590576} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1349349856376648, 'train/info_loss': 0.18196092545986176, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010958928614854813, 'train/video_loss': 0.1818513423204422, 'train/total_loss': 0.31678634881973267} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.5652, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4301, 'grad_norm': 6.707249164581299, 'learning_rate': 1.3523020464434515e-05}[Rank 0] Trainer log: {'loss': 0.4301, 'grad_norm': 6.707249164581299, 'learning_rate': 1.3523020464434515e-05} - -[Rank 3] Trainer log: {'loss': 0.4301, 'grad_norm': 6.707249164581299, 'learning_rate': 1.3523020464434515e-05} -[Rank 2] Trainer log: {'loss': 0.4301, 'grad_norm': 6.707249164581299, 'learning_rate': 1.3523020464434515e-05} -{'loss': 0.4301, 'grad_norm': 6.707249164581299, 'learning_rate': 1.3523020464434515e-05, 'epoch': 0.42} -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.36314752101898196, 'train/info_loss': 0.2554057836532593, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013006750959903003, 'train/video_loss': 0.2552757263183594, 'train/total_loss': 0.6184232234954834} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(0.2579, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.000189405947457999, 'train/lm_loss': 9.396252571605147e-05, 'train/info_loss': 3.832431684713811e-05, 'train/ref_loss': 0.14956897497177124, 'train/uncertainty_loss': -6.98487099725753e-05, 'train/video_loss': 0.15105271339416504, 'train/total_loss': 0.15114668011665344} -[Rank 0] Trainer log: {'loss': 0.2804, 'grad_norm': 3.9106268882751465, 'learning_rate': 1.351303484992216e-05}[Rank 3] Trainer log: {'loss': 0.2804, 'grad_norm': 3.9106268882751465, 'learning_rate': 1.351303484992216e-05}[Rank 2] Trainer log: {'loss': 0.2804, 'grad_norm': 3.9106268882751465, 'learning_rate': 1.351303484992216e-05} - - -[Rank 1] Trainer log: {'loss': 0.2804, 'grad_norm': 3.9106268882751465, 'learning_rate': 1.351303484992216e-05} -{'loss': 0.2804, 'grad_norm': 3.9106268882751465, 'learning_rate': 1.351303484992216e-05, 'epoch': 0.42} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0004741732031106949, 'train/lm_loss': 5.153945530764759e-05, 'train/info_loss': 3.075505082961172e-05, 'train/ref_loss': 0.15573173761367798, 'train/uncertainty_loss': -7.469147094525398e-05, 'train/video_loss': 0.1594811975955963, 'train/total_loss': 0.15953274071216583} -tensor(0.0357, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00021648367401212455, 'train/lm_loss': 8.678936865180731e-05, 'train/info_loss': 3.409269265830517e-05, 'train/ref_loss': 0.16508106887340546, 'train/uncertainty_loss': -7.439341861754657e-05, 'train/video_loss': 0.16677264869213104, 'train/total_loss': 0.166859433054924} -tensor(0.0894, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3385, 'grad_norm': 5.148198127746582, 'learning_rate': 1.350304523769725e-05} -[Rank 3] Trainer log: {'loss': 0.3385, 'grad_norm': 5.148198127746582, 'learning_rate': 1.350304523769725e-05}[Rank 2] Trainer log: {'loss': 0.3385, 'grad_norm': 5.148198127746582, 'learning_rate': 1.350304523769725e-05} - -[Rank 0] Trainer log: {'loss': 0.3385, 'grad_norm': 5.148198127746582, 'learning_rate': 1.350304523769725e-05} -{'loss': 0.3385, 'grad_norm': 5.148198127746582, 'learning_rate': 1.350304523769725e-05, 'epoch': 0.42} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1477462649345398, 'train/info_loss': 0.17341957986354828, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010601731482893229, 'train/video_loss': 0.17331355810165405, 'train/total_loss': 0.32105982303619385} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.30745532512664797, 'train/info_loss': 0.1623249500989914, 'train/ref_loss': None, 'train/uncertainty_loss': -8.923322893679143e-05, 'train/video_loss': 0.16223572194576263, 'train/total_loss': 0.46969103813171387} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4507, 'grad_norm': 2.8151228427886963, 'learning_rate': 1.3493051639127618e-05}[Rank 0] Trainer log: {'loss': 0.4507, 'grad_norm': 2.8151228427886963, 'learning_rate': 1.3493051639127618e-05}[Rank 3] Trainer log: {'loss': 0.4507, 'grad_norm': 2.8151228427886963, 'learning_rate': 1.3493051639127618e-05} - -[Rank 2] Trainer log: {'loss': 0.4507, 'grad_norm': 2.8151228427886963, 'learning_rate': 1.3493051639127618e-05} - -{'loss': 0.4507, 'grad_norm': 2.8151228427886963, 'learning_rate': 1.3493051639127618e-05, 'epoch': 0.42} -tensor(-0.0015, device='cuda:0', grad_fn=) tensor(-0.0015, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2223534107208252, 'train/info_loss': 0.12966793775558472, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00014888261212036015, 'train/video_loss': 0.12951906025409698, 'train/total_loss': 0.3518724739551544} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.3801, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00032034057658165694, 'train/lm_loss': 5.168247153051198e-05, 'train/info_loss': 3.129145989078097e-05, 'train/ref_loss': 0.46160727739334106, 'train/uncertainty_loss': 0.03801331222057343, 'train/video_loss': 0.5022146105766296, 'train/total_loss': 0.5022662878036499} -[Rank 1] Trainer log: {'loss': 0.3825, 'grad_norm': 7.100663661956787, 'learning_rate': 1.348305406558564e-05}[Rank 3] Trainer log: {'loss': 0.3825, 'grad_norm': 7.100663661956787, 'learning_rate': 1.348305406558564e-05} - -[Rank 0] Trainer log: {'loss': 0.3825, 'grad_norm': 7.100663661956787, 'learning_rate': 1.348305406558564e-05}[Rank 2] Trainer log: {'loss': 0.3825, 'grad_norm': 7.100663661956787, 'learning_rate': 1.348305406558564e-05} - -{'loss': 0.3825, 'grad_norm': 7.100663661956787, 'learning_rate': 1.348305406558564e-05, 'epoch': 0.42} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0005825265310704708, 'train/lm_loss': 6.641238578595222e-05, 'train/info_loss': 3.22450723615475e-05, 'train/ref_loss': 0.13792574405670166, 'train/uncertainty_loss': -7.565768319182098e-05, 'train/video_loss': 0.14254255592823029, 'train/total_loss': 0.14260897040367126} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.22241013050079347, 'train/info_loss': 0.14607909321784973, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012074292171746493, 'train/video_loss': 0.14595834910869598, 'train/total_loss': 0.3683684766292572} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3396, 'grad_norm': 2.9473025798797607, 'learning_rate': 1.3473052528448203e-05}[Rank 0] Trainer log: {'loss': 0.3396, 'grad_norm': 2.9473025798797607, 'learning_rate': 1.3473052528448203e-05}[Rank 2] Trainer log: {'loss': 0.3396, 'grad_norm': 2.9473025798797607, 'learning_rate': 1.3473052528448203e-05} - - -[Rank 1] Trainer log: {'loss': 0.3396, 'grad_norm': 2.9473025798797607, 'learning_rate': 1.3473052528448203e-05} -{'loss': 0.3396, 'grad_norm': 2.9473025798797607, 'learning_rate': 1.3473052528448203e-05, 'epoch': 0.42} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(0.6044, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002832774538546801, 'train/lm_loss': 6.657922640442849e-05, 'train/info_loss': 3.301988181192428e-05, 'train/ref_loss': 0.6344544887542725, 'train/uncertainty_loss': 0.06044383049011231, 'train/video_loss': 0.6971975564956665, 'train/total_loss': 0.6972641348838806} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0017, device='cuda:1', grad_fn=) tensor(-0.0017, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.4785, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1792296051979065, 'train/info_loss': 0.2012353539466858, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010555308545008303, 'train/video_loss': 0.20112979412078857, 'train/total_loss': 0.380359411239624} -[Rank 1] Trainer log: {'loss': 0.5022, 'grad_norm': 10.875261306762695, 'learning_rate': 1.346304703909672e-05}[Rank 0] Trainer log: {'loss': 0.5022, 'grad_norm': 10.875261306762695, 'learning_rate': 1.346304703909672e-05}[Rank 3] Trainer log: {'loss': 0.5022, 'grad_norm': 10.875261306762695, 'learning_rate': 1.346304703909672e-05} - - -[Rank 2] Trainer log: {'loss': 0.5022, 'grad_norm': 10.875261306762695, 'learning_rate': 1.346304703909672e-05} -{'loss': 0.5022, 'grad_norm': 10.875261306762695, 'learning_rate': 1.346304703909672e-05, 'epoch': 0.42} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003396853106096387, 'train/lm_loss': 0.0001130499760620296, 'train/info_loss': 4.207910751574673e-05, 'train/ref_loss': 0.07893238216638565, 'train/uncertainty_loss': -6.759181269444525e-05, 'train/video_loss': 0.0816243514418602, 'train/total_loss': 0.08173739910125732} -tensor(0.0425, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0017, device='cuda:0', grad_fn=) tensor(-0.0017, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1868248224258423, 'train/info_loss': 0.36165156960487366, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001657962566241622, 'train/video_loss': 0.3614857792854309, 'train/total_loss': 0.5483106374740601} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.4234, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3347, 'grad_norm': 10.859387397766113, 'learning_rate': 1.3453037608917084e-05} -[Rank 3] Trainer log: {'loss': 0.3347, 'grad_norm': 10.859387397766113, 'learning_rate': 1.3453037608917084e-05}[Rank 1] Trainer log: {'loss': 0.3347, 'grad_norm': 10.859387397766113, 'learning_rate': 1.3453037608917084e-05} - -[Rank 0] Trainer log: {'loss': 0.3347, 'grad_norm': 10.859387397766113, 'learning_rate': 1.3453037608917084e-05} -{'loss': 0.3347, 'grad_norm': 10.859387397766113, 'learning_rate': 1.3453037608917084e-05, 'epoch': 0.42} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2329319715499878, 'train/info_loss': 0.18813802301883698, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012042445596307517, 'train/video_loss': 0.1880175918340683, 'train/total_loss': 0.4209495782852173} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.1293, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2608559846878052, 'train/info_loss': 0.19510501623153687, 'train/ref_loss': None, 'train/uncertainty_loss': -8.919166284613312e-05, 'train/video_loss': 0.1950158178806305, 'train/total_loss': 0.4558718204498291} -[Rank 1] Trainer log: {'loss': 0.3668, 'grad_norm': 4.175601005554199, 'learning_rate': 1.3443024249299686e-05}[Rank 2] Trainer log: {'loss': 0.3668, 'grad_norm': 4.175601005554199, 'learning_rate': 1.3443024249299686e-05}[Rank 3] Trainer log: {'loss': 0.3668, 'grad_norm': 4.175601005554199, 'learning_rate': 1.3443024249299686e-05} - - -[Rank 0] Trainer log: {'loss': 0.3668, 'grad_norm': 4.175601005554199, 'learning_rate': 1.3443024249299686e-05} -{'loss': 0.3668, 'grad_norm': 4.175601005554199, 'learning_rate': 1.3443024249299686e-05, 'epoch': 0.42} -tensor(-0.0016, device='cuda:3', grad_fn=) tensor(-0.0016, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0005004546139389277, 'train/lm_loss': 5.9047562535852195e-05, 'train/info_loss': 3.1529863917967305e-05, 'train/ref_loss': 0.19279973208904266, 'train/uncertainty_loss': -7.293441449292004e-05, 'train/video_loss': 0.19676196575164795, 'train/total_loss': 0.19682101905345917} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3515474081039429, 'train/info_loss': 0.10684160888195038, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010618717642500997, 'train/video_loss': 0.10673542320728302, 'train/total_loss': 0.45828282833099365} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.2715, 'grad_norm': 5.740907669067383, 'learning_rate': 1.3433006971639381e-05} -[Rank 0] Trainer log: {'loss': 0.2715, 'grad_norm': 5.740907669067383, 'learning_rate': 1.3433006971639381e-05}[Rank 3] Trainer log: {'loss': 0.2715, 'grad_norm': 5.740907669067383, 'learning_rate': 1.3433006971639381e-05}[Rank 2] Trainer log: {'loss': 0.2715, 'grad_norm': 5.740907669067383, 'learning_rate': 1.3433006971639381e-05} - - -{'loss': 0.2715, 'grad_norm': 5.740907669067383, 'learning_rate': 1.3433006971639381e-05, 'epoch': 0.42} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.19905582666397095, 'train/info_loss': 0.17115741968154907, 'train/ref_loss': None, 'train/uncertainty_loss': -9.00962040759623e-05, 'train/video_loss': 0.17106732726097107, 'train/total_loss': 0.37012314796447754} -tensor(0.1268, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.000235192128457129, 'train/lm_loss': 9.405785240232945e-05, 'train/info_loss': 3.832431684713811e-05, 'train/ref_loss': 0.1489553153514862, 'train/uncertainty_loss': -7.097966736182571e-05, 'train/video_loss': 0.15080420672893524, 'train/total_loss': 0.1508982628583908} -[Rank 3] Trainer log: {'loss': 0.4031, 'grad_norm': 3.6784679889678955, 'learning_rate': 1.3422985787335494e-05}[Rank 0] Trainer log: {'loss': 0.4031, 'grad_norm': 3.6784679889678955, 'learning_rate': 1.3422985787335494e-05}[Rank 2] Trainer log: {'loss': 0.4031, 'grad_norm': 3.6784679889678955, 'learning_rate': 1.3422985787335494e-05} - -[Rank 1] Trainer log: {'loss': 0.4031, 'grad_norm': 3.6784679889678955, 'learning_rate': 1.3422985787335494e-05} - -{'loss': 0.4031, 'grad_norm': 3.6784679889678955, 'learning_rate': 1.3422985787335494e-05, 'epoch': 0.42} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.2565, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.07852090597152711, 'train/info_loss': 0.2339148074388504, 'train/ref_loss': None, 'train/uncertainty_loss': -8.848586003296077e-05, 'train/video_loss': 0.23382632434368134, 'train/total_loss': 0.3123472332954407} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(0.2578, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0293, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00025268017780035736, 'train/lm_loss': 7.644622819498182e-05, 'train/info_loss': 3.516550350468606e-05, 'train/ref_loss': 0.23571330308914185, 'train/uncertainty_loss': 0.002934698574244976, 'train/video_loss': 0.24070461094379425, 'train/total_loss': 0.2407810539007187} -tensor(0.1537, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.365, 'grad_norm': 2.8188469409942627, 'learning_rate': 1.3412960707791778e-05} -[Rank 3] Trainer log: {'loss': 0.365, 'grad_norm': 2.8188469409942627, 'learning_rate': 1.3412960707791778e-05} -[Rank 0] Trainer log: {'loss': 0.365, 'grad_norm': 2.8188469409942627, 'learning_rate': 1.3412960707791778e-05}[Rank 2] Trainer log: {'loss': 0.365, 'grad_norm': 2.8188469409942627, 'learning_rate': 1.3412960707791778e-05} - -{'loss': 0.365, 'grad_norm': 2.8188469409942627, 'learning_rate': 1.3412960707791778e-05, 'epoch': 0.42} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.13088895082473756, 'train/info_loss': 0.2666266858577728, 'train/ref_loss': None, 'train/uncertainty_loss': -8.582388982176782e-05, 'train/video_loss': 0.26654085516929626, 'train/total_loss': 0.39742982387542725} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.4454215526580811, 'train/info_loss': 0.19472160935401917, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010907208779826761, 'train/video_loss': 0.1946125328540802, 'train/total_loss': 0.6400340795516968} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(0.0670, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.4434, 'grad_norm': 7.417932033538818, 'learning_rate': 1.3402931744416432e-05}[Rank 3] Trainer log: {'loss': 0.4434, 'grad_norm': 7.417932033538818, 'learning_rate': 1.3402931744416432e-05}[Rank 2] Trainer log: {'loss': 0.4434, 'grad_norm': 7.417932033538818, 'learning_rate': 1.3402931744416432e-05} - - -[Rank 1] Trainer log: {'loss': 0.4434, 'grad_norm': 7.417932033538818, 'learning_rate': 1.3402931744416432e-05} -{'loss': 0.4434, 'grad_norm': 7.417932033538818, 'learning_rate': 1.3402931744416432e-05, 'epoch': 0.42} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1391, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0004012395162135363, 'train/lm_loss': 6.667455891147256e-05, 'train/info_loss': 3.3556287235114723e-05, 'train/ref_loss': 0.29147759079933167, 'train/uncertainty_loss': 0.013914498686790467, 'train/video_loss': 0.30863556265830994, 'train/total_loss': 0.3087022304534912} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(0.0227, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00023050713352859022, 'train/lm_loss': 5.964343436062336e-05, 'train/info_loss': 3.301988181192428e-05, 'train/ref_loss': 0.20996779203414917, 'train/uncertainty_loss': 0.0022653291001915933, 'train/video_loss': 0.21411021053791046, 'train/total_loss': 0.21416985988616943} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3344, 'grad_norm': 3.9118106365203857, 'learning_rate': 1.339289890862207e-05}[Rank 3] Trainer log: {'loss': 0.3344, 'grad_norm': 3.9118106365203857, 'learning_rate': 1.339289890862207e-05} -[Rank 0] Trainer log: {'loss': 0.3344, 'grad_norm': 3.9118106365203857, 'learning_rate': 1.339289890862207e-05} - -[Rank 2] Trainer log: {'loss': 0.3344, 'grad_norm': 3.9118106365203857, 'learning_rate': 1.339289890862207e-05} -{'loss': 0.3344, 'grad_norm': 3.9118106365203857, 'learning_rate': 1.339289890862207e-05, 'epoch': 0.42} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.0341, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002058698330074549, 'train/lm_loss': 6.719890516251326e-05, 'train/info_loss': 3.379469126230106e-05, 'train/ref_loss': 0.21512913703918457, 'train/uncertainty_loss': 0.0034060955047607424, 'train/video_loss': 0.22021599113941193, 'train/total_loss': 0.22028319537639618} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.25285255908966064, 'train/info_loss': 0.2617396414279938, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012028448982164264, 'train/video_loss': 0.26161935925483704, 'train/total_loss': 0.5144718885421753} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4065, 'grad_norm': 3.5708603858947754, 'learning_rate': 1.3382862211825716e-05} -[Rank 2] Trainer log: {'loss': 0.4065, 'grad_norm': 3.5708603858947754, 'learning_rate': 1.3382862211825716e-05} -[Rank 0] Trainer log: {'loss': 0.4065, 'grad_norm': 3.5708603858947754, 'learning_rate': 1.3382862211825716e-05} -[Rank 3] Trainer log: {'loss': 0.4065, 'grad_norm': 3.5708603858947754, 'learning_rate': 1.3382862211825716e-05} -{'loss': 0.4065, 'grad_norm': 3.5708603858947754, 'learning_rate': 1.3382862211825716e-05, 'epoch': 0.42} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(0.0444, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00030170292593538765, 'train/lm_loss': 0.00027082695160061124, 'train/info_loss': 4.994619666831568e-05, 'train/ref_loss': 0.2355566769838333, 'train/uncertainty_loss': 0.0044351015239953995, 'train/video_loss': 0.2424553632736206, 'train/total_loss': 0.2427261918783188} -tensor(0.0078, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.37128415107727053, 'train/info_loss': 0.2480892837047577, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001099016284570098, 'train/video_loss': 0.24797938764095306, 'train/total_loss': 0.6192635297775269} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(0.6531, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4059, 'grad_norm': 4.6986589431762695, 'learning_rate': 1.3372821665448776e-05} -[Rank 0] Trainer log: {'loss': 0.4059, 'grad_norm': 4.6986589431762695, 'learning_rate': 1.3372821665448776e-05}[Rank 3] Trainer log: {'loss': 0.4059, 'grad_norm': 4.6986589431762695, 'learning_rate': 1.3372821665448776e-05} - -[Rank 2] Trainer log: {'loss': 0.4059, 'grad_norm': 4.6986589431762695, 'learning_rate': 1.3372821665448776e-05} -{'loss': 0.4059, 'grad_norm': 4.6986589431762695, 'learning_rate': 1.3372821665448776e-05, 'epoch': 0.42} -tensor(0.1145, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0510, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003160102525725961, 'train/lm_loss': 5.215918063186109e-05, 'train/info_loss': 2.914582182711456e-05, 'train/ref_loss': 0.2381540685892105, 'train/uncertainty_loss': 0.005102389678359032, 'train/video_loss': 0.24581369757652283, 'train/total_loss': 0.2458658516407013} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.22455847263336182, 'train/info_loss': 0.18283474445343018, 'train/ref_loss': None, 'train/uncertainty_loss': -9.4089366029948e-05, 'train/video_loss': 0.18274065852165222, 'train/total_loss': 0.40729913115501404} -tensor(0.3753, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3488, 'grad_norm': 10.98208999633789, 'learning_rate': 1.3362777280917056e-05}[Rank 0] Trainer log: {'loss': 0.3488, 'grad_norm': 10.98208999633789, 'learning_rate': 1.3362777280917056e-05} -[Rank 3] Trainer log: {'loss': 0.3488, 'grad_norm': 10.98208999633789, 'learning_rate': 1.3362777280917056e-05} - -[Rank 2] Trainer log: {'loss': 0.3488, 'grad_norm': 10.98208999633789, 'learning_rate': 1.3362777280917056e-05} -{'loss': 0.3488, 'grad_norm': 10.98208999633789, 'learning_rate': 1.3362777280917056e-05, 'epoch': 0.42} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.13508437871932985, 'train/info_loss': 0.12267506867647171, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010725605534389615, 'train/video_loss': 0.1225678101181984, 'train/total_loss': 0.2576521933078766} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.30923750400543215, 'train/info_loss': 0.19450122117996216, 'train/ref_loss': None, 'train/uncertainty_loss': -9.988537058234215e-05, 'train/video_loss': 0.1944013386964798, 'train/total_loss': 0.5036388635635376} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3789, 'grad_norm': 4.151271343231201, 'learning_rate': 1.3352729069660714e-05}[Rank 1] Trainer log: {'loss': 0.3789, 'grad_norm': 4.151271343231201, 'learning_rate': 1.3352729069660714e-05}[Rank 0] Trainer log: {'loss': 0.3789, 'grad_norm': 4.151271343231201, 'learning_rate': 1.3352729069660714e-05} - -[Rank 2] Trainer log: {'loss': 0.3789, 'grad_norm': 4.151271343231201, 'learning_rate': 1.3352729069660714e-05} - -{'loss': 0.3789, 'grad_norm': 4.151271343231201, 'learning_rate': 1.3352729069660714e-05, 'epoch': 0.42} -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.4107358455657959, 'train/info_loss': 0.22223758697509766, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012797294184565544, 'train/video_loss': 0.2221096158027649, 'train/total_loss': 0.6328454613685608} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.2642, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.16804002523422243, 'train/info_loss': 0.22661662101745605, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011903118574991823, 'train/video_loss': 0.2264975905418396, 'train/total_loss': 0.39453762769699097} -[Rank 0] Trainer log: {'loss': 0.3836, 'grad_norm': 1.8717761039733887, 'learning_rate': 1.3342677043114272e-05}[Rank 3] Trainer log: {'loss': 0.3836, 'grad_norm': 1.8717761039733887, 'learning_rate': 1.3342677043114272e-05}[Rank 1] Trainer log: {'loss': 0.3836, 'grad_norm': 1.8717761039733887, 'learning_rate': 1.3342677043114272e-05} - - -[Rank 2] Trainer log: {'loss': 0.3836, 'grad_norm': 1.8717761039733887, 'learning_rate': 1.3342677043114272e-05} -{'loss': 0.3836, 'grad_norm': 1.8717761039733887, 'learning_rate': 1.3342677043114272e-05, 'epoch': 0.42} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003618479706346989, 'train/lm_loss': 6.691290182061494e-05, 'train/info_loss': 3.22450723615475e-05, 'train/ref_loss': 0.18259692192077637, 'train/uncertainty_loss': -6.922135944478215e-05, 'train/video_loss': 0.18545474112033844, 'train/total_loss': 0.18552164733409882} -tensor(0.0226, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0005251416936516762, 'train/lm_loss': 5.1706307567656046e-05, 'train/info_loss': 3.1053055863594636e-05, 'train/ref_loss': 0.14364278316497803, 'train/uncertainty_loss': -7.635020301677287e-05, 'train/video_loss': 0.14779861271381378, 'train/total_loss': 0.14785031974315643} -tensor(0.1358, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.2434, 'grad_norm': 6.557397365570068, 'learning_rate': 1.3332621212716583e-05}[Rank 1] Trainer log: {'loss': 0.2434, 'grad_norm': 6.557397365570068, 'learning_rate': 1.3332621212716583e-05}[Rank 0] Trainer log: {'loss': 0.2434, 'grad_norm': 6.557397365570068, 'learning_rate': 1.3332621212716583e-05} - - -[Rank 2] Trainer log: {'loss': 0.2434, 'grad_norm': 6.557397365570068, 'learning_rate': 1.3332621212716583e-05} -{'loss': 0.2434, 'grad_norm': 6.557397365570068, 'learning_rate': 1.3332621212716583e-05, 'epoch': 0.42} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1482739567756653, 'train/info_loss': 0.0984608381986618, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010764412581920624, 'train/video_loss': 0.09835319221019745, 'train/total_loss': 0.24662715196609497} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.0641, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.22888526916503907, 'train/info_loss': 0.13138125836849213, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011026994325220586, 'train/video_loss': 0.13127098977565765, 'train/total_loss': 0.36015626788139343} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3156, 'grad_norm': 9.982169151306152, 'learning_rate': 1.3322561589910841e-05}[Rank 2] Trainer log: {'loss': 0.3156, 'grad_norm': 9.982169151306152, 'learning_rate': 1.3322561589910841e-05}[Rank 3] Trainer log: {'loss': 0.3156, 'grad_norm': 9.982169151306152, 'learning_rate': 1.3322561589910841e-05} - - -[Rank 1] Trainer log: {'loss': 0.3156, 'grad_norm': 9.982169151306152, 'learning_rate': 1.3322561589910841e-05} -{'loss': 0.3156, 'grad_norm': 9.982169151306152, 'learning_rate': 1.3322561589910841e-05, 'epoch': 0.42} -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.48829398155212406, 'train/info_loss': 0.20160317420959473, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012847039615735412, 'train/video_loss': 0.20147469639778137, 'train/total_loss': 0.6897686719894409} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00041126618161797525, 'train/lm_loss': 7.663688738830389e-05, 'train/info_loss': 3.462909808149561e-05, 'train/ref_loss': 0.12739607691764832, 'train/uncertainty_loss': -7.429271354340017e-05, 'train/video_loss': 0.13064654171466827, 'train/total_loss': 0.13072317838668823} -tensor(0.3618, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.456, 'grad_norm': 4.200094223022461, 'learning_rate': 1.3312498186144546e-05}[Rank 1] Trainer log: {'loss': 0.456, 'grad_norm': 4.200094223022461, 'learning_rate': 1.3312498186144546e-05} - -[Rank 2] Trainer log: {'loss': 0.456, 'grad_norm': 4.200094223022461, 'learning_rate': 1.3312498186144546e-05} -[Rank 0] Trainer log: {'loss': 0.456, 'grad_norm': 4.200094223022461, 'learning_rate': 1.3312498186144546e-05} -{'loss': 0.456, 'grad_norm': 4.200094223022461, 'learning_rate': 1.3312498186144546e-05, 'epoch': 0.42} -tensor(0.0501, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0004212017171084881, 'train/lm_loss': 0.0001868090475909412, 'train/info_loss': 4.654905933421105e-05, 'train/ref_loss': 0.23988881707191467, 'train/uncertainty_loss': 0.005006403475999833, 'train/video_loss': 0.248311385512352, 'train/total_loss': 0.24849820137023926} -tensor(0.2319, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.41065182685852053, 'train/info_loss': 0.192414328455925, 'train/ref_loss': None, 'train/uncertainty_loss': -9.373787906952202e-05, 'train/video_loss': 0.1923205852508545, 'train/total_loss': 0.6029723882675171} -tensor(0.3839, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3727, 'grad_norm': 15.964927673339844, 'learning_rate': 1.330243101286951e-05}[Rank 3] Trainer log: {'loss': 0.3727, 'grad_norm': 15.964927673339844, 'learning_rate': 1.330243101286951e-05}[Rank 0] Trainer log: {'loss': 0.3727, 'grad_norm': 15.964927673339844, 'learning_rate': 1.330243101286951e-05} - -[Rank 1] Trainer log: {'loss': 0.3727, 'grad_norm': 15.964927673339844, 'learning_rate': 1.330243101286951e-05} - -{'loss': 0.3727, 'grad_norm': 15.964927673339844, 'learning_rate': 1.330243101286951e-05, 'epoch': 0.42} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(0.1060, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.000206578616052866, 'train/lm_loss': 5.199233419261873e-05, 'train/info_loss': 2.8490208933362737e-05, 'train/ref_loss': 0.2826026976108551, 'train/uncertainty_loss': 0.010603796690702438, 'train/video_loss': 0.29488760232925415, 'train/total_loss': 0.2949396073818207} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(0.3116, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.2369, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.2318, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00024349999148398638, 'train/lm_loss': 0.00011917377123609185, 'train/info_loss': 4.142351099289954e-05, 'train/ref_loss': 0.37077879905700684, 'train/uncertainty_loss': 0.0231785848736763, 'train/video_loss': 0.39594680070877075, 'train/total_loss': 0.39606598019599915} -[Rank 0] Trainer log: {'loss': 0.4481, 'grad_norm': 2.0355186462402344, 'learning_rate': 1.3292360081541824e-05}[Rank 1] Trainer log: {'loss': 0.4481, 'grad_norm': 2.0355186462402344, 'learning_rate': 1.3292360081541824e-05} -[Rank 2] Trainer log: {'loss': 0.4481, 'grad_norm': 2.0355186462402344, 'learning_rate': 1.3292360081541824e-05} - -[Rank 3] Trainer log: {'loss': 0.4481, 'grad_norm': 2.0355186462402344, 'learning_rate': 1.3292360081541824e-05} -{'loss': 0.4481, 'grad_norm': 2.0355186462402344, 'learning_rate': 1.3292360081541824e-05, 'epoch': 0.42} -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.27070596218109133, 'train/info_loss': 0.19401347637176514, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00014138943515717983, 'train/video_loss': 0.1938720941543579, 'train/total_loss': 0.4645780622959137} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.1233, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.35273702144622804, 'train/info_loss': 0.17314469814300537, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012031311634927989, 'train/video_loss': 0.17302438616752625, 'train/total_loss': 0.5257614254951477} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3836, 'grad_norm': 3.3573195934295654, 'learning_rate': 1.3282285403621864e-05}[Rank 1] Trainer log: {'loss': 0.3836, 'grad_norm': 3.3573195934295654, 'learning_rate': 1.3282285403621864e-05}[Rank 3] Trainer log: {'loss': 0.3836, 'grad_norm': 3.3573195934295654, 'learning_rate': 1.3282285403621864e-05} - -[Rank 0] Trainer log: {'loss': 0.3836, 'grad_norm': 3.3573195934295654, 'learning_rate': 1.3282285403621864e-05} - -{'loss': 0.3836, 'grad_norm': 3.3573195934295654, 'learning_rate': 1.3282285403621864e-05, 'epoch': 0.42} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00044773500412702565, 'train/lm_loss': 7.627939921803773e-05, 'train/info_loss': 3.713231490110047e-05, 'train/ref_loss': 0.15398404002189636, 'train/uncertainty_loss': -6.915199337527156e-05, 'train/video_loss': 0.1575338989496231, 'train/total_loss': 0.15761017799377441} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.1290, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00022224306594580414, 'train/lm_loss': 5.9571932069957256e-05, 'train/info_loss': 3.325828583911061e-05, 'train/ref_loss': 0.29647529125213623, 'train/uncertainty_loss': 0.012901423871517182, 'train/video_loss': 0.3111879229545593, 'train/total_loss': 0.31124749779701233} -[Rank 0] Trainer log: {'loss': 0.352, 'grad_norm': 3.5867183208465576, 'learning_rate': 1.3272206990574264e-05}[Rank 3] Trainer log: {'loss': 0.352, 'grad_norm': 3.5867183208465576, 'learning_rate': 1.3272206990574264e-05} - -[Rank 1] Trainer log: {'loss': 0.352, 'grad_norm': 3.5867183208465576, 'learning_rate': 1.3272206990574264e-05} -[Rank 2] Trainer log: {'loss': 0.352, 'grad_norm': 3.5867183208465576, 'learning_rate': 1.3272206990574264e-05} -{'loss': 0.352, 'grad_norm': 3.5867183208465576, 'learning_rate': 1.3272206990574264e-05, 'epoch': 0.42} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.23618173599243164, 'train/info_loss': 0.20837607979774475, 'train/ref_loss': None, 'train/uncertainty_loss': -0.000115494045894593, 'train/video_loss': 0.20826058089733124, 'train/total_loss': 0.4444423317909241} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.1278, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00016994276084005835, 'train/lm_loss': 5.916673690080643e-05, 'train/info_loss': 3.22450723615475e-05, 'train/ref_loss': 0.29163360595703125, 'train/uncertainty_loss': 0.012782922387123108, 'train/video_loss': 0.30580833554267883, 'train/total_loss': 0.3058674931526184} -[Rank 0] Trainer log: {'loss': 0.3263, 'grad_norm': 4.384400844573975, 'learning_rate': 1.3262124853867912e-05}[Rank 3] Trainer log: {'loss': 0.3263, 'grad_norm': 4.384400844573975, 'learning_rate': 1.3262124853867912e-05} -[Rank 2] Trainer log: {'loss': 0.3263, 'grad_norm': 4.384400844573975, 'learning_rate': 1.3262124853867912e-05} - -[Rank 1] Trainer log: {'loss': 0.3263, 'grad_norm': 4.384400844573975, 'learning_rate': 1.3262124853867912e-05} -{'loss': 0.3263, 'grad_norm': 4.384400844573975, 'learning_rate': 1.3262124853867912e-05, 'epoch': 0.42} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1684741258621216, 'train/info_loss': 0.13033270835876465, 'train/ref_loss': None, 'train/uncertainty_loss': -8.69585550390184e-05, 'train/video_loss': 0.13024574518203735, 'train/total_loss': 0.2987198829650879} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.40313024520874025, 'train/info_loss': 0.10646945238113403, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013452656567096712, 'train/video_loss': 0.10633492469787598, 'train/total_loss': 0.509465217590332} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(0.0364, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3236, 'grad_norm': 2.164781093597412, 'learning_rate': 1.3252039004975932e-05}[Rank 2] Trainer log: {'loss': 0.3236, 'grad_norm': 2.164781093597412, 'learning_rate': 1.3252039004975932e-05}[Rank 1] Trainer log: {'loss': 0.3236, 'grad_norm': 2.164781093597412, 'learning_rate': 1.3252039004975932e-05} - - -[Rank 3] Trainer log: {'loss': 0.3236, 'grad_norm': 2.164781093597412, 'learning_rate': 1.3252039004975932e-05} -{'loss': 0.3236, 'grad_norm': 2.164781093597412, 'learning_rate': 1.3252039004975932e-05, 'epoch': 0.42} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.17364249229431153, 'train/info_loss': 0.21087881922721863, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010386521462351084, 'train/video_loss': 0.2107749581336975, 'train/total_loss': 0.38441747426986694} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.21526260375976564, 'train/info_loss': 0.2264682799577713, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012603510404005647, 'train/video_loss': 0.22634224593639374, 'train/total_loss': 0.4416048526763916} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0933, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.4156, 'grad_norm': 5.892687797546387, 'learning_rate': 1.3241949455375668e-05}[Rank 1] Trainer log: {'loss': 0.4156, 'grad_norm': 5.892687797546387, 'learning_rate': 1.3241949455375668e-05} -[Rank 3] Trainer log: {'loss': 0.4156, 'grad_norm': 5.892687797546387, 'learning_rate': 1.3241949455375668e-05} - -[Rank 2] Trainer log: {'loss': 0.4156, 'grad_norm': 5.892687797546387, 'learning_rate': 1.3241949455375668e-05} -{'loss': 0.4156, 'grad_norm': 5.892687797546387, 'learning_rate': 1.3241949455375668e-05, 'epoch': 0.43} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3650156736373902, 'train/info_loss': 0.19136103987693787, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012336273211985827, 'train/video_loss': 0.191237673163414, 'train/total_loss': 0.5562533736228943} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.4703, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3240895986557007, 'train/info_loss': 0.12845997512340546, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011685900390148164, 'train/video_loss': 0.1283431202173233, 'train/total_loss': 0.4524327516555786} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.4442, 'grad_norm': 8.292181015014648, 'learning_rate': 1.3231856216548687e-05}[Rank 2] Trainer log: {'loss': 0.4442, 'grad_norm': 8.292181015014648, 'learning_rate': 1.3231856216548687e-05} -[Rank 1] Trainer log: {'loss': 0.4442, 'grad_norm': 8.292181015014648, 'learning_rate': 1.3231856216548687e-05} - -[Rank 3] Trainer log: {'loss': 0.4442, 'grad_norm': 8.292181015014648, 'learning_rate': 1.3231856216548687e-05} -{'loss': 0.4442, 'grad_norm': 8.292181015014648, 'learning_rate': 1.3231856216548687e-05, 'epoch': 0.43} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.3219, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0757, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0001903733122162521, 'train/lm_loss': 4.00264747440815e-05, 'train/info_loss': 2.5331331926281564e-05, 'train/ref_loss': 0.2645185589790344, 'train/uncertainty_loss': 0.007567177712917328, 'train/video_loss': 0.27363404631614685, 'train/total_loss': 0.2736740708351135} -tensor(0.0135, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2962812423706055, 'train/info_loss': 0.2042345106601715, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001224999432452023, 'train/video_loss': 0.2041120082139969, 'train/total_loss': 0.500393271446228} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3741, 'grad_norm': 6.666689872741699, 'learning_rate': 1.322175929998074e-05}[Rank 2] Trainer log: {'loss': 0.3741, 'grad_norm': 6.666689872741699, 'learning_rate': 1.322175929998074e-05}[Rank 1] Trainer log: {'loss': 0.3741, 'grad_norm': 6.666689872741699, 'learning_rate': 1.322175929998074e-05} - - -[Rank 3] Trainer log: {'loss': 0.3741, 'grad_norm': 6.666689872741699, 'learning_rate': 1.322175929998074e-05} -{'loss': 0.3741, 'grad_norm': 6.666689872741699, 'learning_rate': 1.322175929998074e-05, 'epoch': 0.43} -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.32325658798217777, 'train/info_loss': 0.18789184093475342, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012945950729772448, 'train/video_loss': 0.18776237964630127, 'train/total_loss': 0.5110189914703369} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.32105584144592286, 'train/info_loss': 0.29287874698638916, 'train/ref_loss': None, 'train/uncertainty_loss': -7.992156897671521e-05, 'train/video_loss': 0.29279881715774536, 'train/total_loss': 0.6138546466827393} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.4771, 'grad_norm': 3.5954184532165527, 'learning_rate': 1.321165871716178e-05}[Rank 1] Trainer log: {'loss': 0.4771, 'grad_norm': 3.5954184532165527, 'learning_rate': 1.321165871716178e-05}[Rank 3] Trainer log: {'loss': 0.4771, 'grad_norm': 3.5954184532165527, 'learning_rate': 1.321165871716178e-05} - - -[Rank 2] Trainer log: {'loss': 0.4771, 'grad_norm': 3.5954184532165527, 'learning_rate': 1.321165871716178e-05} -{'loss': 0.4771, 'grad_norm': 3.5954184532165527, 'learning_rate': 1.321165871716178e-05, 'epoch': 0.43} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(0.8002, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.4412734031677246, 'train/info_loss': 0.25678831338882446, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011302748462185264, 'train/video_loss': 0.25667527318000793, 'train/total_loss': 0.697948694229126} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00043866536580026153, 'train/lm_loss': 8.78141261637211e-05, 'train/info_loss': 3.57019089278765e-05, 'train/ref_loss': 0.17746558785438538, 'train/uncertainty_loss': -7.499580970034004e-05, 'train/video_loss': 0.18093562126159668, 'train/total_loss': 0.18102343380451202} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.5619, 'grad_norm': 7.035113334655762, 'learning_rate': 1.320155447958591e-05}[Rank 3] Trainer log: {'loss': 0.5619, 'grad_norm': 7.035113334655762, 'learning_rate': 1.320155447958591e-05}[Rank 0] Trainer log: {'loss': 0.5619, 'grad_norm': 7.035113334655762, 'learning_rate': 1.320155447958591e-05} - -[Rank 2] Trainer log: {'loss': 0.5619, 'grad_norm': 7.035113334655762, 'learning_rate': 1.320155447958591e-05} - -{'loss': 0.5619, 'grad_norm': 7.035113334655762, 'learning_rate': 1.320155447958591e-05, 'epoch': 0.43} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.1665, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00020860042423009872, 'train/lm_loss': 4.6247857972048224e-05, 'train/info_loss': 2.8490208933362737e-05, 'train/ref_loss': 0.06749747693538666, 'train/uncertainty_loss': -7.462742505595089e-05, 'train/video_loss': 0.0691201463341713, 'train/total_loss': 0.0691663920879364} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003341373754665256, 'train/lm_loss': 8.669404196552933e-05, 'train/info_loss': 3.5403903893893585e-05, 'train/ref_loss': 0.1538560688495636, 'train/uncertainty_loss': -7.480793865397573e-05, 'train/video_loss': 0.1564897745847702, 'train/total_loss': 0.156576469540596} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3013, 'grad_norm': 4.0658440589904785, 'learning_rate': 1.319144659875141e-05}[Rank 2] Trainer log: {'loss': 0.3013, 'grad_norm': 4.0658440589904785, 'learning_rate': 1.319144659875141e-05} -[Rank 1] Trainer log: {'loss': 0.3013, 'grad_norm': 4.0658440589904785, 'learning_rate': 1.319144659875141e-05} - -[Rank 0] Trainer log: {'loss': 0.3013, 'grad_norm': 4.0658440589904785, 'learning_rate': 1.319144659875141e-05} -{'loss': 0.3013, 'grad_norm': 4.0658440589904785, 'learning_rate': 1.319144659875141e-05, 'epoch': 0.43} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.10201206207275391, 'train/info_loss': 0.18444283306598663, 'train/ref_loss': None, 'train/uncertainty_loss': -8.446726715192199e-05, 'train/video_loss': 0.1843583583831787, 'train/total_loss': 0.2863704264163971} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3301219463348389, 'train/info_loss': 0.1974741518497467, 'train/ref_loss': None, 'train/uncertainty_loss': -9.490684606134893e-05, 'train/video_loss': 0.1973792463541031, 'train/total_loss': 0.5275012254714966} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3261, 'grad_norm': 4.608055591583252, 'learning_rate': 1.3181335086160697e-05} -[Rank 2] Trainer log: {'loss': 0.3261, 'grad_norm': 4.608055591583252, 'learning_rate': 1.3181335086160697e-05} -[Rank 0] Trainer log: {'loss': 0.3261, 'grad_norm': 4.608055591583252, 'learning_rate': 1.3181335086160697e-05}[Rank 3] Trainer log: {'loss': 0.3261, 'grad_norm': 4.608055591583252, 'learning_rate': 1.3181335086160697e-05} - -{'loss': 0.3261, 'grad_norm': 4.608055591583252, 'learning_rate': 1.3181335086160697e-05, 'epoch': 0.43} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0004207128658890724, 'train/lm_loss': 6.758024683222175e-05, 'train/info_loss': 3.462909808149561e-05, 'train/ref_loss': 0.1445823758840561, 'train/uncertainty_loss': -7.81897339038551e-05, 'train/video_loss': 0.14790453016757965, 'train/total_loss': 0.14797210693359375} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.4246, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=)tensor(0.0299, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) -{'train/tv_loss': 0.00024700553622096776, 'train/lm_loss': 5.9428921667858964e-05, 'train/info_loss': 3.22450723615475e-05, 'train/ref_loss': 0.5123814940452576, 'train/uncertainty_loss': 0.04246286749839783, 'train/video_loss': 0.5568526983261108, 'train/total_loss': 0.5569121241569519} -[Rank 0] Trainer log: {'loss': 0.355, 'grad_norm': 7.144419193267822, 'learning_rate': 1.3171219953320331e-05}[Rank 1] Trainer log: {'loss': 0.355, 'grad_norm': 7.144419193267822, 'learning_rate': 1.3171219953320331e-05} -[Rank 3] Trainer log: {'loss': 0.355, 'grad_norm': 7.144419193267822, 'learning_rate': 1.3171219953320331e-05} - -[Rank 2] Trainer log: {'loss': 0.355, 'grad_norm': 7.144419193267822, 'learning_rate': 1.3171219953320331e-05} -{'loss': 0.355, 'grad_norm': 7.144419193267822, 'learning_rate': 1.3171219953320331e-05, 'epoch': 0.43} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.9129, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.15684806108474733, 'train/info_loss': 0.1523335874080658, 'train/ref_loss': None, 'train/uncertainty_loss': -8.878050721250474e-05, 'train/video_loss': 0.15224480628967285, 'train/total_loss': 0.3090928792953491} -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.44017810821533204, 'train/info_loss': 0.26781538128852844, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012674554018303754, 'train/video_loss': 0.2676886320114136, 'train/total_loss': 0.7078667879104614} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.0189, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.9266, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.5631, 'grad_norm': 17.468673706054688, 'learning_rate': 1.3161101211740974e-05} -[Rank 1] Trainer log: {'loss': 0.5631, 'grad_norm': 17.468673706054688, 'learning_rate': 1.3161101211740974e-05} -[Rank 2] Trainer log: {'loss': 0.5631, 'grad_norm': 17.468673706054688, 'learning_rate': 1.3161101211740974e-05} -[Rank 0] Trainer log: {'loss': 0.5631, 'grad_norm': 17.468673706054688, 'learning_rate': 1.3161101211740974e-05} -{'loss': 0.5631, 'grad_norm': 17.468673706054688, 'learning_rate': 1.3161101211740974e-05, 'epoch': 0.43} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.0518, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00031014792621135716, 'train/lm_loss': 7.670838385820389e-05, 'train/info_loss': 3.653631210909225e-05, 'train/ref_loss': 0.08249177038669586, 'train/uncertainty_loss': -6.974793504923583e-05, 'train/video_loss': 0.08493974804878235, 'train/total_loss': 0.08501645922660828} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.19956308603286743, 'train/info_loss': 0.1442231684923172, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010600885143503548, 'train/video_loss': 0.14411716163158417, 'train/total_loss': 0.3436802625656128} -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.1963, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3123, 'grad_norm': 2.750178575515747, 'learning_rate': 1.3150978872937415e-05} -[Rank 2] Trainer log: {'loss': 0.3123, 'grad_norm': 2.750178575515747, 'learning_rate': 1.3150978872937415e-05} -[Rank 0] Trainer log: {'loss': 0.3123, 'grad_norm': 2.750178575515747, 'learning_rate': 1.3150978872937415e-05}[Rank 3] Trainer log: {'loss': 0.3123, 'grad_norm': 2.750178575515747, 'learning_rate': 1.3150978872937415e-05} - -{'loss': 0.3123, 'grad_norm': 2.750178575515747, 'learning_rate': 1.3150978872937415e-05, 'epoch': 0.43} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.05483399033546448, 'train/info_loss': 0.20939496159553528, 'train/ref_loss': None, 'train/uncertainty_loss': -9.035825496539474e-05, 'train/video_loss': 0.2093046009540558, 'train/total_loss': 0.2641385793685913} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.15749109983444215, 'train/info_loss': 0.152647465467453, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010160097153857351, 'train/video_loss': 0.15254586935043335, 'train/total_loss': 0.31003695726394653} -tensor(0.1765, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3739, 'grad_norm': 5.1435546875, 'learning_rate': 1.3140852948428519e-05}[Rank 2] Trainer log: {'loss': 0.3739, 'grad_norm': 5.1435546875, 'learning_rate': 1.3140852948428519e-05} - -[Rank 3] Trainer log: {'loss': 0.3739, 'grad_norm': 5.1435546875, 'learning_rate': 1.3140852948428519e-05} -[Rank 0] Trainer log: {'loss': 0.3739, 'grad_norm': 5.1435546875, 'learning_rate': 1.3140852948428519e-05} -{'loss': 0.3739, 'grad_norm': 5.1435546875, 'learning_rate': 1.3140852948428519e-05, 'epoch': 0.43} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1219570517539978, 'train/info_loss': 0.19632332026958466, 'train/ref_loss': None, 'train/uncertainty_loss': -8.995053358376027e-05, 'train/video_loss': 0.1962333768606186, 'train/total_loss': 0.31819042563438416} -tensor(0.0750, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.0521, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(0.3322, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0005415943916887045, 'train/lm_loss': 6.667455891147256e-05, 'train/info_loss': 3.254307739553042e-05, 'train/ref_loss': 0.10823534429073334, 'train/uncertainty_loss': 6.993083516135812e-05, 'train/video_loss': 0.11267057806253433, 'train/total_loss': 0.1127372533082962} -tensor(0.1638, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3572, 'grad_norm': 3.2989466190338135, 'learning_rate': 1.3130723449737247e-05} -[Rank 3] Trainer log: {'loss': 0.3572, 'grad_norm': 3.2989466190338135, 'learning_rate': 1.3130723449737247e-05}[Rank 0] Trainer log: {'loss': 0.3572, 'grad_norm': 3.2989466190338135, 'learning_rate': 1.3130723449737247e-05}[Rank 2] Trainer log: {'loss': 0.3572, 'grad_norm': 3.2989466190338135, 'learning_rate': 1.3130723449737247e-05} - - -{'loss': 0.3572, 'grad_norm': 3.2989466190338135, 'learning_rate': 1.3130723449737247e-05, 'epoch': 0.43} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(0.1568, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.32011418342590336, 'train/info_loss': 0.30054545402526855, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001085464027710259, 'train/video_loss': 0.30043691396713257, 'train/total_loss': 0.6205511093139648} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.3917, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00020805387757718565, 'train/lm_loss': 8.674170821905137e-05, 'train/info_loss': 3.713231490110047e-05, 'train/ref_loss': 0.4969310164451599, 'train/uncertainty_loss': 0.0391732931137085, 'train/video_loss': 0.5378058552742004, 'train/total_loss': 0.5378925800323486} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.466, 'grad_norm': 11.803627967834473, 'learning_rate': 1.312059038839061e-05}[Rank 2] Trainer log: {'loss': 0.466, 'grad_norm': 11.803627967834473, 'learning_rate': 1.312059038839061e-05} -[Rank 0] Trainer log: {'loss': 0.466, 'grad_norm': 11.803627967834473, 'learning_rate': 1.312059038839061e-05} -[Rank 1] Trainer log: {'loss': 0.466, 'grad_norm': 11.803627967834473, 'learning_rate': 1.312059038839061e-05} - -{'loss': 0.466, 'grad_norm': 11.803627967834473, 'learning_rate': 1.312059038839061e-05, 'epoch': 0.43} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2803356647491455, 'train/info_loss': 0.20754151046276093, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001111725578084588, 'train/video_loss': 0.20743033289909363, 'train/total_loss': 0.48776599764823914} -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(0.1550, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.0421, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.3151, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.6291, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002863940550014377, 'train/lm_loss': 7.604106795042754e-05, 'train/info_loss': 3.379469126230106e-05, 'train/ref_loss': 0.658566415309906, 'train/uncertainty_loss': 0.06290672421455383, 'train/video_loss': 0.7237980961799622, 'train/total_loss': 0.7238741517066956} -[Rank 2] Trainer log: {'loss': 0.4167, 'grad_norm': 10.643487930297852, 'learning_rate': 1.3110453775919689e-05} -[Rank 1] Trainer log: {'loss': 0.4167, 'grad_norm': 10.643487930297852, 'learning_rate': 1.3110453775919689e-05} -[Rank 0] Trainer log: {'loss': 0.4167, 'grad_norm': 10.643487930297852, 'learning_rate': 1.3110453775919689e-05}[Rank 3] Trainer log: {'loss': 0.4167, 'grad_norm': 10.643487930297852, 'learning_rate': 1.3110453775919689e-05} - -{'loss': 0.4167, 'grad_norm': 10.643487930297852, 'learning_rate': 1.3110453775919689e-05, 'epoch': 0.43} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.28883299827575687, 'train/info_loss': 0.27309951186180115, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013180336682125927, 'train/video_loss': 0.27296769618988037, 'train/total_loss': 0.5618007183074951} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0948, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0201, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00038029905408620834, 'train/lm_loss': 7.542141829617322e-05, 'train/info_loss': 3.379469126230106e-05, 'train/ref_loss': 0.21141454577445984, 'train/uncertainty_loss': 0.002008544094860554, 'train/video_loss': 0.21649928390979767, 'train/total_loss': 0.21657469868659973} -[Rank 1] Trainer log: {'loss': 0.327, 'grad_norm': 3.776587724685669, 'learning_rate': 1.3100313623859596e-05} -[Rank 0] Trainer log: {'loss': 0.327, 'grad_norm': 3.776587724685669, 'learning_rate': 1.3100313623859596e-05}[Rank 2] Trainer log: {'loss': 0.327, 'grad_norm': 3.776587724685669, 'learning_rate': 1.3100313623859596e-05} - -[Rank 3] Trainer log: {'loss': 0.327, 'grad_norm': 3.776587724685669, 'learning_rate': 1.3100313623859596e-05} -{'loss': 0.327, 'grad_norm': 3.776587724685669, 'learning_rate': 1.3100313623859596e-05, 'epoch': 0.43} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(0.0687, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0007951629348099232, 'train/lm_loss': 7.706587784923614e-05, 'train/info_loss': 3.462909808149561e-05, 'train/ref_loss': 0.18209904432296753, 'train/uncertainty_loss': -7.45856494177133e-05, 'train/video_loss': 0.1884204000234604, 'train/total_loss': 0.18849746882915497} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(0.1555, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.2581, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003115284023806453, 'train/lm_loss': 6.710357265546918e-05, 'train/info_loss': 3.486750210868195e-05, 'train/ref_loss': 0.390938401222229, 'train/uncertainty_loss': 0.025806090235710146, 'train/video_loss': 0.4192715883255005, 'train/total_loss': 0.4193387031555176} -[Rank 3] Trainer log: {'loss': 0.3449, 'grad_norm': 4.7037739753723145, 'learning_rate': 1.3090169943749475e-05}[Rank 2] Trainer log: {'loss': 0.3449, 'grad_norm': 4.7037739753723145, 'learning_rate': 1.3090169943749475e-05} -[Rank 1] Trainer log: {'loss': 0.3449, 'grad_norm': 4.7037739753723145, 'learning_rate': 1.3090169943749475e-05} - -[Rank 0] Trainer log: {'loss': 0.3449, 'grad_norm': 4.7037739753723145, 'learning_rate': 1.3090169943749475e-05} -{'loss': 0.3449, 'grad_norm': 4.7037739753723145, 'learning_rate': 1.3090169943749475e-05, 'epoch': 0.43} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.1919, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003524106461554766, 'train/lm_loss': 0.00010540091898292303, 'train/info_loss': 3.9814316551201046e-05, 'train/ref_loss': 0.17931832373142242, 'train/uncertainty_loss': -6.831570062786341e-05, 'train/video_loss': 0.18210910260677338, 'train/total_loss': 0.18221449851989746} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.20731093883514407, 'train/info_loss': 0.13337959349155426, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011045770952478051, 'train/video_loss': 0.13326913118362427, 'train/total_loss': 0.3405800759792328} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.2637, 'grad_norm': 5.191952228546143, 'learning_rate': 1.3080022747132488e-05}[Rank 2] Trainer log: {'loss': 0.2637, 'grad_norm': 5.191952228546143, 'learning_rate': 1.3080022747132488e-05}[Rank 3] Trainer log: {'loss': 0.2637, 'grad_norm': 5.191952228546143, 'learning_rate': 1.3080022747132488e-05} - - -[Rank 0] Trainer log: {'loss': 0.2637, 'grad_norm': 5.191952228546143, 'learning_rate': 1.3080022747132488e-05} -{'loss': 0.2637, 'grad_norm': 5.191952228546143, 'learning_rate': 1.3080022747132488e-05, 'epoch': 0.43} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.1388, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00023662224411964418, 'train/lm_loss': 6.74610782880336e-05, 'train/info_loss': 3.462909808149561e-05, 'train/ref_loss': 0.30735933780670166, 'train/uncertainty_loss': 0.013881208002567293, 'train/video_loss': 0.32316815853118896, 'train/total_loss': 0.3232356309890747} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.014622080326080323, 'train/info_loss': 0.2021673023700714, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010953443124890329, 'train/video_loss': 0.20205776393413544, 'train/total_loss': 0.21667984127998352} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3967, 'grad_norm': 7.673417568206787, 'learning_rate': 1.3069872045555792e-05}[Rank 2] Trainer log: {'loss': 0.3967, 'grad_norm': 7.673417568206787, 'learning_rate': 1.3069872045555792e-05} -[Rank 3] Trainer log: {'loss': 0.3967, 'grad_norm': 7.673417568206787, 'learning_rate': 1.3069872045555792e-05} - -[Rank 0] Trainer log: {'loss': 0.3967, 'grad_norm': 7.673417568206787, 'learning_rate': 1.3069872045555792e-05} -{'loss': 0.3967, 'grad_norm': 7.673417568206787, 'learning_rate': 1.3069872045555792e-05, 'epoch': 0.43} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(0.2503, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.4797, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003081987844780088, 'train/lm_loss': 5.182548193261027e-05, 'train/info_loss': 3.129145989078097e-05, 'train/ref_loss': 0.5643867254257202, 'train/uncertainty_loss': 0.04797437787055969, 'train/video_loss': 0.6148580312728882, 'train/total_loss': 0.614909827709198} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.03634399473667145, 'train/info_loss': 0.1398530900478363, 'train/ref_loss': None, 'train/uncertainty_loss': -9.613056899979711e-05, 'train/video_loss': 0.1397569626569748, 'train/total_loss': 0.176100954413414} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4077, 'grad_norm': 13.45173454284668, 'learning_rate': 1.3059717850570534e-05}[Rank 0] Trainer log: {'loss': 0.4077, 'grad_norm': 13.45173454284668, 'learning_rate': 1.3059717850570534e-05} -[Rank 1] Trainer log: {'loss': 0.4077, 'grad_norm': 13.45173454284668, 'learning_rate': 1.3059717850570534e-05} - -[Rank 2] Trainer log: {'loss': 0.4077, 'grad_norm': 13.45173454284668, 'learning_rate': 1.3059717850570534e-05} -{'loss': 0.4077, 'grad_norm': 13.45173454284668, 'learning_rate': 1.3059717850570534e-05, 'epoch': 0.43} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.0590, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.4132, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00017696106806397438, 'train/lm_loss': 7.487325929105282e-05, 'train/info_loss': 3.278148142271675e-05, 'train/ref_loss': 0.4666178822517395, 'train/uncertainty_loss': 0.04132092893123627, 'train/video_loss': 0.5093873143196106, 'train/total_loss': 0.5094621777534485} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.9543, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00027805562131106855, 'train/lm_loss': 0.0001097854576073587, 'train/info_loss': 3.921831739717163e-05, 'train/ref_loss': 0.11827182024717331, 'train/uncertainty_loss': -7.177931256592273e-05, 'train/video_loss': 0.12046370655298233, 'train/total_loss': 0.120573490858078} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4343, 'grad_norm': 13.181310653686523, 'learning_rate': 1.3049560173731841e-05} -[Rank 0] Trainer log: {'loss': 0.4343, 'grad_norm': 13.181310653686523, 'learning_rate': 1.3049560173731841e-05}[Rank 3] Trainer log: {'loss': 0.4343, 'grad_norm': 13.181310653686523, 'learning_rate': 1.3049560173731841e-05} - -[Rank 2] Trainer log: {'loss': 0.4343, 'grad_norm': 13.181310653686523, 'learning_rate': 1.3049560173731841e-05} -{'loss': 0.4343, 'grad_norm': 13.181310653686523, 'learning_rate': 1.3049560173731841e-05, 'epoch': 0.43} -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(0.0228, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.2668, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00017873833421617747, 'train/lm_loss': 5.13249367941171e-05, 'train/info_loss': 3.129145989078097e-05, 'train/ref_loss': 0.39448514580726624, 'train/uncertainty_loss': 0.026677802205085754, 'train/video_loss': 0.4226241707801819, 'train/total_loss': 0.4226754903793335} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.1672, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00030378079973161223, 'train/lm_loss': 7.539758225902916e-05, 'train/info_loss': 3.379469126230106e-05, 'train/ref_loss': 0.04680386558175087, 'train/uncertainty_loss': -7.136692292988301e-05, 'train/video_loss': 0.04919654130935669, 'train/total_loss': 0.04927193745970726} -[Rank 1] Trainer log: {'loss': 0.3175, 'grad_norm': 3.7236411571502686, 'learning_rate': 1.3039399026598798e-05}[Rank 3] Trainer log: {'loss': 0.3175, 'grad_norm': 3.7236411571502686, 'learning_rate': 1.3039399026598798e-05} -[Rank 0] Trainer log: {'loss': 0.3175, 'grad_norm': 3.7236411571502686, 'learning_rate': 1.3039399026598798e-05} - -[Rank 2] Trainer log: {'loss': 0.3175, 'grad_norm': 3.7236411571502686, 'learning_rate': 1.3039399026598798e-05} -{'loss': 0.3175, 'grad_norm': 3.7236411571502686, 'learning_rate': 1.3039399026598798e-05, 'epoch': 0.43} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2572831392288208, 'train/info_loss': 0.17838220298290253, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010631229961290956, 'train/video_loss': 0.17827589809894562, 'train/total_loss': 0.4355590343475342} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(0.4098, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(0.5427, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003138439962640405, 'train/lm_loss': 6.662689265795052e-05, 'train/info_loss': 3.43310966854915e-05, 'train/ref_loss': 0.1963808536529541, 'train/uncertainty_loss': -6.754964706487954e-05, 'train/video_loss': 0.19885839521884918, 'train/total_loss': 0.19892501831054688} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4234, 'grad_norm': 9.004124641418457, 'learning_rate': 1.3029234420734438e-05} -[Rank 3] Trainer log: {'loss': 0.4234, 'grad_norm': 9.004124641418457, 'learning_rate': 1.3029234420734438e-05} -[Rank 2] Trainer log: {'loss': 0.4234, 'grad_norm': 9.004124641418457, 'learning_rate': 1.3029234420734438e-05} -[Rank 0] Trainer log: {'loss': 0.4234, 'grad_norm': 9.004124641418457, 'learning_rate': 1.3029234420734438e-05} -{'loss': 0.4234, 'grad_norm': 9.004124641418457, 'learning_rate': 1.3029234420734438e-05, 'epoch': 0.43} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.08625665307044983, 'train/info_loss': 0.19003786146640778, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011080055264756083, 'train/video_loss': 0.18992705643177032, 'train/total_loss': 0.27618372440338135} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.12322157621383667, 'train/info_loss': 0.2218244969844818, 'train/ref_loss': None, 'train/uncertainty_loss': -9.224672103300691e-05, 'train/video_loss': 0.2217322438955307, 'train/total_loss': 0.34495383501052856} -tensor(0.1021, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3279, 'grad_norm': 5.628873825073242, 'learning_rate': 1.3019066367705734e-05} -[Rank 3] Trainer log: {'loss': 0.3279, 'grad_norm': 5.628873825073242, 'learning_rate': 1.3019066367705734e-05}[Rank 0] Trainer log: {'loss': 0.3279, 'grad_norm': 5.628873825073242, 'learning_rate': 1.3019066367705734e-05} - -[Rank 2] Trainer log: {'loss': 0.3279, 'grad_norm': 5.628873825073242, 'learning_rate': 1.3019066367705734e-05} -{'loss': 0.3279, 'grad_norm': 5.628873825073242, 'learning_rate': 1.3019066367705734e-05, 'epoch': 0.43} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(0.2653, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00031179366633296015, 'train/lm_loss': 5.1467947196215397e-05, 'train/info_loss': 3.00994397548493e-05, 'train/ref_loss': 0.39068475365638733, 'train/uncertainty_loss': 0.026531401276588443, 'train/video_loss': 0.41974058747291565, 'train/total_loss': 0.4197920560836792} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.1898, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002811664249747992, 'train/lm_loss': 6.631705327890814e-05, 'train/info_loss': 3.325828583911061e-05, 'train/ref_loss': 0.3390262722969055, 'train/uncertainty_loss': 0.018976444005966188, 'train/video_loss': 0.36028531193733215, 'train/total_loss': 0.3603516221046448} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3311, 'grad_norm': 6.904906749725342, 'learning_rate': 1.300889487908358e-05} -[Rank 3] Trainer log: {'loss': 0.3311, 'grad_norm': 6.904906749725342, 'learning_rate': 1.300889487908358e-05} -[Rank 0] Trainer log: {'loss': 0.3311, 'grad_norm': 6.904906749725342, 'learning_rate': 1.300889487908358e-05} -[Rank 2] Trainer log: {'loss': 0.3311, 'grad_norm': 6.904906749725342, 'learning_rate': 1.300889487908358e-05} -{'loss': 0.3311, 'grad_norm': 6.904906749725342, 'learning_rate': 1.300889487908358e-05, 'epoch': 0.43} -tensor(-0.0015, device='cuda:0', grad_fn=) tensor(-0.0015, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.36587462425231937, 'train/info_loss': 0.144365131855011, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00015200923662632705, 'train/video_loss': 0.14421312510967255, 'train/total_loss': 0.5100877285003662} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002404517959803343, 'train/lm_loss': 6.591187557205559e-05, 'train/info_loss': 3.129145989078097e-05, 'train/ref_loss': 0.16126605868339539, 'train/uncertainty_loss': -6.92508474458009e-05, 'train/video_loss': 0.16315172612667084, 'train/total_loss': 0.16321763396263123} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3354, 'grad_norm': 4.071390151977539, 'learning_rate': 1.2998719966442782e-05}[Rank 1] Trainer log: {'loss': 0.3354, 'grad_norm': 4.071390151977539, 'learning_rate': 1.2998719966442782e-05} - -[Rank 0] Trainer log: {'loss': 0.3354, 'grad_norm': 4.071390151977539, 'learning_rate': 1.2998719966442782e-05}[Rank 2] Trainer log: {'loss': 0.3354, 'grad_norm': 4.071390151977539, 'learning_rate': 1.2998719966442782e-05} - -{'loss': 0.3354, 'grad_norm': 4.071390151977539, 'learning_rate': 1.2998719966442782e-05, 'epoch': 0.43} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.34208891391754154, 'train/info_loss': 0.15509794652462006, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011228768853470683, 'train/video_loss': 0.15498566627502441, 'train/total_loss': 0.49707457423210144} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.1567, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1551176190376282, 'train/info_loss': 0.24027863144874573, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001294466434046626, 'train/video_loss': 0.24014918506145477, 'train/total_loss': 0.3952668011188507} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4907, 'grad_norm': 10.219313621520996, 'learning_rate': 1.2988541641362033e-05}[Rank 3] Trainer log: {'loss': 0.4907, 'grad_norm': 10.219313621520996, 'learning_rate': 1.2988541641362033e-05}[Rank 0] Trainer log: {'loss': 0.4907, 'grad_norm': 10.219313621520996, 'learning_rate': 1.2988541641362033e-05} - - -[Rank 2] Trainer log: {'loss': 0.4907, 'grad_norm': 10.219313621520996, 'learning_rate': 1.2988541641362033e-05} -{'loss': 0.4907, 'grad_norm': 10.219313621520996, 'learning_rate': 1.2988541641362033e-05, 'epoch': 0.43} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.28071174621582035, 'train/info_loss': 0.2097240835428238, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011223892215639353, 'train/video_loss': 0.20961184799671173, 'train/total_loss': 0.49032360315322876} -tensor(0.2251, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.2542, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(0.4094, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0032, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00022646957077085972, 'train/lm_loss': 3.964508068747819e-05, 'train/info_loss': 2.6106152290594764e-05, 'train/ref_loss': 0.20744258165359497, 'train/uncertainty_loss': 0.00031913125421851874, 'train/video_loss': 0.2095995843410492, 'train/total_loss': 0.20963923633098602} -[Rank 0] Trainer log: {'loss': 0.4241, 'grad_norm': 12.80066967010498, 'learning_rate': 1.2978359915423924e-05}[Rank 2] Trainer log: {'loss': 0.4241, 'grad_norm': 12.80066967010498, 'learning_rate': 1.2978359915423924e-05} - -[Rank 1] Trainer log: {'loss': 0.4241, 'grad_norm': 12.80066967010498, 'learning_rate': 1.2978359915423924e-05} -[Rank 3] Trainer log: {'loss': 0.4241, 'grad_norm': 12.80066967010498, 'learning_rate': 1.2978359915423924e-05} -{'loss': 0.4241, 'grad_norm': 12.80066967010498, 'learning_rate': 1.2978359915423924e-05, 'epoch': 0.43} -tensor(0.3705, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.3971, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(0.3614, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0004619215615093708, 'train/lm_loss': 5.78319828491658e-05, 'train/info_loss': 3.254307739553042e-05, 'train/ref_loss': 0.46773561835289, 'train/uncertainty_loss': 0.03613616228103638, 'train/video_loss': 0.5075997114181519, 'train/total_loss': 0.507657527923584} -tensor(0.3292, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00028539898339658977, 'train/lm_loss': 3.049141087103635e-05, 'train/info_loss': 2.4377704903599806e-05, 'train/ref_loss': 0.17411160469055176, 'train/uncertainty_loss': -7.14857247658074e-05, 'train/video_loss': 0.17634768784046173, 'train/total_loss': 0.17637817561626434} -[Rank 0] Trainer log: {'loss': 0.4339, 'grad_norm': 21.98418617248535, 'learning_rate': 1.2968174800214901e-05}[Rank 3] Trainer log: {'loss': 0.4339, 'grad_norm': 21.98418617248535, 'learning_rate': 1.2968174800214901e-05}[Rank 1] Trainer log: {'loss': 0.4339, 'grad_norm': 21.98418617248535, 'learning_rate': 1.2968174800214901e-05} - -[Rank 2] Trainer log: {'loss': 0.4339, 'grad_norm': 21.98418617248535, 'learning_rate': 1.2968174800214901e-05} - -{'loss': 0.4339, 'grad_norm': 21.98418617248535, 'learning_rate': 1.2968174800214901e-05, 'epoch': 0.43} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0005709333345293999, 'train/lm_loss': 3.8786942604929214e-05, 'train/info_loss': 2.8490208933362737e-05, 'train/ref_loss': 0.1379050761461258, 'train/uncertainty_loss': -8.481152472086252e-05, 'train/video_loss': 0.14241620898246765, 'train/total_loss': 0.14245499670505524} -tensor(0.3474, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3078930377960205, 'train/info_loss': 0.15585672855377197, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011356973554939033, 'train/video_loss': 0.15574315190315247, 'train/total_loss': 0.463636189699173} -tensor(0.3747, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4817, 'grad_norm': 11.849188804626465, 'learning_rate': 1.2957986307325282e-05}[Rank 0] Trainer log: {'loss': 0.4817, 'grad_norm': 11.849188804626465, 'learning_rate': 1.2957986307325282e-05}[Rank 1] Trainer log: {'loss': 0.4817, 'grad_norm': 11.849188804626465, 'learning_rate': 1.2957986307325282e-05} - - -[Rank 2] Trainer log: {'loss': 0.4817, 'grad_norm': 11.849188804626465, 'learning_rate': 1.2957986307325282e-05} -{'loss': 0.4817, 'grad_norm': 11.849188804626465, 'learning_rate': 1.2957986307325282e-05, 'epoch': 0.43} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3651862859725952, 'train/info_loss': 0.3378802537918091, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011311344569548966, 'train/video_loss': 0.3377671539783478, 'train/total_loss': 0.7029534578323364} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.0646, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.6030, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(0.2852, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.194490122795105, 'train/info_loss': 0.17903709411621094, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011344451922923326, 'train/video_loss': 0.17892365157604218, 'train/total_loss': 0.3734137713909149} -[Rank 1] Trainer log: {'loss': 0.4953, 'grad_norm': 9.451658248901367, 'learning_rate': 1.2947794448349206e-05} -[Rank 3] Trainer log: {'loss': 0.4953, 'grad_norm': 9.451658248901367, 'learning_rate': 1.2947794448349206e-05} -[Rank 0] Trainer log: {'loss': 0.4953, 'grad_norm': 9.451658248901367, 'learning_rate': 1.2947794448349206e-05}[Rank 2] Trainer log: {'loss': 0.4953, 'grad_norm': 9.451658248901367, 'learning_rate': 1.2947794448349206e-05} - -{'loss': 0.4953, 'grad_norm': 9.451658248901367, 'learning_rate': 1.2947794448349206e-05, 'epoch': 0.43} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(0.1196, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00019522602669894696, 'train/lm_loss': 5.792732699774206e-05, 'train/info_loss': 3.176826794515364e-05, 'train/ref_loss': 0.17938420176506042, 'train/uncertainty_loss': -6.961195613257587e-05, 'train/video_loss': 0.18090815842151642, 'train/total_loss': 0.1809660792350769} -tensor(0.1811, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.13963104486465455, 'train/info_loss': 0.25338509678840637, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011037503136321903, 'train/video_loss': 0.25327470898628235, 'train/total_loss': 0.3929057717323303} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0015, device='cuda:3', grad_fn=) tensor(-0.0015, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3928, 'grad_norm': 6.845163345336914, 'learning_rate': 1.2937599234884674e-05}[Rank 2] Trainer log: {'loss': 0.3928, 'grad_norm': 6.845163345336914, 'learning_rate': 1.2937599234884674e-05} -[Rank 1] Trainer log: {'loss': 0.3928, 'grad_norm': 6.845163345336914, 'learning_rate': 1.2937599234884674e-05} - -[Rank 3] Trainer log: {'loss': 0.3928, 'grad_norm': 6.845163345336914, 'learning_rate': 1.2937599234884674e-05} -{'loss': 0.3928, 'grad_norm': 6.845163345336914, 'learning_rate': 1.2937599234884674e-05, 'epoch': 0.43} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00031678138766437773, 'train/lm_loss': 5.752213182859123e-05, 'train/info_loss': 3.075505082961172e-05, 'train/ref_loss': 0.16663265228271484, 'train/uncertainty_loss': -7.128026918508112e-05, 'train/video_loss': 0.16912637650966644, 'train/total_loss': 0.1691838949918747} -tensor(0.1114, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.0657, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002104278886690736, 'train/lm_loss': 5.802266532555223e-05, 'train/info_loss': 3.43310966854915e-05, 'train/ref_loss': 0.14824247360229492, 'train/uncertainty_loss': -6.93572626914829e-05, 'train/video_loss': 0.14989088475704193, 'train/total_loss': 0.14994890987873077} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3473, 'grad_norm': 5.982193470001221, 'learning_rate': 1.2927400678533479e-05}[Rank 1] Trainer log: {'loss': 0.3473, 'grad_norm': 5.982193470001221, 'learning_rate': 1.2927400678533479e-05} -[Rank 2] Trainer log: {'loss': 0.3473, 'grad_norm': 5.982193470001221, 'learning_rate': 1.2927400678533479e-05} - -[Rank 3] Trainer log: {'loss': 0.3473, 'grad_norm': 5.982193470001221, 'learning_rate': 1.2927400678533479e-05} -{'loss': 0.3473, 'grad_norm': 5.982193470001221, 'learning_rate': 1.2927400678533479e-05, 'epoch': 0.44} -tensor(-0.0017, device='cuda:1', grad_fn=) tensor(-0.0017, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.24522273540496828, 'train/info_loss': 0.17340752482414246, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010492443107068539, 'train/video_loss': 0.17330260574817657, 'train/total_loss': 0.4185253381729126} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(0.0968, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003197409212589264, 'train/lm_loss': 7.473026053048671e-05, 'train/info_loss': 3.5403903893893585e-05, 'train/ref_loss': 0.23663735389709473, 'train/uncertainty_loss': 0.009677628427743912, 'train/video_loss': 0.24890832602977753, 'train/total_loss': 0.24898305535316467} -tensor(0.1017, device='cuda:2', grad_fn=) tensor(-0.0006, device='cuda:2', grad_fn=) -tensor(0.0508, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3374, 'grad_norm': 3.93158221244812, 'learning_rate': 1.2917198790901231e-05} -[Rank 2] Trainer log: {'loss': 0.3374, 'grad_norm': 3.93158221244812, 'learning_rate': 1.2917198790901231e-05} -[Rank 0] Trainer log: {'loss': 0.3374, 'grad_norm': 3.93158221244812, 'learning_rate': 1.2917198790901231e-05}[Rank 3] Trainer log: {'loss': 0.3374, 'grad_norm': 3.93158221244812, 'learning_rate': 1.2917198790901231e-05} - -{'loss': 0.3374, 'grad_norm': 3.93158221244812, 'learning_rate': 1.2917198790901231e-05, 'epoch': 0.44} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.1561, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.12311769723892213, 'train/info_loss': 0.10659302771091461, 'train/ref_loss': None, 'train/uncertainty_loss': -9.836002718657256e-05, 'train/video_loss': 0.10649466514587402, 'train/total_loss': 0.22961236536502838} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00021380861289799213, 'train/lm_loss': 6.631705327890814e-05, 'train/info_loss': 3.599991032388061e-05, 'train/ref_loss': 0.14695154130458832, 'train/uncertainty_loss': -6.629496929235757e-05, 'train/video_loss': 0.1486317217350006, 'train/total_loss': 0.14869803190231323} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3017, 'grad_norm': 2.3499255180358887, 'learning_rate': 1.2906993583597318e-05}[Rank 3] Trainer log: {'loss': 0.3017, 'grad_norm': 2.3499255180358887, 'learning_rate': 1.2906993583597318e-05} - -[Rank 2] Trainer log: {'loss': 0.3017, 'grad_norm': 2.3499255180358887, 'learning_rate': 1.2906993583597318e-05} -[Rank 0] Trainer log: {'loss': 0.3017, 'grad_norm': 2.3499255180358887, 'learning_rate': 1.2906993583597318e-05} -{'loss': 0.3017, 'grad_norm': 2.3499255180358887, 'learning_rate': 1.2906993583597318e-05, 'epoch': 0.44} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.396513843536377, 'train/info_loss': 0.2170768827199936, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011988416081294418, 'train/video_loss': 0.2169570028781891, 'train/total_loss': 0.6134708523750305} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3042440414428711, 'train/info_loss': 0.1614048331975937, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011388531420379877, 'train/video_loss': 0.1612909436225891, 'train/total_loss': 0.4655349850654602} -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4063, 'grad_norm': 1.9845776557922363, 'learning_rate': 1.2896785068234925e-05}[Rank 0] Trainer log: {'loss': 0.4063, 'grad_norm': 1.9845776557922363, 'learning_rate': 1.2896785068234925e-05} -[Rank 3] Trainer log: {'loss': 0.4063, 'grad_norm': 1.9845776557922363, 'learning_rate': 1.2896785068234925e-05} - -{'loss': 0.4063, 'grad_norm': 1.9845776557922363, 'learning_rate': 1.2896785068234925e-05, 'epoch': 0.44} -[Rank 2] Trainer log: {'loss': 0.4063, 'grad_norm': 1.9845776557922363, 'learning_rate': 1.2896785068234925e-05} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0015, device='cuda:0', grad_fn=) tensor(-0.0015, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.41091122627258303, 'train/info_loss': 0.2725898325443268, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001472089788876474, 'train/video_loss': 0.27244260907173157, 'train/total_loss': 0.683353841304779} -tensor(0.1791, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.27135810852050785, 'train/info_loss': 0.14957180619239807, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010338638676330448, 'train/video_loss': 0.14946842193603516, 'train/total_loss': 0.4208265244960785} -tensor(0.0727, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4809, 'grad_norm': 4.222432613372803, 'learning_rate': 1.288657325643098e-05} -[Rank 3] Trainer log: {'loss': 0.4809, 'grad_norm': 4.222432613372803, 'learning_rate': 1.288657325643098e-05}[Rank 0] Trainer log: {'loss': 0.4809, 'grad_norm': 4.222432613372803, 'learning_rate': 1.288657325643098e-05} - -[Rank 2] Trainer log: {'loss': 0.4809, 'grad_norm': 4.222432613372803, 'learning_rate': 1.288657325643098e-05} -{'loss': 0.4809, 'grad_norm': 4.222432613372803, 'learning_rate': 1.288657325643098e-05, 'epoch': 0.44} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.17182228565216065, 'train/info_loss': 0.19068363308906555, 'train/ref_loss': None, 'train/uncertainty_loss': -8.967916364781559e-05, 'train/video_loss': 0.19059395790100098, 'train/total_loss': 0.36241626739501953} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.1546, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00020205008331686258, 'train/lm_loss': 3.935903368983418e-05, 'train/info_loss': 2.9861035727662966e-05, 'train/ref_loss': 0.3043319880962372, 'train/uncertainty_loss': 0.015456004440784455, 'train/video_loss': 0.32143422961235046, 'train/total_loss': 0.3214735984802246} -[Rank 1] Trainer log: {'loss': 0.3263, 'grad_norm': 1.8554898500442505, 'learning_rate': 1.2876358159806182e-05}[Rank 0] Trainer log: {'loss': 0.3263, 'grad_norm': 1.8554898500442505, 'learning_rate': 1.2876358159806182e-05} -[Rank 3] Trainer log: {'loss': 0.3263, 'grad_norm': 1.8554898500442505, 'learning_rate': 1.2876358159806182e-05} - -[Rank 2] Trainer log: {'loss': 0.3263, 'grad_norm': 1.8554898500442505, 'learning_rate': 1.2876358159806182e-05} -{'loss': 0.3263, 'grad_norm': 1.8554898500442505, 'learning_rate': 1.2876358159806182e-05, 'epoch': 0.44} -tensor(0.1501, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1420, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.27157952785491946, 'train/info_loss': 0.16241523623466492, 'train/ref_loss': None, 'train/uncertainty_loss': -9.499616571702064e-05, 'train/video_loss': 0.16232024133205414, 'train/total_loss': 0.43389976024627686} -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.0234, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003277544165030122, 'train/lm_loss': 5.113424849696458e-05, 'train/info_loss': 3.22450723615475e-05, 'train/ref_loss': 0.2134949266910553, 'train/uncertainty_loss': 0.002337332628667355, 'train/video_loss': 0.21848654747009277, 'train/total_loss': 0.21853768825531006} -tensor(0.2298, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3366, 'grad_norm': 4.882051467895508, 'learning_rate': 1.2866139789984953e-05}[Rank 0] Trainer log: {'loss': 0.3366, 'grad_norm': 4.882051467895508, 'learning_rate': 1.2866139789984953e-05} - -[Rank 1] Trainer log: {'loss': 0.3366, 'grad_norm': 4.882051467895508, 'learning_rate': 1.2866139789984953e-05} -[Rank 3] Trainer log: {'loss': 0.3366, 'grad_norm': 4.882051467895508, 'learning_rate': 1.2866139789984953e-05} -{'loss': 0.3366, 'grad_norm': 4.882051467895508, 'learning_rate': 1.2866139789984953e-05, 'epoch': 0.44} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.08965242505073548, 'train/info_loss': 0.17881359159946442, 'train/ref_loss': None, 'train/uncertainty_loss': -8.585789473727346e-05, 'train/video_loss': 0.17872773110866547, 'train/total_loss': 0.26838016510009766} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002616179874166846, 'train/lm_loss': 5.76889724470675e-05, 'train/info_loss': 3.301988181192428e-05, 'train/ref_loss': 0.08608076721429825, 'train/uncertainty_loss': -7.298320415429772e-05, 'train/video_loss': 0.08813374489545822, 'train/total_loss': 0.0881914347410202} -tensor(0.1320, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.2545, 'grad_norm': 2.985894203186035, 'learning_rate': 1.2855918158595446e-05}[Rank 3] Trainer log: {'loss': 0.2545, 'grad_norm': 2.985894203186035, 'learning_rate': 1.2855918158595446e-05}[Rank 1] Trainer log: {'loss': 0.2545, 'grad_norm': 2.985894203186035, 'learning_rate': 1.2855918158595446e-05} - - -[Rank 0] Trainer log: {'loss': 0.2545, 'grad_norm': 2.985894203186035, 'learning_rate': 1.2855918158595446e-05} -{'loss': 0.2545, 'grad_norm': 2.985894203186035, 'learning_rate': 1.2855918158595446e-05, 'epoch': 0.44} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.0439, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00020993337966501713, 'train/lm_loss': 6.619788473471999e-05, 'train/info_loss': 3.325828583911061e-05, 'train/ref_loss': 0.16867947578430176, 'train/uncertainty_loss': -6.695822812616826e-05, 'train/video_loss': 0.17032524943351746, 'train/total_loss': 0.17039144039154053} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.20780715942382813, 'train/info_loss': 0.1664077341556549, 'train/ref_loss': None, 'train/uncertainty_loss': -9.23088809940964e-05, 'train/video_loss': 0.16631542146205902, 'train/total_loss': 0.37412258982658386} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3533, 'grad_norm': 4.85083532333374, 'learning_rate': 1.2845693277269529e-05}[Rank 3] Trainer log: {'loss': 0.3533, 'grad_norm': 4.85083532333374, 'learning_rate': 1.2845693277269529e-05}[Rank 1] Trainer log: {'loss': 0.3533, 'grad_norm': 4.85083532333374, 'learning_rate': 1.2845693277269529e-05} - - -[Rank 0] Trainer log: {'loss': 0.3533, 'grad_norm': 4.85083532333374, 'learning_rate': 1.2845693277269529e-05} -{'loss': 0.3533, 'grad_norm': 4.85083532333374, 'learning_rate': 1.2845693277269529e-05, 'epoch': 0.44} -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.27036027908325194, 'train/info_loss': 0.27652105689048767, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012636169558390974, 'train/video_loss': 0.27639469504356384, 'train/total_loss': 0.5467549562454224} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.9298, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00019132236484438182, 'train/lm_loss': 0.00013344603357836604, 'train/info_loss': 4.142351099289954e-05, 'train/ref_loss': 0.8357201814651489, 'train/uncertainty_loss': 0.09297518134117128, 'train/video_loss': 0.9302673935890198, 'train/total_loss': 0.9304008483886719} -[Rank 1] Trainer log: {'loss': 0.3175, 'grad_norm': 8.241012573242188, 'learning_rate': 1.283546515764276e-05}[Rank 2] Trainer log: {'loss': 0.3175, 'grad_norm': 8.241012573242188, 'learning_rate': 1.283546515764276e-05} - -[Rank 3] Trainer log: {'loss': 0.3175, 'grad_norm': 8.241012573242188, 'learning_rate': 1.283546515764276e-05} -[Rank 0] Trainer log: {'loss': 0.3175, 'grad_norm': 8.241012573242188, 'learning_rate': 1.283546515764276e-05} -{'loss': 0.3175, 'grad_norm': 8.241012573242188, 'learning_rate': 1.283546515764276e-05, 'epoch': 0.44} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.11162246465682985, 'train/info_loss': 0.17798560857772827, 'train/ref_loss': None, 'train/uncertainty_loss': -8.619803702458739e-05, 'train/video_loss': 0.17789940536022186, 'train/total_loss': 0.28952187299728394} -tensor(1.0950, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.3416, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1070, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0001565850921906531, 'train/lm_loss': 3.452003002166748e-05, 'train/info_loss': 2.783459422062151e-05, 'train/ref_loss': 0.286615252494812, 'train/uncertainty_loss': 0.010699699819087983, 'train/video_loss': 0.29859545826911926, 'train/total_loss': 0.2986299693584442} -[Rank 1] Trainer log: {'loss': 0.5012, 'grad_norm': 15.930635452270508, 'learning_rate': 1.2825233811354389e-05}[Rank 0] Trainer log: {'loss': 0.5012, 'grad_norm': 15.930635452270508, 'learning_rate': 1.2825233811354389e-05} -[Rank 2] Trainer log: {'loss': 0.5012, 'grad_norm': 15.930635452270508, 'learning_rate': 1.2825233811354389e-05}[Rank 3] Trainer log: {'loss': 0.5012, 'grad_norm': 15.930635452270508, 'learning_rate': 1.2825233811354389e-05} - - -{'loss': 0.5012, 'grad_norm': 15.930635452270508, 'learning_rate': 1.2825233811354389e-05, 'epoch': 0.44} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(0.0282, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1241, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00024132044054567814, 'train/lm_loss': 8.995893876999617e-05, 'train/info_loss': 3.832431684713811e-05, 'train/ref_loss': 0.2867826521396637, 'train/uncertainty_loss': 0.012411907315254211, 'train/video_loss': 0.3011634349822998, 'train/total_loss': 0.30125340819358826} -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3585632085800171, 'train/info_loss': 0.22078058123588562, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013956199400126936, 'train/video_loss': 0.22064101696014404, 'train/total_loss': 0.5792042016983032} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(1.1462, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.4566, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.5689, 'grad_norm': 11.378826141357422, 'learning_rate': 1.2814999250047335e-05}[Rank 1] Trainer log: {'loss': 0.5689, 'grad_norm': 11.378826141357422, 'learning_rate': 1.2814999250047335e-05} -[Rank 3] Trainer log: {'loss': 0.5689, 'grad_norm': 11.378826141357422, 'learning_rate': 1.2814999250047335e-05} - -[Rank 0] Trainer log: {'loss': 0.5689, 'grad_norm': 11.378826141357422, 'learning_rate': 1.2814999250047335e-05} -{'loss': 0.5689, 'grad_norm': 11.378826141357422, 'learning_rate': 1.2814999250047335e-05, 'epoch': 0.44} -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.26964850425720216, 'train/info_loss': 0.17680855095386505, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010814594570547343, 'train/video_loss': 0.1767003983259201, 'train/total_loss': 0.4463489055633545} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.23597402572631837, 'train/info_loss': 0.1736302375793457, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011527326423674822, 'train/video_loss': 0.1735149621963501, 'train/total_loss': 0.4094889760017395} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(0.6107, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.5724, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.5159, 'grad_norm': 13.56294059753418, 'learning_rate': 1.2804761485368176e-05}[Rank 3] Trainer log: {'loss': 0.5159, 'grad_norm': 13.56294059753418, 'learning_rate': 1.2804761485368176e-05} - -[Rank 0] Trainer log: {'loss': 0.5159, 'grad_norm': 13.56294059753418, 'learning_rate': 1.2804761485368176e-05} -[Rank 2] Trainer log: {'loss': 0.5159, 'grad_norm': 13.56294059753418, 'learning_rate': 1.2804761485368176e-05} -{'loss': 0.5159, 'grad_norm': 13.56294059753418, 'learning_rate': 1.2804761485368176e-05, 'epoch': 0.44} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2610345840454102, 'train/info_loss': 0.19622613489627838, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011583231389522552, 'train/video_loss': 0.1961103081703186, 'train/total_loss': 0.4571448862552643} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.0624, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.23113532066345216, 'train/info_loss': 0.1825009435415268, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012029455974698067, 'train/video_loss': 0.18238064646720886, 'train/total_loss': 0.41351598501205444} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.0042, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1514, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.4277, 'grad_norm': 4.451176166534424, 'learning_rate': 1.2794520528967136e-05}[Rank 0] Trainer log: {'loss': 0.4277, 'grad_norm': 4.451176166534424, 'learning_rate': 1.2794520528967136e-05} -[Rank 1] Trainer log: {'loss': 0.4277, 'grad_norm': 4.451176166534424, 'learning_rate': 1.2794520528967136e-05} - -[Rank 3] Trainer log: {'loss': 0.4277, 'grad_norm': 4.451176166534424, 'learning_rate': 1.2794520528967136e-05} -{'loss': 0.4277, 'grad_norm': 4.451176166534424, 'learning_rate': 1.2794520528967136e-05, 'epoch': 0.44} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.3903, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.4101, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00022122571244835855, 'train/lm_loss': 3.9382872637361293e-05, 'train/info_loss': 2.8013399060000665e-05, 'train/ref_loss': 0.5009114146232605, 'train/uncertainty_loss': 0.04100866317749024, 'train/video_loss': 0.5437178611755371, 'train/total_loss': 0.5437572598457336} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0005427715834230185, 'train/lm_loss': 9.949116501957179e-05, 'train/info_loss': 3.2006668334361166e-05, 'train/ref_loss': 0.05647406727075577, 'train/uncertainty_loss': -7.967038545757532e-05, 'train/video_loss': 0.060768578201532364, 'train/total_loss': 0.060868069529533386} -[Rank 1] Trainer log: {'loss': 0.3829, 'grad_norm': 7.660790920257568, 'learning_rate': 1.2784276392498068e-05}[Rank 3] Trainer log: {'loss': 0.3829, 'grad_norm': 7.660790920257568, 'learning_rate': 1.2784276392498068e-05}[Rank 2] Trainer log: {'loss': 0.3829, 'grad_norm': 7.660790920257568, 'learning_rate': 1.2784276392498068e-05} - - -[Rank 0] Trainer log: {'loss': 0.3829, 'grad_norm': 7.660790920257568, 'learning_rate': 1.2784276392498068e-05} -{'loss': 0.3829, 'grad_norm': 7.660790920257568, 'learning_rate': 1.2784276392498068e-05, 'epoch': 0.44} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(0.1702, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00030330116860568526, 'train/lm_loss': 2.9919293592683974e-05, 'train/info_loss': 2.324527122254949e-05, 'train/ref_loss': 0.3223302960395813, 'train/uncertainty_loss': 0.01701674163341522, 'train/video_loss': 0.3417966961860657, 'train/total_loss': 0.3418266177177429} -tensor(0.0776, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.1436, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2696938276290894, 'train/info_loss': 0.3149970471858978, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011539034312590957, 'train/video_loss': 0.31488165259361267, 'train/total_loss': 0.5845754742622375} -tensor(0.0792, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3964, 'grad_norm': 6.093995571136475, 'learning_rate': 1.2774029087618448e-05}[Rank 0] Trainer log: {'loss': 0.3964, 'grad_norm': 6.093995571136475, 'learning_rate': 1.2774029087618448e-05}[Rank 2] Trainer log: {'loss': 0.3964, 'grad_norm': 6.093995571136475, 'learning_rate': 1.2774029087618448e-05} -[Rank 1] Trainer log: {'loss': 0.3964, 'grad_norm': 6.093995571136475, 'learning_rate': 1.2774029087618448e-05} - - -{'loss': 0.3964, 'grad_norm': 6.093995571136475, 'learning_rate': 1.2774029087618448e-05, 'epoch': 0.44} -tensor(0.2670, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00025840820744633676, 'train/lm_loss': 3.900147567037493e-05, 'train/info_loss': 2.7178979507880285e-05, 'train/ref_loss': 0.159755140542984, 'train/uncertainty_loss': -6.906929193064571e-05, 'train/video_loss': 0.16178052127361298, 'train/total_loss': 0.16181951761245728}tensor(0.2280, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.3051, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00032392160501331095, 'train/lm_loss': 0.00012072257231920958, 'train/info_loss': 3.486750210868195e-05, 'train/ref_loss': 0.4275704622268677, 'train/uncertainty_loss': 0.030505701899528503, 'train/video_loss': 0.46070241928100586, 'train/total_loss': 0.4608231484889984} -tensor(0.0547, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.2909, 'grad_norm': 8.92898941040039, 'learning_rate': 1.2763778625989355e-05}[Rank 2] Trainer log: {'loss': 0.2909, 'grad_norm': 8.92898941040039, 'learning_rate': 1.2763778625989355e-05}[Rank 0] Trainer log: {'loss': 0.2909, 'grad_norm': 8.92898941040039, 'learning_rate': 1.2763778625989355e-05} - -[Rank 1] Trainer log: {'loss': 0.2909, 'grad_norm': 8.92898941040039, 'learning_rate': 1.2763778625989355e-05} - -{'loss': 0.2909, 'grad_norm': 8.92898941040039, 'learning_rate': 1.2763778625989355e-05, 'epoch': 0.44} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.27367002964019777, 'train/info_loss': 0.18567970395088196, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012015106622129679, 'train/video_loss': 0.18555955588817596, 'train/total_loss': 0.45922958850860596} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.25647428035736086, 'train/info_loss': 0.20202060043811798, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012038141721859574, 'train/video_loss': 0.20190021395683289, 'train/total_loss': 0.4583745002746582} -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.4678, 'grad_norm': 2.393505096435547, 'learning_rate': 1.275352501927546e-05}[Rank 1] Trainer log: {'loss': 0.4678, 'grad_norm': 2.393505096435547, 'learning_rate': 1.275352501927546e-05}[Rank 3] Trainer log: {'loss': 0.4678, 'grad_norm': 2.393505096435547, 'learning_rate': 1.275352501927546e-05} - - -[Rank 2] Trainer log: {'loss': 0.4678, 'grad_norm': 2.393505096435547, 'learning_rate': 1.275352501927546e-05} -{'loss': 0.4678, 'grad_norm': 2.393505096435547, 'learning_rate': 1.275352501927546e-05, 'epoch': 0.44} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.0737, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0006703344639390707, 'train/lm_loss': 3.432933008298278e-05, 'train/info_loss': 2.4914121240726672e-05, 'train/ref_loss': 0.17258904874324799, 'train/uncertainty_loss': -8.153958478942514e-05, 'train/video_loss': 0.17789509892463684, 'train/total_loss': 0.17792943120002747} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.30582799911499026, 'train/info_loss': 0.22289833426475525, 'train/ref_loss': None, 'train/uncertainty_loss': -9.900146396830679e-05, 'train/video_loss': 0.22279933094978333, 'train/total_loss': 0.528627336025238} -tensor(0.4052, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4283, 'grad_norm': 9.129878997802734, 'learning_rate': 1.2743268279145018e-05}[Rank 2] Trainer log: {'loss': 0.4283, 'grad_norm': 9.129878997802734, 'learning_rate': 1.2743268279145018e-05} -[Rank 3] Trainer log: {'loss': 0.4283, 'grad_norm': 9.129878997802734, 'learning_rate': 1.2743268279145018e-05} - -[Rank 0] Trainer log: {'loss': 0.4283, 'grad_norm': 9.129878997802734, 'learning_rate': 1.2743268279145018e-05} -{'loss': 0.4283, 'grad_norm': 9.129878997802734, 'learning_rate': 1.2743268279145018e-05, 'epoch': 0.44} -tensor(-0.0017, device='cuda:2', grad_fn=) tensor(-0.0017, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(0.0763, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.2596, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00022420347668230536, 'train/lm_loss': 3.9382872637361293e-05, 'train/info_loss': 2.8490208933362737e-05, 'train/ref_loss': 0.39139097929000854, 'train/uncertainty_loss': 0.025960922241210938, 'train/video_loss': 0.4191740155220032, 'train/total_loss': 0.4192133843898773} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.42951779365539555, 'train/info_loss': 0.15621526539325714, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011064962018281221, 'train/video_loss': 0.15610460937023163, 'train/total_loss': 0.5856224298477173} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4376, 'grad_norm': 9.75659465789795, 'learning_rate': 1.2733008417269841e-05} -[Rank 2] Trainer log: {'loss': 0.4376, 'grad_norm': 9.75659465789795, 'learning_rate': 1.2733008417269841e-05} -[Rank 0] Trainer log: {'loss': 0.4376, 'grad_norm': 9.75659465789795, 'learning_rate': 1.2733008417269841e-05}[Rank 3] Trainer log: {'loss': 0.4376, 'grad_norm': 9.75659465789795, 'learning_rate': 1.2733008417269841e-05} - -{'loss': 0.4376, 'grad_norm': 9.75659465789795, 'learning_rate': 1.2733008417269841e-05, 'epoch': 0.44} -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.15190689563751222, 'train/info_loss': 0.17809665203094482, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010339764412492514, 'train/video_loss': 0.17799325287342072, 'train/total_loss': 0.3299001455307007} -tensor(0.1890, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1394, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1931, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00028250492177903655, 'train/lm_loss': 5.721227498725057e-05, 'train/info_loss': 3.379469126230106e-05, 'train/ref_loss': 0.341951459646225, 'train/uncertainty_loss': 0.019311708211898804, 'train/video_loss': 0.36355698108673096, 'train/total_loss': 0.36361420154571533} -[Rank 0] Trainer log: {'loss': 0.3985, 'grad_norm': 13.816030502319336, 'learning_rate': 1.2722745445325301e-05}[Rank 2] Trainer log: {'loss': 0.3985, 'grad_norm': 13.816030502319336, 'learning_rate': 1.2722745445325301e-05}[Rank 3] Trainer log: {'loss': 0.3985, 'grad_norm': 13.816030502319336, 'learning_rate': 1.2722745445325301e-05} - -[Rank 1] Trainer log: {'loss': 0.3985, 'grad_norm': 13.816030502319336, 'learning_rate': 1.2722745445325301e-05} - -{'loss': 0.3985, 'grad_norm': 13.816030502319336, 'learning_rate': 1.2722745445325301e-05, 'epoch': 0.44} -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.05717803835868836, 'train/info_loss': 0.09733711183071136, 'train/ref_loss': None, 'train/uncertainty_loss': -8.451424073427916e-05, 'train/video_loss': 0.09725259989500046, 'train/total_loss': 0.15443064272403717} -tensor(0.0850, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0006, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.26037011146545413, 'train/info_loss': 0.13706988096237183, 'train/ref_loss': None, 'train/uncertainty_loss': -9.362276759929956e-05, 'train/video_loss': 0.13697625696659088, 'train/total_loss': 0.3973463773727417} -tensor(0.1064, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1329, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.2917, 'grad_norm': 6.609551429748535, 'learning_rate': 1.2712479374990303e-05} -[Rank 2] Trainer log: {'loss': 0.2917, 'grad_norm': 6.609551429748535, 'learning_rate': 1.2712479374990303e-05}[Rank 3] Trainer log: {'loss': 0.2917, 'grad_norm': 6.609551429748535, 'learning_rate': 1.2712479374990303e-05}[Rank 0] Trainer log: {'loss': 0.2917, 'grad_norm': 6.609551429748535, 'learning_rate': 1.2712479374990303e-05} - - -{'loss': 0.2917, 'grad_norm': 6.609551429748535, 'learning_rate': 1.2712479374990303e-05, 'epoch': 0.44} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(0.2858, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003399609122425318, 'train/lm_loss': 4.3458986328914766e-05, 'train/info_loss': 2.8013399060000665e-05, 'train/ref_loss': 0.4097214341163635, 'train/uncertainty_loss': 0.028578448295593264, 'train/video_loss': 0.4410476088523865, 'train/total_loss': 0.44109106063842773} -tensor(0.3437, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.1968, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.3513, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00020953463390469552, 'train/lm_loss': 4.422175697982311e-05, 'train/info_loss': 2.8013399060000665e-05, 'train/ref_loss': 0.45541703701019287, 'train/uncertainty_loss': 0.03513402044773102, 'train/video_loss': 0.4922553598880768, 'train/total_loss': 0.4922995865345001} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.421, 'grad_norm': 4.8718366622924805, 'learning_rate': 1.2702210217947289e-05} -[Rank 3] Trainer log: {'loss': 0.421, 'grad_norm': 4.8718366622924805, 'learning_rate': 1.2702210217947289e-05} -[Rank 0] Trainer log: {'loss': 0.421, 'grad_norm': 4.8718366622924805, 'learning_rate': 1.2702210217947289e-05}[Rank 1] Trainer log: {'loss': 0.421, 'grad_norm': 4.8718366622924805, 'learning_rate': 1.2702210217947289e-05} - -{'loss': 0.421, 'grad_norm': 4.8718366622924805, 'learning_rate': 1.2702210217947289e-05, 'epoch': 0.44} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3569701671600342, 'train/info_loss': 0.28798869252204895, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011215175036340953, 'train/video_loss': 0.28787654638290405, 'train/total_loss': 0.6448466777801514} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.0852, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00029614069499075415, 'train/lm_loss': 2.6343518402427436e-05, 'train/info_loss': 2.4914121240726672e-05, 'train/ref_loss': 0.21511472761631012, 'train/uncertainty_loss': -7.078309427015484e-05, 'train/video_loss': 0.2174379825592041, 'train/total_loss': 0.21746432781219482} -[Rank 1] Trainer log: {'loss': 0.4203, 'grad_norm': 4.827730655670166, 'learning_rate': 1.2691937985882194e-05}[Rank 2] Trainer log: {'loss': 0.4203, 'grad_norm': 4.827730655670166, 'learning_rate': 1.2691937985882194e-05} - -[Rank 0] Trainer log: {'loss': 0.4203, 'grad_norm': 4.827730655670166, 'learning_rate': 1.2691937985882194e-05}[Rank 3] Trainer log: {'loss': 0.4203, 'grad_norm': 4.827730655670166, 'learning_rate': 1.2691937985882194e-05} - -{'loss': 0.4203, 'grad_norm': 4.827730655670166, 'learning_rate': 1.2691937985882194e-05, 'epoch': 0.44} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.17287538051605225, 'train/info_loss': 0.12024251371622086, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010087609989568592, 'train/video_loss': 0.1201416403055191, 'train/total_loss': 0.29301702976226807} -tensor(0.0109, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.41298532485961914, 'train/info_loss': 0.2221626341342926, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010717380791902542, 'train/video_loss': 0.22205546498298645, 'train/total_loss': 0.6350407600402832} -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3935, 'grad_norm': 2.876939296722412, 'learning_rate': 1.2681662690484476e-05}[Rank 1] Trainer log: {'loss': 0.3935, 'grad_norm': 2.876939296722412, 'learning_rate': 1.2681662690484476e-05} - -[Rank 0] Trainer log: {'loss': 0.3935, 'grad_norm': 2.876939296722412, 'learning_rate': 1.2681662690484476e-05}[Rank 3] Trainer log: {'loss': 0.3935, 'grad_norm': 2.876939296722412, 'learning_rate': 1.2681662690484476e-05} - -{'loss': 0.3935, 'grad_norm': 2.876939296722412, 'learning_rate': 1.2681662690484476e-05, 'epoch': 0.44} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.28106787204742434, 'train/info_loss': 0.14399763941764832, 'train/ref_loss': None, 'train/uncertainty_loss': -8.300706394948065e-05, 'train/video_loss': 0.1439146250486374, 'train/total_loss': 0.424982488155365} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.3115, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1479633092880249, 'train/info_loss': 0.16022111475467682, 'train/ref_loss': None, 'train/uncertainty_loss': -8.483913261443378e-05, 'train/video_loss': 0.16013628244400024, 'train/total_loss': 0.3080995976924896} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.382, 'grad_norm': 4.570655345916748, 'learning_rate': 1.2671384343447057e-05}[Rank 0] Trainer log: {'loss': 0.382, 'grad_norm': 4.570655345916748, 'learning_rate': 1.2671384343447057e-05} -[Rank 2] Trainer log: {'loss': 0.382, 'grad_norm': 4.570655345916748, 'learning_rate': 1.2671384343447057e-05} -[Rank 1] Trainer log: {'loss': 0.382, 'grad_norm': 4.570655345916748, 'learning_rate': 1.2671384343447057e-05} - -{'loss': 0.382, 'grad_norm': 4.570655345916748, 'learning_rate': 1.2671384343447057e-05, 'epoch': 0.44} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(0.2174, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1087, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00024200878106057645, 'train/lm_loss': 2.989545464515686e-05, 'train/info_loss': 2.592734745121561e-05, 'train/ref_loss': 0.27784183621406555, 'train/uncertainty_loss': 0.010866475850343706, 'train/video_loss': 0.2906703054904938, 'train/total_loss': 0.29070019721984863} -tensor(0.1649, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.07688860893249512, 'train/info_loss': 0.17461155354976654, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010446697706356645, 'train/video_loss': 0.17450708150863647, 'train/total_loss': 0.25139570236206055} -tensor(0.0169, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3711, 'grad_norm': 2.252570152282715, 'learning_rate': 1.2661102956466345e-05}[Rank 1] Trainer log: {'loss': 0.3711, 'grad_norm': 2.252570152282715, 'learning_rate': 1.2661102956466345e-05} - -[Rank 0] Trainer log: {'loss': 0.3711, 'grad_norm': 2.252570152282715, 'learning_rate': 1.2661102956466345e-05}[Rank 3] Trainer log: {'loss': 0.3711, 'grad_norm': 2.252570152282715, 'learning_rate': 1.2661102956466345e-05} - -{'loss': 0.3711, 'grad_norm': 2.252570152282715, 'learning_rate': 1.2661102956466345e-05, 'epoch': 0.44} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.0290, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2962508678436279, 'train/info_loss': 0.1558769792318344, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012192605063319207, 'train/video_loss': 0.15575505793094635, 'train/total_loss': 0.45200592279434204} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.0463, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -tensor(0.0771, device='cuda:3', grad_fn=) {'train/tv_loss': 0.00019761170260608197, 'train/lm_loss': 5.714077269658446e-05, 'train/info_loss': 3.3556287235114723e-05, 'train/ref_loss': 0.2249104082584381, 'train/uncertainty_loss': 0.0046314973384141925, 'train/video_loss': 0.2311563640832901, 'train/total_loss': 0.2312135100364685} -tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3141, 'grad_norm': 7.939483642578125, 'learning_rate': 1.2650818541242207e-05}[Rank 3] Trainer log: {'loss': 0.3141, 'grad_norm': 7.939483642578125, 'learning_rate': 1.2650818541242207e-05}[Rank 2] Trainer log: {'loss': 0.3141, 'grad_norm': 7.939483642578125, 'learning_rate': 1.2650818541242207e-05} - - -[Rank 0] Trainer log: {'loss': 0.3141, 'grad_norm': 7.939483642578125, 'learning_rate': 1.2650818541242207e-05} -{'loss': 0.3141, 'grad_norm': 7.939483642578125, 'learning_rate': 1.2650818541242207e-05, 'epoch': 0.44} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.05777530074119568, 'train/info_loss': 0.2060435712337494, 'train/ref_loss': None, 'train/uncertainty_loss': -8.613975951448083e-05, 'train/video_loss': 0.20595742762088776, 'train/total_loss': 0.2637327313423157} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.0923, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3041990756988526, 'train/info_loss': 0.18739619851112366, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011992970248684288, 'train/video_loss': 0.18727627396583557, 'train/total_loss': 0.49147534370422363} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3904, 'grad_norm': 2.7406392097473145, 'learning_rate': 1.264053110947795e-05} -[Rank 2] Trainer log: {'loss': 0.3904, 'grad_norm': 2.7406392097473145, 'learning_rate': 1.264053110947795e-05} -[Rank 3] Trainer log: {'loss': 0.3904, 'grad_norm': 2.7406392097473145, 'learning_rate': 1.264053110947795e-05} -[Rank 0] Trainer log: {'loss': 0.3904, 'grad_norm': 2.7406392097473145, 'learning_rate': 1.264053110947795e-05} -{'loss': 0.3904, 'grad_norm': 2.7406392097473145, 'learning_rate': 1.264053110947795e-05, 'epoch': 0.44} -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.08416563868522645, 'train/info_loss': 0.24991318583488464, 'train/ref_loss': None, 'train/uncertainty_loss': -8.131943759508431e-05, 'train/video_loss': 0.24983187019824982, 'train/total_loss': 0.333997517824173} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1945142984390259, 'train/info_loss': 0.21589843928813934, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010789184598252178, 'train/video_loss': 0.2157905548810959, 'train/total_loss': 0.41030484437942505} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3219, 'grad_norm': 2.962953805923462, 'learning_rate': 1.2630240672880316e-05}[Rank 1] Trainer log: {'loss': 0.3219, 'grad_norm': 2.962953805923462, 'learning_rate': 1.2630240672880316e-05}[Rank 3] Trainer log: {'loss': 0.3219, 'grad_norm': 2.962953805923462, 'learning_rate': 1.2630240672880316e-05} - - -[Rank 0] Trainer log: {'loss': 0.3219, 'grad_norm': 2.962953805923462, 'learning_rate': 1.2630240672880316e-05} -{'loss': 0.3219, 'grad_norm': 2.962953805923462, 'learning_rate': 1.2630240672880316e-05, 'epoch': 0.44} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.28996031284332274, 'train/info_loss': 0.24081800878047943, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001097807427868247, 'train/video_loss': 0.24070823192596436, 'train/total_loss': 0.5306685566902161} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.2510, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(0.5696, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00020347367972135545, 'train/lm_loss': 5.7045428548008204e-05, 'train/info_loss': 3.027824277523905e-05, 'train/ref_loss': 0.5838445425033569, 'train/uncertainty_loss': 0.056955814361572266, 'train/video_loss': 0.642458438873291, 'train/total_loss': 0.6425154805183411} -[Rank 1] Trainer log: {'loss': 0.4442, 'grad_norm': 10.299372673034668, 'learning_rate': 1.2619947243159472e-05} -[Rank 2] Trainer log: {'loss': 0.4442, 'grad_norm': 10.299372673034668, 'learning_rate': 1.2619947243159472e-05}[Rank 3] Trainer log: {'loss': 0.4442, 'grad_norm': 10.299372673034668, 'learning_rate': 1.2619947243159472e-05} - -[Rank 0] Trainer log: {'loss': 0.4442, 'grad_norm': 10.299372673034668, 'learning_rate': 1.2619947243159472e-05} -{'loss': 0.4442, 'grad_norm': 10.299372673034668, 'learning_rate': 1.2619947243159472e-05, 'epoch': 0.44} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.2562, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002900954568758607, 'train/lm_loss': 4.453163128346205e-05, 'train/info_loss': 3.129145989078097e-05, 'train/ref_loss': 0.3869965672492981, 'train/uncertainty_loss': 0.02561841905117035, 'train/video_loss': 0.41496706008911133, 'train/total_loss': 0.41501158475875854} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.09151319265365601, 'train/info_loss': 0.2287430465221405, 'train/ref_loss': None, 'train/uncertainty_loss': -9.512531105428935e-05, 'train/video_loss': 0.22864791750907898, 'train/total_loss': 0.3201611042022705} -tensor(0.0509, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.4301, 'grad_norm': 7.453306198120117, 'learning_rate': 1.2609650832028979e-05}[Rank 1] Trainer log: {'loss': 0.4301, 'grad_norm': 7.453306198120117, 'learning_rate': 1.2609650832028979e-05}[Rank 3] Trainer log: {'loss': 0.4301, 'grad_norm': 7.453306198120117, 'learning_rate': 1.2609650832028979e-05} - - -[Rank 2] Trainer log: {'loss': 0.4301, 'grad_norm': 7.453306198120117, 'learning_rate': 1.2609650832028979e-05} -{'loss': 0.4301, 'grad_norm': 7.453306198120117, 'learning_rate': 1.2609650832028979e-05, 'epoch': 0.45} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2402961254119873, 'train/info_loss': 0.15297341346740723, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011051481124013662, 'train/video_loss': 0.15286289155483246, 'train/total_loss': 0.39315903186798096} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00023037232458591463, 'train/lm_loss': 4.419792094267905e-05, 'train/info_loss': 2.962263170047663e-05, 'train/ref_loss': 0.09540881216526031, 'train/uncertainty_loss': -7.073299493640662e-05, 'train/video_loss': 0.09721067547798157, 'train/total_loss': 0.09725487232208252} -tensor(0.1549, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.2944, 'grad_norm': 3.0121352672576904, 'learning_rate': 1.2599351451205802e-05} -[Rank 2] Trainer log: {'loss': 0.2944, 'grad_norm': 3.0121352672576904, 'learning_rate': 1.2599351451205802e-05} -[Rank 0] Trainer log: {'loss': 0.2944, 'grad_norm': 3.0121352672576904, 'learning_rate': 1.2599351451205802e-05}[Rank 3] Trainer log: {'loss': 0.2944, 'grad_norm': 3.0121352672576904, 'learning_rate': 1.2599351451205802e-05} - -{'loss': 0.2944, 'grad_norm': 3.0121352672576904, 'learning_rate': 1.2599351451205802e-05, 'epoch': 0.45} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(0.1284, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002006109803915024, 'train/lm_loss': 4.395955475047231e-05, 'train/info_loss': 2.9384225854300894e-05, 'train/ref_loss': 0.09141207486391068, 'train/uncertainty_loss': -7.001569611020386e-05, 'train/video_loss': 0.09297633171081543, 'train/total_loss': 0.09302029013633728} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2852074146270752, 'train/info_loss': 0.1442277431488037, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010918223997578025, 'train/video_loss': 0.1441185623407364, 'train/total_loss': 0.42932599782943726} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3031, 'grad_norm': 8.640837669372559, 'learning_rate': 1.2589049112410281e-05} -[Rank 3] Trainer log: {'loss': 0.3031, 'grad_norm': 8.640837669372559, 'learning_rate': 1.2589049112410281e-05}[Rank 1] Trainer log: {'loss': 0.3031, 'grad_norm': 8.640837669372559, 'learning_rate': 1.2589049112410281e-05} - -[Rank 0] Trainer log: {'loss': 0.3031, 'grad_norm': 8.640837669372559, 'learning_rate': 1.2589049112410281e-05} -{'loss': 0.3031, 'grad_norm': 8.640837669372559, 'learning_rate': 1.2589049112410281e-05, 'epoch': 0.45} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.9757, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00022495663724839688, 'train/lm_loss': 5.721227498725057e-05, 'train/info_loss': 3.129145989078097e-05, 'train/ref_loss': 0.9131650328636169, 'train/uncertainty_loss': 0.09756845235824585, 'train/video_loss': 1.0125644207000732, 'train/total_loss': 1.0126216411590576} -tensor(0.2103, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2606720209121704, 'train/info_loss': 0.27344444394111633, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00014084639260545374, 'train/video_loss': 0.2733035981655121, 'train/total_loss': 0.5339756011962891} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(0.2206, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.5266, 'grad_norm': 14.940010070800781, 'learning_rate': 1.2578743827366124e-05}[Rank 1] Trainer log: {'loss': 0.5266, 'grad_norm': 14.940010070800781, 'learning_rate': 1.2578743827366124e-05} -[Rank 3] Trainer log: {'loss': 0.5266, 'grad_norm': 14.940010070800781, 'learning_rate': 1.2578743827366124e-05} - -[Rank 2] Trainer log: {'loss': 0.5266, 'grad_norm': 14.940010070800781, 'learning_rate': 1.2578743827366124e-05} -{'loss': 0.5266, 'grad_norm': 14.940010070800781, 'learning_rate': 1.2578743827366124e-05, 'epoch': 0.45} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00028902050107717516, 'train/lm_loss': 3.3947924384847285e-05, 'train/info_loss': 2.4735316401347518e-05, 'train/ref_loss': 0.15605519711971283, 'train/uncertainty_loss': -7.110789883881808e-05, 'train/video_loss': 0.15832099318504333, 'train/total_loss': 0.15835493803024292} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00028992476873099807, 'train/lm_loss': 4.353049444034696e-05, 'train/info_loss': 2.8669011953752488e-05, 'train/ref_loss': 0.192971333861351, 'train/uncertainty_loss': -6.97728362865746e-05, 'train/video_loss': 0.19524963200092316, 'train/total_loss': 0.19529315829277039} -[Rank 0] Trainer log: {'loss': 0.3201, 'grad_norm': 4.731839179992676, 'learning_rate': 1.2568435607800386e-05}[Rank 1] Trainer log: {'loss': 0.3201, 'grad_norm': 4.731839179992676, 'learning_rate': 1.2568435607800386e-05} -[Rank 2] Trainer log: {'loss': 0.3201, 'grad_norm': 4.731839179992676, 'learning_rate': 1.2568435607800386e-05} - -[Rank 3] Trainer log: {'loss': 0.3201, 'grad_norm': 4.731839179992676, 'learning_rate': 1.2568435607800386e-05} -{'loss': 0.3201, 'grad_norm': 4.731839179992676, 'learning_rate': 1.2568435607800386e-05, 'epoch': 0.45} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1027964472770691, 'train/info_loss': 0.23222950100898743, 'train/ref_loss': None, 'train/uncertainty_loss': -8.989346097223461e-05, 'train/video_loss': 0.23213960230350494, 'train/total_loss': 0.3349360525608063} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002530387369915843, 'train/lm_loss': 4.367351357359439e-05, 'train/info_loss': 2.962263170047663e-05, 'train/ref_loss': 0.047715745866298676, 'train/uncertainty_loss': -7.015187293291092e-05, 'train/video_loss': 0.04969953000545502, 'train/total_loss': 0.049743205308914185} -tensor(0.2885, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.2799, 'grad_norm': 6.760186672210693, 'learning_rate': 1.255812446544347e-05}[Rank 0] Trainer log: {'loss': 0.2799, 'grad_norm': 6.760186672210693, 'learning_rate': 1.255812446544347e-05} -[Rank 3] Trainer log: {'loss': 0.2799, 'grad_norm': 6.760186672210693, 'learning_rate': 1.255812446544347e-05} - -[Rank 2] Trainer log: {'loss': 0.2799, 'grad_norm': 6.760186672210693, 'learning_rate': 1.255812446544347e-05} -{'loss': 0.2799, 'grad_norm': 6.760186672210693, 'learning_rate': 1.255812446544347e-05, 'epoch': 0.45} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003459489438682795, 'train/lm_loss': 2.6224323664791883e-05, 'train/info_loss': 2.396049239905551e-05, 'train/ref_loss': 0.20069143176078796, 'train/uncertainty_loss': -7.214411743916571e-05, 'train/video_loss': 0.20341083407402039, 'train/total_loss': 0.20343706011772156} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1157, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00023445060942322017, 'train/lm_loss': 3.3638032618910074e-05, 'train/info_loss': 2.6523362976149656e-05, 'train/ref_loss': 0.10462568700313568, 'train/uncertainty_loss': -7.081012008711696e-05, 'train/video_loss': 0.10645700246095657, 'train/total_loss': 0.10649064183235168} -[Rank 3] Trainer log: {'loss': 0.3432, 'grad_norm': 1.6867645978927612, 'learning_rate': 1.2547810412029099e-05}[Rank 1] Trainer log: {'loss': 0.3432, 'grad_norm': 1.6867645978927612, 'learning_rate': 1.2547810412029099e-05}[Rank 0] Trainer log: {'loss': 0.3432, 'grad_norm': 1.6867645978927612, 'learning_rate': 1.2547810412029099e-05} - - -[Rank 2] Trainer log: {'loss': 0.3432, 'grad_norm': 1.6867645978927612, 'learning_rate': 1.2547810412029099e-05} -{'loss': 0.3432, 'grad_norm': 1.6867645978927612, 'learning_rate': 1.2547810412029099e-05, 'epoch': 0.45} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1128543496131897, 'train/info_loss': 0.21818633377552032, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011747631942853332, 'train/video_loss': 0.21806885302066803, 'train/total_loss': 0.3309231996536255} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(0.4112, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(0.0350, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1149, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.0508, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00019909366965293885, 'train/lm_loss': 5.671174149028957e-05, 'train/info_loss': 3.325828583911061e-05, 'train/ref_loss': 0.25130629539489746, 'train/uncertainty_loss': 0.005076046660542489, 'train/video_loss': 0.25800836086273193, 'train/total_loss': 0.2580650746822357} -[Rank 1] Trainer log: {'loss': 0.3167, 'grad_norm': 4.48518180847168, 'learning_rate': 1.2537493459294306e-05}[Rank 3] Trainer log: {'loss': 0.3167, 'grad_norm': 4.48518180847168, 'learning_rate': 1.2537493459294306e-05} -[Rank 2] Trainer log: {'loss': 0.3167, 'grad_norm': 4.48518180847168, 'learning_rate': 1.2537493459294306e-05} - -[Rank 0] Trainer log: {'loss': 0.3167, 'grad_norm': 4.48518180847168, 'learning_rate': 1.2537493459294306e-05} -{'loss': 0.3167, 'grad_norm': 4.48518180847168, 'learning_rate': 1.2537493459294306e-05, 'epoch': 0.45} -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3368321180343628, 'train/info_loss': 0.17197874188423157, 'train/ref_loss': None, 'train/uncertainty_loss': -9.675802430137993e-05, 'train/video_loss': 0.17188198864459991, 'train/total_loss': 0.5087141394615173} -tensor(0.1666, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0187, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.10434095859527588, 'train/info_loss': 0.16010090708732605, 'train/ref_loss': None, 'train/uncertainty_loss': -9.756419458426536e-05, 'train/video_loss': 0.16000334918498993, 'train/total_loss': 0.26434430480003357} -tensor(0.2701, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.4232, 'grad_norm': 9.71017074584961, 'learning_rate': 1.2527173618979435e-05}[Rank 2] Trainer log: {'loss': 0.4232, 'grad_norm': 9.71017074584961, 'learning_rate': 1.2527173618979435e-05}[Rank 1] Trainer log: {'loss': 0.4232, 'grad_norm': 9.71017074584961, 'learning_rate': 1.2527173618979435e-05} - - -[Rank 3] Trainer log: {'loss': 0.4232, 'grad_norm': 9.71017074584961, 'learning_rate': 1.2527173618979435e-05} -{'loss': 0.4232, 'grad_norm': 9.71017074584961, 'learning_rate': 1.2527173618979435e-05, 'epoch': 0.45} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.2151, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003480943851172924, 'train/lm_loss': 3.881077864207328e-05, 'train/info_loss': 2.914582182711456e-05, 'train/ref_loss': 0.18789350986480713, 'train/uncertainty_loss': -7.021006313152611e-05, 'train/video_loss': 0.19063720107078552, 'train/total_loss': 0.1906760185956955} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.0709, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0004241571761667729, 'train/lm_loss': 2.975242678076029e-05, 'train/info_loss': 2.4377704903599806e-05, 'train/ref_loss': 0.26231443881988525, 'train/uncertainty_loss': 0.007085143774747849, 'train/video_loss': 0.2728172242641449, 'train/total_loss': 0.2728469669818878} -[Rank 0] Trainer log: {'loss': 0.3418, 'grad_norm': 14.133904457092285, 'learning_rate': 1.2516850902828102e-05}[Rank 1] Trainer log: {'loss': 0.3418, 'grad_norm': 14.133904457092285, 'learning_rate': 1.2516850902828102e-05}[Rank 3] Trainer log: {'loss': 0.3418, 'grad_norm': 14.133904457092285, 'learning_rate': 1.2516850902828102e-05} - - -[Rank 2] Trainer log: {'loss': 0.3418, 'grad_norm': 14.133904457092285, 'learning_rate': 1.2516850902828102e-05} -{'loss': 0.3418, 'grad_norm': 14.133904457092285, 'learning_rate': 1.2516850902828102e-05, 'epoch': 0.45} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.1338, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1486, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002202600706368685, 'train/lm_loss': 5.675940774381161e-05, 'train/info_loss': 3.176826794515364e-05, 'train/ref_loss': 0.31044092774391174, 'train/uncertainty_loss': 0.014861442148685455, 'train/video_loss': 0.32709622383117676, 'train/total_loss': 0.3271529972553253} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(0.2261, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0106, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0005210274830460548, 'train/lm_loss': 8.083140128292144e-05, 'train/info_loss': 3.1053055863594636e-05, 'train/ref_loss': 0.2198091447353363, 'train/uncertainty_loss': 0.0010576002299785615, 'train/video_loss': 0.22506600618362427, 'train/total_loss': 0.2251468449831009} -[Rank 0] Trainer log: {'loss': 0.3993, 'grad_norm': 10.934391021728516, 'learning_rate': 1.2506525322587207e-05}[Rank 1] Trainer log: {'loss': 0.3993, 'grad_norm': 10.934391021728516, 'learning_rate': 1.2506525322587207e-05} -[Rank 3] Trainer log: {'loss': 0.3993, 'grad_norm': 10.934391021728516, 'learning_rate': 1.2506525322587207e-05} - -[Rank 2] Trainer log: {'loss': 0.3993, 'grad_norm': 10.934391021728516, 'learning_rate': 1.2506525322587207e-05} -{'loss': 0.3993, 'grad_norm': 10.934391021728516, 'learning_rate': 1.2506525322587207e-05, 'epoch': 0.45} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.000333463354036212, 'train/lm_loss': 6.498234579339623e-05, 'train/info_loss': 3.683431350509636e-05, 'train/ref_loss': 0.16739708185195923, 'train/uncertainty_loss': -7.282291189767421e-05, 'train/video_loss': 0.17002880573272705, 'train/total_loss': 0.17009378969669342} -tensor(0.2013, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1091, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00029488813597708944, 'train/lm_loss': 2.975242678076029e-05, 'train/info_loss': 2.5510136765660718e-05, 'train/ref_loss': 0.1849384307861328, 'train/uncertainty_loss': -6.99950265698135e-05, 'train/video_loss': 0.18725305795669556, 'train/total_loss': 0.18728281557559967} -[Rank 3] Trainer log: {'loss': 0.2955, 'grad_norm': 6.132021427154541, 'learning_rate': 1.24961968900069e-05} -[Rank 1] Trainer log: {'loss': 0.2955, 'grad_norm': 6.132021427154541, 'learning_rate': 1.24961968900069e-05}[Rank 2] Trainer log: {'loss': 0.2955, 'grad_norm': 6.132021427154541, 'learning_rate': 1.24961968900069e-05} -[Rank 0] Trainer log: {'loss': 0.2955, 'grad_norm': 6.132021427154541, 'learning_rate': 1.24961968900069e-05} - -{'loss': 0.2955, 'grad_norm': 6.132021427154541, 'learning_rate': 1.24961968900069e-05, 'epoch': 0.45} -tensor(0.1540, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0056, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.5900, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00024133252445608379, 'train/lm_loss': 4.972793394699693e-05, 'train/info_loss': 3.254307739553042e-05, 'train/ref_loss': 0.18146270513534546, 'train/uncertainty_loss': -7.168937590904534e-05, 'train/video_loss': 0.1833542138338089, 'train/total_loss': 0.18340393900871277} -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.16081907749176028, 'train/info_loss': 0.1571955531835556, 'train/ref_loss': None, 'train/uncertainty_loss': -9.702949319034815e-05, 'train/video_loss': 0.15709851682186127, 'train/total_loss': 0.3179175853729248} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.5500, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0452, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.345, 'grad_norm': 24.996017456054688, 'learning_rate': 1.2485865616840586e-05}[Rank 3] Trainer log: {'loss': 0.345, 'grad_norm': 24.996017456054688, 'learning_rate': 1.2485865616840586e-05} -[Rank 0] Trainer log: {'loss': 0.345, 'grad_norm': 24.996017456054688, 'learning_rate': 1.2485865616840586e-05} -[Rank 2] Trainer log: {'loss': 0.345, 'grad_norm': 24.996017456054688, 'learning_rate': 1.2485865616840586e-05} - -{'loss': 0.345, 'grad_norm': 24.996017456054688, 'learning_rate': 1.2485865616840586e-05, 'epoch': 0.45} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.0447, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0006564086768776179, 'train/lm_loss': 3.401943831704557e-05, 'train/info_loss': 2.914582182711456e-05, 'train/ref_loss': 0.10811207443475723, 'train/uncertainty_loss': -7.342686294578017e-05, 'train/video_loss': 0.11331906169652939, 'train/total_loss': 0.11335308104753494} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(0.2154, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.3346, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002599187195301056, 'train/lm_loss': 8.407256100326777e-05, 'train/info_loss': 3.599991032388061e-05, 'train/ref_loss': 0.4342034161090851, 'train/uncertainty_loss': 0.033461421728134155, 'train/video_loss': 0.46978017687797546, 'train/total_loss': 0.46986424922943115} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3396, 'grad_norm': 7.94145393371582, 'learning_rate': 1.2475531514844893e-05}[Rank 0] Trainer log: {'loss': 0.3396, 'grad_norm': 7.94145393371582, 'learning_rate': 1.2475531514844893e-05} -[Rank 1] Trainer log: {'loss': 0.3396, 'grad_norm': 7.94145393371582, 'learning_rate': 1.2475531514844893e-05} - -[Rank 2] Trainer log: {'loss': 0.3396, 'grad_norm': 7.94145393371582, 'learning_rate': 1.2475531514844893e-05} -{'loss': 0.3396, 'grad_norm': 7.94145393371582, 'learning_rate': 1.2475531514844893e-05, 'epoch': 0.45} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.14170889854431154, 'train/info_loss': 0.19218696653842926, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011722762137651445, 'train/video_loss': 0.19206973910331726, 'train/total_loss': 0.33377861976623535} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(0.0687, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0017, device='cuda:0', grad_fn=) tensor(-0.0017, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.17485718727111818, 'train/info_loss': 0.22927629947662354, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001698893029242754, 'train/video_loss': 0.22910641133785248, 'train/total_loss': 0.4039635956287384} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3549, 'grad_norm': 6.4409708976745605, 'learning_rate': 1.2465194595779679e-05} -[Rank 1] Trainer log: {'loss': 0.3549, 'grad_norm': 6.4409708976745605, 'learning_rate': 1.2465194595779679e-05} -[Rank 0] Trainer log: {'loss': 0.3549, 'grad_norm': 6.4409708976745605, 'learning_rate': 1.2465194595779679e-05}[Rank 3] Trainer log: {'loss': 0.3549, 'grad_norm': 6.4409708976745605, 'learning_rate': 1.2465194595779679e-05} - -{'loss': 0.3549, 'grad_norm': 6.4409708976745605, 'learning_rate': 1.2465194595779679e-05, 'epoch': 0.45} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3040521860122681, 'train/info_loss': 0.2271643429994583, 'train/ref_loss': None, 'train/uncertainty_loss': -9.252637973986567e-05, 'train/video_loss': 0.2270718216896057, 'train/total_loss': 0.5311239957809448} -tensor(-0.0015, device='cuda:3', grad_fn=) tensor(-0.0015, device='cuda:3', grad_fn=) -tensor(0.0879, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00032470906153321267, 'train/lm_loss': 3.356651868671179e-05, 'train/info_loss': 2.413929905742407e-05, 'train/ref_loss': 0.16801048815250397, 'train/uncertainty_loss': -7.08421808667481e-05, 'train/video_loss': 0.17056146264076233, 'train/total_loss': 0.17059503495693207} -[Rank 1] Trainer log: {'loss': 0.3667, 'grad_norm': 4.069051265716553, 'learning_rate': 1.2454854871407993e-05}[Rank 3] Trainer log: {'loss': 0.3667, 'grad_norm': 4.069051265716553, 'learning_rate': 1.2454854871407993e-05} -[Rank 0] Trainer log: {'loss': 0.3667, 'grad_norm': 4.069051265716553, 'learning_rate': 1.2454854871407993e-05} - -[Rank 2] Trainer log: {'loss': 0.3667, 'grad_norm': 4.069051265716553, 'learning_rate': 1.2454854871407993e-05} -{'loss': 0.3667, 'grad_norm': 4.069051265716553, 'learning_rate': 1.2454854871407993e-05, 'epoch': 0.45} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2360884189605713, 'train/info_loss': 0.16744272410869598, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013990416191518308, 'train/video_loss': 0.16730281710624695, 'train/total_loss': 0.4033912420272827} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.1449, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.21553852558135989, 'train/info_loss': 0.17705628275871277, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012561285402625799, 'train/video_loss': 0.17693066596984863, 'train/total_loss': 0.392469197511673} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.2491, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3995, 'grad_norm': 8.742913246154785, 'learning_rate': 1.244451235349609e-05}[Rank 1] Trainer log: {'loss': 0.3995, 'grad_norm': 8.742913246154785, 'learning_rate': 1.244451235349609e-05} - -[Rank 2] Trainer log: {'loss': 0.3995, 'grad_norm': 8.742913246154785, 'learning_rate': 1.244451235349609e-05} -[Rank 3] Trainer log: {'loss': 0.3995, 'grad_norm': 8.742913246154785, 'learning_rate': 1.244451235349609e-05} -{'loss': 0.3995, 'grad_norm': 8.742913246154785, 'learning_rate': 1.244451235349609e-05, 'epoch': 0.45} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(0.2042, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00029323622584342957, 'train/lm_loss': 3.881077864207328e-05, 'train/info_loss': 2.783459422062151e-05, 'train/ref_loss': 0.3494979739189148, 'train/uncertainty_loss': 0.020420454442501068, 'train/video_loss': 0.372292160987854, 'train/total_loss': 0.3723309636116028} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.23167409896850588, 'train/info_loss': 0.262124240398407, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012180738849565387, 'train/video_loss': 0.26200243830680847, 'train/total_loss': 0.4936765432357788} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3974, 'grad_norm': 2.5723137855529785, 'learning_rate': 1.2434167053813399e-05}[Rank 3] Trainer log: {'loss': 0.3974, 'grad_norm': 2.5723137855529785, 'learning_rate': 1.2434167053813399e-05}[Rank 2] Trainer log: {'loss': 0.3974, 'grad_norm': 2.5723137855529785, 'learning_rate': 1.2434167053813399e-05} - - -[Rank 0] Trainer log: {'loss': 0.3974, 'grad_norm': 2.5723137855529785, 'learning_rate': 1.2434167053813399e-05} -{'loss': 0.3974, 'grad_norm': 2.5723137855529785, 'learning_rate': 1.2434167053813399e-05, 'epoch': 0.45} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.0634, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(0.1893, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002450444502756, 'train/lm_loss': 4.319678118918091e-05, 'train/info_loss': 2.914582182711456e-05, 'train/ref_loss': 0.33894962072372437, 'train/uncertainty_loss': 0.01893337219953537, 'train/video_loss': 0.3598725199699402, 'train/total_loss': 0.35991570353507996} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00037994561716914177, 'train/lm_loss': 3.0086160404607656e-05, 'train/info_loss': 2.3602882720297202e-05, 'train/ref_loss': 0.09996718913316727, 'train/uncertainty_loss': -7.099661743268371e-05, 'train/video_loss': 0.10295935720205307, 'train/total_loss': 0.10298944264650345} -[Rank 0] Trainer log: {'loss': 0.3129, 'grad_norm': 2.1703672409057617, 'learning_rate': 1.2423818984132516e-05}[Rank 3] Trainer log: {'loss': 0.3129, 'grad_norm': 2.1703672409057617, 'learning_rate': 1.2423818984132516e-05} -[Rank 2] Trainer log: {'loss': 0.3129, 'grad_norm': 2.1703672409057617, 'learning_rate': 1.2423818984132516e-05} - -[Rank 1] Trainer log: {'loss': 0.3129, 'grad_norm': 2.1703672409057617, 'learning_rate': 1.2423818984132516e-05} -{'loss': 0.3129, 'grad_norm': 2.1703672409057617, 'learning_rate': 1.2423818984132516e-05, 'epoch': 0.45} -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.24948632717132568, 'train/info_loss': 0.21842114627361298, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001290246844291687, 'train/video_loss': 0.21829211711883545, 'train/total_loss': 0.46777844429016113} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(0.0484, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -tensor(0.2157, device='cuda:3', grad_fn=) {'train/tv_loss': 0.00019249030156061054, 'train/lm_loss': 7.372927502728999e-05, 'train/info_loss': 3.409269265830517e-05, 'train/ref_loss': 0.20051893591880798, 'train/uncertainty_loss': -7.250637281686068e-05, 'train/video_loss': 0.20202043652534485, 'train/total_loss': 0.202094167470932} -tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.2886, 'grad_norm': 4.46931266784668, 'learning_rate': 1.2413468156229182e-05} -[Rank 0] Trainer log: {'loss': 0.2886, 'grad_norm': 4.46931266784668, 'learning_rate': 1.2413468156229182e-05}[Rank 3] Trainer log: {'loss': 0.2886, 'grad_norm': 4.46931266784668, 'learning_rate': 1.2413468156229182e-05} -[Rank 2] Trainer log: {'loss': 0.2886, 'grad_norm': 4.46931266784668, 'learning_rate': 1.2413468156229182e-05} - -{'loss': 0.2886, 'grad_norm': 4.46931266784668, 'learning_rate': 1.2413468156229182e-05, 'epoch': 0.45} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.296465539932251, 'train/info_loss': 0.23881782591342926, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011904834536835551, 'train/video_loss': 0.2386987805366516, 'train/total_loss': 0.5351643562316895} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2992027521133423, 'train/info_loss': 0.18718549609184265, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012487102067098023, 'train/video_loss': 0.1870606243610382, 'train/total_loss': 0.4862633943557739} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1702, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.4374, 'grad_norm': 5.3546142578125, 'learning_rate': 1.240311458188229e-05}[Rank 1] Trainer log: {'loss': 0.4374, 'grad_norm': 5.3546142578125, 'learning_rate': 1.240311458188229e-05}[Rank 3] Trainer log: {'loss': 0.4374, 'grad_norm': 5.3546142578125, 'learning_rate': 1.240311458188229e-05} - -[Rank 0] Trainer log: {'loss': 0.4374, 'grad_norm': 5.3546142578125, 'learning_rate': 1.240311458188229e-05} - -{'loss': 0.4374, 'grad_norm': 5.3546142578125, 'learning_rate': 1.240311458188229e-05, 'epoch': 0.45} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.4538, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00028182792011648415, 'train/lm_loss': 4.424559301696718e-05, 'train/info_loss': 3.00994397548493e-05, 'train/ref_loss': 0.5086236596107483, 'train/uncertainty_loss': 0.04538052380084992, 'train/video_loss': 0.5562888979911804, 'train/total_loss': 0.5563331246376038} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.21270198822021485, 'train/info_loss': 0.1724759340286255, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012519295560196043, 'train/video_loss': 0.17235073447227478, 'train/total_loss': 0.38505274057388306} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.1461, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.2936, 'grad_norm': 7.436012268066406, 'learning_rate': 1.2392758272873842e-05}[Rank 2] Trainer log: {'loss': 0.2936, 'grad_norm': 7.436012268066406, 'learning_rate': 1.2392758272873842e-05}[Rank 1] Trainer log: {'loss': 0.2936, 'grad_norm': 7.436012268066406, 'learning_rate': 1.2392758272873842e-05} - - -[Rank 0] Trainer log: {'loss': 0.2936, 'grad_norm': 7.436012268066406, 'learning_rate': 1.2392758272873842e-05} -{'loss': 0.2936, 'grad_norm': 7.436012268066406, 'learning_rate': 1.2392758272873842e-05, 'epoch': 0.45} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(0.5127, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.0215, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0001588208484463394, 'train/lm_loss': 5.7545967865735297e-05, 'train/info_loss': 3.027824277523905e-05, 'train/ref_loss': 0.23315861821174622, 'train/uncertainty_loss': 0.0021475134417414665, 'train/video_loss': 0.23660697042942047, 'train/total_loss': 0.2366645187139511} -tensor(-0.0018, device='cuda:1', grad_fn=) tensor(-0.0018, device='cuda:1', grad_fn=) -tensor(-0.0015, device='cuda:0', grad_fn=) tensor(-0.0015, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.35532231330871583, 'train/info_loss': 0.10586348176002502, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00015203538350760937, 'train/video_loss': 0.1057114452123642, 'train/total_loss': 0.46103376150131226} -tensor(-0.0015, device='cuda:3', grad_fn=) tensor(-0.0015, device='cuda:3', grad_fn=) -tensor(0.3175, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4991, 'grad_norm': 10.561589241027832, 'learning_rate': 1.2382399240988967e-05} -[Rank 2] Trainer log: {'loss': 0.4991, 'grad_norm': 10.561589241027832, 'learning_rate': 1.2382399240988967e-05} -[Rank 3] Trainer log: {'loss': 0.4991, 'grad_norm': 10.561589241027832, 'learning_rate': 1.2382399240988967e-05} -[Rank 0] Trainer log: {'loss': 0.4991, 'grad_norm': 10.561589241027832, 'learning_rate': 1.2382399240988967e-05} -{'loss': 0.4991, 'grad_norm': 10.561589241027832, 'learning_rate': 1.2382399240988967e-05, 'epoch': 0.45} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.12551769018173217, 'train/info_loss': 0.09661006927490234, 'train/ref_loss': None, 'train/uncertainty_loss': -9.437418775632979e-05, 'train/video_loss': 0.09651569277048111, 'train/total_loss': 0.22203338146209717} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002834278624504805, 'train/lm_loss': 2.038374950643629e-05, 'train/info_loss': 2.0801588107133284e-05, 'train/ref_loss': 0.16279150545597076, 'train/uncertainty_loss': -7.328003412112594e-05, 'train/video_loss': 0.16500644385814667, 'train/total_loss': 0.16502682864665985} -[Rank 3] Trainer log: {'loss': 0.3624, 'grad_norm': 2.1851298809051514, 'learning_rate': 1.2372037498015882e-05}[Rank 1] Trainer log: {'loss': 0.3624, 'grad_norm': 2.1851298809051514, 'learning_rate': 1.2372037498015882e-05} - -[Rank 2] Trainer log: {'loss': 0.3624, 'grad_norm': 2.1851298809051514, 'learning_rate': 1.2372037498015882e-05} -[Rank 0] Trainer log: {'loss': 0.3624, 'grad_norm': 2.1851298809051514, 'learning_rate': 1.2372037498015882e-05} -{'loss': 0.3624, 'grad_norm': 2.1851298809051514, 'learning_rate': 1.2372037498015882e-05, 'epoch': 0.45} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.28398590087890624, 'train/info_loss': 0.25571495294570923, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013898557517677546, 'train/video_loss': 0.255575954914093, 'train/total_loss': 0.5395618677139282} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.3208, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00018981551984325053, 'train/lm_loss': 8.390573784708977e-05, 'train/info_loss': 3.516550350468606e-05, 'train/ref_loss': 0.42095857858657837, 'train/uncertainty_loss': 0.03208267390727997, 'train/video_loss': 0.4545949399471283, 'train/total_loss': 0.45467883348464966} -[Rank 1] Trainer log: {'loss': 0.443, 'grad_norm': 2.772779941558838, 'learning_rate': 1.2361673055745898e-05}[Rank 3] Trainer log: {'loss': 0.443, 'grad_norm': 2.772779941558838, 'learning_rate': 1.2361673055745898e-05}[Rank 2] Trainer log: {'loss': 0.443, 'grad_norm': 2.772779941558838, 'learning_rate': 1.2361673055745898e-05} - - -[Rank 0] Trainer log: {'loss': 0.443, 'grad_norm': 2.772779941558838, 'learning_rate': 1.2361673055745898e-05} -{'loss': 0.443, 'grad_norm': 2.772779941558838, 'learning_rate': 1.2361673055745898e-05, 'epoch': 0.45} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2673844814300537, 'train/info_loss': 0.10458933562040329, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001155378413386643, 'train/video_loss': 0.10447379946708679, 'train/total_loss': 0.37185829877853394} -tensor(-0.0015, device='cuda:3', grad_fn=) tensor(-0.0015, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.24262278079986574, 'train/info_loss': 0.2430172711610794, 'train/ref_loss': None, 'train/uncertainty_loss': -9.419168345630169e-05, 'train/video_loss': 0.2429230809211731, 'train/total_loss': 0.4855458736419678} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3576, 'grad_norm': 3.6400623321533203, 'learning_rate': 1.2351305925973387e-05} -[Rank 0] Trainer log: {'loss': 0.3576, 'grad_norm': 3.6400623321533203, 'learning_rate': 1.2351305925973387e-05}[Rank 2] Trainer log: {'loss': 0.3576, 'grad_norm': 3.6400623321533203, 'learning_rate': 1.2351305925973387e-05}[Rank 1] Trainer log: {'loss': 0.3576, 'grad_norm': 3.6400623321533203, 'learning_rate': 1.2351305925973387e-05} - - -{'loss': 0.3576, 'grad_norm': 3.6400623321533203, 'learning_rate': 1.2351305925973387e-05, 'epoch': 0.45} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(0.0543, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1622, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.2346, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00033909427002072335, 'train/lm_loss': 3.411478828638792e-05, 'train/info_loss': 2.515252708690241e-05, 'train/ref_loss': 0.364515483379364, 'train/uncertainty_loss': 0.02346245348453522, 'train/video_loss': 0.3907158374786377, 'train/total_loss': 0.3907499611377716} -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.23255393505096436, 'train/info_loss': 0.19503499567508698, 'train/ref_loss': None, 'train/uncertainty_loss': -9.37703880481422e-05, 'train/video_loss': 0.1949412226676941, 'train/total_loss': 0.42749518156051636} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3115, 'grad_norm': 8.173691749572754, 'learning_rate': 1.2340936120495793e-05}[Rank 3] Trainer log: {'loss': 0.3115, 'grad_norm': 8.173691749572754, 'learning_rate': 1.2340936120495793e-05}[Rank 1] Trainer log: {'loss': 0.3115, 'grad_norm': 8.173691749572754, 'learning_rate': 1.2340936120495793e-05} - - -[Rank 0] Trainer log: {'loss': 0.3115, 'grad_norm': 8.173691749572754, 'learning_rate': 1.2340936120495793e-05} -{'loss': 0.3115, 'grad_norm': 8.173691749572754, 'learning_rate': 1.2340936120495793e-05, 'epoch': 0.45} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00021853328216820956, 'train/lm_loss': 3.3852574415504935e-05, 'train/info_loss': 2.574854443082586e-05, 'train/ref_loss': 0.11059975624084473, 'train/uncertainty_loss': -6.855300744064152e-05, 'train/video_loss': 0.11230521649122238, 'train/total_loss': 0.1123390719294548} -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.09411268830299378, 'train/info_loss': 0.20410458743572235, 'train/ref_loss': None, 'train/uncertainty_loss': -9.491422097198666e-05, 'train/video_loss': 0.20400966703891754, 'train/total_loss': 0.2981223464012146} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.3372, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3243, 'grad_norm': 7.4432501792907715, 'learning_rate': 1.233056365111359e-05} -[Rank 3] Trainer log: {'loss': 0.3243, 'grad_norm': 7.4432501792907715, 'learning_rate': 1.233056365111359e-05} -[Rank 0] Trainer log: {'loss': 0.3243, 'grad_norm': 7.4432501792907715, 'learning_rate': 1.233056365111359e-05}[Rank 2] Trainer log: {'loss': 0.3243, 'grad_norm': 7.4432501792907715, 'learning_rate': 1.233056365111359e-05} - -{'loss': 0.3243, 'grad_norm': 7.4432501792907715, 'learning_rate': 1.233056365111359e-05, 'epoch': 0.45} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.07659597992897034, 'train/info_loss': 0.14120633900165558, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011197221465408802, 'train/video_loss': 0.141094371676445, 'train/total_loss': 0.2176903486251831} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.1965, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.37675483226776124, 'train/info_loss': 0.10467775911092758, 'train/ref_loss': None, 'train/uncertainty_loss': -8.374348399229348e-05, 'train/video_loss': 0.10459401458501816, 'train/total_loss': 0.4813488721847534} -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(0.4778, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3936, 'grad_norm': 8.61507511138916, 'learning_rate': 1.2320188529630293e-05}[Rank 2] Trainer log: {'loss': 0.3936, 'grad_norm': 8.61507511138916, 'learning_rate': 1.2320188529630293e-05}[Rank 3] Trainer log: {'loss': 0.3936, 'grad_norm': 8.61507511138916, 'learning_rate': 1.2320188529630293e-05} - - -[Rank 0] Trainer log: {'loss': 0.3936, 'grad_norm': 8.61507511138916, 'learning_rate': 1.2320188529630293e-05} -{'loss': 0.3936, 'grad_norm': 8.61507511138916, 'learning_rate': 1.2320188529630293e-05, 'epoch': 0.45} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2560771703720093, 'train/info_loss': 0.16419243812561035, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011767866089940071, 'train/video_loss': 0.16407476365566254, 'train/total_loss': 0.420151948928833} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3416083097457886, 'train/info_loss': 0.19544251263141632, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013001784682273865, 'train/video_loss': 0.1953125, 'train/total_loss': 0.5369207859039307} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3861, 'grad_norm': 3.2564568519592285, 'learning_rate': 1.2309810767852435e-05} -[Rank 2] Trainer log: {'loss': 0.3861, 'grad_norm': 3.2564568519592285, 'learning_rate': 1.2309810767852435e-05}[Rank 0] Trainer log: {'loss': 0.3861, 'grad_norm': 3.2564568519592285, 'learning_rate': 1.2309810767852435e-05} - -[Rank 3] Trainer log: {'loss': 0.3861, 'grad_norm': 3.2564568519592285, 'learning_rate': 1.2309810767852435e-05} -{'loss': 0.3861, 'grad_norm': 3.2564568519592285, 'learning_rate': 1.2309810767852435e-05, 'epoch': 0.45} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00024803250562399626, 'train/lm_loss': 5.008546868339181e-05, 'train/info_loss': 2.914582182711456e-05, 'train/ref_loss': 0.16793479025363922, 'train/uncertainty_loss': -6.978113669902087e-05, 'train/video_loss': 0.16987840831279755, 'train/total_loss': 0.16992849111557007} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2597484827041626, 'train/info_loss': 0.24270959198474884, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010981826344504953, 'train/video_loss': 0.24259977042675018, 'train/total_loss': 0.5023482441902161} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.1457, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3516, 'grad_norm': 3.3338637351989746, 'learning_rate': 1.2299430377589548e-05} -[Rank 3] Trainer log: {'loss': 0.3516, 'grad_norm': 3.3338637351989746, 'learning_rate': 1.2299430377589548e-05} -[Rank 2] Trainer log: {'loss': 0.3516, 'grad_norm': 3.3338637351989746, 'learning_rate': 1.2299430377589548e-05} -[Rank 0] Trainer log: {'loss': 0.3516, 'grad_norm': 3.3338637351989746, 'learning_rate': 1.2299430377589548e-05} -{'loss': 0.3516, 'grad_norm': 3.3338637351989746, 'learning_rate': 1.2299430377589548e-05, 'epoch': 0.45} -tensor(-0.0015, device='cuda:0', grad_fn=) tensor(-0.0015, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2856121063232422, 'train/info_loss': 0.2614346146583557, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00014720143517479303, 'train/video_loss': 0.2612874209880829, 'train/total_loss': 0.5468995571136475} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(0.2542, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(0.0086, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.24377028942108156, 'train/info_loss': 0.08791998773813248, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011092271888628602, 'train/video_loss': 0.08780906349420547, 'train/total_loss': 0.3315793573856354} -[Rank 3] Trainer log: {'loss': 0.4435, 'grad_norm': 5.717404365539551, 'learning_rate': 1.228904737065416e-05}[Rank 0] Trainer log: {'loss': 0.4435, 'grad_norm': 5.717404365539551, 'learning_rate': 1.228904737065416e-05}[Rank 2] Trainer log: {'loss': 0.4435, 'grad_norm': 5.717404365539551, 'learning_rate': 1.228904737065416e-05} - -[Rank 1] Trainer log: {'loss': 0.4435, 'grad_norm': 5.717404365539551, 'learning_rate': 1.228904737065416e-05} - -{'loss': 0.4435, 'grad_norm': 5.717404365539551, 'learning_rate': 1.228904737065416e-05, 'epoch': 0.46} -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(0.2963, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002872005570679903, 'train/lm_loss': 5.690241814590991e-05, 'train/info_loss': 3.00994397548493e-05, 'train/ref_loss': 0.41435137391090393, 'train/uncertainty_loss': 0.029634428024291993, 'train/video_loss': 0.4463135004043579, 'train/total_loss': 0.446370393037796} -tensor(0.0756, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.22236242294311526, 'train/info_loss': 0.29338204860687256, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012817238457500936, 'train/video_loss': 0.2932538688182831, 'train/total_loss': 0.5156162977218628} -tensor(0.1293, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4046, 'grad_norm': 3.3068130016326904, 'learning_rate': 1.2278661758861774e-05} -[Rank 3] Trainer log: {'loss': 0.4046, 'grad_norm': 3.3068130016326904, 'learning_rate': 1.2278661758861774e-05}[Rank 0] Trainer log: {'loss': 0.4046, 'grad_norm': 3.3068130016326904, 'learning_rate': 1.2278661758861774e-05} -[Rank 2] Trainer log: {'loss': 0.4046, 'grad_norm': 3.3068130016326904, 'learning_rate': 1.2278661758861774e-05} - -{'loss': 0.4046, 'grad_norm': 3.3068130016326904, 'learning_rate': 1.2278661758861774e-05, 'epoch': 0.46} -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2118786096572876, 'train/info_loss': 0.1694498509168625, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001256887335330248, 'train/video_loss': 0.16932415962219238, 'train/total_loss': 0.381202757358551} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.0644, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.12172538042068481, 'train/info_loss': 0.394825279712677, 'train/ref_loss': None, 'train/uncertainty_loss': -9.341825498268009e-05, 'train/video_loss': 0.3947318494319916, 'train/total_loss': 0.516457200050354} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.2477, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4649, 'grad_norm': 5.811886310577393, 'learning_rate': 1.2268273554030858e-05} -[Rank 0] Trainer log: {'loss': 0.4649, 'grad_norm': 5.811886310577393, 'learning_rate': 1.2268273554030858e-05}[Rank 2] Trainer log: {'loss': 0.4649, 'grad_norm': 5.811886310577393, 'learning_rate': 1.2268273554030858e-05}[Rank 3] Trainer log: {'loss': 0.4649, 'grad_norm': 5.811886310577393, 'learning_rate': 1.2268273554030858e-05} - - -{'loss': 0.4649, 'grad_norm': 5.811886310577393, 'learning_rate': 1.2268273554030858e-05, 'epoch': 0.46} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00020567423198372128, 'train/lm_loss': 3.018151328433305e-05, 'train/info_loss': 2.270885306643322e-05, 'train/ref_loss': 0.11993731558322906, 'train/uncertainty_loss': -7.071913569234312e-05, 'train/video_loss': 0.12153469771146774, 'train/total_loss': 0.12156488001346588} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00024024830199778082, 'train/lm_loss': 3.857240662910044e-05, 'train/info_loss': 2.6940573661704548e-05, 'train/ref_loss': 0.16065099835395813, 'train/uncertainty_loss': -7.064326200634241e-05, 'train/video_loss': 0.16252927482128143, 'train/total_loss': 0.1625678539276123} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3075, 'grad_norm': 6.294760704040527, 'learning_rate': 1.2257882767982834e-05}[Rank 3] Trainer log: {'loss': 0.3075, 'grad_norm': 6.294760704040527, 'learning_rate': 1.2257882767982834e-05}[Rank 1] Trainer log: {'loss': 0.3075, 'grad_norm': 6.294760704040527, 'learning_rate': 1.2257882767982834e-05} - - -[Rank 2] Trainer log: {'loss': 0.3075, 'grad_norm': 6.294760704040527, 'learning_rate': 1.2257882767982834e-05} -{'loss': 0.3075, 'grad_norm': 6.294760704040527, 'learning_rate': 1.2257882767982834e-05, 'epoch': 0.46} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.1647, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.0714, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002696096198633313, 'train/lm_loss': 3.032454405911267e-05, 'train/info_loss': 2.270885306643322e-05, 'train/ref_loss': 0.25798094272613525, 'train/uncertainty_loss': 0.007140875607728959, 'train/video_loss': 0.26730141043663025, 'train/total_loss': 0.2673317492008209} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(0.2063, device='cuda:3', grad_fn=) tensor(-0.0006, device='cuda:3', grad_fn=) -tensor(0.1273, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00040160580538213254, 'train/lm_loss': 3.392408543732017e-05, 'train/info_loss': 2.6523362976149656e-05, 'train/ref_loss': 0.2954278588294983, 'train/uncertainty_loss': 0.012726439535617829, 'train/video_loss': 0.3113936483860016, 'train/total_loss': 0.3114275634288788} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3652, 'grad_norm': 3.927255868911743, 'learning_rate': 1.2247489412542055e-05}[Rank 1] Trainer log: {'loss': 0.3652, 'grad_norm': 3.927255868911743, 'learning_rate': 1.2247489412542055e-05} -[Rank 2] Trainer log: {'loss': 0.3652, 'grad_norm': 3.927255868911743, 'learning_rate': 1.2247489412542055e-05} - -[Rank 3] Trainer log: {'loss': 0.3652, 'grad_norm': 3.927255868911743, 'learning_rate': 1.2247489412542055e-05} -{'loss': 0.3652, 'grad_norm': 3.927255868911743, 'learning_rate': 1.2247489412542055e-05, 'epoch': 0.46} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0015, device='cuda:2', grad_fn=) tensor(-0.0015, device='cuda:2', grad_fn=) -tensor(0.2368, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003477975027635694, 'train/lm_loss': 4.362584149930626e-05, 'train/info_loss': 3.00994397548493e-05, 'train/ref_loss': 0.3726978302001953, 'train/uncertainty_loss': 0.023676423728466036, 'train/video_loss': 0.39918673038482666, 'train/total_loss': 0.39923036098480225} -tensor(0.0274, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1201, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0035, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002711305860430002, 'train/lm_loss': 3.370954655110836e-05, 'train/info_loss': 2.6106152290594764e-05, 'train/ref_loss': 0.21285444498062134, 'train/uncertainty_loss': 0.000352410925552249, 'train/video_loss': 0.2154020071029663, 'train/total_loss': 0.2154357135295868} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3972, 'grad_norm': 2.5728228092193604, 'learning_rate': 1.2237093499535801e-05}[Rank 3] Trainer log: {'loss': 0.3972, 'grad_norm': 2.5728228092193604, 'learning_rate': 1.2237093499535801e-05} -[Rank 1] Trainer log: {'loss': 0.3972, 'grad_norm': 2.5728228092193604, 'learning_rate': 1.2237093499535801e-05} - -[Rank 2] Trainer log: {'loss': 0.3972, 'grad_norm': 2.5728228092193604, 'learning_rate': 1.2237093499535801e-05} -{'loss': 0.3972, 'grad_norm': 2.5728228092193604, 'learning_rate': 1.2237093499535801e-05, 'epoch': 0.46} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.28081784248352054, 'train/info_loss': 0.21471856534481049, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013465032679960132, 'train/video_loss': 0.21458391845226288, 'train/total_loss': 0.4954017400741577} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.3222, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.30147428512573243, 'train/info_loss': 0.1728658229112625, 'train/ref_loss': None, 'train/uncertainty_loss': -9.694981272332371e-05, 'train/video_loss': 0.17276887595653534, 'train/total_loss': 0.4742431640625} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.2637, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4905, 'grad_norm': 3.052776336669922, 'learning_rate': 1.2226695040794264e-05} -[Rank 0] Trainer log: {'loss': 0.4905, 'grad_norm': 3.052776336669922, 'learning_rate': 1.2226695040794264e-05}[Rank 2] Trainer log: {'loss': 0.4905, 'grad_norm': 3.052776336669922, 'learning_rate': 1.2226695040794264e-05} - -[Rank 3] Trainer log: {'loss': 0.4905, 'grad_norm': 3.052776336669922, 'learning_rate': 1.2226695040794264e-05} -{'loss': 0.4905, 'grad_norm': 3.052776336669922, 'learning_rate': 1.2226695040794264e-05, 'epoch': 0.46} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.11288316249847413, 'train/info_loss': 0.1512836366891861, 'train/ref_loss': None, 'train/uncertainty_loss': -9.774109348654748e-05, 'train/video_loss': 0.15118589997291565, 'train/total_loss': 0.2640690803527832} -tensor(0.1440, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00023094199132174255, 'train/lm_loss': 2.989545464515686e-05, 'train/info_loss': 2.3781687559676357e-05, 'train/ref_loss': 0.19911713898181915, 'train/uncertainty_loss': -6.95238122716546e-05, 'train/video_loss': 0.20091892778873444, 'train/total_loss': 0.2009488195180893} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3719, 'grad_norm': 9.304122924804688, 'learning_rate': 1.2216294048150532e-05} -[Rank 0] Trainer log: {'loss': 0.3719, 'grad_norm': 9.304122924804688, 'learning_rate': 1.2216294048150532e-05}[Rank 1] Trainer log: {'loss': 0.3719, 'grad_norm': 9.304122924804688, 'learning_rate': 1.2216294048150532e-05} -[Rank 3] Trainer log: {'loss': 0.3719, 'grad_norm': 9.304122924804688, 'learning_rate': 1.2216294048150532e-05} - -{'loss': 0.3719, 'grad_norm': 9.304122924804688, 'learning_rate': 1.2216294048150532e-05, 'epoch': 0.46} -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.26950299739837646, 'train/info_loss': 0.20222017168998718, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013659076066687703, 'train/video_loss': 0.20208358764648438, 'train/total_loss': 0.47158658504486084} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.2915, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(1.0296, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.5939, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1015, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002854016376659274, 'train/lm_loss': 2.9943132540211084e-05, 'train/info_loss': 2.288765972480178e-05, 'train/ref_loss': 0.2780076265335083, 'train/uncertainty_loss': 0.010151395201683046, 'train/video_loss': 0.2904651165008545, 'train/total_loss': 0.2904950678348541} -[Rank 1] Trainer log: {'loss': 0.4912, 'grad_norm': 22.12078285217285, 'learning_rate': 1.2205890533440576e-05}[Rank 3] Trainer log: {'loss': 0.4912, 'grad_norm': 22.12078285217285, 'learning_rate': 1.2205890533440576e-05} - -[Rank 0] Trainer log: {'loss': 0.4912, 'grad_norm': 22.12078285217285, 'learning_rate': 1.2205890533440576e-05}[Rank 2] Trainer log: {'loss': 0.4912, 'grad_norm': 22.12078285217285, 'learning_rate': 1.2205890533440576e-05} - -{'loss': 0.4912, 'grad_norm': 22.12078285217285, 'learning_rate': 1.2205890533440576e-05, 'epoch': 0.46} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.001103383768349886, 'train/lm_loss': 2.6295840507373216e-05, 'train/info_loss': 2.181482341256924e-05, 'train/ref_loss': 0.20506176352500916, 'train/uncertainty_loss': -7.183199631981552e-05, 'train/video_loss': 0.21383881568908691, 'train/total_loss': 0.21386511623859406} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(0.0212, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00031070285476744177, 'train/lm_loss': 3.8238684646785265e-05, 'train/info_loss': 2.396049239905551e-05, 'train/ref_loss': 0.1866799294948578, 'train/uncertainty_loss': 0.0021192327141761782, 'train/video_loss': 0.19130873680114746, 'train/total_loss': 0.19134697318077087} -[Rank 2] Trainer log: {'loss': 0.3472, 'grad_norm': 7.5659098625183105, 'learning_rate': 1.2195484508503235e-05}[Rank 1] Trainer log: {'loss': 0.3472, 'grad_norm': 7.5659098625183105, 'learning_rate': 1.2195484508503235e-05}[Rank 3] Trainer log: {'loss': 0.3472, 'grad_norm': 7.5659098625183105, 'learning_rate': 1.2195484508503235e-05} - - -[Rank 0] Trainer log: {'loss': 0.3472, 'grad_norm': 7.5659098625183105, 'learning_rate': 1.2195484508503235e-05} -{'loss': 0.3472, 'grad_norm': 7.5659098625183105, 'learning_rate': 1.2195484508503235e-05, 'epoch': 0.46} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3532389879226685, 'train/info_loss': 0.20469427108764648, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011103157885372639, 'train/video_loss': 0.20458324253559113, 'train/total_loss': 0.5578222274780273} -tensor(0.1813, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.18859562873840333, 'train/info_loss': 0.34698721766471863, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012940322048962116, 'train/video_loss': 0.34685781598091125, 'train/total_loss': 0.5354534387588501} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.0473, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.459, 'grad_norm': 6.791378974914551, 'learning_rate': 1.218507598518021e-05} -[Rank 0] Trainer log: {'loss': 0.459, 'grad_norm': 6.791378974914551, 'learning_rate': 1.218507598518021e-05}[Rank 3] Trainer log: {'loss': 0.459, 'grad_norm': 6.791378974914551, 'learning_rate': 1.218507598518021e-05} - -[Rank 2] Trainer log: {'loss': 0.459, 'grad_norm': 6.791378974914551, 'learning_rate': 1.218507598518021e-05} -{'loss': 0.459, 'grad_norm': 6.791378974914551, 'learning_rate': 1.218507598518021e-05, 'epoch': 0.46} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0394, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00023717032745480538, 'train/lm_loss': 4.446012317202986e-05, 'train/info_loss': 2.735778434725944e-05, 'train/ref_loss': 0.24814534187316895, 'train/uncertainty_loss': 0.0039384752511978155, 'train/video_loss': 0.25400853157043457, 'train/total_loss': 0.254052996635437} -tensor(-0.0015, device='cuda:3', grad_fn=) tensor(-0.0015, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00031396220438182357, 'train/lm_loss': 2.9800101765431466e-05, 'train/info_loss': 2.396049239905551e-05, 'train/ref_loss': 0.14615750312805176, 'train/uncertainty_loss': -7.024139049462975e-05, 'train/video_loss': 0.14862291514873505, 'train/total_loss': 0.14865271747112274} -[Rank 2] Trainer log: {'loss': 0.3531, 'grad_norm': 1.814233660697937, 'learning_rate': 1.217466497531604e-05} -[Rank 1] Trainer log: {'loss': 0.3531, 'grad_norm': 1.814233660697937, 'learning_rate': 1.217466497531604e-05}[Rank 0] Trainer log: {'loss': 0.3531, 'grad_norm': 1.814233660697937, 'learning_rate': 1.217466497531604e-05} -[Rank 3] Trainer log: {'loss': 0.3531, 'grad_norm': 1.814233660697937, 'learning_rate': 1.217466497531604e-05} - -{'loss': 0.3531, 'grad_norm': 1.814233660697937, 'learning_rate': 1.217466497531604e-05, 'epoch': 0.46} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.0699, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.000329440669156611, 'train/lm_loss': 2.3006070114206524e-05, 'train/info_loss': 2.0980394765501842e-05, 'train/ref_loss': 0.26085323095321655, 'train/uncertainty_loss': 0.0069914057850837714, 'train/video_loss': 0.27050116658210754, 'train/total_loss': 0.27052417397499084} -tensor(0.0639, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00028689345344901085, 'train/lm_loss': 3.850089560728521e-05, 'train/info_loss': 2.8490208933362737e-05, 'train/ref_loss': 0.19060997664928436, 'train/uncertainty_loss': -6.875995895825327e-05, 'train/video_loss': 0.19286486506462097, 'train/total_loss': 0.19290336966514587} -tensor(0.2670, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0214, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.376, 'grad_norm': 17.644432067871094, 'learning_rate': 1.2164251490758095e-05}[Rank 3] Trainer log: {'loss': 0.376, 'grad_norm': 17.644432067871094, 'learning_rate': 1.2164251490758095e-05} - -[Rank 1] Trainer log: {'loss': 0.376, 'grad_norm': 17.644432067871094, 'learning_rate': 1.2164251490758095e-05} -[Rank 0] Trainer log: {'loss': 0.376, 'grad_norm': 17.644432067871094, 'learning_rate': 1.2164251490758095e-05} -{'loss': 0.376, 'grad_norm': 17.644432067871094, 'learning_rate': 1.2164251490758095e-05, 'epoch': 0.46} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.18135756254196167, 'train/info_loss': 0.20690558850765228, 'train/ref_loss': None, 'train/uncertainty_loss': -8.974148076958955e-05, 'train/video_loss': 0.20681585371494293, 'train/total_loss': 0.3881734013557434} -tensor(0.0162, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0615, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0015, device='cuda:0', grad_fn=) tensor(-0.0015, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.38494420051574707, 'train/info_loss': 0.14521001279354095, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00014750065747648478, 'train/video_loss': 0.14506250619888306, 'train/total_loss': 0.5300067067146301} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.2987, 'grad_norm': 5.972226619720459, 'learning_rate': 1.2153835543356556e-05}[Rank 1] Trainer log: {'loss': 0.2987, 'grad_norm': 5.972226619720459, 'learning_rate': 1.2153835543356556e-05} -[Rank 3] Trainer log: {'loss': 0.2987, 'grad_norm': 5.972226619720459, 'learning_rate': 1.2153835543356556e-05} - -[Rank 0] Trainer log: {'loss': 0.2987, 'grad_norm': 5.972226619720459, 'learning_rate': 1.2153835543356556e-05} -{'loss': 0.2987, 'grad_norm': 5.972226619720459, 'learning_rate': 1.2153835543356556e-05, 'epoch': 0.46} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.3046, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00020111023914068938, 'train/lm_loss': 2.999080752488226e-05, 'train/info_loss': 2.396049239905551e-05, 'train/ref_loss': 0.41804832220077515, 'train/uncertainty_loss': 0.03046020567417145, 'train/video_loss': 0.45014137029647827, 'train/total_loss': 0.4501713514328003} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.30690462589263917, 'train/info_loss': 0.17335285246372223, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011745361844077707, 'train/video_loss': 0.17323540151119232, 'train/total_loss': 0.4801400303840637} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.5618, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.408, 'grad_norm': 5.098149299621582, 'learning_rate': 1.2143417144964425e-05}[Rank 3] Trainer log: {'loss': 0.408, 'grad_norm': 5.098149299621582, 'learning_rate': 1.2143417144964425e-05} -[Rank 2] Trainer log: {'loss': 0.408, 'grad_norm': 5.098149299621582, 'learning_rate': 1.2143417144964425e-05} - -[Rank 0] Trainer log: {'loss': 0.408, 'grad_norm': 5.098149299621582, 'learning_rate': 1.2143417144964425e-05} -{'loss': 0.408, 'grad_norm': 5.098149299621582, 'learning_rate': 1.2143417144964425e-05, 'epoch': 0.46} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.4179533481597901, 'train/info_loss': 0.2431000918149948, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010394980199635029, 'train/video_loss': 0.24299614131450653, 'train/total_loss': 0.6609494686126709} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.16808848381042482, 'train/info_loss': 0.11865807324647903, 'train/ref_loss': None, 'train/uncertainty_loss': -9.235942270606756e-05, 'train/video_loss': 0.11856571584939957, 'train/total_loss': 0.28665420413017273} -[Rank 3] Trainer log: {'loss': 0.3516, 'grad_norm': 4.3635125160217285, 'learning_rate': 1.213299630743747e-05}[Rank 2] Trainer log: {'loss': 0.3516, 'grad_norm': 4.3635125160217285, 'learning_rate': 1.213299630743747e-05}[Rank 0] Trainer log: {'loss': 0.3516, 'grad_norm': 4.3635125160217285, 'learning_rate': 1.213299630743747e-05} - -[Rank 1] Trainer log: {'loss': 0.3516, 'grad_norm': 4.3635125160217285, 'learning_rate': 1.213299630743747e-05} - -{'loss': 0.3516, 'grad_norm': 4.3635125160217285, 'learning_rate': 1.213299630743747e-05, 'epoch': 0.46} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00024799022357910875, 'train/lm_loss': 2.989545464515686e-05, 'train/info_loss': 2.4914121240726672e-05, 'train/ref_loss': 0.08048927783966064, 'train/uncertainty_loss': -6.888431962579489e-05, 'train/video_loss': 0.08242923021316528, 'train/total_loss': 0.08245912194252014} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.004461218044161797, 'train/info_loss': 0.2092473804950714, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010699948761612178, 'train/video_loss': 0.2091403752565384, 'train/total_loss': 0.21360158920288086} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(0.0673, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.2591, 'grad_norm': 2.810879707336426, 'learning_rate': 1.2122573042634249e-05}[Rank 3] Trainer log: {'loss': 0.2591, 'grad_norm': 2.810879707336426, 'learning_rate': 1.2122573042634249e-05}[Rank 1] Trainer log: {'loss': 0.2591, 'grad_norm': 2.810879707336426, 'learning_rate': 1.2122573042634249e-05} - - -[Rank 0] Trainer log: {'loss': 0.2591, 'grad_norm': 2.810879707336426, 'learning_rate': 1.2122573042634249e-05} -{'loss': 0.2591, 'grad_norm': 2.810879707336426, 'learning_rate': 1.2122573042634249e-05, 'epoch': 0.46} -tensor(-0.0019, device='cuda:0', grad_fn=) tensor(-0.0019, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.46780824661254883, 'train/info_loss': 0.18090534210205078, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001853309338912368, 'train/video_loss': 0.1807200163602829, 'train/total_loss': 0.6485282778739929} -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.33719949722290044, 'train/info_loss': 0.13143105804920197, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013430326944217086, 'train/video_loss': 0.13129675388336182, 'train/total_loss': 0.46849626302719116} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.481, 'grad_norm': 2.2527976036071777, 'learning_rate': 1.2112147362416076e-05} -[Rank 3] Trainer log: {'loss': 0.481, 'grad_norm': 2.2527976036071777, 'learning_rate': 1.2112147362416076e-05} -[Rank 0] Trainer log: {'loss': 0.481, 'grad_norm': 2.2527976036071777, 'learning_rate': 1.2112147362416076e-05} -[Rank 2] Trainer log: {'loss': 0.481, 'grad_norm': 2.2527976036071777, 'learning_rate': 1.2112147362416076e-05} -{'loss': 0.481, 'grad_norm': 2.2527976036071777, 'learning_rate': 1.2112147362416076e-05, 'epoch': 0.46} -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.25636327266693115, 'train/info_loss': 0.2757655382156372, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001369433826766908, 'train/video_loss': 0.27562859654426575, 'train/total_loss': 0.5319918394088745} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(0.0270, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.21061325073242188, 'train/info_loss': 0.1638813316822052, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010072158183902503, 'train/video_loss': 0.16378061473369598, 'train/total_loss': 0.37439388036727905} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1400, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.365, 'grad_norm': 3.147430896759033, 'learning_rate': 1.2101719278647016e-05}[Rank 3] Trainer log: {'loss': 0.365, 'grad_norm': 3.147430896759033, 'learning_rate': 1.2101719278647016e-05} - -[Rank 0] Trainer log: {'loss': 0.365, 'grad_norm': 3.147430896759033, 'learning_rate': 1.2101719278647016e-05}[Rank 2] Trainer log: {'loss': 0.365, 'grad_norm': 3.147430896759033, 'learning_rate': 1.2101719278647016e-05} - -{'loss': 0.365, 'grad_norm': 3.147430896759033, 'learning_rate': 1.2101719278647016e-05, 'epoch': 0.46} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.36061921119689944, 'train/info_loss': 0.1320500522851944, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001267287414520979, 'train/video_loss': 0.13192331790924072, 'train/total_loss': 0.4925425350666046} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.0784, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00018369253957644107, 'train/lm_loss': 3.8429381675086915e-05, 'train/info_loss': 2.7596188374445774e-05, 'train/ref_loss': 0.2712021470069885, 'train/uncertainty_loss': 0.007836093753576278, 'train/video_loss': 0.28053539991378784, 'train/total_loss': 0.2805738151073456} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3547, 'grad_norm': 3.5393178462982178, 'learning_rate': 1.209128880319387e-05}[Rank 3] Trainer log: {'loss': 0.3547, 'grad_norm': 3.5393178462982178, 'learning_rate': 1.209128880319387e-05}[Rank 0] Trainer log: {'loss': 0.3547, 'grad_norm': 3.5393178462982178, 'learning_rate': 1.209128880319387e-05} - - -{'loss': 0.3547, 'grad_norm': 3.5393178462982178, 'learning_rate': 1.209128880319387e-05, 'epoch': 0.46} -[Rank 1] Trainer log: {'loss': 0.3547, 'grad_norm': 3.5393178462982178, 'learning_rate': 1.209128880319387e-05} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.18393999338150024, 'train/info_loss': 0.22577527165412903, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011848442954942584, 'train/video_loss': 0.22565679252147675, 'train/total_loss': 0.4095968008041382} -tensor(0.2477, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0618, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.31764185428619385, 'train/info_loss': 0.2783219516277313, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011413168394938112, 'train/video_loss': 0.27820780873298645, 'train/total_loss': 0.5958496332168579} -tensor(0.0005, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3688, 'grad_norm': 8.457040786743164, 'learning_rate': 1.208085594792616e-05} -[Rank 1] Trainer log: {'loss': 0.3688, 'grad_norm': 8.457040786743164, 'learning_rate': 1.208085594792616e-05} -[Rank 0] Trainer log: {'loss': 0.3688, 'grad_norm': 8.457040786743164, 'learning_rate': 1.208085594792616e-05} -[Rank 3] Trainer log: {'loss': 0.3688, 'grad_norm': 8.457040786743164, 'learning_rate': 1.208085594792616e-05} -{'loss': 0.3688, 'grad_norm': 8.457040786743164, 'learning_rate': 1.208085594792616e-05, 'epoch': 0.46} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00027023260481655597, 'train/lm_loss': 2.975242678076029e-05, 'train/info_loss': 2.5331331926281564e-05, 'train/ref_loss': 0.16997316479682922, 'train/uncertainty_loss': -6.928298971615731e-05, 'train/video_loss': 0.1720910668373108, 'train/total_loss': 0.1721208244562149} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.28614346981048583, 'train/info_loss': 0.5183919072151184, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013546874979510903, 'train/video_loss': 0.518256425857544, 'train/total_loss': 0.8043999075889587} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.5127, 'grad_norm': 6.061588287353516, 'learning_rate': 1.2070420724716113e-05} -[Rank 2] Trainer log: {'loss': 0.5127, 'grad_norm': 6.061588287353516, 'learning_rate': 1.2070420724716113e-05}[Rank 0] Trainer log: {'loss': 0.5127, 'grad_norm': 6.061588287353516, 'learning_rate': 1.2070420724716113e-05} - -[Rank 3] Trainer log: {'loss': 0.5127, 'grad_norm': 6.061588287353516, 'learning_rate': 1.2070420724716113e-05} -{'loss': 0.5127, 'grad_norm': 6.061588287353516, 'learning_rate': 1.2070420724716113e-05, 'epoch': 0.46} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(0.6225, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.1357, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00022277461830526592, 'train/lm_loss': 5.695009022019804e-05, 'train/info_loss': 2.8669011953752488e-05, 'train/ref_loss': 0.16193526983261108, 'train/uncertainty_loss': -7.257931865751743e-05, 'train/video_loss': 0.1636735498905182, 'train/total_loss': 0.16373050212860107} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2993167400360108, 'train/info_loss': 0.2788437604904175, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011388739803805948, 'train/video_loss': 0.2787298858165741, 'train/total_loss': 0.5780466198921204} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3918, 'grad_norm': 9.340973854064941, 'learning_rate': 1.2059983145438654e-05}[Rank 1] Trainer log: {'loss': 0.3918, 'grad_norm': 9.340973854064941, 'learning_rate': 1.2059983145438654e-05} -[Rank 3] Trainer log: {'loss': 0.3918, 'grad_norm': 9.340973854064941, 'learning_rate': 1.2059983145438654e-05} - -[Rank 0] Trainer log: {'loss': 0.3918, 'grad_norm': 9.340973854064941, 'learning_rate': 1.2059983145438654e-05} -{'loss': 0.3918, 'grad_norm': 9.340973854064941, 'learning_rate': 1.2059983145438654e-05, 'epoch': 0.46} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(0.0993, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.12254739999771119, 'train/info_loss': 0.16653083264827728, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010938070481643082, 'train/video_loss': 0.16642145812511444, 'train/total_loss': 0.28896886110305786} -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.1485, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00016002123011276128, 'train/lm_loss': 6.386213935911655e-05, 'train/info_loss': 3.516550350468606e-05, 'train/ref_loss': 0.3119538724422455, 'train/uncertainty_loss': 0.01484757959842682, 'train/video_loss': 0.328116774559021, 'train/total_loss': 0.3281806409358978} -[Rank 1] Trainer log: {'loss': 0.3753, 'grad_norm': 2.6542556285858154, 'learning_rate': 1.2049543221971392e-05}[Rank 3] Trainer log: {'loss': 0.3753, 'grad_norm': 2.6542556285858154, 'learning_rate': 1.2049543221971392e-05}[Rank 0] Trainer log: {'loss': 0.3753, 'grad_norm': 2.6542556285858154, 'learning_rate': 1.2049543221971392e-05} - -[Rank 2] Trainer log: {'loss': 0.3753, 'grad_norm': 2.6542556285858154, 'learning_rate': 1.2049543221971392e-05} - -{'loss': 0.3753, 'grad_norm': 2.6542556285858154, 'learning_rate': 1.2049543221971392e-05, 'epoch': 0.46} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.3906, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002746510086581111, 'train/lm_loss': 6.450566579587758e-05, 'train/info_loss': 3.409269265830517e-05, 'train/ref_loss': 0.47199124097824097, 'train/uncertainty_loss': 0.039058837294578555, 'train/video_loss': 0.5132814049720764, 'train/total_loss': 0.5133458971977234} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(0.6332, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002094317926093936, 'train/lm_loss': 4.3935718713328244e-05, 'train/info_loss': 2.9861035727662966e-05, 'train/ref_loss': 0.1010478138923645, 'train/uncertainty_loss': -6.990624824538827e-05, 'train/video_loss': 0.10268322378396988, 'train/total_loss': 0.10272715985774994} -[Rank 1] Trainer log: {'loss': 0.3711, 'grad_norm': 7.748622417449951, 'learning_rate': 1.2039100966194594e-05}[Rank 3] Trainer log: {'loss': 0.3711, 'grad_norm': 7.748622417449951, 'learning_rate': 1.2039100966194594e-05} -[Rank 2] Trainer log: {'loss': 0.3711, 'grad_norm': 7.748622417449951, 'learning_rate': 1.2039100966194594e-05} - -[Rank 0] Trainer log: {'loss': 0.3711, 'grad_norm': 7.748622417449951, 'learning_rate': 1.2039100966194594e-05} -{'loss': 0.3711, 'grad_norm': 7.748622417449951, 'learning_rate': 1.2039100966194594e-05, 'epoch': 0.46} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.12949122190475465, 'train/info_loss': 0.2280755490064621, 'train/ref_loss': None, 'train/uncertainty_loss': -9.73225513007492e-05, 'train/video_loss': 0.22797822952270508, 'train/total_loss': 0.35746943950653076} -tensor(0.0574, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.2820, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(0.0897, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0269, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002531849779188633, 'train/lm_loss': 3.0038485419936478e-05, 'train/info_loss': 2.6344558136770502e-05, 'train/ref_loss': 0.23387736082077026, 'train/uncertainty_loss': 0.0026890246197581293, 'train/video_loss': 0.23861820995807648, 'train/total_loss': 0.23864825069904327} -[Rank 1] Trainer log: {'loss': 0.3294, 'grad_norm': 5.007693767547607, 'learning_rate': 1.2028656389991193e-05}[Rank 3] Trainer log: {'loss': 0.3294, 'grad_norm': 5.007693767547607, 'learning_rate': 1.2028656389991193e-05} - -[Rank 0] Trainer log: {'loss': 0.3294, 'grad_norm': 5.007693767547607, 'learning_rate': 1.2028656389991193e-05} -[Rank 2] Trainer log: {'loss': 0.3294, 'grad_norm': 5.007693767547607, 'learning_rate': 1.2028656389991193e-05} -{'loss': 0.3294, 'grad_norm': 5.007693767547607, 'learning_rate': 1.2028656389991193e-05, 'epoch': 0.46} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.25263655185699463, 'train/info_loss': 0.25546640157699585, 'train/ref_loss': None, 'train/uncertainty_loss': -8.964919834397733e-05, 'train/video_loss': 0.25537675619125366, 'train/total_loss': 0.5080133080482483} -tensor(0.2886, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.08518304228782654, 'train/info_loss': 0.215057373046875, 'train/ref_loss': None, 'train/uncertainty_loss': -8.370631840080023e-05, 'train/video_loss': 0.21497367322444916, 'train/total_loss': 0.30015671253204346} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3782, 'grad_norm': 5.7404398918151855, 'learning_rate': 1.2018209505246755e-05}[Rank 0] Trainer log: {'loss': 0.3782, 'grad_norm': 5.7404398918151855, 'learning_rate': 1.2018209505246755e-05}[Rank 3] Trainer log: {'loss': 0.3782, 'grad_norm': 5.7404398918151855, 'learning_rate': 1.2018209505246755e-05} - -[Rank 2] Trainer log: {'loss': 0.3782, 'grad_norm': 5.7404398918151855, 'learning_rate': 1.2018209505246755e-05} - -{'loss': 0.3782, 'grad_norm': 5.7404398918151855, 'learning_rate': 1.2018209505246755e-05, 'epoch': 0.46} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002496513538062573, 'train/lm_loss': 5.008546868339181e-05, 'train/info_loss': 3.301988181192428e-05, 'train/ref_loss': 0.1663041114807129, 'train/uncertainty_loss': -6.888334173709154e-05, 'train/video_loss': 0.1682654619216919, 'train/total_loss': 0.16831554472446442} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.4303, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002341665094718337, 'train/lm_loss': 2.291071286890656e-05, 'train/info_loss': 2.1636018573190086e-05, 'train/ref_loss': 0.5033798813819885, 'train/uncertainty_loss': 0.043025305867195135, 'train/video_loss': 0.5483001470565796, 'train/total_loss': 0.5483230352401733} -[Rank 0] Trainer log: {'loss': 0.3475, 'grad_norm': 9.260931015014648, 'learning_rate': 1.2007760323849469e-05}[Rank 2] Trainer log: {'loss': 0.3475, 'grad_norm': 9.260931015014648, 'learning_rate': 1.2007760323849469e-05} -[Rank 1] Trainer log: {'loss': 0.3475, 'grad_norm': 9.260931015014648, 'learning_rate': 1.2007760323849469e-05} - -[Rank 3] Trainer log: {'loss': 0.3475, 'grad_norm': 9.260931015014648, 'learning_rate': 1.2007760323849469e-05} -{'loss': 0.3475, 'grad_norm': 9.260931015014648, 'learning_rate': 1.2007760323849469e-05, 'epoch': 0.46} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.0849, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00024291102308779955, 'train/lm_loss': 3.3852574415504935e-05, 'train/info_loss': 2.6344558136770502e-05, 'train/ref_loss': 0.2707577347755432, 'train/uncertainty_loss': 0.00848771557211876, 'train/video_loss': 0.2812150716781616, 'train/total_loss': 0.28124892711639404} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.13422586917877197, 'train/info_loss': 0.17880848050117493, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001083115115761757, 'train/video_loss': 0.17870016396045685, 'train/total_loss': 0.3129260540008545} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3897, 'grad_norm': 5.285192966461182, 'learning_rate': 1.199730885769015e-05}[Rank 3] Trainer log: {'loss': 0.3897, 'grad_norm': 5.285192966461182, 'learning_rate': 1.199730885769015e-05}[Rank 1] Trainer log: {'loss': 0.3897, 'grad_norm': 5.285192966461182, 'learning_rate': 1.199730885769015e-05} - -[Rank 2] Trainer log: {'loss': 0.3897, 'grad_norm': 5.285192966461182, 'learning_rate': 1.199730885769015e-05} - -{'loss': 0.3897, 'grad_norm': 5.285192966461182, 'learning_rate': 1.199730885769015e-05, 'epoch': 0.46} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.4210, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1965, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00040078959427773957, 'train/lm_loss': 3.826252068392933e-05, 'train/info_loss': 2.8490208933362737e-05, 'train/ref_loss': 0.3452445864677429, 'train/uncertainty_loss': 0.019651158154010775, 'train/video_loss': 0.36813053488731384, 'train/total_loss': 0.36816880106925964} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.28092844486236573, 'train/info_loss': 0.18251769244670868, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010658128885552288, 'train/video_loss': 0.1824111044406891, 'train/total_loss': 0.46333956718444824} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.2914, 'grad_norm': 7.618317604064941, 'learning_rate': 1.1986855118662207e-05}[Rank 3] Trainer log: {'loss': 0.2914, 'grad_norm': 7.618317604064941, 'learning_rate': 1.1986855118662207e-05}[Rank 0] Trainer log: {'loss': 0.2914, 'grad_norm': 7.618317604064941, 'learning_rate': 1.1986855118662207e-05} - - -[Rank 2] Trainer log: {'loss': 0.2914, 'grad_norm': 7.618317604064941, 'learning_rate': 1.1986855118662207e-05} -{'loss': 0.2914, 'grad_norm': 7.618317604064941, 'learning_rate': 1.1986855118662207e-05, 'epoch': 0.46} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.0117, device='cuda:0', grad_fn=) tensor(-0.0006, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0001761137740686536, 'train/lm_loss': 2.6224323664791883e-05, 'train/info_loss': 2.396049239905551e-05, 'train/ref_loss': 0.19774232804775238, 'train/uncertainty_loss': 0.001167740486562252, 'train/video_loss': 0.20034293830394745, 'train/total_loss': 0.20036916434764862} -tensor(0.2121, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1895283818244934, 'train/info_loss': 0.2313016653060913, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010142590617761016, 'train/video_loss': 0.2312002331018448, 'train/total_loss': 0.4207286238670349} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.359, 'grad_norm': 3.5624871253967285, 'learning_rate': 1.1976399118661628e-05}[Rank 1] Trainer log: {'loss': 0.359, 'grad_norm': 3.5624871253967285, 'learning_rate': 1.1976399118661628e-05} - -[Rank 0] Trainer log: {'loss': 0.359, 'grad_norm': 3.5624871253967285, 'learning_rate': 1.1976399118661628e-05}[Rank 2] Trainer log: {'loss': 0.359, 'grad_norm': 3.5624871253967285, 'learning_rate': 1.1976399118661628e-05} - -{'loss': 0.359, 'grad_norm': 3.5624871253967285, 'learning_rate': 1.1976399118661628e-05, 'epoch': 0.46} -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.29790048599243163, 'train/info_loss': 0.1190960705280304, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013914566952735185, 'train/video_loss': 0.11895692348480225, 'train/total_loss': 0.41685742139816284} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.18764970302581788, 'train/info_loss': 0.2449886053800583, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00014071089681237938, 'train/video_loss': 0.24484789371490479, 'train/total_loss': 0.43249762058258057} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(0.1898, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4003, 'grad_norm': 7.772458076477051, 'learning_rate': 1.1965940869586988e-05} -[Rank 0] Trainer log: {'loss': 0.4003, 'grad_norm': 7.772458076477051, 'learning_rate': 1.1965940869586988e-05}[Rank 1] Trainer log: {'loss': 0.4003, 'grad_norm': 7.772458076477051, 'learning_rate': 1.1965940869586988e-05} - -[Rank 2] Trainer log: {'loss': 0.4003, 'grad_norm': 7.772458076477051, 'learning_rate': 1.1965940869586988e-05} -{'loss': 0.4003, 'grad_norm': 7.772458076477051, 'learning_rate': 1.1965940869586988e-05, 'epoch': 0.47} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3453739881515503, 'train/info_loss': 0.2901858389377594, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012429979396983982, 'train/video_loss': 0.2900615334510803, 'train/total_loss': 0.6354355216026306} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.2357, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.19169619083404543, 'train/info_loss': 0.21204018592834473, 'train/ref_loss': None, 'train/uncertainty_loss': -9.569276589900256e-05, 'train/video_loss': 0.21194449067115784, 'train/total_loss': 0.4036406874656677} -tensor(0.0003, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.445, 'grad_norm': 6.691404342651367, 'learning_rate': 1.1955480383339407e-05}[Rank 3] Trainer log: {'loss': 0.445, 'grad_norm': 6.691404342651367, 'learning_rate': 1.1955480383339407e-05}[Rank 0] Trainer log: {'loss': 0.445, 'grad_norm': 6.691404342651367, 'learning_rate': 1.1955480383339407e-05} - -[Rank 2] Trainer log: {'loss': 0.445, 'grad_norm': 6.691404342651367, 'learning_rate': 1.1955480383339407e-05} - -{'loss': 0.445, 'grad_norm': 6.691404342651367, 'learning_rate': 1.1955480383339407e-05, 'epoch': 0.47} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.1423, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1105, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00026927618309855464, 'train/lm_loss': 2.6462710229679943e-05, 'train/info_loss': 2.455650974297896e-05, 'train/ref_loss': 0.28684002161026, 'train/uncertainty_loss': 0.011051946133375169, 'train/video_loss': 0.30007073283195496, 'train/total_loss': 0.30009719729423523} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.280925726890564, 'train/info_loss': 0.20828574895858765, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010331132216379046, 'train/video_loss': 0.2081824392080307, 'train/total_loss': 0.489108145236969} -[Rank 3] Trainer log: {'loss': 0.3801, 'grad_norm': 3.918121337890625, 'learning_rate': 1.1945017671822562e-05} -[Rank 0] Trainer log: {'loss': 0.3801, 'grad_norm': 3.918121337890625, 'learning_rate': 1.1945017671822562e-05}[Rank 1] Trainer log: {'loss': 0.3801, 'grad_norm': 3.918121337890625, 'learning_rate': 1.1945017671822562e-05} -[Rank 2] Trainer log: {'loss': 0.3801, 'grad_norm': 3.918121337890625, 'learning_rate': 1.1945017671822562e-05} - -{'loss': 0.3801, 'grad_norm': 3.918121337890625, 'learning_rate': 1.1945017671822562e-05, 'epoch': 0.47} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.33357975482940677, 'train/info_loss': 0.20864668488502502, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012432695366442205, 'train/video_loss': 0.20852236449718475, 'train/total_loss': 0.5421020984649658} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.4322085380554199, 'train/info_loss': 0.10044533014297485, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012036239495500923, 'train/video_loss': 0.10032496601343155, 'train/total_loss': 0.5325335264205933} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1521, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3287, 'grad_norm': 7.887557029724121, 'learning_rate': 1.1934552746942653e-05} -[Rank 3] Trainer log: {'loss': 0.3287, 'grad_norm': 7.887557029724121, 'learning_rate': 1.1934552746942653e-05}[Rank 2] Trainer log: {'loss': 0.3287, 'grad_norm': 7.887557029724121, 'learning_rate': 1.1934552746942653e-05} - -[Rank 0] Trainer log: {'loss': 0.3287, 'grad_norm': 7.887557029724121, 'learning_rate': 1.1934552746942653e-05} -{'loss': 0.3287, 'grad_norm': 7.887557029724121, 'learning_rate': 1.1934552746942653e-05, 'epoch': 0.47} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.20186541080474854, 'train/info_loss': 0.13150370121002197, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010369092924520374, 'train/video_loss': 0.131400004029274, 'train/total_loss': 0.33326542377471924} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(0.3736, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0015, device='cuda:1', grad_fn=) tensor(-0.0015, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.3543, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0005315700545907021, 'train/lm_loss': 3.411478828638792e-05, 'train/info_loss': 2.914582182711456e-05, 'train/ref_loss': 0.45376256108283997, 'train/uncertainty_loss': 0.03542972505092621, 'train/video_loss': 0.49347397685050964, 'train/total_loss': 0.49350810050964355} -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4207, 'grad_norm': 6.384531021118164, 'learning_rate': 1.1924085620608415e-05} -[Rank 3] Trainer log: {'loss': 0.4207, 'grad_norm': 6.384531021118164, 'learning_rate': 1.1924085620608415e-05}[Rank 0] Trainer log: {'loss': 0.4207, 'grad_norm': 6.384531021118164, 'learning_rate': 1.1924085620608415e-05} - -[Rank 2] Trainer log: {'loss': 0.4207, 'grad_norm': 6.384531021118164, 'learning_rate': 1.1924085620608415e-05} -{'loss': 0.4207, 'grad_norm': 6.384531021118164, 'learning_rate': 1.1924085620608415e-05, 'epoch': 0.47} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.16012197732925415, 'train/info_loss': 0.12925752997398376, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011373310117051006, 'train/video_loss': 0.12914380431175232, 'train/total_loss': 0.28926578164100647} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.0130, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.31383552551269533, 'train/info_loss': 0.19422118365764618, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011992219369858504, 'train/video_loss': 0.1941012591123581, 'train/total_loss': 0.5079367756843567} -tensor(0.4051, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3885, 'grad_norm': 8.61880874633789, 'learning_rate': 1.1913616304731064e-05}[Rank 1] Trainer log: {'loss': 0.3885, 'grad_norm': 8.61880874633789, 'learning_rate': 1.1913616304731064e-05} - -[Rank 2] Trainer log: {'loss': 0.3885, 'grad_norm': 8.61880874633789, 'learning_rate': 1.1913616304731064e-05}[Rank 3] Trainer log: {'loss': 0.3885, 'grad_norm': 8.61880874633789, 'learning_rate': 1.1913616304731064e-05} - -{'loss': 0.3885, 'grad_norm': 8.61880874633789, 'learning_rate': 1.1913616304731064e-05, 'epoch': 0.47} -tensor(0.1041, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.3455, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003727975068613887, 'train/lm_loss': 3.406711330171675e-05, 'train/info_loss': 2.9861035727662966e-05, 'train/ref_loss': 0.46691834926605225, 'train/uncertainty_loss': 0.03455016016960144, 'train/video_loss': 0.5044807195663452, 'train/total_loss': 0.5045148134231567} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.1768, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1381088137626648, 'train/info_loss': 0.13631321489810944, 'train/ref_loss': None, 'train/uncertainty_loss': -9.102633339352906e-05, 'train/video_loss': 0.13622218370437622, 'train/total_loss': 0.2743310034275055} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3499, 'grad_norm': 10.028788566589355, 'learning_rate': 1.1903144811224331e-05}[Rank 0] Trainer log: {'loss': 0.3499, 'grad_norm': 10.028788566589355, 'learning_rate': 1.1903144811224331e-05}[Rank 3] Trainer log: {'loss': 0.3499, 'grad_norm': 10.028788566589355, 'learning_rate': 1.1903144811224331e-05} - - -[Rank 2] Trainer log: {'loss': 0.3499, 'grad_norm': 10.028788566589355, 'learning_rate': 1.1903144811224331e-05} -{'loss': 0.3499, 'grad_norm': 10.028788566589355, 'learning_rate': 1.1903144811224331e-05, 'epoch': 0.47} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.0142, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1171, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.18946840763092043, 'train/info_loss': 0.26211726665496826, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012479206779971718, 'train/video_loss': 0.261992484331131, 'train/total_loss': 0.45146089792251587} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(0.0244, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0162, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00023176642134785654, 'train/lm_loss': 2.9943132540211084e-05, 'train/info_loss': 2.7178979507880285e-05, 'train/ref_loss': 0.21101127564907074, 'train/uncertainty_loss': 0.0016166271641850473, 'train/video_loss': 0.21450921893119812, 'train/total_loss': 0.21453915536403656} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.318, 'grad_norm': 2.79905366897583, 'learning_rate': 1.1892671152004407e-05}[Rank 0] Trainer log: {'loss': 0.318, 'grad_norm': 2.79905366897583, 'learning_rate': 1.1892671152004407e-05} -[Rank 1] Trainer log: {'loss': 0.318, 'grad_norm': 2.79905366897583, 'learning_rate': 1.1892671152004407e-05} -[Rank 2] Trainer log: {'loss': 0.318, 'grad_norm': 2.79905366897583, 'learning_rate': 1.1892671152004407e-05} - -{'loss': 0.318, 'grad_norm': 2.79905366897583, 'learning_rate': 1.1892671152004407e-05, 'epoch': 0.47} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.0411, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002444184385240078, 'train/lm_loss': 2.6033614994958045e-05, 'train/info_loss': 2.4735316401347518e-05, 'train/ref_loss': 0.18131670355796814, 'train/uncertainty_loss': 0.004114103317260742, 'train/video_loss': 0.1874108910560608, 'train/total_loss': 0.18743692338466644} -tensor(0.2359, device='cuda:3', grad_fn=) tensor(0.1057, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.18174939155578615, 'train/info_loss': 0.1980365663766861, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010800288291648031, 'train/video_loss': 0.1979285627603531, 'train/total_loss': 0.379677951335907} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.2558, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.3100, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.369, 'grad_norm': 9.383964538574219, 'learning_rate': 1.1882195338989959e-05}[Rank 3] Trainer log: {'loss': 0.369, 'grad_norm': 9.383964538574219, 'learning_rate': 1.1882195338989959e-05} - -[Rank 2] Trainer log: {'loss': 0.369, 'grad_norm': 9.383964538574219, 'learning_rate': 1.1882195338989959e-05} -[Rank 0] Trainer log: {'loss': 0.369, 'grad_norm': 9.383964538574219, 'learning_rate': 1.1882195338989959e-05} -{'loss': 0.369, 'grad_norm': 9.383964538574219, 'learning_rate': 1.1882195338989959e-05, 'epoch': 0.47} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.0031, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002766693709418178, 'train/lm_loss': 5.630654050037265e-05, 'train/info_loss': 3.599991032388061e-05, 'train/ref_loss': 0.22067317366600037, 'train/uncertainty_loss': 0.0003075884422287345, 'train/video_loss': 0.22323012351989746, 'train/total_loss': 0.223286435008049} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.22584204673767092, 'train/info_loss': 0.09863081574440002, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001006715465337038, 'train/video_loss': 0.09853014349937439, 'train/total_loss': 0.32437217235565186} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1116, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3664, 'grad_norm': 3.1393234729766846, 'learning_rate': 1.18717173841021e-05}[Rank 3] Trainer log: {'loss': 0.3664, 'grad_norm': 3.1393234729766846, 'learning_rate': 1.18717173841021e-05} - -[Rank 2] Trainer log: {'loss': 0.3664, 'grad_norm': 3.1393234729766846, 'learning_rate': 1.18717173841021e-05} -[Rank 0] Trainer log: {'loss': 0.3664, 'grad_norm': 3.1393234729766846, 'learning_rate': 1.18717173841021e-05} -{'loss': 0.3664, 'grad_norm': 3.1393234729766846, 'learning_rate': 1.18717173841021e-05, 'epoch': 0.47} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1548212170600891, 'train/info_loss': 0.1912440061569214, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012215909082442523, 'train/video_loss': 0.19112184643745422, 'train/total_loss': 0.34594306349754333} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.3509, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.5016772270202637, 'train/info_loss': 0.20465052127838135, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011649085208773614, 'train/video_loss': 0.20453402400016785, 'train/total_loss': 0.7062112092971802} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.4363, 'grad_norm': 5.512862682342529, 'learning_rate': 1.186123729926438e-05}[Rank 3] Trainer log: {'loss': 0.4363, 'grad_norm': 5.512862682342529, 'learning_rate': 1.186123729926438e-05} - -[Rank 1] Trainer log: {'loss': 0.4363, 'grad_norm': 5.512862682342529, 'learning_rate': 1.186123729926438e-05} -[Rank 0] Trainer log: {'loss': 0.4363, 'grad_norm': 5.512862682342529, 'learning_rate': 1.186123729926438e-05} -{'loss': 0.4363, 'grad_norm': 5.512862682342529, 'learning_rate': 1.186123729926438e-05, 'epoch': 0.47} -tensor(0.0254, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1572, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1978, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.1875, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00020413734018802644, 'train/lm_loss': 2.9919293592683974e-05, 'train/info_loss': 2.7178979507880285e-05, 'train/ref_loss': 0.34020742774009705, 'train/uncertainty_loss': 0.01875142753124237, 'train/video_loss': 0.36061912775039673, 'train/total_loss': 0.360649049282074} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(0.0808, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1266, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0004211803898215294, 'train/lm_loss': 4.407873784657568e-05, 'train/info_loss': 3.254307739553042e-05, 'train/ref_loss': 0.2958466112613678, 'train/uncertainty_loss': 0.012664102017879486, 'train/video_loss': 0.3119127154350281, 'train/total_loss': 0.3119567930698395} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3245, 'grad_norm': 1.8389372825622559, 'learning_rate': 1.1850755096402775e-05}[Rank 3] Trainer log: {'loss': 0.3245, 'grad_norm': 1.8389372825622559, 'learning_rate': 1.1850755096402775e-05} -[Rank 1] Trainer log: {'loss': 0.3245, 'grad_norm': 1.8389372825622559, 'learning_rate': 1.1850755096402775e-05} - -[Rank 2] Trainer log: {'loss': 0.3245, 'grad_norm': 1.8389372825622559, 'learning_rate': 1.1850755096402775e-05} -{'loss': 0.3245, 'grad_norm': 1.8389372825622559, 'learning_rate': 1.1850755096402775e-05, 'epoch': 0.47} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3304593086242676, 'train/info_loss': 0.20470379292964935, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001317944610491395, 'train/video_loss': 0.20457199215888977, 'train/total_loss': 0.5350313186645508} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.4384347915649414, 'train/info_loss': 0.17932704091072083, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012035211548209191, 'train/video_loss': 0.17920668423175812, 'train/total_loss': 0.6176415085792542} -tensor(0.2823, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0160, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3943, 'grad_norm': 7.807502746582031, 'learning_rate': 1.1840270787445676e-05}[Rank 0] Trainer log: {'loss': 0.3943, 'grad_norm': 7.807502746582031, 'learning_rate': 1.1840270787445676e-05} -[Rank 3] Trainer log: {'loss': 0.3943, 'grad_norm': 7.807502746582031, 'learning_rate': 1.1840270787445676e-05} - -{'loss': 0.3943, 'grad_norm': 7.807502746582031, 'learning_rate': 1.1840270787445676e-05, 'epoch': 0.47} -[Rank 1] Trainer log: {'loss': 0.3943, 'grad_norm': 7.807502746582031, 'learning_rate': 1.1840270787445676e-05} -tensor(0.3815, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00023143216967582705, 'train/lm_loss': 3.857240662910044e-05, 'train/info_loss': 3.075505082961172e-05, 'train/ref_loss': 0.46654045581817627, 'train/uncertainty_loss': 0.038146120309829716, 'train/video_loss': 0.5065687894821167, 'train/total_loss': 0.5066073536872864} -tensor(0.0422, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3984688997268677, 'train/info_loss': 0.1677258014678955, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010574670741334558, 'train/video_loss': 0.16762004792690277, 'train/total_loss': 0.5660889744758606} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3929, 'grad_norm': 3.630394458770752, 'learning_rate': 1.1829784384323857e-05} -[Rank 0] Trainer log: {'loss': 0.3929, 'grad_norm': 3.630394458770752, 'learning_rate': 1.1829784384323857e-05}[Rank 3] Trainer log: {'loss': 0.3929, 'grad_norm': 3.630394458770752, 'learning_rate': 1.1829784384323857e-05} -[Rank 1] Trainer log: {'loss': 0.3929, 'grad_norm': 3.630394458770752, 'learning_rate': 1.1829784384323857e-05} - -{'loss': 0.3929, 'grad_norm': 3.630394458770752, 'learning_rate': 1.1829784384323857e-05, 'epoch': 0.47} -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.32100677490234375, 'train/info_loss': 0.1156475692987442, 'train/ref_loss': None, 'train/uncertainty_loss': -9.96468123048544e-05, 'train/video_loss': 0.11554792523384094, 'train/total_loss': 0.4365547001361847} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1961, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.16437565088272096, 'train/info_loss': 0.3938767611980438, 'train/ref_loss': None, 'train/uncertainty_loss': -0.000144502311013639, 'train/video_loss': 0.3937322497367859, 'train/total_loss': 0.5581079125404358} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.4578, 'grad_norm': 4.779919624328613, 'learning_rate': 1.1819295898970487e-05}[Rank 1] Trainer log: {'loss': 0.4578, 'grad_norm': 4.779919624328613, 'learning_rate': 1.1819295898970487e-05} -[Rank 2] Trainer log: {'loss': 0.4578, 'grad_norm': 4.779919624328613, 'learning_rate': 1.1819295898970487e-05} -[Rank 3] Trainer log: {'loss': 0.4578, 'grad_norm': 4.779919624328613, 'learning_rate': 1.1819295898970487e-05} - -{'loss': 0.4578, 'grad_norm': 4.779919624328613, 'learning_rate': 1.1819295898970487e-05, 'epoch': 0.47} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.13228940963745117, 'train/info_loss': 0.19716569781303406, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011520024854689838, 'train/video_loss': 0.19705049693584442, 'train/total_loss': 0.3293399214744568} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.1408, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.0671, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00026190488133579496, 'train/lm_loss': 4.972793394699693e-05, 'train/info_loss': 3.516550350468606e-05, 'train/ref_loss': 0.2596268057823181, 'train/uncertainty_loss': 0.006707420200109482, 'train/video_loss': 0.2684646546840668, 'train/total_loss': 0.26851439476013184} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3676, 'grad_norm': 2.605509042739868, 'learning_rate': 1.1808805343321102e-05} -[Rank 0] Trainer log: {'loss': 0.3676, 'grad_norm': 2.605509042739868, 'learning_rate': 1.1808805343321102e-05}[Rank 2] Trainer log: {'loss': 0.3676, 'grad_norm': 2.605509042739868, 'learning_rate': 1.1808805343321102e-05}[Rank 3] Trainer log: {'loss': 0.3676, 'grad_norm': 2.605509042739868, 'learning_rate': 1.1808805343321102e-05} - - -{'loss': 0.3676, 'grad_norm': 2.605509042739868, 'learning_rate': 1.1808805343321102e-05, 'epoch': 0.47} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0952, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00026166634634137157, 'train/lm_loss': 3.401943831704557e-05, 'train/info_loss': 2.9861035727662966e-05, 'train/ref_loss': 0.17315009236335754, 'train/uncertainty_loss': -7.208792376331986e-05, 'train/video_loss': 0.1752011924982071, 'train/total_loss': 0.17523521184921265} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3020282030105591, 'train/info_loss': 0.20468373596668243, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011652061948552728, 'train/video_loss': 0.20456720888614655, 'train/total_loss': 0.5065954327583313} -tensor(0.0063, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1672, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.2274, 'grad_norm': 12.466569900512695, 'learning_rate': 1.1798312729313589e-05} -[Rank 2] Trainer log: {'loss': 0.2274, 'grad_norm': 12.466569900512695, 'learning_rate': 1.1798312729313589e-05} -[Rank 1] Trainer log: {'loss': 0.2274, 'grad_norm': 12.466569900512695, 'learning_rate': 1.1798312729313589e-05} -[Rank 0] Trainer log: {'loss': 0.2274, 'grad_norm': 12.466569900512695, 'learning_rate': 1.1798312729313589e-05} -{'loss': 0.2274, 'grad_norm': 12.466569900512695, 'learning_rate': 1.1798312729313589e-05, 'epoch': 0.47} -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.19453266859054566, 'train/info_loss': 0.25139251351356506, 'train/ref_loss': None, 'train/uncertainty_loss': -8.624134934507311e-05, 'train/video_loss': 0.2513062655925751, 'train/total_loss': 0.44583892822265625} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.1912, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.03427465260028839, 'train/info_loss': 0.22313757240772247, 'train/ref_loss': None, 'train/uncertainty_loss': -9.493089164607227e-05, 'train/video_loss': 0.22304263710975647, 'train/total_loss': 0.25731730461120605} -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3121, 'grad_norm': 5.383796215057373, 'learning_rate': 1.1787818068888182e-05}[Rank 2] Trainer log: {'loss': 0.3121, 'grad_norm': 5.383796215057373, 'learning_rate': 1.1787818068888182e-05}[Rank 0] Trainer log: {'loss': 0.3121, 'grad_norm': 5.383796215057373, 'learning_rate': 1.1787818068888182e-05} - -[Rank 1] Trainer log: {'loss': 0.3121, 'grad_norm': 5.383796215057373, 'learning_rate': 1.1787818068888182e-05} - -{'loss': 0.3121, 'grad_norm': 5.383796215057373, 'learning_rate': 1.1787818068888182e-05, 'epoch': 0.47} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.0641, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002798251807689667, 'train/lm_loss': 4.403106577228755e-05, 'train/info_loss': 3.278148142271675e-05, 'train/ref_loss': 0.17949241399765015, 'train/uncertainty_loss': -7.114529144018889e-05, 'train/video_loss': 0.18169265985488892, 'train/total_loss': 0.18173669278621674} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.20705358982086183, 'train/info_loss': 0.10498198866844177, 'train/ref_loss': None, 'train/uncertainty_loss': -8.303910726681352e-05, 'train/video_loss': 0.10489895194768906, 'train/total_loss': 0.31195253133773804} -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3281, 'grad_norm': 4.925294876098633, 'learning_rate': 1.1777321373987448e-05}[Rank 3] Trainer log: {'loss': 0.3281, 'grad_norm': 4.925294876098633, 'learning_rate': 1.1777321373987448e-05}[Rank 2] Trainer log: {'loss': 0.3281, 'grad_norm': 4.925294876098633, 'learning_rate': 1.1777321373987448e-05} - - -[Rank 0] Trainer log: {'loss': 0.3281, 'grad_norm': 4.925294876098633, 'learning_rate': 1.1777321373987448e-05} -{'loss': 0.3281, 'grad_norm': 4.925294876098633, 'learning_rate': 1.1777321373987448e-05, 'epoch': 0.47} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.208056640625, 'train/info_loss': 0.15391366183757782, 'train/ref_loss': None, 'train/uncertainty_loss': -9.629805572330952e-05, 'train/video_loss': 0.15381737053394318, 'train/total_loss': 0.3618740141391754} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.17169865369796755, 'train/info_loss': 0.1796208620071411, 'train/ref_loss': None, 'train/uncertainty_loss': -9.78795113041997e-05, 'train/video_loss': 0.17952297627925873, 'train/total_loss': 0.35122162103652954} -tensor(0.3936, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4182, 'grad_norm': 6.666947364807129, 'learning_rate': 1.1766822656556258e-05} -[Rank 2] Trainer log: {'loss': 0.4182, 'grad_norm': 6.666947364807129, 'learning_rate': 1.1766822656556258e-05} -[Rank 1] Trainer log: {'loss': 0.4182, 'grad_norm': 6.666947364807129, 'learning_rate': 1.1766822656556258e-05} -[Rank 0] Trainer log: {'loss': 0.4182, 'grad_norm': 6.666947364807129, 'learning_rate': 1.1766822656556258e-05} -{'loss': 0.4182, 'grad_norm': 6.666947364807129, 'learning_rate': 1.1766822656556258e-05, 'epoch': 0.47} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.2647, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(0.1514, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.000278975535184145, 'train/lm_loss': 4.424559301696718e-05, 'train/info_loss': 3.075505082961172e-05, 'train/ref_loss': 0.20017006993293762, 'train/uncertainty_loss': -6.765577127225697e-05, 'train/video_loss': 0.202364981174469, 'train/total_loss': 0.20240922272205353} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2177219867706299, 'train/info_loss': 0.21094314754009247, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012055996339768172, 'train/video_loss': 0.21082258224487305, 'train/total_loss': 0.4285445809364319} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(0.3143, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0824, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3599, 'grad_norm': 3.613628625869751, 'learning_rate': 1.1756321928541791e-05}[Rank 1] Trainer log: {'loss': 0.3599, 'grad_norm': 3.613628625869751, 'learning_rate': 1.1756321928541791e-05}[Rank 0] Trainer log: {'loss': 0.3599, 'grad_norm': 3.613628625869751, 'learning_rate': 1.1756321928541791e-05} - -[Rank 3] Trainer log: {'loss': 0.3599, 'grad_norm': 3.613628625869751, 'learning_rate': 1.1756321928541791e-05} - -{'loss': 0.3599, 'grad_norm': 3.613628625869751, 'learning_rate': 1.1756321928541791e-05, 'epoch': 0.47} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.5132, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003913972526788712, 'train/lm_loss': 2.9680909938178958e-05, 'train/info_loss': 2.6761768822325394e-05, 'train/ref_loss': 0.5234886407852173, 'train/uncertainty_loss': 0.0513175368309021, 'train/video_loss': 0.5779640674591064, 'train/total_loss': 0.5779937505722046} -tensor(0.0947, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(0.8606, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002652485622093082, 'train/lm_loss': 4.991862224414945e-05, 'train/info_loss': 3.1053055863594636e-05, 'train/ref_loss': 0.7908589839935303, 'train/uncertainty_loss': 0.08606272339820863, 'train/video_loss': 0.87907475233078, 'train/total_loss': 0.879124641418457} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.4024, 'grad_norm': 4.624104976654053, 'learning_rate': 1.1745819201893517e-05}[Rank 3] Trainer log: {'loss': 0.4024, 'grad_norm': 4.624104976654053, 'learning_rate': 1.1745819201893517e-05} -[Rank 2] Trainer log: {'loss': 0.4024, 'grad_norm': 4.624104976654053, 'learning_rate': 1.1745819201893517e-05} -[Rank 1] Trainer log: {'loss': 0.4024, 'grad_norm': 4.624104976654053, 'learning_rate': 1.1745819201893517e-05} - -{'loss': 0.4024, 'grad_norm': 4.624104976654053, 'learning_rate': 1.1745819201893517e-05, 'epoch': 0.47} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0727, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00021307494025677443, 'train/lm_loss': 2.999080752488226e-05, 'train/info_loss': 2.7178979507880285e-05, 'train/ref_loss': 0.26954853534698486, 'train/uncertainty_loss': 0.007267159968614579, 'train/video_loss': 0.278547465801239, 'train/total_loss': 0.27857744693756104} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.10566611289978028, 'train/info_loss': 0.17072869837284088, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001076113316230476, 'train/video_loss': 0.1706210821866989, 'train/total_loss': 0.2762871980667114} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1173, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.2341, 'grad_norm': 8.402688980102539, 'learning_rate': 1.1735314488563174e-05}[Rank 2] Trainer log: {'loss': 0.2341, 'grad_norm': 8.402688980102539, 'learning_rate': 1.1735314488563174e-05}[Rank 1] Trainer log: {'loss': 0.2341, 'grad_norm': 8.402688980102539, 'learning_rate': 1.1735314488563174e-05} - -[Rank 3] Trainer log: {'loss': 0.2341, 'grad_norm': 8.402688980102539, 'learning_rate': 1.1735314488563174e-05} - -{'loss': 0.2341, 'grad_norm': 8.402688980102539, 'learning_rate': 1.1735314488563174e-05, 'epoch': 0.47} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.4475138187408447, 'train/info_loss': 0.22651952505111694, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001332302577793598, 'train/video_loss': 0.22638629376888275, 'train/total_loss': 0.6739001274108887} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(0.1381, device='cuda:0', grad_fn=)tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002610699506476522, 'train/lm_loss': 3.850089560728521e-05, 'train/info_loss': 3.027824277523905e-05, 'train/ref_loss': 0.30670905113220215, 'train/uncertainty_loss': 0.01380835473537445, 'train/video_loss': 0.3226362466812134, 'train/total_loss': 0.3226747512817383} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3111, 'grad_norm': 3.8814995288848877, 'learning_rate': 1.1724807800504768e-05}[Rank 0] Trainer log: {'loss': 0.3111, 'grad_norm': 3.8814995288848877, 'learning_rate': 1.1724807800504768e-05}[Rank 2] Trainer log: {'loss': 0.3111, 'grad_norm': 3.8814995288848877, 'learning_rate': 1.1724807800504768e-05} - -[Rank 3] Trainer log: {'loss': 0.3111, 'grad_norm': 3.8814995288848877, 'learning_rate': 1.1724807800504768e-05} - -{'loss': 0.3111, 'grad_norm': 3.8814995288848877, 'learning_rate': 1.1724807800504768e-05, 'epoch': 0.47} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.19897929430007935, 'train/info_loss': 0.2874358594417572, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012193910079076886, 'train/video_loss': 0.28731390833854675, 'train/total_loss': 0.4862931966781616} -tensor(0.1550, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(0.3668, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(0.1822, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0001839180593378842, 'train/lm_loss': 2.6367357349954545e-05, 'train/info_loss': 2.6940573661704548e-05, 'train/ref_loss': 0.08047779649496078, 'train/uncertainty_loss': -7.324217003770173e-05, 'train/video_loss': 0.08190284669399261, 'train/total_loss': 0.08192921429872513} -[Rank 0] Trainer log: {'loss': 0.3559, 'grad_norm': 2.9928267002105713, 'learning_rate': 1.1714299149674538e-05}[Rank 1] Trainer log: {'loss': 0.3559, 'grad_norm': 2.9928267002105713, 'learning_rate': 1.1714299149674538e-05} -[Rank 2] Trainer log: {'loss': 0.3559, 'grad_norm': 2.9928267002105713, 'learning_rate': 1.1714299149674538e-05} - -[Rank 3] Trainer log: {'loss': 0.3559, 'grad_norm': 2.9928267002105713, 'learning_rate': 1.1714299149674538e-05}{'loss': 0.3559, 'grad_norm': 2.9928267002105713, 'learning_rate': 1.1714299149674538e-05, 'epoch': 0.47} - -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2762507438659668, 'train/info_loss': 0.17797483503818512, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012453830568119885, 'train/video_loss': 0.17785029113292694, 'train/total_loss': 0.454101026058197} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.4129, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.4191, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002823856193572283, 'train/lm_loss': 4.989478620700538e-05, 'train/info_loss': 3.462909808149561e-05, 'train/ref_loss': 0.14124590158462524, 'train/uncertainty_loss': -6.945232162252069e-05, 'train/video_loss': 0.1434701681137085, 'train/total_loss': 0.1435200572013855} -tensor(0.0546, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3015, 'grad_norm': 3.2518928050994873, 'learning_rate': 1.1703788548030976e-05} -[Rank 1] Trainer log: {'loss': 0.3015, 'grad_norm': 3.2518928050994873, 'learning_rate': 1.1703788548030976e-05} -[Rank 0] Trainer log: {'loss': 0.3015, 'grad_norm': 3.2518928050994873, 'learning_rate': 1.1703788548030976e-05}[Rank 2] Trainer log: {'loss': 0.3015, 'grad_norm': 3.2518928050994873, 'learning_rate': 1.1703788548030976e-05} - -{'loss': 0.3015, 'grad_norm': 3.2518928050994873, 'learning_rate': 1.1703788548030976e-05, 'epoch': 0.47} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.14816219806671144, 'train/info_loss': 0.13412632048130035, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011761036003008485, 'train/video_loss': 0.13400870561599731, 'train/total_loss': 0.2821708917617798} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.7139, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.33311741352081303, 'train/info_loss': 0.22063934803009033, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001343667390756309, 'train/video_loss': 0.2205049842596054, 'train/total_loss': 0.5536224246025085} -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.4182, 'grad_norm': 4.841800212860107, 'learning_rate': 1.169327600753478e-05}[Rank 1] Trainer log: {'loss': 0.4182, 'grad_norm': 4.841800212860107, 'learning_rate': 1.169327600753478e-05}[Rank 2] Trainer log: {'loss': 0.4182, 'grad_norm': 4.841800212860107, 'learning_rate': 1.169327600753478e-05} - - -[Rank 3] Trainer log: {'loss': 0.4182, 'grad_norm': 4.841800212860107, 'learning_rate': 1.169327600753478e-05} -{'loss': 0.4182, 'grad_norm': 4.841800212860107, 'learning_rate': 1.169327600753478e-05, 'epoch': 0.47} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.13378100395202638, 'train/info_loss': 0.16320261359214783, 'train/ref_loss': None, 'train/uncertainty_loss': -9.566654916852713e-05, 'train/video_loss': 0.16310694813728333, 'train/total_loss': 0.29688793420791626} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1217089295387268, 'train/info_loss': 0.2321079969406128, 'train/ref_loss': None, 'train/uncertainty_loss': -9.81599441729486e-05, 'train/video_loss': 0.23200984299182892, 'train/total_loss': 0.35371875762939453} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3865, 'grad_norm': 2.9649739265441895, 'learning_rate': 1.1682761540148861e-05}[Rank 1] Trainer log: {'loss': 0.3865, 'grad_norm': 2.9649739265441895, 'learning_rate': 1.1682761540148861e-05}[Rank 2] Trainer log: {'loss': 0.3865, 'grad_norm': 2.9649739265441895, 'learning_rate': 1.1682761540148861e-05} - - -[Rank 0] Trainer log: {'loss': 0.3865, 'grad_norm': 2.9649739265441895, 'learning_rate': 1.1682761540148861e-05} -{'loss': 0.3865, 'grad_norm': 2.9649739265441895, 'learning_rate': 1.1682761540148861e-05, 'epoch': 0.47} -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2462874174118042, 'train/info_loss': 0.20347455143928528, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012698249192908407, 'train/video_loss': 0.2033475637435913, 'train/total_loss': 0.44963496923446655} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(0.9072, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00028378802817314864, 'train/lm_loss': 3.423397720325738e-05, 'train/info_loss': 2.9861035727662966e-05, 'train/ref_loss': 0.14047493040561676, 'train/uncertainty_loss': -7.136936765164138e-05, 'train/video_loss': 0.14270372688770294, 'train/total_loss': 0.1427379548549652} -[Rank 3] Trainer log: {'loss': 0.4856, 'grad_norm': 10.93110466003418, 'learning_rate': 1.1672245157838318e-05}[Rank 2] Trainer log: {'loss': 0.4856, 'grad_norm': 10.93110466003418, 'learning_rate': 1.1672245157838318e-05}[Rank 0] Trainer log: {'loss': 0.4856, 'grad_norm': 10.93110466003418, 'learning_rate': 1.1672245157838318e-05} - -[Rank 1] Trainer log: {'loss': 0.4856, 'grad_norm': 10.93110466003418, 'learning_rate': 1.1672245157838318e-05} - -{'loss': 0.4856, 'grad_norm': 10.93110466003418, 'learning_rate': 1.1672245157838318e-05, 'epoch': 0.47} -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.040583577752113346, 'train/info_loss': 0.1392592489719391, 'train/ref_loss': None, 'train/uncertainty_loss': -8.89838847797364e-05, 'train/video_loss': 0.13917025923728943, 'train/total_loss': 0.179753839969635} -tensor(0.2158, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0472, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.3626, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.246868634223938, 'train/info_loss': 0.13271060585975647, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012130318209528923, 'train/video_loss': 0.13258929550647736, 'train/total_loss': 0.379457950592041} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3554, 'grad_norm': 15.116262435913086, 'learning_rate': 1.1661726872570433e-05}[Rank 1] Trainer log: {'loss': 0.3554, 'grad_norm': 15.116262435913086, 'learning_rate': 1.1661726872570433e-05} - -[Rank 0] Trainer log: {'loss': 0.3554, 'grad_norm': 15.116262435913086, 'learning_rate': 1.1661726872570433e-05}[Rank 3] Trainer log: {'loss': 0.3554, 'grad_norm': 15.116262435913086, 'learning_rate': 1.1661726872570433e-05} - -{'loss': 0.3554, 'grad_norm': 15.116262435913086, 'learning_rate': 1.1661726872570433e-05, 'epoch': 0.47} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.112091064453125, 'train/info_loss': 0.16255244612693787, 'train/ref_loss': None, 'train/uncertainty_loss': -9.544656495563686e-05, 'train/video_loss': 0.16245700418949127, 'train/total_loss': 0.2745480537414551} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.3863, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003159353043884039, 'train/lm_loss': 0.00015491206431761384, 'train/info_loss': 3.951631879317574e-05, 'train/ref_loss': 0.10809344798326492, 'train/uncertainty_loss': -7.19727308023721e-05, 'train/video_loss': 0.11058847606182098, 'train/total_loss': 0.11074338853359222} -[Rank 1] Trainer log: {'loss': 0.3131, 'grad_norm': 5.331007957458496, 'learning_rate': 1.1651206696314652e-05}[Rank 3] Trainer log: {'loss': 0.3131, 'grad_norm': 5.331007957458496, 'learning_rate': 1.1651206696314652e-05}[Rank 2] Trainer log: {'loss': 0.3131, 'grad_norm': 5.331007957458496, 'learning_rate': 1.1651206696314652e-05}[Rank 0] Trainer log: {'loss': 0.3131, 'grad_norm': 5.331007957458496, 'learning_rate': 1.1651206696314652e-05} - - - -{'loss': 0.3131, 'grad_norm': 5.331007957458496, 'learning_rate': 1.1651206696314652e-05, 'epoch': 0.47} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(0.1499, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00026950447354465726, 'train/lm_loss': 0.0001428808900527656, 'train/info_loss': 3.921831739717163e-05, 'train/ref_loss': 0.3114229440689087, 'train/uncertainty_loss': 0.014990980923175813, 'train/video_loss': 0.3286091983318329, 'train/total_loss': 0.3287520706653595} -tensor(0.2383, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.17971910238265992, 'train/info_loss': 0.13360801339149475, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012586849043145777, 'train/video_loss': 0.13348214328289032, 'train/total_loss': 0.31320124864578247} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3559, 'grad_norm': 2.3000268936157227, 'learning_rate': 1.1640684641042573e-05}[Rank 2] Trainer log: {'loss': 0.3559, 'grad_norm': 2.3000268936157227, 'learning_rate': 1.1640684641042573e-05} - -[Rank 3] Trainer log: {'loss': 0.3559, 'grad_norm': 2.3000268936157227, 'learning_rate': 1.1640684641042573e-05} -[Rank 0] Trainer log: {'loss': 0.3559, 'grad_norm': 2.3000268936157227, 'learning_rate': 1.1640684641042573e-05} -{'loss': 0.3559, 'grad_norm': 2.3000268936157227, 'learning_rate': 1.1640684641042573e-05, 'epoch': 0.48} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0989, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0001980475150048733, 'train/lm_loss': 5.0347665091976526e-05, 'train/info_loss': 3.43310966854915e-05, 'train/ref_loss': 0.26942309737205505, 'train/uncertainty_loss': 0.009885582327842713, 'train/video_loss': 0.2809273898601532, 'train/total_loss': 0.280977725982666} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00014713137643411757, 'train/lm_loss': 2.9609396005980673e-05, 'train/info_loss': 2.6344558136770502e-05, 'train/ref_loss': 0.13817471265792847, 'train/uncertainty_loss': -7.16421753168106e-05, 'train/video_loss': 0.1393064707517624, 'train/total_loss': 0.13933607935905457} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.3123, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3329, 'grad_norm': 5.163388252258301, 'learning_rate': 1.163016071872793e-05}[Rank 3] Trainer log: {'loss': 0.3329, 'grad_norm': 5.163388252258301, 'learning_rate': 1.163016071872793e-05}[Rank 2] Trainer log: {'loss': 0.3329, 'grad_norm': 5.163388252258301, 'learning_rate': 1.163016071872793e-05} - -[Rank 1] Trainer log: {'loss': 0.3329, 'grad_norm': 5.163388252258301, 'learning_rate': 1.163016071872793e-05} - -{'loss': 0.3329, 'grad_norm': 5.163388252258301, 'learning_rate': 1.163016071872793e-05, 'epoch': 0.48} -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1824371099472046, 'train/info_loss': 0.16691520810127258, 'train/ref_loss': None, 'train/uncertainty_loss': -9.396208333782852e-05, 'train/video_loss': 0.16682124137878418, 'train/total_loss': 0.3492583632469177} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(0.0464, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.3515, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1558, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002864166395738721, 'train/lm_loss': 4.9704097909852864e-05, 'train/info_loss': 3.1529863917967305e-05, 'train/ref_loss': 0.3097907602787018, 'train/uncertainty_loss': 0.015577034652233125, 'train/video_loss': 0.32769066095352173, 'train/total_loss': 0.3277403712272644} -[Rank 3] Trainer log: {'loss': 0.3776, 'grad_norm': 9.347774505615234, 'learning_rate': 1.1619634941346587e-05}[Rank 1] Trainer log: {'loss': 0.3776, 'grad_norm': 9.347774505615234, 'learning_rate': 1.1619634941346587e-05} -[Rank 0] Trainer log: {'loss': 0.3776, 'grad_norm': 9.347774505615234, 'learning_rate': 1.1619634941346587e-05} - -[Rank 2] Trainer log: {'loss': 0.3776, 'grad_norm': 9.347774505615234, 'learning_rate': 1.1619634941346587e-05} -{'loss': 0.3776, 'grad_norm': 9.347774505615234, 'learning_rate': 1.1619634941346587e-05, 'epoch': 0.48} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.0386, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0735, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002512803766876459, 'train/lm_loss': 3.418630221858621e-05, 'train/info_loss': 2.7178979507880285e-05, 'train/ref_loss': 0.21829789876937866, 'train/uncertainty_loss': 0.007351279258728027, 'train/video_loss': 0.22768659889698029, 'train/total_loss': 0.22772078216075897} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3265918254852295, 'train/info_loss': 0.20793719589710236, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012106230715289712, 'train/video_loss': 0.20781613886356354, 'train/total_loss': 0.5344079732894897} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.1183, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3024, 'grad_norm': 7.3147711753845215, 'learning_rate': 1.1609107320876514e-05}[Rank 2] Trainer log: {'loss': 0.3024, 'grad_norm': 7.3147711753845215, 'learning_rate': 1.1609107320876514e-05}[Rank 1] Trainer log: {'loss': 0.3024, 'grad_norm': 7.3147711753845215, 'learning_rate': 1.1609107320876514e-05} - - -[Rank 0] Trainer log: {'loss': 0.3024, 'grad_norm': 7.3147711753845215, 'learning_rate': 1.1609107320876514e-05} -{'loss': 0.3024, 'grad_norm': 7.3147711753845215, 'learning_rate': 1.1609107320876514e-05, 'epoch': 0.48} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.12222353219985962, 'train/info_loss': 0.15249910950660706, 'train/ref_loss': None, 'train/uncertainty_loss': -8.578101405873895e-05, 'train/video_loss': 0.15241332352161407, 'train/total_loss': 0.2746368646621704} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0595, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3764825820922852, 'train/info_loss': 0.16000248491764069, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013135800836607813, 'train/video_loss': 0.15987113118171692, 'train/total_loss': 0.5363537073135376} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3659, 'grad_norm': 4.198408603668213, 'learning_rate': 1.1598577869297777e-05} -[Rank 2] Trainer log: {'loss': 0.3659, 'grad_norm': 4.198408603668213, 'learning_rate': 1.1598577869297777e-05} -[Rank 1] Trainer log: {'loss': 0.3659, 'grad_norm': 4.198408603668213, 'learning_rate': 1.1598577869297777e-05} -[Rank 0] Trainer log: {'loss': 0.3659, 'grad_norm': 4.198408603668213, 'learning_rate': 1.1598577869297777e-05} -{'loss': 0.3659, 'grad_norm': 4.198408603668213, 'learning_rate': 1.1598577869297777e-05, 'epoch': 0.48} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.13857451677322388, 'train/info_loss': 0.16664539277553558, 'train/ref_loss': None, 'train/uncertainty_loss': -8.935428340919317e-05, 'train/video_loss': 0.16655604541301727, 'train/total_loss': 0.30513057112693787} -tensor(0.1914, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(0.5260, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(1.0323, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002538758097216487, 'train/lm_loss': 3.0086160404607656e-05, 'train/info_loss': 2.735778434725944e-05, 'train/ref_loss': 1.0364562273025513, 'train/uncertainty_loss': 0.10322513580322266, 'train/video_loss': 1.1417396068572998, 'train/total_loss': 1.1417696475982666} -[Rank 1] Trainer log: {'loss': 0.5071, 'grad_norm': 3.125058889389038, 'learning_rate': 1.158804659859254e-05}[Rank 0] Trainer log: {'loss': 0.5071, 'grad_norm': 3.125058889389038, 'learning_rate': 1.158804659859254e-05}[Rank 2] Trainer log: {'loss': 0.5071, 'grad_norm': 3.125058889389038, 'learning_rate': 1.158804659859254e-05} -[Rank 3] Trainer log: {'loss': 0.5071, 'grad_norm': 3.125058889389038, 'learning_rate': 1.158804659859254e-05} - - -{'loss': 0.5071, 'grad_norm': 3.125058889389038, 'learning_rate': 1.158804659859254e-05, 'epoch': 0.48} -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.1508, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00027974343392997983, 'train/lm_loss': 2.6128970785066488e-05, 'train/info_loss': 2.515252708690241e-05, 'train/ref_loss': 0.2968696057796478, 'train/uncertainty_loss': 0.01508249044418335, 'train/video_loss': 0.31421518325805664, 'train/total_loss': 0.31424131989479065} -tensor(0.0793, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -tensor(0.9798, device='cuda:1', grad_fn=) {'train/tv_loss': 0.00032845241948962215, 'train/lm_loss': 2.9776262817904356e-05, 'train/info_loss': 2.9384225854300894e-05, 'train/ref_loss': 0.25023144483566284, 'train/uncertainty_loss': 0.007933251559734344, 'train/video_loss': 0.26082170009613037, 'train/total_loss': 0.2608514726161957} -tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0990, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4023, 'grad_norm': 20.07342529296875, 'learning_rate': 1.1577513520745012e-05} -[Rank 3] Trainer log: {'loss': 0.4023, 'grad_norm': 20.07342529296875, 'learning_rate': 1.1577513520745012e-05} -[Rank 0] Trainer log: {'loss': 0.4023, 'grad_norm': 20.07342529296875, 'learning_rate': 1.1577513520745012e-05}[Rank 2] Trainer log: {'loss': 0.4023, 'grad_norm': 20.07342529296875, 'learning_rate': 1.1577513520745012e-05} - -{'loss': 0.4023, 'grad_norm': 20.07342529296875, 'learning_rate': 1.1577513520745012e-05, 'epoch': 0.48} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00029193577356636526, 'train/lm_loss': 2.6200484717264773e-05, 'train/info_loss': 2.4735316401347518e-05, 'train/ref_loss': 0.1591043770313263, 'train/uncertainty_loss': -6.784069119021296e-05, 'train/video_loss': 0.16139675676822662, 'train/total_loss': 0.1614229530096054} -tensor(0.0978, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.39217543601989746, 'train/info_loss': 0.1837902069091797, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012432648800313474, 'train/video_loss': 0.18366588652133942, 'train/total_loss': 0.5758413076400757} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.4102, 'grad_norm': 3.1366443634033203, 'learning_rate': 1.156697864774148e-05}[Rank 1] Trainer log: {'loss': 0.4102, 'grad_norm': 3.1366443634033203, 'learning_rate': 1.156697864774148e-05} -[Rank 3] Trainer log: {'loss': 0.4102, 'grad_norm': 3.1366443634033203, 'learning_rate': 1.156697864774148e-05} - -[Rank 0] Trainer log: {'loss': 0.4102, 'grad_norm': 3.1366443634033203, 'learning_rate': 1.156697864774148e-05} -{'loss': 0.4102, 'grad_norm': 3.1366443634033203, 'learning_rate': 1.156697864774148e-05, 'epoch': 0.48} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.34354329109191895, 'train/info_loss': 0.2151029109954834, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011814015451818705, 'train/video_loss': 0.21498477458953857, 'train/total_loss': 0.5585280656814575} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.32749288082122807, 'train/info_loss': 0.17974846065044403, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001125344540923834, 'train/video_loss': 0.1796359270811081, 'train/total_loss': 0.5071288347244263} -tensor(0.0630, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.4081, 'grad_norm': 5.863702297210693, 'learning_rate': 1.1556441991570267e-05}[Rank 2] Trainer log: {'loss': 0.4081, 'grad_norm': 5.863702297210693, 'learning_rate': 1.1556441991570267e-05}[Rank 3] Trainer log: {'loss': 0.4081, 'grad_norm': 5.863702297210693, 'learning_rate': 1.1556441991570267e-05} - - -[Rank 1] Trainer log: {'loss': 0.4081, 'grad_norm': 5.863702297210693, 'learning_rate': 1.1556441991570267e-05} -{'loss': 0.4081, 'grad_norm': 5.863702297210693, 'learning_rate': 1.1556441991570267e-05, 'epoch': 0.48} -tensor(-0.0015, device='cuda:3', grad_fn=) tensor(-0.0015, device='cuda:3', grad_fn=) -tensor(-0.0015, device='cuda:0', grad_fn=) tensor(-0.0015, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.31552968025207523, 'train/info_loss': 0.20284323394298553, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001453855074942112, 'train/video_loss': 0.20269784331321716, 'train/total_loss': 0.5182275176048279} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.0531, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.4155867099761963, 'train/info_loss': 0.24303777515888214, 'train/ref_loss': None, 'train/uncertainty_loss': -9.933719411492348e-05, 'train/video_loss': 0.24293844401836395, 'train/total_loss': 0.6585251688957214} -tensor(-0.0006, device='cuda:2', grad_fn=) tensor(-0.0006, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4053, 'grad_norm': 1.8834244012832642, 'learning_rate': 1.1545903564221723e-05}[Rank 3] Trainer log: {'loss': 0.4053, 'grad_norm': 1.8834244012832642, 'learning_rate': 1.1545903564221723e-05}[Rank 2] Trainer log: {'loss': 0.4053, 'grad_norm': 1.8834244012832642, 'learning_rate': 1.1545903564221723e-05} - - -[Rank 0] Trainer log: {'loss': 0.4053, 'grad_norm': 1.8834244012832642, 'learning_rate': 1.1545903564221723e-05} -{'loss': 0.4053, 'grad_norm': 1.8834244012832642, 'learning_rate': 1.1545903564221723e-05, 'epoch': 0.48} -tensor(0.1108, device='cuda:1', grad_fn=) tensor(0.0724, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.2193, device='cuda:3', grad_fn=) tensor(-0.0006, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002868340350687504, 'train/lm_loss': 3.3542679739184676e-05, 'train/info_loss': 2.5331331926281564e-05, 'train/ref_loss': 0.17103061079978943, 'train/uncertainty_loss': -6.696482887491584e-05, 'train/video_loss': 0.173283651471138, 'train/total_loss': 0.17331719398498535} -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00047799111343920235, 'train/lm_loss': 4.379269958008081e-05, 'train/info_loss': 3.0516646802425385e-05, 'train/ref_loss': 0.18082642555236816, 'train/uncertainty_loss': -7.288953638635576e-05, 'train/video_loss': 0.18460798263549805, 'train/total_loss': 0.18465177714824677} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.2854, 'grad_norm': 3.887515068054199, 'learning_rate': 1.1535363377688214e-05}[Rank 3] Trainer log: {'loss': 0.2854, 'grad_norm': 3.887515068054199, 'learning_rate': 1.1535363377688214e-05} - -[Rank 0] Trainer log: {'loss': 0.2854, 'grad_norm': 3.887515068054199, 'learning_rate': 1.1535363377688214e-05}[Rank 1] Trainer log: {'loss': 0.2854, 'grad_norm': 3.887515068054199, 'learning_rate': 1.1535363377688214e-05} - -{'loss': 0.2854, 'grad_norm': 3.887515068054199, 'learning_rate': 1.1535363377688214e-05, 'epoch': 0.48} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.21950595378875734, 'train/info_loss': 0.1590735763311386, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011817982885986567, 'train/video_loss': 0.1589553952217102, 'train/total_loss': 0.3784613609313965} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.3236, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(0.2607, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00034905637148767714, 'train/lm_loss': 2.6272001559846106e-05, 'train/info_loss': 2.5331331926281564e-05, 'train/ref_loss': 0.17344903945922852, 'train/uncertainty_loss': -6.790102343074977e-05, 'train/video_loss': 0.17619891464710236, 'train/total_loss': 0.1762251853942871} -[Rank 2] Trainer log: {'loss': 0.3758, 'grad_norm': 2.249119281768799, 'learning_rate': 1.152482144396411e-05} -[Rank 1] Trainer log: {'loss': 0.3758, 'grad_norm': 2.249119281768799, 'learning_rate': 1.152482144396411e-05} -[Rank 0] Trainer log: {'loss': 0.3758, 'grad_norm': 2.249119281768799, 'learning_rate': 1.152482144396411e-05}[Rank 3] Trainer log: {'loss': 0.3758, 'grad_norm': 2.249119281768799, 'learning_rate': 1.152482144396411e-05} - -{'loss': 0.3758, 'grad_norm': 2.249119281768799, 'learning_rate': 1.152482144396411e-05, 'epoch': 0.48} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(0.0596, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003335307352244854, 'train/lm_loss': 4.381653561722488e-05, 'train/info_loss': 2.9384225854300894e-05, 'train/ref_loss': 0.2383045256137848, 'train/uncertainty_loss': 0.0059603434056043625, 'train/video_loss': 0.2469625025987625, 'train/total_loss': 0.24700631201267242} -tensor(-0.0015, device='cuda:0', grad_fn=) tensor(-0.0015, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3009353637695313, 'train/info_loss': 0.19532939791679382, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00015482320450246336, 'train/video_loss': 0.19517457485198975, 'train/total_loss': 0.4961099326610565} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.1280, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.366, 'grad_norm': 4.677962303161621, 'learning_rate': 1.1514277775045768e-05} -[Rank 3] Trainer log: {'loss': 0.366, 'grad_norm': 4.677962303161621, 'learning_rate': 1.1514277775045768e-05} -[Rank 1] Trainer log: {'loss': 0.366, 'grad_norm': 4.677962303161621, 'learning_rate': 1.1514277775045768e-05} -[Rank 0] Trainer log: {'loss': 0.366, 'grad_norm': 4.677962303161621, 'learning_rate': 1.1514277775045768e-05} -{'loss': 0.366, 'grad_norm': 4.677962303161621, 'learning_rate': 1.1514277775045768e-05, 'epoch': 0.48} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.13925817012786865, 'train/info_loss': 0.1463087797164917, 'train/ref_loss': None, 'train/uncertainty_loss': -9.900745935738088e-05, 'train/video_loss': 0.14620977640151978, 'train/total_loss': 0.2854679524898529} -tensor(0.0666, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0017, device='cuda:1', grad_fn=) tensor(-0.0017, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.1456, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0007887547835707664, 'train/lm_loss': 2.586674236226827e-05, 'train/info_loss': 2.3066464564180933e-05, 'train/ref_loss': 0.27015551924705505, 'train/uncertainty_loss': 0.014557139575481416, 'train/video_loss': 0.29104578495025635, 'train/total_loss': 0.29107165336608887} -[Rank 1] Trainer log: {'loss': 0.4086, 'grad_norm': 3.9972825050354004, 'learning_rate': 1.150373238293152e-05} -[Rank 3] Trainer log: {'loss': 0.4086, 'grad_norm': 3.9972825050354004, 'learning_rate': 1.150373238293152e-05}[Rank 2] Trainer log: {'loss': 0.4086, 'grad_norm': 3.9972825050354004, 'learning_rate': 1.150373238293152e-05} - -[Rank 0] Trainer log: {'loss': 0.4086, 'grad_norm': 3.9972825050354004, 'learning_rate': 1.150373238293152e-05} -{'loss': 0.4086, 'grad_norm': 3.9972825050354004, 'learning_rate': 1.150373238293152e-05, 'epoch': 0.48} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.029422298073768616, 'train/info_loss': 0.14497512578964233, 'train/ref_loss': None, 'train/uncertainty_loss': -9.251756710000337e-05, 'train/video_loss': 0.14488260447978973, 'train/total_loss': 0.17430490255355835} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.2284, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.1359, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.4342, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002805976895615459, 'train/lm_loss': 2.2982231166679414e-05, 'train/info_loss': 2.455650974297896e-05, 'train/ref_loss': 0.5218969583511353, 'train/uncertainty_loss': 0.043418785929679876, 'train/video_loss': 0.5675850510597229, 'train/total_loss': 0.5676080584526062} -[Rank 2] Trainer log: {'loss': 0.3903, 'grad_norm': 16.572153091430664, 'learning_rate': 1.1493185279621655e-05} -[Rank 3] Trainer log: {'loss': 0.3903, 'grad_norm': 16.572153091430664, 'learning_rate': 1.1493185279621655e-05} -[Rank 1] Trainer log: {'loss': 0.3903, 'grad_norm': 16.572153091430664, 'learning_rate': 1.1493185279621655e-05} -[Rank 0] Trainer log: {'loss': 0.3903, 'grad_norm': 16.572153091430664, 'learning_rate': 1.1493185279621655e-05} -{'loss': 0.3903, 'grad_norm': 16.572153091430664, 'learning_rate': 1.1493185279621655e-05, 'epoch': 0.48} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.21486825942993165, 'train/info_loss': 0.18772107362747192, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012470285873860122, 'train/video_loss': 0.1875963658094406, 'train/total_loss': 0.4024646282196045} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(0.1497, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.36878979206085205, 'train/info_loss': 0.09941631555557251, 'train/ref_loss': None, 'train/uncertainty_loss': -9.180598426610232e-05, 'train/video_loss': 0.09932450950145721, 'train/total_loss': 0.46811431646347046} -tensor(0.1046, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3953, 'grad_norm': 3.5227253437042236, 'learning_rate': 1.148263647711842e-05}[Rank 3] Trainer log: {'loss': 0.3953, 'grad_norm': 3.5227253437042236, 'learning_rate': 1.148263647711842e-05}[Rank 2] Trainer log: {'loss': 0.3953, 'grad_norm': 3.5227253437042236, 'learning_rate': 1.148263647711842e-05} - - -[Rank 0] Trainer log: {'loss': 0.3953, 'grad_norm': 3.5227253437042236, 'learning_rate': 1.148263647711842e-05} -{'loss': 0.3953, 'grad_norm': 3.5227253437042236, 'learning_rate': 1.148263647711842e-05, 'epoch': 0.48} -tensor(0.3045, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.08835054636001588, 'train/info_loss': 0.2184736579656601, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010269763879477978, 'train/video_loss': 0.2183709591627121, 'train/total_loss': 0.3067215085029602} -tensor(0.2612, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002421858021989465, 'train/lm_loss': 2.6009776047430935e-05, 'train/info_loss': 2.413929905742407e-05, 'train/ref_loss': 0.20126262307167053, 'train/uncertainty_loss': -7.234408985823393e-05, 'train/video_loss': 0.2031519114971161, 'train/total_loss': 0.20317791402339935} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.0773, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3409, 'grad_norm': 12.162702560424805, 'learning_rate': 1.1472085987425982e-05}[Rank 0] Trainer log: {'loss': 0.3409, 'grad_norm': 12.162702560424805, 'learning_rate': 1.1472085987425982e-05} -[Rank 3] Trainer log: {'loss': 0.3409, 'grad_norm': 12.162702560424805, 'learning_rate': 1.1472085987425982e-05} - -{'loss': 0.3409, 'grad_norm': 12.162702560424805, 'learning_rate': 1.1472085987425982e-05, 'epoch': 0.48} -[Rank 2] Trainer log: {'loss': 0.3409, 'grad_norm': 12.162702560424805, 'learning_rate': 1.1472085987425982e-05} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.22598869800567628, 'train/info_loss': 0.17974890768527985, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010620104148983955, 'train/video_loss': 0.1796427071094513, 'train/total_loss': 0.405631422996521} -tensor(0.0575, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0407, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.0635, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0186, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003656445071101189, 'train/lm_loss': 3.8286359631456436e-05, 'train/info_loss': 3.075505082961172e-05, 'train/ref_loss': 0.16691547632217407, 'train/uncertainty_loss': 0.001859494671225548, 'train/video_loss': 0.1717308908700943, 'train/total_loss': 0.1717691719532013} -[Rank 0] Trainer log: {'loss': 0.236, 'grad_norm': 2.0814571380615234, 'learning_rate': 1.1461533822550443e-05}[Rank 3] Trainer log: {'loss': 0.236, 'grad_norm': 2.0814571380615234, 'learning_rate': 1.1461533822550443e-05} -[Rank 2] Trainer log: {'loss': 0.236, 'grad_norm': 2.0814571380615234, 'learning_rate': 1.1461533822550443e-05}[Rank 1] Trainer log: {'loss': 0.236, 'grad_norm': 2.0814571380615234, 'learning_rate': 1.1461533822550443e-05} - - -{'loss': 0.236, 'grad_norm': 2.0814571380615234, 'learning_rate': 1.1461533822550443e-05, 'epoch': 0.48} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.34652979373931886, 'train/info_loss': 0.17144185304641724, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011947535676881671, 'train/video_loss': 0.17132237553596497, 'train/total_loss': 0.5178521871566772} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.11489582061767578, 'train/info_loss': 0.29576605558395386, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013547997223213316, 'train/video_loss': 0.2956305742263794, 'train/total_loss': 0.4105263948440552} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.1879, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3876, 'grad_norm': 5.347225189208984, 'learning_rate': 1.1450979994499793e-05} -[Rank 1] Trainer log: {'loss': 0.3876, 'grad_norm': 5.347225189208984, 'learning_rate': 1.1450979994499793e-05}[Rank 0] Trainer log: {'loss': 0.3876, 'grad_norm': 5.347225189208984, 'learning_rate': 1.1450979994499793e-05} - -[Rank 2] Trainer log: {'loss': 0.3876, 'grad_norm': 5.347225189208984, 'learning_rate': 1.1450979994499793e-05} -{'loss': 0.3876, 'grad_norm': 5.347225189208984, 'learning_rate': 1.1450979994499793e-05, 'epoch': 0.48} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(0.1307, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.0343, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00025498883333057164, 'train/lm_loss': 6.381447310559452e-05, 'train/info_loss': 3.2006668334361166e-05, 'train/ref_loss': 0.23618805408477783, 'train/uncertainty_loss': 0.0034284021705389025, 'train/video_loss': 0.24168837070465088, 'train/total_loss': 0.24175219237804413} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3081267356872559, 'train/info_loss': 0.15164177119731903, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010134673211723567, 'train/video_loss': 0.15154042840003967, 'train/total_loss': 0.4596671760082245} -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.4001, 'grad_norm': 7.071802616119385, 'learning_rate': 1.144042451528393e-05} -[Rank 1] Trainer log: {'loss': 0.4001, 'grad_norm': 7.071802616119385, 'learning_rate': 1.144042451528393e-05} -[Rank 0] Trainer log: {'loss': 0.4001, 'grad_norm': 7.071802616119385, 'learning_rate': 1.144042451528393e-05}[Rank 3] Trainer log: {'loss': 0.4001, 'grad_norm': 7.071802616119385, 'learning_rate': 1.144042451528393e-05} - -{'loss': 0.4001, 'grad_norm': 7.071802616119385, 'learning_rate': 1.144042451528393e-05, 'epoch': 0.48} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.13863737583160402, 'train/info_loss': 0.19588249921798706, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001042818301357329, 'train/video_loss': 0.19577822089195251, 'train/total_loss': 0.33441561460494995} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(0.4473, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00021519241854548455, 'train/lm_loss': 3.866775659844279e-05, 'train/info_loss': 2.8013399060000665e-05, 'train/ref_loss': 0.12485770881175995, 'train/uncertainty_loss': -7.074041059240699e-05, 'train/video_loss': 0.1265365332365036, 'train/total_loss': 0.12657520174980164} -tensor(0.0108, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.358, 'grad_norm': 2.607140064239502, 'learning_rate': 1.1429867396914625e-05}[Rank 1] Trainer log: {'loss': 0.358, 'grad_norm': 2.607140064239502, 'learning_rate': 1.1429867396914625e-05}[Rank 3] Trainer log: {'loss': 0.358, 'grad_norm': 2.607140064239502, 'learning_rate': 1.1429867396914625e-05} - - -[Rank 2] Trainer log: {'loss': 0.358, 'grad_norm': 2.607140064239502, 'learning_rate': 1.1429867396914625e-05} -{'loss': 0.358, 'grad_norm': 2.607140064239502, 'learning_rate': 1.1429867396914625e-05, 'epoch': 0.48} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.15639345645904543, 'train/info_loss': 0.20618540048599243, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010616747895255686, 'train/video_loss': 0.20607922971248627, 'train/total_loss': 0.36247268319129944} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.364274263381958, 'train/info_loss': 0.171014204621315, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011962667340412736, 'train/video_loss': 0.1708945780992508, 'train/total_loss': 0.5351688265800476} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.4582, 'grad_norm': 3.056138038635254, 'learning_rate': 1.1419308651405512e-05}[Rank 3] Trainer log: {'loss': 0.4582, 'grad_norm': 3.056138038635254, 'learning_rate': 1.1419308651405512e-05} -[Rank 0] Trainer log: {'loss': 0.4582, 'grad_norm': 3.056138038635254, 'learning_rate': 1.1419308651405512e-05} - -[Rank 1] Trainer log: {'loss': 0.4582, 'grad_norm': 3.056138038635254, 'learning_rate': 1.1419308651405512e-05} -{'loss': 0.4582, 'grad_norm': 3.056138038635254, 'learning_rate': 1.1419308651405512e-05, 'epoch': 0.48} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(0.3083, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.000261351652443409, 'train/lm_loss': 4.426942905411124e-05, 'train/info_loss': 3.129145989078097e-05, 'train/ref_loss': 0.4241097867488861, 'train/uncertainty_loss': 0.03083259165287018, 'train/video_loss': 0.4570644795894623, 'train/total_loss': 0.457108736038208} -tensor(-0.0015, device='cuda:2', grad_fn=) tensor(-0.0015, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0005063887685537339, 'train/lm_loss': 2.291071286890656e-05, 'train/info_loss': 2.3424076061928645e-05, 'train/ref_loss': 0.1946549415588379, 'train/uncertainty_loss': -7.350415689870715e-05, 'train/video_loss': 0.19865596294403076, 'train/total_loss': 0.1986788809299469} -[Rank 3] Trainer log: {'loss': 0.3221, 'grad_norm': 6.9496259689331055, 'learning_rate': 1.140874829077208e-05}[Rank 2] Trainer log: {'loss': 0.3221, 'grad_norm': 6.9496259689331055, 'learning_rate': 1.140874829077208e-05} - -[Rank 0] Trainer log: {'loss': 0.3221, 'grad_norm': 6.9496259689331055, 'learning_rate': 1.140874829077208e-05}[Rank 1] Trainer log: {'loss': 0.3221, 'grad_norm': 6.9496259689331055, 'learning_rate': 1.140874829077208e-05} - -{'loss': 0.3221, 'grad_norm': 6.9496259689331055, 'learning_rate': 1.140874829077208e-05, 'epoch': 0.48} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00023361074272543192, 'train/lm_loss': 1.7594516975805163e-05, 'train/info_loss': 2.0980394765501842e-05, 'train/ref_loss': 0.2013835310935974, 'train/uncertainty_loss': -6.903389585204423e-05, 'train/video_loss': 0.20320436358451843, 'train/total_loss': 0.20322196185588837} -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.9561, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0672, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.2284, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00033538169227540496, 'train/lm_loss': 4.3315964285284286e-05, 'train/info_loss': 3.027824277523905e-05, 'train/ref_loss': 0.3532584607601166, 'train/uncertainty_loss': 0.02283802628517151, 'train/video_loss': 0.3788098096847534, 'train/total_loss': 0.37885311245918274} -[Rank 2] Trainer log: {'loss': 0.4027, 'grad_norm': 10.195490837097168, 'learning_rate': 1.1398186327031663e-05}[Rank 3] Trainer log: {'loss': 0.4027, 'grad_norm': 10.195490837097168, 'learning_rate': 1.1398186327031663e-05}[Rank 1] Trainer log: {'loss': 0.4027, 'grad_norm': 10.195490837097168, 'learning_rate': 1.1398186327031663e-05} - - -[Rank 0] Trainer log: {'loss': 0.4027, 'grad_norm': 10.195490837097168, 'learning_rate': 1.1398186327031663e-05} -{'loss': 0.4027, 'grad_norm': 10.195490837097168, 'learning_rate': 1.1398186327031663e-05, 'epoch': 0.48} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.0500, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1829, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003664586925879121, 'train/lm_loss': 2.6057453942485155e-05, 'train/info_loss': 2.6344558136770502e-05, 'train/ref_loss': 0.20975685119628906, 'train/uncertainty_loss': -7.856964366510512e-05, 'train/video_loss': 0.2126362919807434, 'train/total_loss': 0.21266235411167145} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.000389961781911552, 'train/lm_loss': 1.3780119479633869e-05, 'train/info_loss': 1.9132725356030278e-05, 'train/ref_loss': 0.1866665482521057, 'train/uncertainty_loss': -7.385513745248318e-05, 'train/video_loss': 0.18973152339458466, 'train/total_loss': 0.18974530696868896} -[Rank 1] Trainer log: {'loss': 0.3232, 'grad_norm': 4.0991902351379395, 'learning_rate': 1.13876227722034e-05}[Rank 2] Trainer log: {'loss': 0.3232, 'grad_norm': 4.0991902351379395, 'learning_rate': 1.13876227722034e-05} - -[Rank 3] Trainer log: {'loss': 0.3232, 'grad_norm': 4.0991902351379395, 'learning_rate': 1.13876227722034e-05} -[Rank 0] Trainer log: {'loss': 0.3232, 'grad_norm': 4.0991902351379395, 'learning_rate': 1.13876227722034e-05} -{'loss': 0.3232, 'grad_norm': 4.0991902351379395, 'learning_rate': 1.13876227722034e-05, 'epoch': 0.48} -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.25097203254699707, 'train/info_loss': 0.18076470494270325, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011653725523501635, 'train/video_loss': 0.18064816296100616, 'train/total_loss': 0.43162018060684204} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.39866974353790285, 'train/info_loss': 0.15563862025737762, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012871127109974624, 'train/video_loss': 0.15550990402698517, 'train/total_loss': 0.5541796684265137} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.5364, 'grad_norm': 3.137474536895752, 'learning_rate': 1.137705763830826e-05}[Rank 1] Trainer log: {'loss': 0.5364, 'grad_norm': 3.137474536895752, 'learning_rate': 1.137705763830826e-05} -[Rank 2] Trainer log: {'loss': 0.5364, 'grad_norm': 3.137474536895752, 'learning_rate': 1.137705763830826e-05} -[Rank 3] Trainer log: {'loss': 0.5364, 'grad_norm': 3.137474536895752, 'learning_rate': 1.137705763830826e-05} - -{'loss': 0.5364, 'grad_norm': 3.137474536895752, 'learning_rate': 1.137705763830826e-05, 'epoch': 0.48} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1828397512435913, 'train/info_loss': 0.16716523468494415, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011200365843251349, 'train/video_loss': 0.1670532375574112, 'train/total_loss': 0.3498929738998413} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.20830950736999512, 'train/info_loss': 0.2270187884569168, 'train/ref_loss': None, 'train/uncertainty_loss': -0.000111199333332479, 'train/video_loss': 0.22690759599208832, 'train/total_loss': 0.43521711230278015} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4223, 'grad_norm': 5.384871959686279, 'learning_rate': 1.1366490937368997e-05}[Rank 1] Trainer log: {'loss': 0.4223, 'grad_norm': 5.384871959686279, 'learning_rate': 1.1366490937368997e-05} -[Rank 2] Trainer log: {'loss': 0.4223, 'grad_norm': 5.384871959686279, 'learning_rate': 1.1366490937368997e-05} - -[Rank 0] Trainer log: {'loss': 0.4223, 'grad_norm': 5.384871959686279, 'learning_rate': 1.1366490937368997e-05} -{'loss': 0.4223, 'grad_norm': 5.384871959686279, 'learning_rate': 1.1366490937368997e-05, 'epoch': 0.48} -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.38062727451324463, 'train/info_loss': 0.17056579887866974, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011994711821898818, 'train/video_loss': 0.17044584453105927, 'train/total_loss': 0.5510731339454651} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(0.1688, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.1927, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00022032062988728285, 'train/lm_loss': 2.3053748009260744e-05, 'train/info_loss': 2.413929905742407e-05, 'train/ref_loss': 0.3389272093772888, 'train/uncertainty_loss': 0.019273951649665833, 'train/video_loss': 0.35998785495758057, 'train/total_loss': 0.36001092195510864} -[Rank 1] Trainer log: {'loss': 0.4585, 'grad_norm': 4.292842388153076, 'learning_rate': 1.1355922681410155e-05} -[Rank 3] Trainer log: {'loss': 0.4585, 'grad_norm': 4.292842388153076, 'learning_rate': 1.1355922681410155e-05}[Rank 0] Trainer log: {'loss': 0.4585, 'grad_norm': 4.292842388153076, 'learning_rate': 1.1355922681410155e-05} - -[Rank 2] Trainer log: {'loss': 0.4585, 'grad_norm': 4.292842388153076, 'learning_rate': 1.1355922681410155e-05} -{'loss': 0.4585, 'grad_norm': 4.292842388153076, 'learning_rate': 1.1355922681410155e-05, 'epoch': 0.48} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(0.0044, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00027214130386710167, 'train/lm_loss': 2.0240712910890582e-05, 'train/info_loss': 2.151681292161811e-05, 'train/ref_loss': 0.2261197417974472, 'train/uncertainty_loss': 0.00043819379061460495, 'train/video_loss': 0.2287565916776657, 'train/total_loss': 0.22877682745456696} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.38980643749237065, 'train/info_loss': 0.18330545723438263, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013042142381891608, 'train/video_loss': 0.18317504227161407, 'train/total_loss': 0.5729814767837524} -tensor(-0.0015, device='cuda:3', grad_fn=) tensor(-0.0015, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4569, 'grad_norm': 2.4226372241973877, 'learning_rate': 1.1345352882458042e-05}[Rank 0] Trainer log: {'loss': 0.4569, 'grad_norm': 2.4226372241973877, 'learning_rate': 1.1345352882458042e-05}[Rank 3] Trainer log: {'loss': 0.4569, 'grad_norm': 2.4226372241973877, 'learning_rate': 1.1345352882458042e-05} - - -{'loss': 0.4569, 'grad_norm': 2.4226372241973877, 'learning_rate': 1.1345352882458042e-05, 'epoch': 0.48} -[Rank 2] Trainer log: {'loss': 0.4569, 'grad_norm': 2.4226372241973877, 'learning_rate': 1.1345352882458042e-05} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.14874373674392702, 'train/info_loss': 0.13062874972820282, 'train/ref_loss': None, 'train/uncertainty_loss': -9.165643132291734e-05, 'train/video_loss': 0.13053709268569946, 'train/total_loss': 0.2792808413505554} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.1708, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00027943875174969436, 'train/lm_loss': 1.7642196326050907e-05, 'train/info_loss': 2.151681292161811e-05, 'train/ref_loss': 0.3260694146156311, 'train/uncertainty_loss': 0.017083685100078582, 'train/video_loss': 0.3454101085662842, 'train/total_loss': 0.3454277515411377} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.2294, 'grad_norm': 3.414252996444702, 'learning_rate': 1.1334781552540732e-05} -[Rank 1] Trainer log: {'loss': 0.2294, 'grad_norm': 3.414252996444702, 'learning_rate': 1.1334781552540732e-05} -[Rank 2] Trainer log: {'loss': 0.2294, 'grad_norm': 3.414252996444702, 'learning_rate': 1.1334781552540732e-05} -[Rank 0] Trainer log: {'loss': 0.2294, 'grad_norm': 3.414252996444702, 'learning_rate': 1.1334781552540732e-05} -{'loss': 0.2294, 'grad_norm': 3.414252996444702, 'learning_rate': 1.1334781552540732e-05, 'epoch': 0.48} -tensor(-0.0016, device='cuda:0', grad_fn=) tensor(-0.0016, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.03043414950370789, 'train/info_loss': 0.20030072331428528, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00016457608435302973, 'train/video_loss': 0.20013613998889923, 'train/total_loss': 0.23057028651237488} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(0.5089, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.42087783813476565, 'train/info_loss': 0.16732169687747955, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012646815739572048, 'train/video_loss': 0.16719523072242737, 'train/total_loss': 0.5880730748176575} -tensor(0.0657, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.4865, 'grad_norm': 12.344096183776855, 'learning_rate': 1.1324208703688026e-05} -[Rank 0] Trainer log: {'loss': 0.4865, 'grad_norm': 12.344096183776855, 'learning_rate': 1.1324208703688026e-05}[Rank 1] Trainer log: {'loss': 0.4865, 'grad_norm': 12.344096183776855, 'learning_rate': 1.1324208703688026e-05} - -[Rank 3] Trainer log: {'loss': 0.4865, 'grad_norm': 12.344096183776855, 'learning_rate': 1.1324208703688026e-05} -{'loss': 0.4865, 'grad_norm': 12.344096183776855, 'learning_rate': 1.1324208703688026e-05, 'epoch': 0.48} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002386088250204921, 'train/lm_loss': 1.5591966803185642e-05, 'train/info_loss': 2.181482341256924e-05, 'train/ref_loss': 0.17134463787078857, 'train/uncertainty_loss': -7.121426751837135e-05, 'train/video_loss': 0.17320410907268524, 'train/total_loss': 0.173219695687294} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2770405292510986, 'train/info_loss': 0.18740279972553253, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013283080188557507, 'train/video_loss': 0.18726997077465057, 'train/total_loss': 0.46431052684783936} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.302, 'grad_norm': 3.624013662338257, 'learning_rate': 1.1313634347931466e-05}[Rank 3] Trainer log: {'loss': 0.302, 'grad_norm': 3.624013662338257, 'learning_rate': 1.1313634347931466e-05} -[Rank 1] Trainer log: {'loss': 0.302, 'grad_norm': 3.624013662338257, 'learning_rate': 1.1313634347931466e-05} - -[Rank 0] Trainer log: {'loss': 0.302, 'grad_norm': 3.624013662338257, 'learning_rate': 1.1313634347931466e-05} -{'loss': 0.302, 'grad_norm': 3.624013662338257, 'learning_rate': 1.1313634347931466e-05, 'epoch': 0.49} -tensor(0.0758, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.1889, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.5533, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.000595219898968935, 'train/lm_loss': 2.0002319070044907e-05, 'train/info_loss': 2.455650974297896e-05, 'train/ref_loss': 0.20839840173721313, 'train/uncertainty_loss': -7.226623129099608e-05, 'train/video_loss': 0.21311244368553162, 'train/total_loss': 0.21313244104385376} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.05958203077316285, 'train/info_loss': 0.13405141234397888, 'train/ref_loss': None, 'train/uncertainty_loss': -9.363995632156731e-05, 'train/video_loss': 0.13395777344703674, 'train/total_loss': 0.1935397982597351} -tensor(0.7290, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3779, 'grad_norm': 4.723217010498047, 'learning_rate': 1.1303058497304304e-05}[Rank 3] Trainer log: {'loss': 0.3779, 'grad_norm': 4.723217010498047, 'learning_rate': 1.1303058497304304e-05} - -[Rank 0] Trainer log: {'loss': 0.3779, 'grad_norm': 4.723217010498047, 'learning_rate': 1.1303058497304304e-05}[Rank 2] Trainer log: {'loss': 0.3779, 'grad_norm': 4.723217010498047, 'learning_rate': 1.1303058497304304e-05} - -{'loss': 0.3779, 'grad_norm': 4.723217010498047, 'learning_rate': 1.1303058497304304e-05, 'epoch': 0.49} -tensor(-0.0017, device='cuda:2', grad_fn=) tensor(-0.0017, device='cuda:2', grad_fn=) -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.27073748111724855, 'train/info_loss': 0.2879105508327484, 'train/ref_loss': None, 'train/uncertainty_loss': -9.275845368392766e-05, 'train/video_loss': 0.2878178060054779, 'train/total_loss': 0.5585553050041199} -tensor(0.0851, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.4170695781707764, 'train/info_loss': 0.19697602093219757, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011171387741342187, 'train/video_loss': 0.1968643069267273, 'train/total_loss': 0.6139339208602905} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.0088, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3872, 'grad_norm': 5.631134033203125, 'learning_rate': 1.1292481163841489e-05}[Rank 2] Trainer log: {'loss': 0.3872, 'grad_norm': 5.631134033203125, 'learning_rate': 1.1292481163841489e-05} - -[Rank 3] Trainer log: {'loss': 0.3872, 'grad_norm': 5.631134033203125, 'learning_rate': 1.1292481163841489e-05} -[Rank 0] Trainer log: {'loss': 0.3872, 'grad_norm': 5.631134033203125, 'learning_rate': 1.1292481163841489e-05} -{'loss': 0.3872, 'grad_norm': 5.631134033203125, 'learning_rate': 1.1292481163841489e-05, 'epoch': 0.49} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3884771823883057, 'train/info_loss': 0.2364257425069809, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012035117251798511, 'train/video_loss': 0.2363053858280182, 'train/total_loss': 0.6247825622558594} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.1244, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002574573736637831, 'train/lm_loss': 2.293455181643367e-05, 'train/info_loss': 2.235124156868551e-05, 'train/ref_loss': 0.29373469948768616, 'train/uncertainty_loss': 0.01244252398610115, 'train/video_loss': 0.3082592487335205, 'train/total_loss': 0.30828219652175903} -[Rank 3] Trainer log: {'loss': 0.4563, 'grad_norm': 6.703609466552734, 'learning_rate': 1.1281902359579669e-05}[Rank 1] Trainer log: {'loss': 0.4563, 'grad_norm': 6.703609466552734, 'learning_rate': 1.1281902359579669e-05}[Rank 2] Trainer log: {'loss': 0.4563, 'grad_norm': 6.703609466552734, 'learning_rate': 1.1281902359579669e-05} - - -[Rank 0] Trainer log: {'loss': 0.4563, 'grad_norm': 6.703609466552734, 'learning_rate': 1.1281902359579669e-05} -{'loss': 0.4563, 'grad_norm': 6.703609466552734, 'learning_rate': 1.1281902359579669e-05, 'epoch': 0.49} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00029738217126578097, 'train/lm_loss': 3.3638032618910074e-05, 'train/info_loss': 2.574854443082586e-05, 'train/ref_loss': 0.2167385369539261, 'train/uncertainty_loss': -7.124600815586746e-05, 'train/video_loss': 0.2190721035003662, 'train/total_loss': 0.21910573542118073} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3391152858734131, 'train/info_loss': 0.16434146463871002, 'train/ref_loss': None, 'train/uncertainty_loss': -9.247775305993856e-05, 'train/video_loss': 0.164248988032341, 'train/total_loss': 0.5033642649650574} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3279, 'grad_norm': 1.8644397258758545, 'learning_rate': 1.1271322096557152e-05}[Rank 1] Trainer log: {'loss': 0.3279, 'grad_norm': 1.8644397258758545, 'learning_rate': 1.1271322096557152e-05}[Rank 2] Trainer log: {'loss': 0.3279, 'grad_norm': 1.8644397258758545, 'learning_rate': 1.1271322096557152e-05} - -[Rank 3] Trainer log: {'loss': 0.3279, 'grad_norm': 1.8644397258758545, 'learning_rate': 1.1271322096557152e-05} - -{'loss': 0.3279, 'grad_norm': 1.8644397258758545, 'learning_rate': 1.1271322096557152e-05, 'epoch': 0.49} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0735, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002547347452491522, 'train/lm_loss': 2.0002319070044907e-05, 'train/info_loss': 2.3602882720297202e-05, 'train/ref_loss': 0.26353341341018677, 'train/uncertainty_loss': 0.007345779985189438, 'train/video_loss': 0.27294066548347473, 'train/total_loss': 0.2729606628417969} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.0711, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0628, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00033419085666537286, 'train/lm_loss': 1.766603672876954e-05, 'train/info_loss': 2.0682384274550714e-05, 'train/ref_loss': 0.2579119801521301, 'train/uncertainty_loss': 0.006281530857086182, 'train/video_loss': 0.26688772439956665, 'train/total_loss': 0.26690539717674255} -[Rank 0] Trainer log: {'loss': 0.3125, 'grad_norm': 2.7475154399871826, 'learning_rate': 1.1260740386813916e-05}[Rank 1] Trainer log: {'loss': 0.3125, 'grad_norm': 2.7475154399871826, 'learning_rate': 1.1260740386813916e-05} -[Rank 3] Trainer log: {'loss': 0.3125, 'grad_norm': 2.7475154399871826, 'learning_rate': 1.1260740386813916e-05} -[Rank 2] Trainer log: {'loss': 0.3125, 'grad_norm': 2.7475154399871826, 'learning_rate': 1.1260740386813916e-05} - -{'loss': 0.3125, 'grad_norm': 2.7475154399871826, 'learning_rate': 1.1260740386813916e-05, 'epoch': 0.49} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(0.0630, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.000273836636915803, 'train/lm_loss': 4.412640992086381e-05, 'train/info_loss': 3.027824277523905e-05, 'train/ref_loss': 0.14764055609703064, 'train/uncertainty_loss': -7.128661964088679e-05, 'train/video_loss': 0.14979024231433868, 'train/total_loss': 0.14983436465263367} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003589187515899539, 'train/lm_loss': 2.0193033560644838e-05, 'train/info_loss': 2.2172436729306355e-05, 'train/ref_loss': 0.1349163055419922, 'train/uncertainty_loss': -7.1068195393309e-05, 'train/video_loss': 0.13773876428604126, 'train/total_loss': 0.13775895535945892} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3505, 'grad_norm': 2.6598494052886963, 'learning_rate': 1.125015724239158e-05}[Rank 0] Trainer log: {'loss': 0.3505, 'grad_norm': 2.6598494052886963, 'learning_rate': 1.125015724239158e-05}[Rank 1] Trainer log: {'loss': 0.3505, 'grad_norm': 2.6598494052886963, 'learning_rate': 1.125015724239158e-05} - - -[Rank 3] Trainer log: {'loss': 0.3505, 'grad_norm': 2.6598494052886963, 'learning_rate': 1.125015724239158e-05} -{'loss': 0.3505, 'grad_norm': 2.6598494052886963, 'learning_rate': 1.125015724239158e-05, 'epoch': 0.49} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.11183707714080811, 'train/info_loss': 0.2141566127538681, 'train/ref_loss': None, 'train/uncertainty_loss': -9.5774931833148e-05, 'train/video_loss': 0.21406084299087524, 'train/total_loss': 0.3258979320526123} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2122830390930176, 'train/info_loss': 0.21234819293022156, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001238299417309463, 'train/video_loss': 0.21222436428070068, 'train/total_loss': 0.42450740933418274} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3826, 'grad_norm': 2.313490152359009, 'learning_rate': 1.12395726753334e-05}[Rank 3] Trainer log: {'loss': 0.3826, 'grad_norm': 2.313490152359009, 'learning_rate': 1.12395726753334e-05} - -[Rank 0] Trainer log: {'loss': 0.3826, 'grad_norm': 2.313490152359009, 'learning_rate': 1.12395726753334e-05}[Rank 2] Trainer log: {'loss': 0.3826, 'grad_norm': 2.313490152359009, 'learning_rate': 1.12395726753334e-05} - -{'loss': 0.3826, 'grad_norm': 2.313490152359009, 'learning_rate': 1.12395726753334e-05, 'epoch': 0.49} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.16384886503219606, 'train/info_loss': 0.2228795439004898, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012061666930094362, 'train/video_loss': 0.2227589339017868, 'train/total_loss': 0.3866077959537506} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3260855197906494, 'train/info_loss': 0.2753165662288666, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00014396698679775, 'train/video_loss': 0.2751725912094116, 'train/total_loss': 0.6012581586837769} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3768, 'grad_norm': 2.7997474670410156, 'learning_rate': 1.1228986697684248e-05}[Rank 2] Trainer log: {'loss': 0.3768, 'grad_norm': 2.7997474670410156, 'learning_rate': 1.1228986697684248e-05} -[Rank 3] Trainer log: {'loss': 0.3768, 'grad_norm': 2.7997474670410156, 'learning_rate': 1.1228986697684248e-05} - -[Rank 1] Trainer log: {'loss': 0.3768, 'grad_norm': 2.7997474670410156, 'learning_rate': 1.1228986697684248e-05} -{'loss': 0.3768, 'grad_norm': 2.7997474670410156, 'learning_rate': 1.1228986697684248e-05, 'epoch': 0.49} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.27017664909362793, 'train/info_loss': 0.24103227257728577, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001237314660102129, 'train/video_loss': 0.24090854823589325, 'train/total_loss': 0.5110852122306824} -tensor(0.0292, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(0.0820, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.27412440776824953, 'train/info_loss': 0.1921161562204361, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001271778135560453, 'train/video_loss': 0.1919889748096466, 'train/total_loss': 0.4661133885383606} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.4295, 'grad_norm': 2.199033737182617, 'learning_rate': 1.1218399321490601e-05}[Rank 3] Trainer log: {'loss': 0.4295, 'grad_norm': 2.199033737182617, 'learning_rate': 1.1218399321490601e-05}[Rank 1] Trainer log: {'loss': 0.4295, 'grad_norm': 2.199033737182617, 'learning_rate': 1.1218399321490601e-05} - - -{'loss': 0.4295, 'grad_norm': 2.199033737182617, 'learning_rate': 1.1218399321490601e-05, 'epoch': 0.49} -[Rank 2] Trainer log: {'loss': 0.4295, 'grad_norm': 2.199033737182617, 'learning_rate': 1.1218399321490601e-05} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.305824875831604, 'train/info_loss': 0.1582883894443512, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011954186484217644, 'train/video_loss': 0.15816885232925415, 'train/total_loss': 0.46399372816085815} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.2189, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.31245520114898684, 'train/info_loss': 0.17545798420906067, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011913577327504754, 'train/video_loss': 0.17533884942531586, 'train/total_loss': 0.48779404163360596} -tensor(0.0391, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.3578, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.2906, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3787, 'grad_norm': 5.56175422668457, 'learning_rate': 1.1207810558800528e-05} -[Rank 0] Trainer log: {'loss': 0.3787, 'grad_norm': 5.56175422668457, 'learning_rate': 1.1207810558800528e-05}[Rank 3] Trainer log: {'loss': 0.3787, 'grad_norm': 5.56175422668457, 'learning_rate': 1.1207810558800528e-05}[Rank 2] Trainer log: {'loss': 0.3787, 'grad_norm': 5.56175422668457, 'learning_rate': 1.1207810558800528e-05} - - -{'loss': 0.3787, 'grad_norm': 5.56175422668457, 'learning_rate': 1.1207810558800528e-05, 'epoch': 0.49} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00018368118908256294, 'train/lm_loss': 3.881077864207328e-05, 'train/info_loss': 2.914582182711456e-05, 'train/ref_loss': 0.1940653920173645, 'train/uncertainty_loss': -7.217972888611257e-05, 'train/video_loss': 0.19549180567264557, 'train/total_loss': 0.19553062319755554} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.31017541885375977, 'train/info_loss': 0.18183758854866028, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013174114283174277, 'train/video_loss': 0.18170584738254547, 'train/total_loss': 0.49188125133514404} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.2985, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.2633, 'grad_norm': 4.425489902496338, 'learning_rate': 1.1197220421663675e-05} -[Rank 0] Trainer log: {'loss': 0.2633, 'grad_norm': 4.425489902496338, 'learning_rate': 1.1197220421663675e-05}[Rank 3] Trainer log: {'loss': 0.2633, 'grad_norm': 4.425489902496338, 'learning_rate': 1.1197220421663675e-05} - -[Rank 2] Trainer log: {'loss': 0.2633, 'grad_norm': 4.425489902496338, 'learning_rate': 1.1197220421663675e-05} -{'loss': 0.2633, 'grad_norm': 4.425489902496338, 'learning_rate': 1.1197220421663675e-05, 'epoch': 0.49} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.09294674992561341, 'train/info_loss': 0.1072859913110733, 'train/ref_loss': None, 'train/uncertainty_loss': -8.423227118328215e-05, 'train/video_loss': 0.10720176249742508, 'train/total_loss': 0.20014852285385132} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.0085, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.000572055671364069, 'train/lm_loss': 4.972793394699693e-05, 'train/info_loss': 3.278148142271675e-05, 'train/ref_loss': 0.21466954052448273, 'train/uncertainty_loss': 0.0008461283519864083, 'train/video_loss': 0.22012490034103394, 'train/total_loss': 0.2201746255159378} -[Rank 1] Trainer log: {'loss': 0.3719, 'grad_norm': 2.0170609951019287, 'learning_rate': 1.1186628922131251e-05} -[Rank 2] Trainer log: {'loss': 0.3719, 'grad_norm': 2.0170609951019287, 'learning_rate': 1.1186628922131251e-05} -[Rank 3] Trainer log: {'loss': 0.3719, 'grad_norm': 2.0170609951019287, 'learning_rate': 1.1186628922131251e-05} -[Rank 0] Trainer log: {'loss': 0.3719, 'grad_norm': 2.0170609951019287, 'learning_rate': 1.1186628922131251e-05} -{'loss': 0.3719, 'grad_norm': 2.0170609951019287, 'learning_rate': 1.1186628922131251e-05, 'epoch': 0.49} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2745287656784058, 'train/info_loss': 0.1906413584947586, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011542264837771654, 'train/video_loss': 0.19052593410015106, 'train/total_loss': 0.4650546908378601} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(1.1132, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1892929553985596, 'train/info_loss': 0.1912183165550232, 'train/ref_loss': None, 'train/uncertainty_loss': -9.300923557020724e-05, 'train/video_loss': 0.1911253035068512, 'train/total_loss': 0.38041824102401733} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4541, 'grad_norm': 9.38331413269043, 'learning_rate': 1.1176036072256025e-05}[Rank 1] Trainer log: {'loss': 0.4541, 'grad_norm': 9.38331413269043, 'learning_rate': 1.1176036072256025e-05} - -[Rank 0] Trainer log: {'loss': 0.4541, 'grad_norm': 9.38331413269043, 'learning_rate': 1.1176036072256025e-05}[Rank 2] Trainer log: {'loss': 0.4541, 'grad_norm': 9.38331413269043, 'learning_rate': 1.1176036072256025e-05} - -{'loss': 0.4541, 'grad_norm': 9.38331413269043, 'learning_rate': 1.1176036072256025e-05, 'epoch': 0.49} -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00032726717181503774, 'train/lm_loss': 3.342349082231522e-05, 'train/info_loss': 2.6106152290594764e-05, 'train/ref_loss': 0.2002326250076294, 'train/uncertainty_loss': -7.18548195436597e-05, 'train/video_loss': 0.20280501246452332, 'train/total_loss': 0.20283843576908112} -tensor(0.3632, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.2470, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00018528415821492673, 'train/lm_loss': 3.859624266624451e-05, 'train/info_loss': 2.8907417799928226e-05, 'train/ref_loss': 0.38178497552871704, 'train/uncertainty_loss': 0.024696581065654755, 'train/video_loss': 0.40799275040626526, 'train/total_loss': 0.4080313444137573} -[Rank 1] Trainer log: {'loss': 0.3961, 'grad_norm': 7.879559516906738, 'learning_rate': 1.1165441884092287e-05}[Rank 3] Trainer log: {'loss': 0.3961, 'grad_norm': 7.879559516906738, 'learning_rate': 1.1165441884092287e-05}[Rank 0] Trainer log: {'loss': 0.3961, 'grad_norm': 7.879559516906738, 'learning_rate': 1.1165441884092287e-05} - - -[Rank 2] Trainer log: {'loss': 0.3961, 'grad_norm': 7.879559516906738, 'learning_rate': 1.1165441884092287e-05} -{'loss': 0.3961, 'grad_norm': 7.879559516906738, 'learning_rate': 1.1165441884092287e-05, 'epoch': 0.49} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.2317, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(0.2563, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00025523146614432336, 'train/lm_loss': 2.3006070114206524e-05, 'train/info_loss': 2.413929905742407e-05, 'train/ref_loss': 0.384145587682724, 'train/uncertainty_loss': 0.025626733899116516, 'train/video_loss': 0.4118382930755615, 'train/total_loss': 0.4118613004684448} -tensor(0.1064, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.0115, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.3290, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002964100567623973, 'train/lm_loss': 5.699776229448617e-05, 'train/info_loss': 3.3556287235114723e-05, 'train/ref_loss': 0.4304743707180023, 'train/uncertainty_loss': 0.032896587252616884, 'train/video_loss': 0.4657757878303528, 'train/total_loss': 0.46583279967308044} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3458, 'grad_norm': 6.669648170471191, 'learning_rate': 1.1154846369695864e-05} -[Rank 2] Trainer log: {'loss': 0.3458, 'grad_norm': 6.669648170471191, 'learning_rate': 1.1154846369695864e-05} -[Rank 0] Trainer log: {'loss': 0.3458, 'grad_norm': 6.669648170471191, 'learning_rate': 1.1154846369695864e-05}[Rank 1] Trainer log: {'loss': 0.3458, 'grad_norm': 6.669648170471191, 'learning_rate': 1.1154846369695864e-05} - -{'loss': 0.3458, 'grad_norm': 6.669648170471191, 'learning_rate': 1.1154846369695864e-05, 'epoch': 0.49} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.08640984892845155, 'train/info_loss': 0.20177681744098663, 'train/ref_loss': None, 'train/uncertainty_loss': -8.813131717033684e-05, 'train/video_loss': 0.20168869197368622, 'train/total_loss': 0.28809854388237} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(0.1443, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00022913094144314528, 'train/lm_loss': 5.6163530098274356e-05, 'train/info_loss': 3.409269265830517e-05, 'train/ref_loss': 0.10374864190816879, 'train/uncertainty_loss': -6.979886675253511e-05, 'train/video_loss': 0.10554598271846771, 'train/total_loss': 0.10560214519500732} -[Rank 0] Trainer log: {'loss': 0.2536, 'grad_norm': 3.587587833404541, 'learning_rate': 1.114424954112408e-05}[Rank 2] Trainer log: {'loss': 0.2536, 'grad_norm': 3.587587833404541, 'learning_rate': 1.114424954112408e-05} -[Rank 1] Trainer log: {'loss': 0.2536, 'grad_norm': 3.587587833404541, 'learning_rate': 1.114424954112408e-05} - -[Rank 3] Trainer log: {'loss': 0.2536, 'grad_norm': 3.587587833404541, 'learning_rate': 1.114424954112408e-05} -{'loss': 0.2536, 'grad_norm': 3.587587833404541, 'learning_rate': 1.114424954112408e-05, 'epoch': 0.49} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.12324057817459107, 'train/info_loss': 0.18923181295394897, 'train/ref_loss': None, 'train/uncertainty_loss': -9.417231776751578e-05, 'train/video_loss': 0.18913763761520386, 'train/total_loss': 0.3123782277107239} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.26344773769378665, 'train/info_loss': 0.18315644562244415, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001036380184814334, 'train/video_loss': 0.18305280804634094, 'train/total_loss': 0.4465005397796631} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4188, 'grad_norm': 2.551377058029175, 'learning_rate': 1.1133651410435763e-05} -[Rank 2] Trainer log: {'loss': 0.4188, 'grad_norm': 2.551377058029175, 'learning_rate': 1.1133651410435763e-05}[Rank 1] Trainer log: {'loss': 0.4188, 'grad_norm': 2.551377058029175, 'learning_rate': 1.1133651410435763e-05} - -[Rank 0] Trainer log: {'loss': 0.4188, 'grad_norm': 2.551377058029175, 'learning_rate': 1.1133651410435763e-05} -{'loss': 0.4188, 'grad_norm': 2.551377058029175, 'learning_rate': 1.1133651410435763e-05, 'epoch': 0.49} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.3492, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1095, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002818437526002526, 'train/lm_loss': 4.379269958008081e-05, 'train/info_loss': 3.00994397548493e-05, 'train/ref_loss': 0.25963181257247925, 'train/uncertainty_loss': 0.010947711765766144, 'train/video_loss': 0.27286437153816223, 'train/total_loss': 0.27290815114974976} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.28115258216857913, 'train/info_loss': 0.18794149160385132, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011074421927332879, 'train/video_loss': 0.18783074617385864, 'train/total_loss': 0.46898332238197327} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3226, 'grad_norm': 10.113152503967285, 'learning_rate': 1.1123051989691227e-05} -[Rank 2] Trainer log: {'loss': 0.3226, 'grad_norm': 10.113152503967285, 'learning_rate': 1.1123051989691227e-05} -[Rank 3] Trainer log: {'loss': 0.3226, 'grad_norm': 10.113152503967285, 'learning_rate': 1.1123051989691227e-05} -[Rank 0] Trainer log: {'loss': 0.3226, 'grad_norm': 10.113152503967285, 'learning_rate': 1.1123051989691227e-05} -{'loss': 0.3226, 'grad_norm': 10.113152503967285, 'learning_rate': 1.1123051989691227e-05, 'epoch': 0.49} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.13272207975387573, 'train/info_loss': 0.17483288049697876, 'train/ref_loss': None, 'train/uncertainty_loss': -8.809235296212137e-05, 'train/video_loss': 0.17474478483200073, 'train/total_loss': 0.30746686458587646} -tensor(0.4527, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2704842805862427, 'train/info_loss': 0.1728808730840683, 'train/ref_loss': None, 'train/uncertainty_loss': -8.194528636522591e-05, 'train/video_loss': 0.17279893159866333, 'train/total_loss': 0.44328323006629944} -tensor(0.1241, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4392, 'grad_norm': 8.083270072937012, 'learning_rate': 1.1112451290952238e-05} -[Rank 3] Trainer log: {'loss': 0.4392, 'grad_norm': 8.083270072937012, 'learning_rate': 1.1112451290952238e-05} -[Rank 2] Trainer log: {'loss': 0.4392, 'grad_norm': 8.083270072937012, 'learning_rate': 1.1112451290952238e-05} -[Rank 0] Trainer log: {'loss': 0.4392, 'grad_norm': 8.083270072937012, 'learning_rate': 1.1112451290952238e-05} -{'loss': 0.4392, 'grad_norm': 8.083270072937012, 'learning_rate': 1.1112451290952238e-05, 'epoch': 0.49} -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2953122854232788, 'train/info_loss': 0.17293213307857513, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001295866910368204, 'train/video_loss': 0.17280255258083344, 'train/total_loss': 0.46811485290527344} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(0.6950, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.1587, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(0.0939, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00017280979081988335, 'train/lm_loss': 3.871542867273092e-05, 'train/info_loss': 2.6344558136770502e-05, 'train/ref_loss': 0.27342718839645386, 'train/uncertainty_loss': 0.009385598450899126, 'train/video_loss': 0.2842215895652771, 'train/total_loss': 0.2842603027820587} -[Rank 3] Trainer log: {'loss': 0.4821, 'grad_norm': 4.413468360900879, 'learning_rate': 1.1101849326282028e-05}[Rank 0] Trainer log: {'loss': 0.4821, 'grad_norm': 4.413468360900879, 'learning_rate': 1.1101849326282028e-05}[Rank 1] Trainer log: {'loss': 0.4821, 'grad_norm': 4.413468360900879, 'learning_rate': 1.1101849326282028e-05} - - -[Rank 2] Trainer log: {'loss': 0.4821, 'grad_norm': 4.413468360900879, 'learning_rate': 1.1101849326282028e-05} -{'loss': 0.4821, 'grad_norm': 4.413468360900879, 'learning_rate': 1.1101849326282028e-05, 'epoch': 0.49} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.0865, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1274, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00031831343658268453, 'train/lm_loss': 4.987095016986132e-05, 'train/info_loss': 3.176826794515364e-05, 'train/ref_loss': 0.28304415941238403, 'train/uncertainty_loss': 0.012737458944320679, 'train/video_loss': 0.2983599007129669, 'train/total_loss': 0.29840975999832153} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.24576265811920167, 'train/info_loss': 0.1542835533618927, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010392318945378065, 'train/video_loss': 0.1541796326637268, 'train/total_loss': 0.3999422788619995} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.1458, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.3001, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.344, 'grad_norm': 2.794057607650757, 'learning_rate': 1.109124610774527e-05}[Rank 3] Trainer log: {'loss': 0.344, 'grad_norm': 2.794057607650757, 'learning_rate': 1.109124610774527e-05} - -[Rank 2] Trainer log: {'loss': 0.344, 'grad_norm': 2.794057607650757, 'learning_rate': 1.109124610774527e-05} -[Rank 0] Trainer log: {'loss': 0.344, 'grad_norm': 2.794057607650757, 'learning_rate': 1.109124610774527e-05} -{'loss': 0.344, 'grad_norm': 2.794057607650757, 'learning_rate': 1.109124610774527e-05, 'epoch': 0.49} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.053138858079910284, 'train/info_loss': 0.15604887902736664, 'train/ref_loss': None, 'train/uncertainty_loss': -8.930315379984677e-05, 'train/video_loss': 0.1559595763683319, 'train/total_loss': 0.2090984284877777} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1907549500465393, 'train/info_loss': 0.1956629753112793, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001216830685734749, 'train/video_loss': 0.19554129242897034, 'train/total_loss': 0.38629624247550964} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.2081, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.0075, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.2611, 'grad_norm': 6.542510032653809, 'learning_rate': 1.1080641647408063e-05}[Rank 3] Trainer log: {'loss': 0.2611, 'grad_norm': 6.542510032653809, 'learning_rate': 1.1080641647408063e-05} - -[Rank 0] Trainer log: {'loss': 0.2611, 'grad_norm': 6.542510032653809, 'learning_rate': 1.1080641647408063e-05} -[Rank 2] Trainer log: {'loss': 0.2611, 'grad_norm': 6.542510032653809, 'learning_rate': 1.1080641647408063e-05} -{'loss': 0.2611, 'grad_norm': 6.542510032653809, 'learning_rate': 1.1080641647408063e-05, 'epoch': 0.49} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.2917, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003750406904146075, 'train/lm_loss': 4.982327809557319e-05, 'train/info_loss': 3.176826794515364e-05, 'train/ref_loss': 0.4106074571609497, 'train/uncertainty_loss': 0.02916851043701172, 'train/video_loss': 0.44280806183815, 'train/total_loss': 0.44285789132118225} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2363327980041504, 'train/info_loss': 0.23862798511981964, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012621905189007522, 'train/video_loss': 0.23850177228450775, 'train/total_loss': 0.4748345613479614} -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(0.2035, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3989, 'grad_norm': 4.814853191375732, 'learning_rate': 1.1070035957337911e-05} -[Rank 0] Trainer log: {'loss': 0.3989, 'grad_norm': 4.814853191375732, 'learning_rate': 1.1070035957337911e-05}[Rank 2] Trainer log: {'loss': 0.3989, 'grad_norm': 4.814853191375732, 'learning_rate': 1.1070035957337911e-05}[Rank 1] Trainer log: {'loss': 0.3989, 'grad_norm': 4.814853191375732, 'learning_rate': 1.1070035957337911e-05} - - -{'loss': 0.3989, 'grad_norm': 4.814853191375732, 'learning_rate': 1.1070035957337911e-05, 'epoch': 0.49} -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2930912494659424, 'train/info_loss': 0.1110132485628128, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001038530725054443, 'train/video_loss': 0.11090939491987228, 'train/total_loss': 0.4040006697177887} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.0184, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002315323567017913, 'train/lm_loss': 3.382873546797782e-05, 'train/info_loss': 2.7596188374445774e-05, 'train/ref_loss': 0.22197619080543518, 'train/uncertainty_loss': 0.0018355222418904305, 'train/video_loss': 0.22569157183170319, 'train/total_loss': 0.22572539746761322} -[Rank 0] Trainer log: {'loss': 0.3737, 'grad_norm': 2.1151883602142334, 'learning_rate': 1.1059429049603729e-05}[Rank 1] Trainer log: {'loss': 0.3737, 'grad_norm': 2.1151883602142334, 'learning_rate': 1.1059429049603729e-05} -[Rank 3] Trainer log: {'loss': 0.3737, 'grad_norm': 2.1151883602142334, 'learning_rate': 1.1059429049603729e-05} - -{'loss': 0.3737, 'grad_norm': 2.1151883602142334, 'learning_rate': 1.1059429049603729e-05, 'epoch': 0.49} -[Rank 2] Trainer log: {'loss': 0.3737, 'grad_norm': 2.1151883602142334, 'learning_rate': 1.1059429049603729e-05} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=)tensor(0.1553, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0004411047324538231, 'train/lm_loss': 5.6973926257342106e-05, 'train/info_loss': 3.129145989078097e-05, 'train/ref_loss': 0.09692198038101196, 'train/uncertainty_loss': -7.522919331677259e-05, 'train/video_loss': 0.10040688514709473, 'train/total_loss': 0.1004638597369194} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1232, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002410534303635359, 'train/lm_loss': 3.866775659844279e-05, 'train/info_loss': 2.6761768822325394e-05, 'train/ref_loss': 0.29474392533302307, 'train/uncertainty_loss': 0.012324120104312898, 'train/video_loss': 0.3090232312679291, 'train/total_loss': 0.3090618848800659} -[Rank 1] Trainer log: {'loss': 0.2862, 'grad_norm': 2.253288745880127, 'learning_rate': 1.1048820936275808e-05} -[Rank 0] Trainer log: {'loss': 0.2862, 'grad_norm': 2.253288745880127, 'learning_rate': 1.1048820936275808e-05}[Rank 2] Trainer log: {'loss': 0.2862, 'grad_norm': 2.253288745880127, 'learning_rate': 1.1048820936275808e-05} - -[Rank 3] Trainer log: {'loss': 0.2862, 'grad_norm': 2.253288745880127, 'learning_rate': 1.1048820936275808e-05} -{'loss': 0.2862, 'grad_norm': 2.253288745880127, 'learning_rate': 1.1048820936275808e-05, 'epoch': 0.49} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0742, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1210, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00035200587008148436, 'train/lm_loss': 3.370954655110836e-05, 'train/info_loss': 2.5510136765660718e-05, 'train/ref_loss': 0.28033754229545593, 'train/uncertainty_loss': 0.012104157358407974, 'train/video_loss': 0.2952832579612732, 'train/total_loss': 0.2953169643878937} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(0.1353, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00019406010396778585, 'train/lm_loss': 3.8643917650915686e-05, 'train/info_loss': 3.1053055863594636e-05, 'train/ref_loss': 0.29712358117103577, 'train/uncertainty_loss': 0.013533154129981996, 'train/video_loss': 0.31224027276039124, 'train/total_loss': 0.3122789263725281} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0015, device='cuda:2', grad_fn=) tensor(-0.0015, device='cuda:2', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3868, 'grad_norm': 7.951351642608643, 'learning_rate': 1.1038211629425816e-05}[Rank 0] Trainer log: {'loss': 0.3868, 'grad_norm': 7.951351642608643, 'learning_rate': 1.1038211629425816e-05} - -[Rank 2] Trainer log: {'loss': 0.3868, 'grad_norm': 7.951351642608643, 'learning_rate': 1.1038211629425816e-05} -[Rank 1] Trainer log: {'loss': 0.3868, 'grad_norm': 7.951351642608643, 'learning_rate': 1.1038211629425816e-05} -{'loss': 0.3868, 'grad_norm': 7.951351642608643, 'learning_rate': 1.1038211629425816e-05, 'epoch': 0.49} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.4002624034881592, 'train/info_loss': 0.15492947399616241, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012203333899378777, 'train/video_loss': 0.1548074334859848, 'train/total_loss': 0.5550698637962341} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(0.1052, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.1651, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3376471519470215, 'train/info_loss': 0.18813036382198334, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001100806868635118, 'train/video_loss': 0.18802028894424438, 'train/total_loss': 0.5256674289703369} -tensor(0.1806, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.4393, 'grad_norm': 12.654496192932129, 'learning_rate': 1.1027601141126782e-05} -[Rank 1] Trainer log: {'loss': 0.4393, 'grad_norm': 12.654496192932129, 'learning_rate': 1.1027601141126782e-05} -[Rank 3] Trainer log: {'loss': 0.4393, 'grad_norm': 12.654496192932129, 'learning_rate': 1.1027601141126782e-05}[Rank 0] Trainer log: {'loss': 0.4393, 'grad_norm': 12.654496192932129, 'learning_rate': 1.1027601141126782e-05} - -{'loss': 0.4393, 'grad_norm': 12.654496192932129, 'learning_rate': 1.1027601141126782e-05, 'epoch': 0.49} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.397564959526062, 'train/info_loss': 0.13108095526695251, 'train/ref_loss': None, 'train/uncertainty_loss': -9.977406589314342e-05, 'train/video_loss': 0.1309811770915985, 'train/total_loss': 0.528546154499054} -tensor(0.3483, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.2348, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.1549, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00022887005470693112, 'train/lm_loss': 4.9513409612700344e-05, 'train/info_loss': 3.22450723615475e-05, 'train/ref_loss': 0.3085976839065552, 'train/uncertainty_loss': 0.015491369366645814, 'train/video_loss': 0.3259522616863251, 'train/total_loss': 0.32600176334381104} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(0.0348, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3637, 'grad_norm': 11.753012657165527, 'learning_rate': 1.1016989483453075e-05}[Rank 2] Trainer log: {'loss': 0.3637, 'grad_norm': 11.753012657165527, 'learning_rate': 1.1016989483453075e-05}[Rank 1] Trainer log: {'loss': 0.3637, 'grad_norm': 11.753012657165527, 'learning_rate': 1.1016989483453075e-05} - - -[Rank 0] Trainer log: {'loss': 0.3637, 'grad_norm': 11.753012657165527, 'learning_rate': 1.1016989483453075e-05} -{'loss': 0.3637, 'grad_norm': 11.753012657165527, 'learning_rate': 1.1016989483453075e-05, 'epoch': 0.49} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.1280, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00020872587338089944, 'train/lm_loss': 0.00010482901707291603, 'train/info_loss': 3.5403903893893585e-05, 'train/ref_loss': 0.20040343701839447, 'train/uncertainty_loss': -7.224420551210642e-05, 'train/video_loss': 0.20203641057014465, 'train/total_loss': 0.20214124023914337} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.13964327573776245, 'train/info_loss': 0.15552586317062378, 'train/ref_loss': None, 'train/uncertainty_loss': -9.333553025498987e-05, 'train/video_loss': 0.15543252229690552, 'train/total_loss': 0.29507580399513245} -[Rank 1] Trainer log: {'loss': 0.306, 'grad_norm': 5.195961952209473, 'learning_rate': 1.1006376668480394e-05} -[Rank 0] Trainer log: {'loss': 0.306, 'grad_norm': 5.195961952209473, 'learning_rate': 1.1006376668480394e-05}[Rank 2] Trainer log: {'loss': 0.306, 'grad_norm': 5.195961952209473, 'learning_rate': 1.1006376668480394e-05} - -[Rank 3] Trainer log: {'loss': 0.306, 'grad_norm': 5.195961952209473, 'learning_rate': 1.1006376668480394e-05}{'loss': 0.306, 'grad_norm': 5.195961952209473, 'learning_rate': 1.1006376668480394e-05, 'epoch': 0.49} - -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(0.0312, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0408, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.000508703151717782, 'train/lm_loss': 3.4257816150784495e-05, 'train/info_loss': 2.8490208933362737e-05, 'train/ref_loss': 0.16071653366088867, 'train/uncertainty_loss': -8.574715466238559e-05, 'train/video_loss': 0.16472890973091125, 'train/total_loss': 0.1647631675004959} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00021469271741807462, 'train/lm_loss': 3.39717633323744e-05, 'train/info_loss': 2.8251803087187e-05, 'train/ref_loss': 0.171041339635849, 'train/uncertainty_loss': -7.088264101184905e-05, 'train/video_loss': 0.17271624505519867, 'train/total_loss': 0.17275021970272064} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.2839, 'grad_norm': 4.700052738189697, 'learning_rate': 1.0995762708285757e-05} -[Rank 3] Trainer log: {'loss': 0.2839, 'grad_norm': 4.700052738189697, 'learning_rate': 1.0995762708285757e-05} -[Rank 1] Trainer log: {'loss': 0.2839, 'grad_norm': 4.700052738189697, 'learning_rate': 1.0995762708285757e-05} -[Rank 0] Trainer log: {'loss': 0.2839, 'grad_norm': 4.700052738189697, 'learning_rate': 1.0995762708285757e-05} -{'loss': 0.2839, 'grad_norm': 4.700052738189697, 'learning_rate': 1.0995762708285757e-05, 'epoch': 0.49} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.09945175647735596, 'train/info_loss': 0.14495037496089935, 'train/ref_loss': None, 'train/uncertainty_loss': -9.581982158124447e-05, 'train/video_loss': 0.1448545604944229, 'train/total_loss': 0.24430632591247559} -tensor(0.4834, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1178, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.0088, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00023513161577284338, 'train/lm_loss': 4.436477611307055e-05, 'train/info_loss': 2.9384225854300894e-05, 'train/ref_loss': 0.15876366198062897, 'train/uncertainty_loss': -7.139577064663172e-05, 'train/video_loss': 0.16060270369052887, 'train/total_loss': 0.16064706444740295} -[Rank 1] Trainer log: {'loss': 0.4069, 'grad_norm': 8.24801254272461, 'learning_rate': 1.0985147614947484e-05}[Rank 3] Trainer log: {'loss': 0.4069, 'grad_norm': 8.24801254272461, 'learning_rate': 1.0985147614947484e-05}[Rank 0] Trainer log: {'loss': 0.4069, 'grad_norm': 8.24801254272461, 'learning_rate': 1.0985147614947484e-05} - - -[Rank 2] Trainer log: {'loss': 0.4069, 'grad_norm': 8.24801254272461, 'learning_rate': 1.0985147614947484e-05} -{'loss': 0.4069, 'grad_norm': 8.24801254272461, 'learning_rate': 1.0985147614947484e-05, 'epoch': 0.5} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.29160776138305666, 'train/info_loss': 0.13176093995571136, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011834364850074054, 'train/video_loss': 0.13164259493350983, 'train/total_loss': 0.42325037717819214} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.7885, device='cuda:2', grad_fn=) tensor(0.0070, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1532, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(0.0820, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00019516355823725464, 'train/lm_loss': 3.0038485419936478e-05, 'train/info_loss': 2.592734745121561e-05, 'train/ref_loss': 0.2727270722389221, 'train/uncertainty_loss': 0.00820457935333252, 'train/video_loss': 0.2825188934803009, 'train/total_loss': 0.2825489342212677} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3442, 'grad_norm': 6.24219274520874, 'learning_rate': 1.0974531400545187e-05}[Rank 0] Trainer log: {'loss': 0.3442, 'grad_norm': 6.24219274520874, 'learning_rate': 1.0974531400545187e-05}[Rank 2] Trainer log: {'loss': 0.3442, 'grad_norm': 6.24219274520874, 'learning_rate': 1.0974531400545187e-05} - -[Rank 3] Trainer log: {'loss': 0.3442, 'grad_norm': 6.24219274520874, 'learning_rate': 1.0974531400545187e-05} - -{'loss': 0.3442, 'grad_norm': 6.24219274520874, 'learning_rate': 1.0974531400545187e-05, 'epoch': 0.5} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.6856, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.09015951156616211, 'train/info_loss': 0.11720403283834457, 'train/ref_loss': None, 'train/uncertainty_loss': -9.571881964802742e-05, 'train/video_loss': 0.1171083152294159, 'train/total_loss': 0.20726782083511353} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(2.1338, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0001996398437768221, 'train/lm_loss': 2.641503524500877e-05, 'train/info_loss': 2.455650974297896e-05, 'train/ref_loss': 1.4147207736968994, 'train/uncertainty_loss': 0.21338369846343996, 'train/video_loss': 1.6297261714935303, 'train/total_loss': 1.6297526359558105} -tensor(0.0122, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.5771, 'grad_norm': 18.732324600219727, 'learning_rate': 1.0963914077159748e-05}[Rank 3] Trainer log: {'loss': 0.5771, 'grad_norm': 18.732324600219727, 'learning_rate': 1.0963914077159748e-05} - -[Rank 1] Trainer log: {'loss': 0.5771, 'grad_norm': 18.732324600219727, 'learning_rate': 1.0963914077159748e-05} -[Rank 0] Trainer log: {'loss': 0.5771, 'grad_norm': 18.732324600219727, 'learning_rate': 1.0963914077159748e-05} -{'loss': 0.5771, 'grad_norm': 18.732324600219727, 'learning_rate': 1.0963914077159748e-05, 'epoch': 0.5} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2605082273483276, 'train/info_loss': 0.15515132248401642, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011691863182932139, 'train/video_loss': 0.1550344079732895, 'train/total_loss': 0.4155426621437073} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.0375, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.5152, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2623555898666382, 'train/info_loss': 0.14913254976272583, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011885652784258128, 'train/video_loss': 0.1490136981010437, 'train/total_loss': 0.41136929392814636} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3456, 'grad_norm': 2.580789089202881, 'learning_rate': 1.0953295656873322e-05}[Rank 3] Trainer log: {'loss': 0.3456, 'grad_norm': 2.580789089202881, 'learning_rate': 1.0953295656873322e-05} - -[Rank 2] Trainer log: {'loss': 0.3456, 'grad_norm': 2.580789089202881, 'learning_rate': 1.0953295656873322e-05} -[Rank 0] Trainer log: {'loss': 0.3456, 'grad_norm': 2.580789089202881, 'learning_rate': 1.0953295656873322e-05} -{'loss': 0.3456, 'grad_norm': 2.580789089202881, 'learning_rate': 1.0953295656873322e-05, 'epoch': 0.5} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.0444, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00022302384022623303, 'train/lm_loss': 5.0299993017688396e-05, 'train/info_loss': 3.254307739553042e-05, 'train/ref_loss': 0.14556053280830383, 'train/uncertainty_loss': -6.513025145977735e-05, 'train/video_loss': 0.14731213450431824, 'train/total_loss': 0.14736244082450867} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00048029134050011637, 'train/lm_loss': 4.994245828129351e-05, 'train/info_loss': 3.1053055863594636e-05, 'train/ref_loss': 0.20675170421600342, 'train/uncertainty_loss': -7.731379009783268e-05, 'train/video_loss': 0.2105477750301361, 'train/total_loss': 0.21059772372245789} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.2815, 'grad_norm': 4.774507999420166, 'learning_rate': 1.09426761517693e-05}[Rank 2] Trainer log: {'loss': 0.2815, 'grad_norm': 4.774507999420166, 'learning_rate': 1.09426761517693e-05}[Rank 0] Trainer log: {'loss': 0.2815, 'grad_norm': 4.774507999420166, 'learning_rate': 1.09426761517693e-05} - - -[Rank 1] Trainer log: {'loss': 0.2815, 'grad_norm': 4.774507999420166, 'learning_rate': 1.09426761517693e-05} -{'loss': 0.2815, 'grad_norm': 4.774507999420166, 'learning_rate': 1.09426761517693e-05, 'epoch': 0.5} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.16342750787734986, 'train/info_loss': 0.1908058226108551, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012345178984105588, 'train/video_loss': 0.19068236649036407, 'train/total_loss': 0.35410988330841064} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.2682, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2596060037612915, 'train/info_loss': 0.1615535020828247, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001009525265544653, 'train/video_loss': 0.1614525467157364, 'train/total_loss': 0.4210585355758667} -tensor(0.2409, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.4268, 'grad_norm': 4.685720443725586, 'learning_rate': 1.0932055573932317e-05}[Rank 3] Trainer log: {'loss': 0.4268, 'grad_norm': 4.685720443725586, 'learning_rate': 1.0932055573932317e-05}[Rank 0] Trainer log: {'loss': 0.4268, 'grad_norm': 4.685720443725586, 'learning_rate': 1.0932055573932317e-05} - -[Rank 1] Trainer log: {'loss': 0.4268, 'grad_norm': 4.685720443725586, 'learning_rate': 1.0932055573932317e-05} - -{'loss': 0.4268, 'grad_norm': 4.685720443725586, 'learning_rate': 1.0932055573932317e-05, 'epoch': 0.5} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.36341934204101567, 'train/info_loss': 0.1519460529088974, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010687740286812187, 'train/video_loss': 0.15183918178081512, 'train/total_loss': 0.5152585506439209} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(0.0237, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002922236453741789, 'train/lm_loss': 3.3876410452649e-05, 'train/info_loss': 2.8251803087187e-05, 'train/ref_loss': 0.12241503596305847, 'train/uncertainty_loss': -6.958826561458409e-05, 'train/video_loss': 0.12471149116754532, 'train/total_loss': 0.12474536895751953} -[Rank 1] Trainer log: {'loss': 0.2869, 'grad_norm': 5.263150215148926, 'learning_rate': 1.0921433935448215e-05}[Rank 0] Trainer log: {'loss': 0.2869, 'grad_norm': 5.263150215148926, 'learning_rate': 1.0921433935448215e-05}[Rank 3] Trainer log: {'loss': 0.2869, 'grad_norm': 5.263150215148926, 'learning_rate': 1.0921433935448215e-05} - -[Rank 2] Trainer log: {'loss': 0.2869, 'grad_norm': 5.263150215148926, 'learning_rate': 1.0921433935448215e-05} - -{'loss': 0.2869, 'grad_norm': 5.263150215148926, 'learning_rate': 1.0921433935448215e-05, 'epoch': 0.5} -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2642596960067749, 'train/info_loss': 0.27589187026023865, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013310702051967384, 'train/video_loss': 0.2757587730884552, 'train/total_loss': 0.5400184392929077} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.0239, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(0.3728, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0006, device='cuda:0', grad_fn=) tensor(-0.0006, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00021885777823626995, 'train/lm_loss': 3.39717633323744e-05, 'train/info_loss': 2.6761768822325394e-05, 'train/ref_loss': 0.11262551695108414, 'train/uncertainty_loss': -6.477092392742634e-05, 'train/video_loss': 0.11433837562799454, 'train/total_loss': 0.11437235027551651} -[Rank 3] Trainer log: {'loss': 0.3515, 'grad_norm': 9.342670440673828, 'learning_rate': 1.0910811248404064e-05}[Rank 2] Trainer log: {'loss': 0.3515, 'grad_norm': 9.342670440673828, 'learning_rate': 1.0910811248404064e-05}[Rank 1] Trainer log: {'loss': 0.3515, 'grad_norm': 9.342670440673828, 'learning_rate': 1.0910811248404064e-05} - - -[Rank 0] Trainer log: {'loss': 0.3515, 'grad_norm': 9.342670440673828, 'learning_rate': 1.0910811248404064e-05} -{'loss': 0.3515, 'grad_norm': 9.342670440673828, 'learning_rate': 1.0910811248404064e-05, 'epoch': 0.5} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.10740132331848146, 'train/info_loss': 0.1762283742427826, 'train/ref_loss': None, 'train/uncertainty_loss': -9.86594706773758e-05, 'train/video_loss': 0.17612971365451813, 'train/total_loss': 0.2835310399532318} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0006, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.32494013309478764, 'train/info_loss': 0.18150775134563446, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012235516915097834, 'train/video_loss': 0.18138539791107178, 'train/total_loss': 0.5063255429267883} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.2530, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3519, 'grad_norm': 6.264915466308594, 'learning_rate': 1.090018752488811e-05}[Rank 2] Trainer log: {'loss': 0.3519, 'grad_norm': 6.264915466308594, 'learning_rate': 1.090018752488811e-05}[Rank 3] Trainer log: {'loss': 0.3519, 'grad_norm': 6.264915466308594, 'learning_rate': 1.090018752488811e-05} - - -[Rank 0] Trainer log: {'loss': 0.3519, 'grad_norm': 6.264915466308594, 'learning_rate': 1.090018752488811e-05} -{'loss': 0.3519, 'grad_norm': 6.264915466308594, 'learning_rate': 1.090018752488811e-05, 'epoch': 0.5} -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00023029844742268325, 'train/lm_loss': 3.0038485419936478e-05, 'train/info_loss': 2.6761768822325394e-05, 'train/ref_loss': 0.08810417354106903, 'train/uncertainty_loss': -6.817751564085484e-05, 'train/video_loss': 0.08990514278411865, 'train/total_loss': 0.08993518352508545} -tensor(-0.0016, device='cuda:0', grad_fn=) tensor(-0.0016, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.22789695262908938, 'train/info_loss': 0.20524795353412628, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00015664729289710522, 'train/video_loss': 0.20509131252765656, 'train/total_loss': 0.4329882860183716} -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(0.0585, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3105, 'grad_norm': 2.385098695755005, 'learning_rate': 1.0889562776989785e-05}[Rank 3] Trainer log: {'loss': 0.3105, 'grad_norm': 2.385098695755005, 'learning_rate': 1.0889562776989785e-05} -[Rank 1] Trainer log: {'loss': 0.3105, 'grad_norm': 2.385098695755005, 'learning_rate': 1.0889562776989785e-05} - -[Rank 2] Trainer log: {'loss': 0.3105, 'grad_norm': 2.385098695755005, 'learning_rate': 1.0889562776989785e-05} -{'loss': 0.3105, 'grad_norm': 2.385098695755005, 'learning_rate': 1.0889562776989785e-05, 'epoch': 0.5} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(0.0585, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00027687137480825186, 'train/lm_loss': 4.982327809557319e-05, 'train/info_loss': 3.22450723615475e-05, 'train/ref_loss': 0.13023173809051514, 'train/uncertainty_loss': -6.695783813484014e-05, 'train/video_loss': 0.13241200149059296, 'train/total_loss': 0.13246183097362518} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.1857, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00027203920762985945, 'train/lm_loss': 3.406711330171675e-05, 'train/info_loss': 2.6523362976149656e-05, 'train/ref_loss': 0.3292035758495331, 'train/uncertainty_loss': 0.01857106238603592, 'train/video_loss': 0.3499774634838104, 'train/total_loss': 0.35001152753829956} -[Rank 3] Trainer log: {'loss': 0.3528, 'grad_norm': 5.027342319488525, 'learning_rate': 1.0878937016799684e-05}[Rank 1] Trainer log: {'loss': 0.3528, 'grad_norm': 5.027342319488525, 'learning_rate': 1.0878937016799684e-05}[Rank 0] Trainer log: {'loss': 0.3528, 'grad_norm': 5.027342319488525, 'learning_rate': 1.0878937016799684e-05} - -[Rank 2] Trainer log: {'loss': 0.3528, 'grad_norm': 5.027342319488525, 'learning_rate': 1.0878937016799684e-05} - -{'loss': 0.3528, 'grad_norm': 5.027342319488525, 'learning_rate': 1.0878937016799684e-05, 'epoch': 0.5} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.11722747087478638, 'train/info_loss': 0.2312084287405014, 'train/ref_loss': None, 'train/uncertainty_loss': -8.984003216028214e-05, 'train/video_loss': 0.2311185896396637, 'train/total_loss': 0.3483460545539856} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.0021, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.15923023223876953, 'train/info_loss': 0.13133738934993744, 'train/ref_loss': None, 'train/uncertainty_loss': -9.667068952694536e-05, 'train/video_loss': 0.13124072551727295, 'train/total_loss': 0.2904709577560425} -tensor(0.0165, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(0.0932, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1087, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3075, 'grad_norm': 4.843605995178223, 'learning_rate': 1.086831025640956e-05}[Rank 1] Trainer log: {'loss': 0.3075, 'grad_norm': 4.843605995178223, 'learning_rate': 1.086831025640956e-05}[Rank 3] Trainer log: {'loss': 0.3075, 'grad_norm': 4.843605995178223, 'learning_rate': 1.086831025640956e-05} - -[Rank 2] Trainer log: {'loss': 0.3075, 'grad_norm': 4.843605995178223, 'learning_rate': 1.086831025640956e-05} - -{'loss': 0.3075, 'grad_norm': 4.843605995178223, 'learning_rate': 1.086831025640956e-05, 'epoch': 0.5} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.29336950778961185, 'train/info_loss': 0.1222333014011383, 'train/ref_loss': None, 'train/uncertainty_loss': -9.795302757993342e-05, 'train/video_loss': 0.12213534861803055, 'train/total_loss': 0.4155048429965973} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.36091601848602295, 'train/info_loss': 0.2484631985425949, 'train/ref_loss': None, 'train/uncertainty_loss': -9.899059077724815e-05, 'train/video_loss': 0.24836421012878418, 'train/total_loss': 0.6092802286148071} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4176, 'grad_norm': 3.1502151489257812, 'learning_rate': 1.0857682507912296e-05}[Rank 2] Trainer log: {'loss': 0.4176, 'grad_norm': 3.1502151489257812, 'learning_rate': 1.0857682507912296e-05}[Rank 3] Trainer log: {'loss': 0.4176, 'grad_norm': 3.1502151489257812, 'learning_rate': 1.0857682507912296e-05} - - -[Rank 0] Trainer log: {'loss': 0.4176, 'grad_norm': 3.1502151489257812, 'learning_rate': 1.0857682507912296e-05} -{'loss': 0.4176, 'grad_norm': 3.1502151489257812, 'learning_rate': 1.0857682507912296e-05, 'epoch': 0.5} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2038806200027466, 'train/info_loss': 0.20456263422966003, 'train/ref_loss': None, 'train/uncertainty_loss': -8.710014517419041e-05, 'train/video_loss': 0.204475536942482, 'train/total_loss': 0.4083561599254608} -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(0.1118, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.0701, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00019134742906317116, 'train/lm_loss': 2.9800101765431466e-05, 'train/info_loss': 2.4735316401347518e-05, 'train/ref_loss': 0.2477477490901947, 'train/uncertainty_loss': 0.007009581476449967, 'train/video_loss': 0.25631284713745117, 'train/total_loss': 0.25634264945983887} -[Rank 1] Trainer log: {'loss': 0.334, 'grad_norm': 5.813259124755859, 'learning_rate': 1.0847053783401902e-05}[Rank 2] Trainer log: {'loss': 0.334, 'grad_norm': 5.813259124755859, 'learning_rate': 1.0847053783401902e-05}[Rank 0] Trainer log: {'loss': 0.334, 'grad_norm': 5.813259124755859, 'learning_rate': 1.0847053783401902e-05} - - -[Rank 3] Trainer log: {'loss': 0.334, 'grad_norm': 5.813259124755859, 'learning_rate': 1.0847053783401902e-05} -{'loss': 0.334, 'grad_norm': 5.813259124755859, 'learning_rate': 1.0847053783401902e-05, 'epoch': 0.5} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.2343, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0391, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00023553078062832355, 'train/lm_loss': 2.2815357078798117e-05, 'train/info_loss': 2.235124156868551e-05, 'train/ref_loss': 0.24780906736850739, 'train/uncertainty_loss': 0.003909558057785034, 'train/video_loss': 0.25362521409988403, 'train/total_loss': 0.253648042678833} -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.1524, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00022414601407945157, 'train/lm_loss': 1.5568127855658533e-05, 'train/info_loss': 2.002676046686247e-05, 'train/ref_loss': 0.31131085753440857, 'train/uncertainty_loss': 0.01523561328649521, 'train/video_loss': 0.3283596634864807, 'train/total_loss': 0.3283752202987671} -[Rank 0] Trainer log: {'loss': 0.4127, 'grad_norm': 6.0833234786987305, 'learning_rate': 1.0836424094973507e-05}[Rank 2] Trainer log: {'loss': 0.4127, 'grad_norm': 6.0833234786987305, 'learning_rate': 1.0836424094973507e-05}[Rank 3] Trainer log: {'loss': 0.4127, 'grad_norm': 6.0833234786987305, 'learning_rate': 1.0836424094973507e-05} - -[Rank 1] Trainer log: {'loss': 0.4127, 'grad_norm': 6.0833234786987305, 'learning_rate': 1.0836424094973507e-05} - -{'loss': 0.4127, 'grad_norm': 6.0833234786987305, 'learning_rate': 1.0836424094973507e-05, 'epoch': 0.5} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(0.1598, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002729943720623851, 'train/lm_loss': 2.021687396336347e-05, 'train/info_loss': 2.0503577616182156e-05, 'train/ref_loss': 0.20329363644123077, 'train/uncertainty_loss': -6.881817243993282e-05, 'train/video_loss': 0.2054292857646942, 'train/total_loss': 0.20544950664043427} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002546653850004077, 'train/lm_loss': 2.958555705845356e-05, 'train/info_loss': 2.5331331926281564e-05, 'train/ref_loss': 0.09683065861463547, 'train/uncertainty_loss': -6.85378268826753e-05, 'train/video_loss': 0.09882477670907974, 'train/total_loss': 0.09885436296463013} -[Rank 0] Trainer log: {'loss': 0.2428, 'grad_norm': 8.853964805603027, 'learning_rate': 1.0825793454723325e-05}[Rank 1] Trainer log: {'loss': 0.2428, 'grad_norm': 8.853964805603027, 'learning_rate': 1.0825793454723325e-05}[Rank 3] Trainer log: {'loss': 0.2428, 'grad_norm': 8.853964805603027, 'learning_rate': 1.0825793454723325e-05}[Rank 2] Trainer log: {'loss': 0.2428, 'grad_norm': 8.853964805603027, 'learning_rate': 1.0825793454723325e-05} - - - -{'loss': 0.2428, 'grad_norm': 8.853964805603027, 'learning_rate': 1.0825793454723325e-05, 'epoch': 0.5} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.0068, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0162, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00019698073156178, 'train/lm_loss': 2.6081292890012265e-05, 'train/info_loss': 2.3424076061928645e-05, 'train/ref_loss': 0.21795359253883362, 'train/uncertainty_loss': 0.0016215061768889427, 'train/video_loss': 0.22117435932159424, 'train/total_loss': 0.22120043635368347} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.18874398469924927, 'train/info_loss': 0.22209110856056213, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010811459505930543, 'train/video_loss': 0.22198300063610077, 'train/total_loss': 0.41072699427604675} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3452, 'grad_norm': 2.7354323863983154, 'learning_rate': 1.081516187474866e-05}[Rank 0] Trainer log: {'loss': 0.3452, 'grad_norm': 2.7354323863983154, 'learning_rate': 1.081516187474866e-05} -[Rank 2] Trainer log: {'loss': 0.3452, 'grad_norm': 2.7354323863983154, 'learning_rate': 1.081516187474866e-05} -[Rank 3] Trainer log: {'loss': 0.3452, 'grad_norm': 2.7354323863983154, 'learning_rate': 1.081516187474866e-05} - -{'loss': 0.3452, 'grad_norm': 2.7354323863983154, 'learning_rate': 1.081516187474866e-05, 'epoch': 0.5} -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.16946561336517335, 'train/info_loss': 0.1656816601753235, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013645966537296773, 'train/video_loss': 0.16554519534111023, 'train/total_loss': 0.335010826587677} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.1700, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00021133229602128268, 'train/lm_loss': 2.6105131837539375e-05, 'train/info_loss': 2.2172436729306355e-05, 'train/ref_loss': 0.2052234709262848, 'train/uncertainty_loss': -6.585141527466476e-05, 'train/video_loss': 0.2068704515695572, 'train/total_loss': 0.2068965584039688} -[Rank 1] Trainer log: {'loss': 0.4152, 'grad_norm': 5.053834915161133, 'learning_rate': 1.0804529367147878e-05} -[Rank 3] Trainer log: {'loss': 0.4152, 'grad_norm': 5.053834915161133, 'learning_rate': 1.0804529367147878e-05}[Rank 0] Trainer log: {'loss': 0.4152, 'grad_norm': 5.053834915161133, 'learning_rate': 1.0804529367147878e-05}[Rank 2] Trainer log: {'loss': 0.4152, 'grad_norm': 5.053834915161133, 'learning_rate': 1.0804529367147878e-05} - - -{'loss': 0.4152, 'grad_norm': 5.053834915161133, 'learning_rate': 1.0804529367147878e-05, 'epoch': 0.5} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.32183802127838135, 'train/info_loss': 0.20706190168857574, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010759469587355852, 'train/video_loss': 0.20695430040359497, 'train/total_loss': 0.5287923216819763} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.0315, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1169, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(0.1478, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00030318014323711396, 'train/lm_loss': 2.956171811092645e-05, 'train/info_loss': 2.3602882720297202e-05, 'train/ref_loss': 0.13902783393859863, 'train/uncertainty_loss': -7.12720153387636e-05, 'train/video_loss': 0.1414056122303009, 'train/total_loss': 0.1414351761341095} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3111, 'grad_norm': 6.48052453994751, 'learning_rate': 1.0793895944020417e-05} -[Rank 3] Trainer log: {'loss': 0.3111, 'grad_norm': 6.48052453994751, 'learning_rate': 1.0793895944020417e-05} -[Rank 0] Trainer log: {'loss': 0.3111, 'grad_norm': 6.48052453994751, 'learning_rate': 1.0793895944020417e-05} -[Rank 2] Trainer log: {'loss': 0.3111, 'grad_norm': 6.48052453994751, 'learning_rate': 1.0793895944020417e-05} -{'loss': 0.3111, 'grad_norm': 6.48052453994751, 'learning_rate': 1.0793895944020417e-05, 'epoch': 0.5} -tensor(0.1047, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=)tensor(0.0128, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=)tensor(-0.0007, device='cuda:3', grad_fn=) - -{'train/tv_loss': 0.00022256055381149055, 'train/lm_loss': 2.3149103799369187e-05, 'train/info_loss': 2.181482341256924e-05, 'train/ref_loss': 0.09922556579113007, 'train/uncertainty_loss': -6.903604953549803e-05, 'train/video_loss': 0.10095883160829544, 'train/total_loss': 0.10098198056221008} -tensor(-0.0015, device='cuda:1', grad_fn=) tensor(-0.0015, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.2089, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00019485827069729567, 'train/lm_loss': 3.378106048330665e-05, 'train/info_loss': 2.5331331926281564e-05, 'train/ref_loss': 0.35350745916366577, 'train/uncertainty_loss': 0.020887675881385806, 'train/video_loss': 0.37597933411598206, 'train/total_loss': 0.3760131299495697} -[Rank 3] Trainer log: {'loss': 0.2915, 'grad_norm': 4.198022365570068, 'learning_rate': 1.0783261617466734e-05}[Rank 1] Trainer log: {'loss': 0.2915, 'grad_norm': 4.198022365570068, 'learning_rate': 1.0783261617466734e-05}[Rank 0] Trainer log: {'loss': 0.2915, 'grad_norm': 4.198022365570068, 'learning_rate': 1.0783261617466734e-05} - - -[Rank 2] Trainer log: {'loss': 0.2915, 'grad_norm': 4.198022365570068, 'learning_rate': 1.0783261617466734e-05} -{'loss': 0.2915, 'grad_norm': 4.198022365570068, 'learning_rate': 1.0783261617466734e-05, 'epoch': 0.5} -tensor(-0.0015, device='cuda:2', grad_fn=) tensor(-0.0015, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.2615, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0008802796714007855, 'train/lm_loss': 2.6033614994958045e-05, 'train/info_loss': 2.396049239905551e-05, 'train/ref_loss': 0.09829133749008179, 'train/uncertainty_loss': -7.993573090061546e-05, 'train/video_loss': 0.10527759790420532, 'train/total_loss': 0.10530363023281097} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.0430, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.000248944410122931, 'train/lm_loss': 6.216990295797587e-05, 'train/info_loss': 2.6523362976149656e-05, 'train/ref_loss': 0.24028754234313965, 'train/uncertainty_loss': 0.004300205782055855, 'train/video_loss': 0.24660582840442657, 'train/total_loss': 0.2466679960489273} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3133, 'grad_norm': 6.723911285400391, 'learning_rate': 1.0772626399588337e-05}[Rank 2] Trainer log: {'loss': 0.3133, 'grad_norm': 6.723911285400391, 'learning_rate': 1.0772626399588337e-05} - -[Rank 1] Trainer log: {'loss': 0.3133, 'grad_norm': 6.723911285400391, 'learning_rate': 1.0772626399588337e-05} -[Rank 0] Trainer log: {'loss': 0.3133, 'grad_norm': 6.723911285400391, 'learning_rate': 1.0772626399588337e-05} -{'loss': 0.3133, 'grad_norm': 6.723911285400391, 'learning_rate': 1.0772626399588337e-05, 'epoch': 0.5} -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.29653751850128174, 'train/info_loss': 0.2257750928401947, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010498387273401023, 'train/video_loss': 0.22567011415958405, 'train/total_loss': 0.5222076177597046} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.3486, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.0112, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0955, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0285, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002920090686529875, 'train/lm_loss': 1.9859281019307675e-05, 'train/info_loss': 2.1338008082238957e-05, 'train/ref_loss': 0.23894540965557098, 'train/uncertainty_loss': 0.0028548486530780796, 'train/video_loss': 0.24415767192840576, 'train/total_loss': 0.24417753517627716} -[Rank 2] Trainer log: {'loss': 0.3883, 'grad_norm': 5.016867160797119, 'learning_rate': 1.0761990302487728e-05}[Rank 1] Trainer log: {'loss': 0.3883, 'grad_norm': 5.016867160797119, 'learning_rate': 1.0761990302487728e-05}[Rank 3] Trainer log: {'loss': 0.3883, 'grad_norm': 5.016867160797119, 'learning_rate': 1.0761990302487728e-05} - - -[Rank 0] Trainer log: {'loss': 0.3883, 'grad_norm': 5.016867160797119, 'learning_rate': 1.0761990302487728e-05} -{'loss': 0.3883, 'grad_norm': 5.016867160797119, 'learning_rate': 1.0761990302487728e-05, 'epoch': 0.5} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.24594607353210451, 'train/info_loss': 0.19821004569530487, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012757693184539676, 'train/video_loss': 0.19808246195316315, 'train/total_loss': 0.4440285563468933} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.2410, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(0.0020, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.09500598311424256, 'train/info_loss': 0.10793036967515945, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010094086173921824, 'train/video_loss': 0.10782942920923233, 'train/total_loss': 0.20283541083335876} -tensor(0.9236, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4882, 'grad_norm': 10.706526756286621, 'learning_rate': 1.0751353338268426e-05}[Rank 0] Trainer log: {'loss': 0.4882, 'grad_norm': 10.706526756286621, 'learning_rate': 1.0751353338268426e-05} -[Rank 1] Trainer log: {'loss': 0.4882, 'grad_norm': 10.706526756286621, 'learning_rate': 1.0751353338268426e-05} - -[Rank 2] Trainer log: {'loss': 0.4882, 'grad_norm': 10.706526756286621, 'learning_rate': 1.0751353338268426e-05} -{'loss': 0.4882, 'grad_norm': 10.706526756286621, 'learning_rate': 1.0751353338268426e-05, 'epoch': 0.5} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002487720921635628, 'train/lm_loss': 3.871542867273092e-05, 'train/info_loss': 2.6523362976149656e-05, 'train/ref_loss': 0.1406993716955185, 'train/uncertainty_loss': -7.172140176407993e-05, 'train/video_loss': 0.14264436066150665, 'train/total_loss': 0.14268307387828827} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0016, device='cuda:0', grad_fn=) tensor(-0.0016, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.14720913171768188, 'train/info_loss': 0.2317415326833725, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001581651973538101, 'train/video_loss': 0.231583371758461, 'train/total_loss': 0.37879252433776855} -tensor(0.0384, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3088, 'grad_norm': 4.429353713989258, 'learning_rate': 1.074071551903492e-05}[Rank 0] Trainer log: {'loss': 0.3088, 'grad_norm': 4.429353713989258, 'learning_rate': 1.074071551903492e-05}[Rank 1] Trainer log: {'loss': 0.3088, 'grad_norm': 4.429353713989258, 'learning_rate': 1.074071551903492e-05} - - -[Rank 2] Trainer log: {'loss': 0.3088, 'grad_norm': 4.429353713989258, 'learning_rate': 1.074071551903492e-05} -{'loss': 0.3088, 'grad_norm': 4.429353713989258, 'learning_rate': 1.074071551903492e-05, 'epoch': 0.5} -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(0.8784, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00016893416177481415, 'train/lm_loss': 3.904915065504611e-05, 'train/info_loss': 2.8669011953752488e-05, 'train/ref_loss': 0.09933105111122131, 'train/uncertainty_loss': -7.052637520246209e-05, 'train/video_loss': 0.100640669465065, 'train/total_loss': 0.10067971795797348} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(-0.0004, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002531633246690035, 'train/lm_loss': 3.0109999352134766e-05, 'train/info_loss': 2.5510136765660718e-05, 'train/ref_loss': 0.23014262318611145, 'train/uncertainty_loss': -4.357477882876992e-05, 'train/video_loss': 0.2321498692035675, 'train/total_loss': 0.23217998445034027} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3661, 'grad_norm': 3.087372064590454, 'learning_rate': 1.0730076856892696e-05}[Rank 1] Trainer log: {'loss': 0.3661, 'grad_norm': 3.087372064590454, 'learning_rate': 1.0730076856892696e-05} - -[Rank 2] Trainer log: {'loss': 0.3661, 'grad_norm': 3.087372064590454, 'learning_rate': 1.0730076856892696e-05}[Rank 0] Trainer log: {'loss': 0.3661, 'grad_norm': 3.087372064590454, 'learning_rate': 1.0730076856892696e-05} - -{'loss': 0.3661, 'grad_norm': 3.087372064590454, 'learning_rate': 1.0730076856892696e-05, 'epoch': 0.5} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.122895085811615, 'train/info_loss': 0.196620374917984, 'train/ref_loss': None, 'train/uncertainty_loss': -8.96754558198154e-05, 'train/video_loss': 0.19653069972991943, 'train/total_loss': 0.3194257915019989} -tensor(0.0675, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.17834944725036622, 'train/info_loss': 0.16666467487812042, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013880172045901417, 'train/video_loss': 0.16652587056159973, 'train/total_loss': 0.3448753356933594} -tensor(0.4104, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.7918, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.4538, 'grad_norm': 9.152913093566895, 'learning_rate': 1.071943736394817e-05}[Rank 3] Trainer log: {'loss': 0.4538, 'grad_norm': 9.152913093566895, 'learning_rate': 1.071943736394817e-05}[Rank 1] Trainer log: {'loss': 0.4538, 'grad_norm': 9.152913093566895, 'learning_rate': 1.071943736394817e-05} - - -[Rank 2] Trainer log: {'loss': 0.4538, 'grad_norm': 9.152913093566895, 'learning_rate': 1.071943736394817e-05} -{'loss': 0.4538, 'grad_norm': 9.152913093566895, 'learning_rate': 1.071943736394817e-05, 'epoch': 0.5} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2209880590438843, 'train/info_loss': 0.1880016028881073, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001170686911791563, 'train/video_loss': 0.18788453936576843, 'train/total_loss': 0.4088726043701172} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(0.1552, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0016, device='cuda:0', grad_fn=) tensor(-0.0016, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3371670484542847, 'train/info_loss': 0.1718449741601944, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00015956436982378364, 'train/video_loss': 0.17168541252613068, 'train/total_loss': 0.508852481842041} -tensor(0.4773, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.426, 'grad_norm': 11.618485450744629, 'learning_rate': 1.070879705230873e-05}[Rank 1] Trainer log: {'loss': 0.426, 'grad_norm': 11.618485450744629, 'learning_rate': 1.070879705230873e-05} -[Rank 2] Trainer log: {'loss': 0.426, 'grad_norm': 11.618485450744629, 'learning_rate': 1.070879705230873e-05} - -[Rank 0] Trainer log: {'loss': 0.426, 'grad_norm': 11.618485450744629, 'learning_rate': 1.070879705230873e-05} -{'loss': 0.426, 'grad_norm': 11.618485450744629, 'learning_rate': 1.070879705230873e-05, 'epoch': 0.5} -tensor(0.0478, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.0818, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1472, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1397, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002031283685937524, 'train/lm_loss': 5.818951176479459e-05, 'train/info_loss': 3.00994397548493e-05, 'train/ref_loss': 0.3087729513645172, 'train/uncertainty_loss': 0.013970728218555451, 'train/video_loss': 0.32439881563186646, 'train/total_loss': 0.3244570195674896} -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3054951667785645, 'train/info_loss': 0.11062386631965637, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010935744503512979, 'train/video_loss': 0.11051450669765472, 'train/total_loss': 0.41600966453552246} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3722, 'grad_norm': 2.670192003250122, 'learning_rate': 1.0698155934082676e-05}[Rank 0] Trainer log: {'loss': 0.3722, 'grad_norm': 2.670192003250122, 'learning_rate': 1.0698155934082676e-05} -[Rank 3] Trainer log: {'loss': 0.3722, 'grad_norm': 2.670192003250122, 'learning_rate': 1.0698155934082676e-05} - -[Rank 2] Trainer log: {'loss': 0.3722, 'grad_norm': 2.670192003250122, 'learning_rate': 1.0698155934082676e-05} -{'loss': 0.3722, 'grad_norm': 2.670192003250122, 'learning_rate': 1.0698155934082676e-05, 'epoch': 0.5} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.24351134300231936, 'train/info_loss': 0.19111013412475586, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013701507123187184, 'train/video_loss': 0.19097311794757843, 'train/total_loss': 0.43448448181152344} -tensor(0.1625, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1814, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.37401905059814455, 'train/info_loss': 0.1026383563876152, 'train/ref_loss': None, 'train/uncertainty_loss': -9.714022162370384e-05, 'train/video_loss': 0.10254121571779251, 'train/total_loss': 0.4765602648258209} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4597, 'grad_norm': 6.268892288208008, 'learning_rate': 1.0687514021379241e-05}[Rank 1] Trainer log: {'loss': 0.4597, 'grad_norm': 6.268892288208008, 'learning_rate': 1.0687514021379241e-05}[Rank 2] Trainer log: {'loss': 0.4597, 'grad_norm': 6.268892288208008, 'learning_rate': 1.0687514021379241e-05} - - -[Rank 0] Trainer log: {'loss': 0.4597, 'grad_norm': 6.268892288208008, 'learning_rate': 1.0687514021379241e-05} -{'loss': 0.4597, 'grad_norm': 6.268892288208008, 'learning_rate': 1.0687514021379241e-05, 'epoch': 0.5} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0972, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.6591, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0011393025517463685, 'train/lm_loss': 5.168247153051198e-05, 'train/info_loss': 2.8013399060000665e-05, 'train/ref_loss': 0.7586095333099365, 'train/uncertainty_loss': 0.06590849161148071, 'train/video_loss': 0.8336604833602905, 'train/total_loss': 0.8337121605873108} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.10644707679748536, 'train/info_loss': 0.17781701683998108, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010291835060343147, 'train/video_loss': 0.17771409451961517, 'train/total_loss': 0.28416118025779724} -tensor(0.3970, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.4972, 'grad_norm': 4.396923542022705, 'learning_rate': 1.0676871326308547e-05}[Rank 1] Trainer log: {'loss': 0.4972, 'grad_norm': 4.396923542022705, 'learning_rate': 1.0676871326308547e-05} -[Rank 0] Trainer log: {'loss': 0.4972, 'grad_norm': 4.396923542022705, 'learning_rate': 1.0676871326308547e-05} - -[Rank 3] Trainer log: {'loss': 0.4972, 'grad_norm': 4.396923542022705, 'learning_rate': 1.0676871326308547e-05} -{'loss': 0.4972, 'grad_norm': 4.396923542022705, 'learning_rate': 1.0676871326308547e-05, 'epoch': 0.5} -tensor(-0.0015, device='cuda:1', grad_fn=) tensor(-0.0015, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2923597812652588, 'train/info_loss': 0.2108151912689209, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011920119868591428, 'train/video_loss': 0.2106959968805313, 'train/total_loss': 0.5030558109283447} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3392985343933106, 'train/info_loss': 0.19302304089069366, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001141425222158432, 'train/video_loss': 0.1929088979959488, 'train/total_loss': 0.5322074294090271} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.6875, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.395, 'grad_norm': 11.852221488952637, 'learning_rate': 1.0666227860981615e-05}[Rank 1] Trainer log: {'loss': 0.395, 'grad_norm': 11.852221488952637, 'learning_rate': 1.0666227860981615e-05}[Rank 0] Trainer log: {'loss': 0.395, 'grad_norm': 11.852221488952637, 'learning_rate': 1.0666227860981615e-05} - - -[Rank 3] Trainer log: {'loss': 0.395, 'grad_norm': 11.852221488952637, 'learning_rate': 1.0666227860981615e-05} -{'loss': 0.395, 'grad_norm': 11.852221488952637, 'learning_rate': 1.0666227860981615e-05, 'epoch': 0.5} -tensor(0.1652, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.2392, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2652781963348389, 'train/info_loss': 0.2081359326839447, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010546979028731586, 'train/video_loss': 0.20803046226501465, 'train/total_loss': 0.47330865263938904} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.14935612678527832, 'train/info_loss': 0.12005950510501862, 'train/ref_loss': None, 'train/uncertainty_loss': -9.615710587240756e-05, 'train/video_loss': 0.11996334791183472, 'train/total_loss': 0.26931947469711304} -tensor(0.2323, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4154, 'grad_norm': 18.819732666015625, 'learning_rate': 1.0655583637510346e-05}[Rank 2] Trainer log: {'loss': 0.4154, 'grad_norm': 18.819732666015625, 'learning_rate': 1.0655583637510346e-05} - -[Rank 1] Trainer log: {'loss': 0.4154, 'grad_norm': 18.819732666015625, 'learning_rate': 1.0655583637510346e-05}[Rank 0] Trainer log: {'loss': 0.4154, 'grad_norm': 18.819732666015625, 'learning_rate': 1.0655583637510346e-05} - -{'loss': 0.4154, 'grad_norm': 18.819732666015625, 'learning_rate': 1.0655583637510346e-05, 'epoch': 0.51} -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2565967082977295, 'train/info_loss': 0.22512532770633698, 'train/ref_loss': None, 'train/uncertainty_loss': -8.974713273346424e-05, 'train/video_loss': 0.22503557801246643, 'train/total_loss': 0.4816322922706604} -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(0.1940, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.21435079574584961, 'train/info_loss': 0.2778104841709137, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011616776464506984, 'train/video_loss': 0.27769431471824646, 'train/total_loss': 0.4920451045036316} -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3127, 'grad_norm': 5.855369567871094, 'learning_rate': 1.0644938668007498e-05}[Rank 0] Trainer log: {'loss': 0.3127, 'grad_norm': 5.855369567871094, 'learning_rate': 1.0644938668007498e-05}[Rank 3] Trainer log: {'loss': 0.3127, 'grad_norm': 5.855369567871094, 'learning_rate': 1.0644938668007498e-05} - - -[Rank 2] Trainer log: {'loss': 0.3127, 'grad_norm': 5.855369567871094, 'learning_rate': 1.0644938668007498e-05} -{'loss': 0.3127, 'grad_norm': 5.855369567871094, 'learning_rate': 1.0644938668007498e-05, 'epoch': 0.51} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1436270833015442, 'train/info_loss': 0.1685701608657837, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011445891577750445, 'train/video_loss': 0.16845570504665375, 'train/total_loss': 0.31208279728889465} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.1369, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1710, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002840730827301741, 'train/lm_loss': 0.00015329206362366677, 'train/info_loss': 3.653631210909225e-05, 'train/ref_loss': 0.3205668032169342, 'train/uncertainty_loss': 0.01709582805633545, 'train/video_loss': 0.33997175097465515, 'train/total_loss': 0.34012505412101746} -tensor(0.0680, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3591, 'grad_norm': 11.574625015258789, 'learning_rate': 1.0634292964586679e-05} -[Rank 3] Trainer log: {'loss': 0.3591, 'grad_norm': 11.574625015258789, 'learning_rate': 1.0634292964586679e-05} -[Rank 2] Trainer log: {'loss': 0.3591, 'grad_norm': 11.574625015258789, 'learning_rate': 1.0634292964586679e-05} -[Rank 0] Trainer log: {'loss': 0.3591, 'grad_norm': 11.574625015258789, 'learning_rate': 1.0634292964586679e-05} -{'loss': 0.3591, 'grad_norm': 11.574625015258789, 'learning_rate': 1.0634292964586679e-05, 'epoch': 0.51} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.0110, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002454160246998072, 'train/lm_loss': 5.26358955539763e-05, 'train/info_loss': 2.8490208933362737e-05, 'train/ref_loss': 0.2123194932937622, 'train/uncertainty_loss': 0.0010973616503179074, 'train/video_loss': 0.21540868282318115, 'train/total_loss': 0.21546131372451782} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.1407, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00032167178578674794, 'train/lm_loss': 4.6057166764512664e-05, 'train/info_loss': 2.8490208933362737e-05, 'train/ref_loss': 0.2981242537498474, 'train/uncertainty_loss': 0.014071829617023468, 'train/video_loss': 0.31479793787002563, 'train/total_loss': 0.3148439824581146} -[Rank 1] Trainer log: {'loss': 0.2934, 'grad_norm': 4.6428327560424805, 'learning_rate': 1.062364653936233e-05}[Rank 0] Trainer log: {'loss': 0.2934, 'grad_norm': 4.6428327560424805, 'learning_rate': 1.062364653936233e-05}[Rank 3] Trainer log: {'loss': 0.2934, 'grad_norm': 4.6428327560424805, 'learning_rate': 1.062364653936233e-05} - - -[Rank 2] Trainer log: {'loss': 0.2934, 'grad_norm': 4.6428327560424805, 'learning_rate': 1.062364653936233e-05} -{'loss': 0.2934, 'grad_norm': 4.6428327560424805, 'learning_rate': 1.062364653936233e-05, 'epoch': 0.51} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(0.1438, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00025929417461156845, 'train/lm_loss': 6.786625599488617e-05, 'train/info_loss': 3.22450723615475e-05, 'train/ref_loss': 0.1874995231628418, 'train/uncertainty_loss': -6.87067920807749e-05, 'train/video_loss': 0.1895374208688736, 'train/total_loss': 0.18960528075695038} -tensor(0.0615, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.18938122987747194, 'train/info_loss': 0.14546383917331696, 'train/ref_loss': None, 'train/uncertainty_loss': -9.393339278176428e-05, 'train/video_loss': 0.14536990225315094, 'train/total_loss': 0.3347511291503906} -tensor(0.1848, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3733, 'grad_norm': 7.544466495513916, 'learning_rate': 1.0612999404449722e-05}[Rank 3] Trainer log: {'loss': 0.3733, 'grad_norm': 7.544466495513916, 'learning_rate': 1.0612999404449722e-05}[Rank 1] Trainer log: {'loss': 0.3733, 'grad_norm': 7.544466495513916, 'learning_rate': 1.0612999404449722e-05} - - -[Rank 2] Trainer log: {'loss': 0.3733, 'grad_norm': 7.544466495513916, 'learning_rate': 1.0612999404449722e-05} -{'loss': 0.3733, 'grad_norm': 7.544466495513916, 'learning_rate': 1.0612999404449722e-05, 'epoch': 0.51} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(0.1600, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0001729297102428973, 'train/lm_loss': 6.898643914610147e-05, 'train/info_loss': 3.129145989078097e-05, 'train/ref_loss': 0.06412827223539352, 'train/uncertainty_loss': -6.775682559236885e-05, 'train/video_loss': 0.06547524780035019, 'train/total_loss': 0.0655442327260971} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(0.0998, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00021015685051679612, 'train/lm_loss': 4.7391999396495526e-05, 'train/info_loss': 2.914582182711456e-05, 'train/ref_loss': 0.11274924874305725, 'train/uncertainty_loss': -6.777975359000266e-05, 'train/video_loss': 0.11439187079668045, 'train/total_loss': 0.11443926393985748} -[Rank 1] Trainer log: {'loss': 0.3038, 'grad_norm': 7.4383463859558105, 'learning_rate': 1.0602351571964921e-05}[Rank 3] Trainer log: {'loss': 0.3038, 'grad_norm': 7.4383463859558105, 'learning_rate': 1.0602351571964921e-05}[Rank 0] Trainer log: {'loss': 0.3038, 'grad_norm': 7.4383463859558105, 'learning_rate': 1.0602351571964921e-05} - -[Rank 2] Trainer log: {'loss': 0.3038, 'grad_norm': 7.4383463859558105, 'learning_rate': 1.0602351571964921e-05} - -{'loss': 0.3038, 'grad_norm': 7.4383463859558105, 'learning_rate': 1.0602351571964921e-05, 'epoch': 0.51} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(0.1341, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00020530503243207932, 'train/lm_loss': 6.0596823459491134e-05, 'train/info_loss': 3.027824277523905e-05, 'train/ref_loss': 0.29985225200653076, 'train/uncertainty_loss': 0.013409781455993653, 'train/video_loss': 0.31493476033210754, 'train/total_loss': 0.31499534845352173} -tensor(0.2709, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.18366819620132446, 'train/info_loss': 0.2870149314403534, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010680523701012135, 'train/video_loss': 0.2869081199169159, 'train/total_loss': 0.47057631611824036} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.0388, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3898, 'grad_norm': 4.736631393432617, 'learning_rate': 1.0591703054024795e-05}[Rank 0] Trainer log: {'loss': 0.3898, 'grad_norm': 4.736631393432617, 'learning_rate': 1.0591703054024795e-05}[Rank 3] Trainer log: {'loss': 0.3898, 'grad_norm': 4.736631393432617, 'learning_rate': 1.0591703054024795e-05} - -[Rank 2] Trainer log: {'loss': 0.3898, 'grad_norm': 4.736631393432617, 'learning_rate': 1.0591703054024795e-05} - -{'loss': 0.3898, 'grad_norm': 4.736631393432617, 'learning_rate': 1.0591703054024795e-05, 'epoch': 0.51} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3159811496734619, 'train/info_loss': 0.1353665441274643, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012419237755239011, 'train/video_loss': 0.13524235785007477, 'train/total_loss': 0.4512234926223755} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(0.2607, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00023022328969091177, 'train/lm_loss': 6.855743122287094e-05, 'train/info_loss': 3.254307739553042e-05, 'train/ref_loss': 0.3934139609336853, 'train/uncertainty_loss': 0.026073139905929566, 'train/video_loss': 0.42136141657829285, 'train/total_loss': 0.42142996191978455} -tensor(0.3269, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3367, 'grad_norm': 6.254545211791992, 'learning_rate': 1.0581053862746993e-05}[Rank 0] Trainer log: {'loss': 0.3367, 'grad_norm': 6.254545211791992, 'learning_rate': 1.0581053862746993e-05}[Rank 2] Trainer log: {'loss': 0.3367, 'grad_norm': 6.254545211791992, 'learning_rate': 1.0581053862746993e-05} - - -[Rank 3] Trainer log: {'loss': 0.3367, 'grad_norm': 6.254545211791992, 'learning_rate': 1.0581053862746993e-05} -{'loss': 0.3367, 'grad_norm': 6.254545211791992, 'learning_rate': 1.0581053862746993e-05, 'epoch': 0.51} -tensor(-0.0015, device='cuda:1', grad_fn=) tensor(-0.0015, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.13213049173355104, 'train/info_loss': 0.09572212398052216, 'train/ref_loss': None, 'train/uncertainty_loss': -8.904431597329677e-05, 'train/video_loss': 0.09563308209180832, 'train/total_loss': 0.2277635633945465} -tensor(0.0892, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0005059799179434777, 'train/lm_loss': 7.923463708721102e-05, 'train/info_loss': 3.1529863917967305e-05, 'train/ref_loss': 0.14780297875404358, 'train/uncertainty_loss': -8.170006913132966e-05, 'train/video_loss': 0.15180064737796783, 'train/total_loss': 0.15187987685203552} -tensor(0.0237, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3394, 'grad_norm': 6.155816078186035, 'learning_rate': 1.0570404010249926e-05} -[Rank 0] Trainer log: {'loss': 0.3394, 'grad_norm': 6.155816078186035, 'learning_rate': 1.0570404010249926e-05}[Rank 3] Trainer log: {'loss': 0.3394, 'grad_norm': 6.155816078186035, 'learning_rate': 1.0570404010249926e-05} - -[Rank 2] Trainer log: {'loss': 0.3394, 'grad_norm': 6.155816078186035, 'learning_rate': 1.0570404010249926e-05} -{'loss': 0.3394, 'grad_norm': 6.155816078186035, 'learning_rate': 1.0570404010249926e-05, 'epoch': 0.51} -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.08813554644584656, 'train/info_loss': 0.1106153056025505, 'train/ref_loss': None, 'train/uncertainty_loss': -9.729012963362039e-05, 'train/video_loss': 0.11051801592111588, 'train/total_loss': 0.19865356385707855} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1800, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.3503, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.0999654233455658, 'train/info_loss': 0.24381723999977112, 'train/ref_loss': None, 'train/uncertainty_loss': -9.819816332310439e-05, 'train/video_loss': 0.24371904134750366, 'train/total_loss': 0.34368446469306946} -[Rank 0] Trainer log: {'loss': 0.3269, 'grad_norm': 2.015839099884033, 'learning_rate': 1.055975350865276e-05}[Rank 1] Trainer log: {'loss': 0.3269, 'grad_norm': 2.015839099884033, 'learning_rate': 1.055975350865276e-05} -[Rank 3] Trainer log: {'loss': 0.3269, 'grad_norm': 2.015839099884033, 'learning_rate': 1.055975350865276e-05} - -[Rank 2] Trainer log: {'loss': 0.3269, 'grad_norm': 2.015839099884033, 'learning_rate': 1.055975350865276e-05} -{'loss': 0.3269, 'grad_norm': 2.015839099884033, 'learning_rate': 1.055975350865276e-05, 'epoch': 0.51} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.1174, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.2277, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00022002814803272487, 'train/lm_loss': 6.143103237263859e-05, 'train/info_loss': 3.027824277523905e-05, 'train/ref_loss': 0.3672938346862793, 'train/uncertainty_loss': 0.02276621609926224, 'train/video_loss': 0.3918505311012268, 'train/total_loss': 0.39191195368766785} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.17502641677856445, 'train/info_loss': 0.1457546055316925, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010504997335374356, 'train/video_loss': 0.14564955234527588, 'train/total_loss': 0.32067596912384033} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.2339, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3182, 'grad_norm': 3.2264773845672607, 'learning_rate': 1.0549102370075397e-05} -[Rank 0] Trainer log: {'loss': 0.3182, 'grad_norm': 3.2264773845672607, 'learning_rate': 1.0549102370075397e-05}[Rank 2] Trainer log: {'loss': 0.3182, 'grad_norm': 3.2264773845672607, 'learning_rate': 1.0549102370075397e-05} - -[Rank 3] Trainer log: {'loss': 0.3182, 'grad_norm': 3.2264773845672607, 'learning_rate': 1.0549102370075397e-05} -{'loss': 0.3182, 'grad_norm': 3.2264773845672607, 'learning_rate': 1.0549102370075397e-05, 'epoch': 0.51} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.3014, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00016233463538810612, 'train/lm_loss': 3.611715219449252e-05, 'train/info_loss': 2.3424076061928645e-05, 'train/ref_loss': 0.14280346035957336, 'train/uncertainty_loss': -6.612674915231765e-05, 'train/video_loss': 0.1440594345331192, 'train/total_loss': 0.1440955549478531} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(1.1887, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1488, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002765166573226452, 'train/lm_loss': 4.190959443803877e-05, 'train/info_loss': 2.455650974297896e-05, 'train/ref_loss': 0.30351293087005615, 'train/uncertainty_loss': 0.014879995584487916, 'train/video_loss': 0.32062962651252747, 'train/total_loss': 0.32067152857780457} -tensor(0.1509, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3988, 'grad_norm': 12.278534889221191, 'learning_rate': 1.0538450606638464e-05}[Rank 2] Trainer log: {'loss': 0.3988, 'grad_norm': 12.278534889221191, 'learning_rate': 1.0538450606638464e-05}[Rank 0] Trainer log: {'loss': 0.3988, 'grad_norm': 12.278534889221191, 'learning_rate': 1.0538450606638464e-05} - - -[Rank 3] Trainer log: {'loss': 0.3988, 'grad_norm': 12.278534889221191, 'learning_rate': 1.0538450606638464e-05} -{'loss': 0.3988, 'grad_norm': 12.278534889221191, 'learning_rate': 1.0538450606638464e-05, 'epoch': 0.51} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(0.1169, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00020279805175960064, 'train/lm_loss': 5.2945758216083054e-05, 'train/info_loss': 2.9384225854300894e-05, 'train/ref_loss': 0.14947715401649475, 'train/uncertainty_loss': -6.793097709305585e-05, 'train/video_loss': 0.15106098353862762, 'train/total_loss': 0.15111392736434937} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.0823, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1471, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002782676601782441, 'train/lm_loss': 4.734432441182435e-05, 'train/info_loss': 2.4914121240726672e-05, 'train/ref_loss': 0.306509405374527, 'train/uncertainty_loss': 0.014712046086788179, 'train/video_loss': 0.3234724998474121, 'train/total_loss': 0.32351985573768616} -[Rank 3] Trainer log: {'loss': 0.3289, 'grad_norm': 9.260180473327637, 'learning_rate': 1.0527798230463305e-05}[Rank 1] Trainer log: {'loss': 0.3289, 'grad_norm': 9.260180473327637, 'learning_rate': 1.0527798230463305e-05}[Rank 0] Trainer log: {'loss': 0.3289, 'grad_norm': 9.260180473327637, 'learning_rate': 1.0527798230463305e-05} - - -[Rank 2] Trainer log: {'loss': 0.3289, 'grad_norm': 9.260180473327637, 'learning_rate': 1.0527798230463305e-05} -{'loss': 0.3289, 'grad_norm': 9.260180473327637, 'learning_rate': 1.0527798230463305e-05, 'epoch': 0.51} -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.27175531387329105, 'train/info_loss': 0.2838438153266907, 'train/ref_loss': None, 'train/uncertainty_loss': -8.704981883056462e-05, 'train/video_loss': 0.2837567627429962, 'train/total_loss': 0.5555120706558228} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.4224425315856934, 'train/info_loss': 0.19566011428833008, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001234679133631289, 'train/video_loss': 0.19553664326667786, 'train/total_loss': 0.6179791688919067} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.0169, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.4514, 'grad_norm': 3.0699563026428223, 'learning_rate': 1.051714525367195e-05}[Rank 1] Trainer log: {'loss': 0.4514, 'grad_norm': 3.0699563026428223, 'learning_rate': 1.051714525367195e-05}[Rank 3] Trainer log: {'loss': 0.4514, 'grad_norm': 3.0699563026428223, 'learning_rate': 1.051714525367195e-05} - - -[Rank 2] Trainer log: {'loss': 0.4514, 'grad_norm': 3.0699563026428223, 'learning_rate': 1.051714525367195e-05} -{'loss': 0.4514, 'grad_norm': 3.0699563026428223, 'learning_rate': 1.051714525367195e-05, 'epoch': 0.51} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.13083263635635375, 'train/info_loss': 0.2193959802389145, 'train/ref_loss': None, 'train/uncertainty_loss': -9.078016155399382e-05, 'train/video_loss': 0.21930520236492157, 'train/total_loss': 0.3501378297805786} -tensor(0.5346, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.2555, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.28560643196105956, 'train/info_loss': 0.13717250525951385, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011686763027682901, 'train/video_loss': 0.1370556354522705, 'train/total_loss': 0.42266207933425903} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4376, 'grad_norm': 7.7380781173706055, 'learning_rate': 1.0506491688387128e-05}[Rank 3] Trainer log: {'loss': 0.4376, 'grad_norm': 7.7380781173706055, 'learning_rate': 1.0506491688387128e-05} - -[Rank 0] Trainer log: {'loss': 0.4376, 'grad_norm': 7.7380781173706055, 'learning_rate': 1.0506491688387128e-05} -[Rank 2] Trainer log: {'loss': 0.4376, 'grad_norm': 7.7380781173706055, 'learning_rate': 1.0506491688387128e-05} -{'loss': 0.4376, 'grad_norm': 7.7380781173706055, 'learning_rate': 1.0506491688387128e-05, 'epoch': 0.51} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.0202, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1759, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00028339701239019636, 'train/lm_loss': 3.208856505807489e-05, 'train/info_loss': 2.3066464564180933e-05, 'train/ref_loss': 0.3189260959625244, 'train/uncertainty_loss': 0.017588454484939575, 'train/video_loss': 0.33880481123924255, 'train/total_loss': 0.3388369083404541} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3976054906845093, 'train/info_loss': 0.24688753485679626, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012476572301238777, 'train/video_loss': 0.24676276743412018, 'train/total_loss': 0.6443682909011841} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(0.5395, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.4777, 'grad_norm': 8.091878890991211, 'learning_rate': 1.0495837546732224e-05}[Rank 1] Trainer log: {'loss': 0.4777, 'grad_norm': 8.091878890991211, 'learning_rate': 1.0495837546732224e-05} - -[Rank 2] Trainer log: {'loss': 0.4777, 'grad_norm': 8.091878890991211, 'learning_rate': 1.0495837546732224e-05} -[Rank 3] Trainer log: {'loss': 0.4777, 'grad_norm': 8.091878890991211, 'learning_rate': 1.0495837546732224e-05} -{'loss': 0.4777, 'grad_norm': 8.091878890991211, 'learning_rate': 1.0495837546732224e-05, 'epoch': 0.51} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.32229793071746826, 'train/info_loss': 0.17403475940227509, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011959829134866595, 'train/video_loss': 0.17391516268253326, 'train/total_loss': 0.49621307849884033} -tensor(0.1204, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.6742, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.2787, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0015, device='cuda:1', grad_fn=) tensor(-0.0015, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.0986209511756897, 'train/info_loss': 0.13583016395568848, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011583558516576887, 'train/video_loss': 0.1357143223285675, 'train/total_loss': 0.2343352735042572} -[Rank 1] Trainer log: {'loss': 0.4679, 'grad_norm': 14.635433197021484, 'learning_rate': 1.0485182840831285e-05}[Rank 3] Trainer log: {'loss': 0.4679, 'grad_norm': 14.635433197021484, 'learning_rate': 1.0485182840831285e-05} - -[Rank 0] Trainer log: {'loss': 0.4679, 'grad_norm': 14.635433197021484, 'learning_rate': 1.0485182840831285e-05}[Rank 2] Trainer log: {'loss': 0.4679, 'grad_norm': 14.635433197021484, 'learning_rate': 1.0485182840831285e-05} - -{'loss': 0.4679, 'grad_norm': 14.635433197021484, 'learning_rate': 1.0485182840831285e-05, 'epoch': 0.51} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.2842, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0500, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00044266395270824437, 'train/lm_loss': 4.126599815208465e-05, 'train/info_loss': 2.413929905742407e-05, 'train/ref_loss': 0.23946812748908997, 'train/uncertainty_loss': 0.005004590749740601, 'train/video_loss': 0.2480381578207016, 'train/total_loss': 0.24807941913604736} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=)tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.09908754229545594, 'train/info_loss': 0.17713521420955658, 'train/ref_loss': None, 'train/uncertainty_loss': -8.721337653696538e-05, 'train/video_loss': 0.177047997713089, 'train/total_loss': 0.27613553404808044} -tensor(0.0578, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3874, 'grad_norm': 14.041946411132812, 'learning_rate': 1.0474527582808998e-05}[Rank 1] Trainer log: {'loss': 0.3874, 'grad_norm': 14.041946411132812, 'learning_rate': 1.0474527582808998e-05}[Rank 2] Trainer log: {'loss': 0.3874, 'grad_norm': 14.041946411132812, 'learning_rate': 1.0474527582808998e-05} - - -[Rank 0] Trainer log: {'loss': 0.3874, 'grad_norm': 14.041946411132812, 'learning_rate': 1.0474527582808998e-05} -{'loss': 0.3874, 'grad_norm': 14.041946411132812, 'learning_rate': 1.0474527582808998e-05, 'epoch': 0.51} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.3026, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0001764589222148061, 'train/lm_loss': 6.081133615225554e-05, 'train/info_loss': 2.783459422062151e-05, 'train/ref_loss': 0.11392664909362793, 'train/uncertainty_loss': -6.614992744289339e-05, 'train/video_loss': 0.1153000071644783, 'train/total_loss': 0.1153608188033104} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.34330570697784424, 'train/info_loss': 0.18736833333969116, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012092859251424671, 'train/video_loss': 0.1872474104166031, 'train/total_loss': 0.5305531024932861} -tensor(0.1050, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0331, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0860, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3258, 'grad_norm': 12.443925857543945, 'learning_rate': 1.0463871784790682e-05}[Rank 3] Trainer log: {'loss': 0.3258, 'grad_norm': 12.443925857543945, 'learning_rate': 1.0463871784790682e-05} - -[Rank 0] Trainer log: {'loss': 0.3258, 'grad_norm': 12.443925857543945, 'learning_rate': 1.0463871784790682e-05}[Rank 1] Trainer log: {'loss': 0.3258, 'grad_norm': 12.443925857543945, 'learning_rate': 1.0463871784790682e-05} - -{'loss': 0.3258, 'grad_norm': 12.443925857543945, 'learning_rate': 1.0463871784790682e-05, 'epoch': 0.51} -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2895621061325073, 'train/info_loss': 0.18204434216022491, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013956805923953652, 'train/video_loss': 0.18190477788448334, 'train/total_loss': 0.47146689891815186} -tensor(0.0489, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0973, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0905, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1746897578239441, 'train/info_loss': 0.21471673250198364, 'train/ref_loss': None, 'train/uncertainty_loss': -8.495832444168628e-05, 'train/video_loss': 0.21463178098201752, 'train/total_loss': 0.38932153582572937} -tensor(0.1674, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3268, 'grad_norm': 11.620532035827637, 'learning_rate': 1.0453215458902264e-05}[Rank 0] Trainer log: {'loss': 0.3268, 'grad_norm': 11.620532035827637, 'learning_rate': 1.0453215458902264e-05}[Rank 1] Trainer log: {'loss': 0.3268, 'grad_norm': 11.620532035827637, 'learning_rate': 1.0453215458902264e-05} - - -[Rank 2] Trainer log: {'loss': 0.3268, 'grad_norm': 11.620532035827637, 'learning_rate': 1.0453215458902264e-05} -{'loss': 0.3268, 'grad_norm': 11.620532035827637, 'learning_rate': 1.0453215458902264e-05, 'epoch': 0.51} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.0356, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00035975826904177666, 'train/lm_loss': 3.5974127240478995e-05, 'train/info_loss': 2.1636018573190086e-05, 'train/ref_loss': 0.22160044312477112, 'train/uncertainty_loss': 0.003559175506234169, 'train/video_loss': 0.228059321641922, 'train/total_loss': 0.22809529304504395} -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0015, device='cuda:0', grad_fn=) tensor(-0.0015, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3824272155761719, 'train/info_loss': 0.21513670682907104, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00014581356663256884, 'train/video_loss': 0.21499089896678925, 'train/total_loss': 0.5974181294441223} -tensor(0.2708, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3971, 'grad_norm': 6.176470756530762, 'learning_rate': 1.0442558617270277e-05}[Rank 0] Trainer log: {'loss': 0.3971, 'grad_norm': 6.176470756530762, 'learning_rate': 1.0442558617270277e-05} -[Rank 1] Trainer log: {'loss': 0.3971, 'grad_norm': 6.176470756530762, 'learning_rate': 1.0442558617270277e-05} -[Rank 3] Trainer log: {'loss': 0.3971, 'grad_norm': 6.176470756530762, 'learning_rate': 1.0442558617270277e-05} - -{'loss': 0.3971, 'grad_norm': 6.176470756530762, 'learning_rate': 1.0442558617270277e-05, 'epoch': 0.51} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2826858043670654, 'train/info_loss': 0.23583543300628662, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011095417430624367, 'train/video_loss': 0.23572447896003723, 'train/total_loss': 0.518410325050354} -tensor(0.7398, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.2369, device='cuda:1', grad_fn=) tensor(0.0545, device='cuda:0', grad_fn=)tensor(-0.0007, device='cuda:1', grad_fn=) - tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002771907253190875, 'train/lm_loss': 3.6331691080704335e-05, 'train/info_loss': 2.235124156868551e-05, 'train/ref_loss': 0.24263127148151398, 'train/uncertainty_loss': 0.005450256541371346, 'train/video_loss': 0.2503214180469513, 'train/total_loss': 0.2503577470779419} -[Rank 3] Trainer log: {'loss': 0.4402, 'grad_norm': 13.716863632202148, 'learning_rate': 1.0431901272021841e-05} -[Rank 1] Trainer log: {'loss': 0.4402, 'grad_norm': 13.716863632202148, 'learning_rate': 1.0431901272021841e-05} -[Rank 2] Trainer log: {'loss': 0.4402, 'grad_norm': 13.716863632202148, 'learning_rate': 1.0431901272021841e-05} -[Rank 0] Trainer log: {'loss': 0.4402, 'grad_norm': 13.716863632202148, 'learning_rate': 1.0431901272021841e-05} -{'loss': 0.4402, 'grad_norm': 13.716863632202148, 'learning_rate': 1.0431901272021841e-05, 'epoch': 0.51} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.29662461280822755, 'train/info_loss': 0.1615721732378006, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011723436182364821, 'train/video_loss': 0.1614549458026886, 'train/total_loss': 0.45807957649230957} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3365468263626099, 'train/info_loss': 0.24694809317588806, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001428905874490738, 'train/video_loss': 0.24680520594120026, 'train/total_loss': 0.5833520293235779} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.1832, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4489, 'grad_norm': 4.451913356781006, 'learning_rate': 1.0421243435284647e-05} -[Rank 2] Trainer log: {'loss': 0.4489, 'grad_norm': 4.451913356781006, 'learning_rate': 1.0421243435284647e-05} -[Rank 0] Trainer log: {'loss': 0.4489, 'grad_norm': 4.451913356781006, 'learning_rate': 1.0421243435284647e-05}[Rank 1] Trainer log: {'loss': 0.4489, 'grad_norm': 4.451913356781006, 'learning_rate': 1.0421243435284647e-05} - -{'loss': 0.4489, 'grad_norm': 4.451913356781006, 'learning_rate': 1.0421243435284647e-05, 'epoch': 0.51} -tensor(-0.0015, device='cuda:0', grad_fn=) tensor(-0.0015, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.38303322792053224, 'train/info_loss': 0.2824927866458893, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00014588312478736043, 'train/video_loss': 0.2823469042778015, 'train/total_loss': 0.6653801202774048} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.1329, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1755, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.15766950845718386, 'train/info_loss': 0.12816017866134644, 'train/ref_loss': None, 'train/uncertainty_loss': -9.776795050129295e-05, 'train/video_loss': 0.1280624121427536, 'train/total_loss': 0.2857319116592407} -[Rank 1] Trainer log: {'loss': 0.4301, 'grad_norm': 3.6090846061706543, 'learning_rate': 1.0410585119186945e-05}[Rank 3] Trainer log: {'loss': 0.4301, 'grad_norm': 3.6090846061706543, 'learning_rate': 1.0410585119186945e-05}[Rank 2] Trainer log: {'loss': 0.4301, 'grad_norm': 3.6090846061706543, 'learning_rate': 1.0410585119186945e-05} - - -[Rank 0] Trainer log: {'loss': 0.4301, 'grad_norm': 3.6090846061706543, 'learning_rate': 1.0410585119186945e-05} -{'loss': 0.4301, 'grad_norm': 3.6090846061706543, 'learning_rate': 1.0410585119186945e-05, 'epoch': 0.51} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(0.1271, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.000157795415725559, 'train/lm_loss': 4.1170651093125345e-05, 'train/info_loss': 2.455650974297896e-05, 'train/ref_loss': 0.2968056797981262, 'train/uncertainty_loss': 0.012708184123039246, 'train/video_loss': 0.31080079078674316, 'train/total_loss': 0.31084194779396057} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.0722, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00023345670197159052, 'train/lm_loss': 4.164738929830492e-05, 'train/info_loss': 2.574854443082586e-05, 'train/ref_loss': 0.25436946749687195, 'train/uncertainty_loss': 0.007220292091369629, 'train/video_loss': 0.2634831666946411, 'train/total_loss': 0.2635248005390167} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3774, 'grad_norm': 3.633248805999756, 'learning_rate': 1.0399926335857532e-05}[Rank 2] Trainer log: {'loss': 0.3774, 'grad_norm': 3.633248805999756, 'learning_rate': 1.0399926335857532e-05} - -[Rank 0] Trainer log: {'loss': 0.3774, 'grad_norm': 3.633248805999756, 'learning_rate': 1.0399926335857532e-05}[Rank 3] Trainer log: {'loss': 0.3774, 'grad_norm': 3.633248805999756, 'learning_rate': 1.0399926335857532e-05} - -{'loss': 0.3774, 'grad_norm': 3.633248805999756, 'learning_rate': 1.0399926335857532e-05, 'epoch': 0.51} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.2838, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00023112455382943154, 'train/lm_loss': 5.33986312802881e-05, 'train/info_loss': 2.7596188374445774e-05, 'train/ref_loss': 0.12037868052721024, 'train/uncertainty_loss': -6.840383866801858e-05, 'train/video_loss': 0.12218686938285828, 'train/total_loss': 0.12224026769399643} -tensor(0.0981, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0989, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0001843122416175902, 'train/lm_loss': 3.175483434461057e-05, 'train/info_loss': 2.235124156868551e-05, 'train/ref_loss': 0.16074791550636292, 'train/uncertainty_loss': -6.766971200704575e-05, 'train/video_loss': 0.16217710077762604, 'train/total_loss': 0.16220885515213013} -tensor(0.0730, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.2917, 'grad_norm': 11.18959903717041, 'learning_rate': 1.0389267097425733e-05} -[Rank 2] Trainer log: {'loss': 0.2917, 'grad_norm': 11.18959903717041, 'learning_rate': 1.0389267097425733e-05}[Rank 0] Trainer log: {'loss': 0.2917, 'grad_norm': 11.18959903717041, 'learning_rate': 1.0389267097425733e-05} -[Rank 1] Trainer log: {'loss': 0.2917, 'grad_norm': 11.18959903717041, 'learning_rate': 1.0389267097425733e-05} - -{'loss': 0.2917, 'grad_norm': 11.18959903717041, 'learning_rate': 1.0389267097425733e-05, 'epoch': 0.51} -tensor(-0.0015, device='cuda:1', grad_fn=) tensor(-0.0015, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2125553369522095, 'train/info_loss': 0.21184173226356506, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012255188776180148, 'train/video_loss': 0.21171918511390686, 'train/total_loss': 0.4242745041847229} -tensor(0.4140, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1416783094406128, 'train/info_loss': 0.14932295680046082, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010250197956338525, 'train/video_loss': 0.14922045171260834, 'train/total_loss': 0.29089877009391785} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4464, 'grad_norm': 7.742002964019775, 'learning_rate': 1.0378607416021398e-05} -[Rank 0] Trainer log: {'loss': 0.4464, 'grad_norm': 7.742002964019775, 'learning_rate': 1.0378607416021398e-05}[Rank 2] Trainer log: {'loss': 0.4464, 'grad_norm': 7.742002964019775, 'learning_rate': 1.0378607416021398e-05} -[Rank 1] Trainer log: {'loss': 0.4464, 'grad_norm': 7.742002964019775, 'learning_rate': 1.0378607416021398e-05} - -{'loss': 0.4464, 'grad_norm': 7.742002964019775, 'learning_rate': 1.0378607416021398e-05, 'epoch': 0.51} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.1737, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003009439678862691, 'train/lm_loss': 4.086077096872032e-05, 'train/info_loss': 2.3781687559676357e-05, 'train/ref_loss': 0.32468920946121216, 'train/uncertainty_loss': 0.01736569106578827, 'train/video_loss': 0.3444862365722656, 'train/total_loss': 0.34452709555625916} -tensor(0.0271, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.4272979259490967, 'train/info_loss': 0.17849375307559967, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010482320794835687, 'train/video_loss': 0.17838892340660095, 'train/total_loss': 0.6056868433952332} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.397, 'grad_norm': 5.2704997062683105, 'learning_rate': 1.0367947303774875e-05}[Rank 3] Trainer log: {'loss': 0.397, 'grad_norm': 5.2704997062683105, 'learning_rate': 1.0367947303774875e-05} - -[Rank 1] Trainer log: {'loss': 0.397, 'grad_norm': 5.2704997062683105, 'learning_rate': 1.0367947303774875e-05} -[Rank 0] Trainer log: {'loss': 0.397, 'grad_norm': 5.2704997062683105, 'learning_rate': 1.0367947303774875e-05} -{'loss': 0.397, 'grad_norm': 5.2704997062683105, 'learning_rate': 1.0367947303774875e-05, 'epoch': 0.51} -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1572152614593506, 'train/info_loss': 0.27374130487442017, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001258104806765914, 'train/video_loss': 0.2736155092716217, 'train/total_loss': 0.43083077669143677} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.018188734352588654, 'train/info_loss': 0.10504136979579926, 'train/ref_loss': None, 'train/uncertainty_loss': -9.902403689920902e-05, 'train/video_loss': 0.10494234412908554, 'train/total_loss': 0.12313108146190643} -tensor(0.0641, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.4061, 'grad_norm': 3.998946189880371, 'learning_rate': 1.0357286772817003e-05} -[Rank 3] Trainer log: {'loss': 0.4061, 'grad_norm': 3.998946189880371, 'learning_rate': 1.0357286772817003e-05} -[Rank 1] Trainer log: {'loss': 0.4061, 'grad_norm': 3.998946189880371, 'learning_rate': 1.0357286772817003e-05} -[Rank 0] Trainer log: {'loss': 0.4061, 'grad_norm': 3.998946189880371, 'learning_rate': 1.0357286772817003e-05} -{'loss': 0.4061, 'grad_norm': 3.998946189880371, 'learning_rate': 1.0357286772817003e-05, 'epoch': 0.51} -tensor(0.4830, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.7218, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0907, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00028634592890739444, 'train/lm_loss': 6.848592893220485e-05, 'train/info_loss': 3.1053055863594636e-05, 'train/ref_loss': 0.2799784541130066, 'train/uncertainty_loss': 0.009071177989244462, 'train/video_loss': 0.2913714349269867, 'train/total_loss': 0.2914399206638336} -tensor(0.0343, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2529787063598633, 'train/info_loss': 0.21261262893676758, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012918951688334346, 'train/video_loss': 0.21248343586921692, 'train/total_loss': 0.4654621481895447} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.1000, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.2880, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4286, 'grad_norm': 17.471097946166992, 'learning_rate': 1.0346625835279103e-05}[Rank 0] Trainer log: {'loss': 0.4286, 'grad_norm': 17.471097946166992, 'learning_rate': 1.0346625835279103e-05}[Rank 1] Trainer log: {'loss': 0.4286, 'grad_norm': 17.471097946166992, 'learning_rate': 1.0346625835279103e-05} - -[Rank 2] Trainer log: {'loss': 0.4286, 'grad_norm': 17.471097946166992, 'learning_rate': 1.0346625835279103e-05} - -{'loss': 0.4286, 'grad_norm': 17.471097946166992, 'learning_rate': 1.0346625835279103e-05, 'epoch': 0.51} -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2475742816925049, 'train/info_loss': 0.1602632850408554, 'train/ref_loss': None, 'train/uncertainty_loss': -0.000127545150462538, 'train/video_loss': 0.16013574600219727, 'train/total_loss': 0.4077100157737732} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.3456, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002693298971280456, 'train/lm_loss': 7.034494774416089e-05, 'train/info_loss': 3.027824277523905e-05, 'train/ref_loss': 0.17360059916973114, 'train/uncertainty_loss': -6.94846734404564e-05, 'train/video_loss': 0.17571602761745453, 'train/total_loss': 0.17578637599945068} -tensor(0.0010, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.316, 'grad_norm': 4.187443256378174, 'learning_rate': 1.0335964503292951e-05}[Rank 1] Trainer log: {'loss': 0.316, 'grad_norm': 4.187443256378174, 'learning_rate': 1.0335964503292951e-05} - -[Rank 2] Trainer log: {'loss': 0.316, 'grad_norm': 4.187443256378174, 'learning_rate': 1.0335964503292951e-05} -[Rank 0] Trainer log: {'loss': 0.316, 'grad_norm': 4.187443256378174, 'learning_rate': 1.0335964503292951e-05} -{'loss': 0.316, 'grad_norm': 4.187443256378174, 'learning_rate': 1.0335964503292951e-05, 'epoch': 0.51} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(0.7908, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00033379881642758846, 'train/lm_loss': 6.06921617873013e-05, 'train/info_loss': 2.783459422062151e-05, 'train/ref_loss': 0.7260329723358154, 'train/uncertainty_loss': 0.07907509207725526, 'train/video_loss': 0.8078063130378723, 'train/total_loss': 0.8078669905662537} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.2677, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.1077, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00043093664571642877, 'train/lm_loss': 0.00015834260266274215, 'train/info_loss': 3.653631210909225e-05, 'train/ref_loss': 0.26396217942237854, 'train/uncertainty_loss': 0.010766754299402237, 'train/video_loss': 0.2782129645347595, 'train/total_loss': 0.27837130427360535} -[Rank 1] Trainer log: {'loss': 0.3994, 'grad_norm': 20.359241485595703, 'learning_rate': 1.0325302788990775e-05}[Rank 3] Trainer log: {'loss': 0.3994, 'grad_norm': 20.359241485595703, 'learning_rate': 1.0325302788990775e-05}[Rank 2] Trainer log: {'loss': 0.3994, 'grad_norm': 20.359241485595703, 'learning_rate': 1.0325302788990775e-05} - - -[Rank 0] Trainer log: {'loss': 0.3994, 'grad_norm': 20.359241485595703, 'learning_rate': 1.0325302788990775e-05} -{'loss': 0.3994, 'grad_norm': 20.359241485595703, 'learning_rate': 1.0325302788990775e-05, 'epoch': 0.52} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.4071, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00017968916799873116, 'train/lm_loss': 5.358931375667453e-05, 'train/info_loss': 2.6940573661704548e-05, 'train/ref_loss': 0.4826279878616333, 'train/uncertainty_loss': 0.04070677757263184, 'train/video_loss': 0.5247992277145386, 'train/total_loss': 0.5248528122901917} -tensor(0.0633, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.046105083823204045, 'train/info_loss': 0.13879777491092682, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010826850775629282, 'train/video_loss': 0.13868950307369232, 'train/total_loss': 0.1847945898771286} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3101, 'grad_norm': 4.123138427734375, 'learning_rate': 1.031464070450524e-05} -[Rank 0] Trainer log: {'loss': 0.3101, 'grad_norm': 4.123138427734375, 'learning_rate': 1.031464070450524e-05}[Rank 1] Trainer log: {'loss': 0.3101, 'grad_norm': 4.123138427734375, 'learning_rate': 1.031464070450524e-05} -[Rank 3] Trainer log: {'loss': 0.3101, 'grad_norm': 4.123138427734375, 'learning_rate': 1.031464070450524e-05} - -{'loss': 0.3101, 'grad_norm': 4.123138427734375, 'learning_rate': 1.031464070450524e-05, 'epoch': 0.52} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.0294, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002797177061438561, 'train/lm_loss': 7.961595547385515e-05, 'train/info_loss': 3.254307739553042e-05, 'train/ref_loss': 0.24180279672145844, 'train/uncertainty_loss': 0.002936580218374729, 'train/video_loss': 0.24700966477394104, 'train/total_loss': 0.24708928167819977} -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(0.2180, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1307, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.2577, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00025388638023287057, 'train/lm_loss': 4.7034455928951505e-05, 'train/info_loss': 2.4914121240726672e-05, 'train/ref_loss': 0.3862142562866211, 'train/uncertainty_loss': 0.025768432021141055, 'train/video_loss': 0.41403868794441223, 'train/total_loss': 0.41408571600914} -[Rank 3] Trainer log: {'loss': 0.3546, 'grad_norm': 10.665351867675781, 'learning_rate': 1.030397826196943e-05}[Rank 1] Trainer log: {'loss': 0.3546, 'grad_norm': 10.665351867675781, 'learning_rate': 1.030397826196943e-05} - -[Rank 2] Trainer log: {'loss': 0.3546, 'grad_norm': 10.665351867675781, 'learning_rate': 1.030397826196943e-05}[Rank 0] Trainer log: {'loss': 0.3546, 'grad_norm': 10.665351867675781, 'learning_rate': 1.030397826196943e-05} - -{'loss': 0.3546, 'grad_norm': 10.665351867675781, 'learning_rate': 1.030397826196943e-05, 'epoch': 0.52} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(0.1205, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0565, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.000515178544446826, 'train/lm_loss': 4.746350750792772e-05, 'train/info_loss': 2.413929905742407e-05, 'train/ref_loss': 0.2494993507862091, 'train/uncertainty_loss': 0.0056541085243225105, 'train/video_loss': 0.25929901003837585, 'train/total_loss': 0.25934648513793945} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=)tensor(0.0260, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=)tensor(-0.0007, device='cuda:1', grad_fn=) - -{'train/tv_loss': 0.00025767157785594463, 'train/lm_loss': 0.00011204917682334782, 'train/info_loss': 3.075505082961172e-05, 'train/ref_loss': 0.1599169373512268, 'train/uncertainty_loss': -6.935898563824594e-05, 'train/video_loss': 0.16193969547748566, 'train/total_loss': 0.1620517373085022} -tensor(0.2484, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.2761, 'grad_norm': 1.3950586318969727, 'learning_rate': 1.0293315473516833e-05} -[Rank 1] Trainer log: {'loss': 0.2761, 'grad_norm': 1.3950586318969727, 'learning_rate': 1.0293315473516833e-05} -[Rank 0] Trainer log: {'loss': 0.2761, 'grad_norm': 1.3950586318969727, 'learning_rate': 1.0293315473516833e-05}[Rank 2] Trainer log: {'loss': 0.2761, 'grad_norm': 1.3950586318969727, 'learning_rate': 1.0293315473516833e-05} - -{'loss': 0.2761, 'grad_norm': 1.3950586318969727, 'learning_rate': 1.0293315473516833e-05, 'epoch': 0.52} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.2009, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.5548, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00033286819234490395, 'train/lm_loss': 5.327945691533387e-05, 'train/info_loss': 2.735778434725944e-05, 'train/ref_loss': 0.174585223197937, 'train/uncertainty_loss': -7.02591030858457e-05, 'train/video_loss': 0.17720526456832886, 'train/total_loss': 0.17725855112075806} -tensor(-0.0005, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) -{'train/tv_loss': 0.0003363270778208971, 'train/lm_loss': 2.8059899341315032e-05, 'train/info_loss': 2.0980394765501842e-05, 'train/ref_loss': 0.18272897601127625, 'train/uncertainty_loss': -6.865624454803765e-05, 'train/video_loss': 0.18537192046642303, 'train/total_loss': 0.18539997935295105} -tensor(0.2722, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3008, 'grad_norm': 18.125511169433594, 'learning_rate': 1.0282652351281344e-05}[Rank 2] Trainer log: {'loss': 0.3008, 'grad_norm': 18.125511169433594, 'learning_rate': 1.0282652351281344e-05}[Rank 0] Trainer log: {'loss': 0.3008, 'grad_norm': 18.125511169433594, 'learning_rate': 1.0282652351281344e-05} - -[Rank 1] Trainer log: {'loss': 0.3008, 'grad_norm': 18.125511169433594, 'learning_rate': 1.0282652351281344e-05} - -{'loss': 0.3008, 'grad_norm': 18.125511169433594, 'learning_rate': 1.0282652351281344e-05, 'epoch': 0.52} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.0602, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.2770, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00021091497037559748, 'train/lm_loss': 6.977294688113034e-05, 'train/info_loss': 3.00994397548493e-05, 'train/ref_loss': 0.4041328430175781, 'train/uncertainty_loss': 0.0276995450258255, 'train/video_loss': 0.43354982137680054, 'train/total_loss': 0.43361958861351013} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.10649865865707397, 'train/info_loss': 0.11561564356088638, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011081229895353318, 'train/video_loss': 0.11550483107566833, 'train/total_loss': 0.2220034897327423} -tensor(0.6426, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.1335, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3452, 'grad_norm': 13.088363647460938, 'learning_rate': 1.027198890739722e-05}[Rank 0] Trainer log: {'loss': 0.3452, 'grad_norm': 13.088363647460938, 'learning_rate': 1.027198890739722e-05}[Rank 3] Trainer log: {'loss': 0.3452, 'grad_norm': 13.088363647460938, 'learning_rate': 1.027198890739722e-05} - -[Rank 2] Trainer log: {'loss': 0.3452, 'grad_norm': 13.088363647460938, 'learning_rate': 1.027198890739722e-05} - -{'loss': 0.3452, 'grad_norm': 13.088363647460938, 'learning_rate': 1.027198890739722e-05, 'epoch': 0.52} -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.7087878227233887, 'train/info_loss': 0.07277967780828476, 'train/ref_loss': None, 'train/uncertainty_loss': -9.588631219230593e-05, 'train/video_loss': 0.07268378883600235, 'train/total_loss': 0.7814716696739197} -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(0.2226, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(0.1529, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.27923567295074464, 'train/info_loss': 0.41270890831947327, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00014234766131266953, 'train/video_loss': 0.41256657242774963, 'train/total_loss': 0.6918022632598877} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(0.1578, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4697, 'grad_norm': 4.678002834320068, 'learning_rate': 1.0261325153999097e-05}[Rank 1] Trainer log: {'loss': 0.4697, 'grad_norm': 4.678002834320068, 'learning_rate': 1.0261325153999097e-05} -[Rank 0] Trainer log: {'loss': 0.4697, 'grad_norm': 4.678002834320068, 'learning_rate': 1.0261325153999097e-05} - -[Rank 2] Trainer log: {'loss': 0.4697, 'grad_norm': 4.678002834320068, 'learning_rate': 1.0261325153999097e-05} -{'loss': 0.4697, 'grad_norm': 4.678002834320068, 'learning_rate': 1.0261325153999097e-05, 'epoch': 0.52} -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(0.0280, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -tensor(0.4564, device='cuda:3', grad_fn=) {'train/tv_loss': 0.00027801706455647944, 'train/lm_loss': 6.050148513168097e-05, 'train/info_loss': 2.914582182711456e-05, 'train/ref_loss': 0.09206611663103104, 'train/uncertainty_loss': -6.976993172429502e-05, 'train/video_loss': 0.09424963593482971, 'train/total_loss': 0.09431013464927673} -tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(0.3545, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1035, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00019801987800747157, 'train/lm_loss': 0.0001766616478562355, 'train/info_loss': 3.462909808149561e-05, 'train/ref_loss': 0.2796594500541687, 'train/uncertainty_loss': 0.010348044335842133, 'train/video_loss': 0.29162630438804626, 'train/total_loss': 0.2918029725551605} -[Rank 3] Trainer log: {'loss': 0.4112, 'grad_norm': 15.068748474121094, 'learning_rate': 1.0250661103221957e-05} -[Rank 1] Trainer log: {'loss': 0.4112, 'grad_norm': 15.068748474121094, 'learning_rate': 1.0250661103221957e-05} -[Rank 0] Trainer log: {'loss': 0.4112, 'grad_norm': 15.068748474121094, 'learning_rate': 1.0250661103221957e-05}[Rank 2] Trainer log: {'loss': 0.4112, 'grad_norm': 15.068748474121094, 'learning_rate': 1.0250661103221957e-05} - -{'loss': 0.4112, 'grad_norm': 15.068748474121094, 'learning_rate': 1.0250661103221957e-05, 'epoch': 0.52} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.2141, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00028764770831912756, 'train/lm_loss': 4.736816044896841e-05, 'train/info_loss': 2.6940573661704548e-05, 'train/ref_loss': 0.11436429619789124, 'train/uncertainty_loss': -6.749824387952685e-05, 'train/video_loss': 0.11662492156028748, 'train/total_loss': 0.11667229235172272} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.05066974759101868, 'train/info_loss': 0.30359041690826416, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010236204834654928, 'train/video_loss': 0.3034880459308624, 'train/total_loss': 0.35415780544281006} -tensor(0.1379, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0647, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.2942, 'grad_norm': 8.669514656066895, 'learning_rate': 1.0239996767201123e-05}[Rank 3] Trainer log: {'loss': 0.2942, 'grad_norm': 8.669514656066895, 'learning_rate': 1.0239996767201123e-05} -[Rank 0] Trainer log: {'loss': 0.2942, 'grad_norm': 8.669514656066895, 'learning_rate': 1.0239996767201123e-05} - -[Rank 2] Trainer log: {'loss': 0.2942, 'grad_norm': 8.669514656066895, 'learning_rate': 1.0239996767201123e-05} -{'loss': 0.2942, 'grad_norm': 8.669514656066895, 'learning_rate': 1.0239996767201123e-05, 'epoch': 0.52} -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.24585421085357667, 'train/info_loss': 0.35907965898513794, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001336266053840518, 'train/video_loss': 0.3589460253715515, 'train/total_loss': 0.6048002243041992} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.0184, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003515485674142838, 'train/lm_loss': 4.7225144226104024e-05, 'train/info_loss': 2.6523362976149656e-05, 'train/ref_loss': 0.19688668847084045, 'train/uncertainty_loss': -6.88537664245814e-05, 'train/video_loss': 0.19965673983097076, 'train/total_loss': 0.19970396161079407} -[Rank 3] Trainer log: {'loss': 0.3388, 'grad_norm': 1.9941456317901611, 'learning_rate': 1.0229332158072243e-05} -[Rank 2] Trainer log: {'loss': 0.3388, 'grad_norm': 1.9941456317901611, 'learning_rate': 1.0229332158072243e-05} -[Rank 0] Trainer log: {'loss': 0.3388, 'grad_norm': 1.9941456317901611, 'learning_rate': 1.0229332158072243e-05}[Rank 1] Trainer log: {'loss': 0.3388, 'grad_norm': 1.9941456317901611, 'learning_rate': 1.0229332158072243e-05} - -{'loss': 0.3388, 'grad_norm': 1.9941456317901611, 'learning_rate': 1.0229332158072243e-05, 'epoch': 0.52} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.20341460704803468, 'train/info_loss': 0.16107824444770813, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011265723733231426, 'train/video_loss': 0.16096559166908264, 'train/total_loss': 0.3643801808357239} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(0.0254, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002437019255012274, 'train/lm_loss': 5.3470139391720295e-05, 'train/info_loss': 2.7596188374445774e-05, 'train/ref_loss': 0.19962748885154724, 'train/uncertainty_loss': -6.799048860557377e-05, 'train/video_loss': 0.20153670012950897, 'train/total_loss': 0.2015901654958725} -[Rank 3] Trainer log: {'loss': 0.446, 'grad_norm': 3.640011787414551, 'learning_rate': 1.0218667287971269e-05} -[Rank 1] Trainer log: {'loss': 0.446, 'grad_norm': 3.640011787414551, 'learning_rate': 1.0218667287971269e-05}[Rank 2] Trainer log: {'loss': 0.446, 'grad_norm': 3.640011787414551, 'learning_rate': 1.0218667287971269e-05} - -[Rank 0] Trainer log: {'loss': 0.446, 'grad_norm': 3.640011787414551, 'learning_rate': 1.0218667287971269e-05} -{'loss': 0.446, 'grad_norm': 3.640011787414551, 'learning_rate': 1.0218667287971269e-05, 'epoch': 0.52} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(0.1047, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002386766253039241, 'train/lm_loss': 4.736816044896841e-05, 'train/info_loss': 2.574854443082586e-05, 'train/ref_loss': 0.27912068367004395, 'train/uncertainty_loss': 0.010465862601995468, 'train/video_loss': 0.29152169823646545, 'train/total_loss': 0.2915690541267395} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3826516389846802, 'train/info_loss': 0.1778016984462738, 'train/ref_loss': None, 'train/uncertainty_loss': -0.000106504384893924, 'train/video_loss': 0.17769519984722137, 'train/total_loss': 0.5603468418121338} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.1263, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.2183, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.428, 'grad_norm': 3.670166254043579, 'learning_rate': 1.020800216903446e-05}[Rank 3] Trainer log: {'loss': 0.428, 'grad_norm': 3.670166254043579, 'learning_rate': 1.020800216903446e-05}[Rank 1] Trainer log: {'loss': 0.428, 'grad_norm': 3.670166254043579, 'learning_rate': 1.020800216903446e-05} - - -[Rank 2] Trainer log: {'loss': 0.428, 'grad_norm': 3.670166254043579, 'learning_rate': 1.020800216903446e-05} -{'loss': 0.428, 'grad_norm': 3.670166254043579, 'learning_rate': 1.020800216903446e-05, 'epoch': 0.52} -tensor(-0.0017, device='cuda:0', grad_fn=) tensor(-0.0017, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.27690503597259525, 'train/info_loss': 0.11404796689748764, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001710515469312668, 'train/video_loss': 0.11387691646814346, 'train/total_loss': 0.3907819390296936} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.1730, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.001003850530833006, 'train/lm_loss': 4.124216211494059e-05, 'train/info_loss': 2.5331331926281564e-05, 'train/ref_loss': 0.32121092081069946, 'train/uncertainty_loss': 0.017299915850162505, 'train/video_loss': 0.34656697511672974, 'train/total_loss': 0.3466082215309143} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3697, 'grad_norm': 6.48858642578125, 'learning_rate': 1.0197336813398358e-05}[Rank 0] Trainer log: {'loss': 0.3697, 'grad_norm': 6.48858642578125, 'learning_rate': 1.0197336813398358e-05}[Rank 3] Trainer log: {'loss': 0.3697, 'grad_norm': 6.48858642578125, 'learning_rate': 1.0197336813398358e-05} - - -[Rank 2] Trainer log: {'loss': 0.3697, 'grad_norm': 6.48858642578125, 'learning_rate': 1.0197336813398358e-05} -{'loss': 0.3697, 'grad_norm': 6.48858642578125, 'learning_rate': 1.0197336813398358e-05, 'epoch': 0.52} -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00029938877560198307, 'train/lm_loss': 6.062065367586911e-05, 'train/info_loss': 3.0516646802425385e-05, 'train/ref_loss': 0.13334062695503235, 'train/uncertainty_loss': -6.967686349526049e-05, 'train/video_loss': 0.13569657504558563, 'train/total_loss': 0.1357571929693222} -tensor(-0.0015, device='cuda:1', grad_fn=) tensor(-0.0015, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.35734283924102783, 'train/info_loss': 0.15360015630722046, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011236483696848155, 'train/video_loss': 0.15348778665065765, 'train/total_loss': 0.5108306407928467} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3484, 'grad_norm': 1.7516065835952759, 'learning_rate': 1.0186671233199759e-05}[Rank 0] Trainer log: {'loss': 0.3484, 'grad_norm': 1.7516065835952759, 'learning_rate': 1.0186671233199759e-05} -[Rank 3] Trainer log: {'loss': 0.3484, 'grad_norm': 1.7516065835952759, 'learning_rate': 1.0186671233199759e-05} - -[Rank 2] Trainer log: {'loss': 0.3484, 'grad_norm': 1.7516065835952759, 'learning_rate': 1.0186671233199759e-05} -{'loss': 0.3484, 'grad_norm': 1.7516065835952759, 'learning_rate': 1.0186671233199759e-05, 'epoch': 0.52} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00033711986616253854, 'train/lm_loss': 6.0334638692438607e-05, 'train/info_loss': 2.8490208933362737e-05, 'train/ref_loss': 0.20547814667224884, 'train/uncertainty_loss': -7.21000018529594e-05, 'train/video_loss': 0.20813149213790894, 'train/total_loss': 0.20819182693958282} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1771804690361023, 'train/info_loss': 0.13898736238479614, 'train/ref_loss': None, 'train/uncertainty_loss': -8.774801972322166e-05, 'train/video_loss': 0.13889960944652557, 'train/total_loss': 0.31608009338378906} -tensor(0.2219, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1327, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.2898, 'grad_norm': 7.097209453582764, 'learning_rate': 1.0176005440575735e-05} -[Rank 3] Trainer log: {'loss': 0.2898, 'grad_norm': 7.097209453582764, 'learning_rate': 1.0176005440575735e-05}[Rank 2] Trainer log: {'loss': 0.2898, 'grad_norm': 7.097209453582764, 'learning_rate': 1.0176005440575735e-05} - -[Rank 0] Trainer log: {'loss': 0.2898, 'grad_norm': 7.097209453582764, 'learning_rate': 1.0176005440575735e-05} -{'loss': 0.2898, 'grad_norm': 7.097209453582764, 'learning_rate': 1.0176005440575735e-05, 'epoch': 0.52} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3320634841918946, 'train/info_loss': 0.18718135356903076, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010538914939388633, 'train/video_loss': 0.18707595765590668, 'train/total_loss': 0.5191394686698914} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.0166, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.0018, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.2880, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003343664109706879, 'train/lm_loss': 6.071599782444537e-05, 'train/info_loss': 2.7596188374445774e-05, 'train/ref_loss': 0.41385841369628906, 'train/uncertainty_loss': 0.02880336940288544, 'train/video_loss': 0.4453643262386322, 'train/total_loss': 0.44542503356933594} -[Rank 0] Trainer log: {'loss': 0.3924, 'grad_norm': 8.774441719055176, 'learning_rate': 1.0165339447663586e-05}[Rank 3] Trainer log: {'loss': 0.3924, 'grad_norm': 8.774441719055176, 'learning_rate': 1.0165339447663586e-05} - -[Rank 1] Trainer log: {'loss': 0.3924, 'grad_norm': 8.774441719055176, 'learning_rate': 1.0165339447663586e-05} -[Rank 2] Trainer log: {'loss': 0.3924, 'grad_norm': 8.774441719055176, 'learning_rate': 1.0165339447663586e-05} -{'loss': 0.3924, 'grad_norm': 8.774441719055176, 'learning_rate': 1.0165339447663586e-05, 'epoch': 0.52} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(0.4556, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002759595634415746, 'train/lm_loss': 6.967761437408626e-05, 'train/info_loss': 3.1053055863594636e-05, 'train/ref_loss': 0.16786311566829681, 'train/uncertainty_loss': -7.120480877347291e-05, 'train/video_loss': 0.1700306534767151, 'train/total_loss': 0.17010033130645752} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.42522783279418946, 'train/info_loss': 0.1884932965040207, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012330542085692286, 'train/video_loss': 0.1883699893951416, 'train/total_loss': 0.6135978698730469} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.3550, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.4309, 'grad_norm': 6.842411994934082, 'learning_rate': 1.0154673266600846e-05} -[Rank 0] Trainer log: {'loss': 0.4309, 'grad_norm': 6.842411994934082, 'learning_rate': 1.0154673266600846e-05}[Rank 1] Trainer log: {'loss': 0.4309, 'grad_norm': 6.842411994934082, 'learning_rate': 1.0154673266600846e-05} -[Rank 3] Trainer log: {'loss': 0.4309, 'grad_norm': 6.842411994934082, 'learning_rate': 1.0154673266600846e-05} - -{'loss': 0.4309, 'grad_norm': 6.842411994934082, 'learning_rate': 1.0154673266600846e-05, 'epoch': 0.52} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(0.2545, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.5595, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.15577839612960817, 'train/info_loss': 0.10978317260742188, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012214434100314976, 'train/video_loss': 0.1096610277891159, 'train/total_loss': 0.2654394209384918} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0015, device='cuda:2', grad_fn=) tensor(-0.0015, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(0.1364, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00024399119429290296, 'train/lm_loss': 6.839059642516077e-05, 'train/info_loss': 3.075505082961172e-05, 'train/ref_loss': 0.302418053150177, 'train/uncertainty_loss': 0.013644273579120637, 'train/video_loss': 0.3180450201034546, 'train/total_loss': 0.31811341643333435} -[Rank 2] Trainer log: {'loss': 0.4842, 'grad_norm': 15.20483684539795, 'learning_rate': 1.0144006909525259e-05} -[Rank 0] Trainer log: {'loss': 0.4842, 'grad_norm': 15.20483684539795, 'learning_rate': 1.0144006909525259e-05}[Rank 1] Trainer log: {'loss': 0.4842, 'grad_norm': 15.20483684539795, 'learning_rate': 1.0144006909525259e-05} -[Rank 3] Trainer log: {'loss': 0.4842, 'grad_norm': 15.20483684539795, 'learning_rate': 1.0144006909525259e-05} - -{'loss': 0.4842, 'grad_norm': 15.20483684539795, 'learning_rate': 1.0144006909525259e-05, 'epoch': 0.52} -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.235964035987854, 'train/info_loss': 0.23311978578567505, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00014140260173007847, 'train/video_loss': 0.23297838866710663, 'train/total_loss': 0.46894243359565735} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.0765, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.25767216682434085, 'train/info_loss': 0.2127174735069275, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012614508159458637, 'train/video_loss': 0.21259133517742157, 'train/total_loss': 0.4702634811401367} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(0.2405, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4039, 'grad_norm': 3.7944929599761963, 'learning_rate': 1.0133340388574775e-05}[Rank 1] Trainer log: {'loss': 0.4039, 'grad_norm': 3.7944929599761963, 'learning_rate': 1.0133340388574775e-05}[Rank 0] Trainer log: {'loss': 0.4039, 'grad_norm': 3.7944929599761963, 'learning_rate': 1.0133340388574775e-05} - - -[Rank 2] Trainer log: {'loss': 0.4039, 'grad_norm': 3.7944929599761963, 'learning_rate': 1.0133340388574775e-05} -{'loss': 0.4039, 'grad_norm': 3.7944929599761963, 'learning_rate': 1.0133340388574775e-05, 'epoch': 0.52} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.20532257556915284, 'train/info_loss': 0.21271319687366486, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001068850513547659, 'train/video_loss': 0.2126063108444214, 'train/total_loss': 0.41792887449264526} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.5683286190032959, 'train/info_loss': 0.12123065441846848, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001014061039313674, 'train/video_loss': 0.12112925201654434, 'train/total_loss': 0.689457893371582} -[Rank 0] Trainer log: {'loss': 0.4039, 'grad_norm': 5.755252361297607, 'learning_rate': 1.0122673715887525e-05}[Rank 2] Trainer log: {'loss': 0.4039, 'grad_norm': 5.755252361297607, 'learning_rate': 1.0122673715887525e-05} - -[Rank 3] Trainer log: {'loss': 0.4039, 'grad_norm': 5.755252361297607, 'learning_rate': 1.0122673715887525e-05} -[Rank 1] Trainer log: {'loss': 0.4039, 'grad_norm': 5.755252361297607, 'learning_rate': 1.0122673715887525e-05} -{'loss': 0.4039, 'grad_norm': 5.755252361297607, 'learning_rate': 1.0122673715887525e-05, 'epoch': 0.52} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.5147, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(0.0754, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00035777371376752857, 'train/lm_loss': 3.604564117267728e-05, 'train/info_loss': 2.235124156868551e-05, 'train/ref_loss': 0.2573245167732239, 'train/uncertainty_loss': 0.007543467730283738, 'train/video_loss': 0.2677525281906128, 'train/total_loss': 0.2677885591983795} -tensor(0.5674, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(-0.0017, device='cuda:1', grad_fn=) tensor(-0.0017, device='cuda:1', grad_fn=) -tensor(0.2547, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0006599554792046547, 'train/lm_loss': 0.00015005203895270827, 'train/info_loss': 3.409269265830517e-05, 'train/ref_loss': 0.3848358988761902, 'train/uncertainty_loss': 0.025469797849655154, 'train/video_loss': 0.415619432926178, 'train/total_loss': 0.4157694876194} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.4969, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.407, 'grad_norm': 17.343589782714844, 'learning_rate': 1.0112006903601814e-05} -[Rank 0] Trainer log: {'loss': 0.407, 'grad_norm': 17.343589782714844, 'learning_rate': 1.0112006903601814e-05}[Rank 2] Trainer log: {'loss': 0.407, 'grad_norm': 17.343589782714844, 'learning_rate': 1.0112006903601814e-05} - -[Rank 3] Trainer log: {'loss': 0.407, 'grad_norm': 17.343589782714844, 'learning_rate': 1.0112006903601814e-05} -{'loss': 0.407, 'grad_norm': 17.343589782714844, 'learning_rate': 1.0112006903601814e-05, 'epoch': 0.52} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.14073091745376587, 'train/info_loss': 0.12141887843608856, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010524510871618987, 'train/video_loss': 0.12131363153457642, 'train/total_loss': 0.2620445489883423} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(1.1958, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.4502, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.08772691488265992, 'train/info_loss': 0.22969700396060944, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001000275486148894, 'train/video_loss': 0.22959697246551514, 'train/total_loss': 0.3173238933086395} -tensor(0.1521, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3828, 'grad_norm': 13.73572826385498, 'learning_rate': 1.0101339963856112e-05}[Rank 0] Trainer log: {'loss': 0.3828, 'grad_norm': 13.73572826385498, 'learning_rate': 1.0101339963856112e-05}[Rank 3] Trainer log: {'loss': 0.3828, 'grad_norm': 13.73572826385498, 'learning_rate': 1.0101339963856112e-05} - - -{'loss': 0.3828, 'grad_norm': 13.73572826385498, 'learning_rate': 1.0101339963856112e-05, 'epoch': 0.52} -[Rank 2] Trainer log: {'loss': 0.3828, 'grad_norm': 13.73572826385498, 'learning_rate': 1.0101339963856112e-05} -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.47529053688049316, 'train/info_loss': 0.21951007843017578, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011488735908642412, 'train/video_loss': 0.21939519047737122, 'train/total_loss': 0.694685697555542} -tensor(0.3114, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.7269, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00017132483189925552, 'train/lm_loss': 4.7225144226104024e-05, 'train/info_loss': 2.8013399060000665e-05, 'train/ref_loss': 0.13118532299995422, 'train/uncertainty_loss': -6.870559882372617e-05, 'train/video_loss': 0.13251522183418274, 'train/total_loss': 0.13256244361400604} -tensor(0.1161, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.5007, 'grad_norm': 1.9356613159179688, 'learning_rate': 1.0090672908789022e-05} -[Rank 2] Trainer log: {'loss': 0.5007, 'grad_norm': 1.9356613159179688, 'learning_rate': 1.0090672908789022e-05} -[Rank 3] Trainer log: {'loss': 0.5007, 'grad_norm': 1.9356613159179688, 'learning_rate': 1.0090672908789022e-05} -[Rank 0] Trainer log: {'loss': 0.5007, 'grad_norm': 1.9356613159179688, 'learning_rate': 1.0090672908789022e-05} -{'loss': 0.5007, 'grad_norm': 1.9356613159179688, 'learning_rate': 1.0090672908789022e-05, 'epoch': 0.52} -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(0.1431, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1897, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00032896122429519896, 'train/lm_loss': 6.076366407796741e-05, 'train/info_loss': 2.914582182711456e-05, 'train/ref_loss': 0.33896467089653015, 'train/uncertainty_loss': 0.018973743915557863, 'train/video_loss': 0.36059924960136414, 'train/total_loss': 0.36066001653671265} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.08827877044677734, 'train/info_loss': 0.15871931612491608, 'train/ref_loss': None, 'train/uncertainty_loss': -9.094329434446991e-05, 'train/video_loss': 0.15862837433815002, 'train/total_loss': 0.24690714478492737} -tensor(0.2504, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3487, 'grad_norm': 8.969770431518555, 'learning_rate': 1.0080005750539288e-05} -[Rank 2] Trainer log: {'loss': 0.3487, 'grad_norm': 8.969770431518555, 'learning_rate': 1.0080005750539288e-05} -[Rank 3] Trainer log: {'loss': 0.3487, 'grad_norm': 8.969770431518555, 'learning_rate': 1.0080005750539288e-05} -[Rank 0] Trainer log: {'loss': 0.3487, 'grad_norm': 8.969770431518555, 'learning_rate': 1.0080005750539288e-05} -{'loss': 0.3487, 'grad_norm': 8.969770431518555, 'learning_rate': 1.0080005750539288e-05, 'epoch': 0.52} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00031709831673651937, 'train/lm_loss': 4.10037930123508e-05, 'train/info_loss': 2.4914121240726672e-05, 'train/ref_loss': 0.07818683981895447, 'train/uncertainty_loss': -6.890305085107685e-05, 'train/video_loss': 0.08067964017391205, 'train/total_loss': 0.08072064071893692} -tensor(0.0287, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.1528, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.23820385932922364, 'train/info_loss': 0.16165778040885925, 'train/ref_loss': None, 'train/uncertainty_loss': -9.866863256320358e-05, 'train/video_loss': 0.1615591049194336, 'train/total_loss': 0.39976298809051514} -[Rank 0] Trainer log: {'loss': 0.3432, 'grad_norm': 3.7951619625091553, 'learning_rate': 1.0069338501245764e-05}[Rank 1] Trainer log: {'loss': 0.3432, 'grad_norm': 3.7951619625091553, 'learning_rate': 1.0069338501245764e-05}[Rank 2] Trainer log: {'loss': 0.3432, 'grad_norm': 3.7951619625091553, 'learning_rate': 1.0069338501245764e-05} - - -[Rank 3] Trainer log: {'loss': 0.3432, 'grad_norm': 3.7951619625091553, 'learning_rate': 1.0069338501245764e-05} -{'loss': 0.3432, 'grad_norm': 3.7951619625091553, 'learning_rate': 1.0069338501245764e-05, 'epoch': 0.52} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00018729695584625007, 'train/lm_loss': 4.128983418922872e-05, 'train/info_loss': 2.3781687559676357e-05, 'train/ref_loss': 0.12133095413446426, 'train/uncertainty_loss': -6.817649700678885e-05, 'train/video_loss': 0.12278494238853455, 'train/total_loss': 0.1228262335062027} -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.08181087970733643, 'train/info_loss': 0.11290903389453888, 'train/ref_loss': None, 'train/uncertainty_loss': -9.206192335113883e-05, 'train/video_loss': 0.11281697452068329, 'train/total_loss': 0.19462785124778748} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4298, 'grad_norm': 3.322925329208374, 'learning_rate': 1.0058671173047421e-05}[Rank 0] Trainer log: {'loss': 0.4298, 'grad_norm': 3.322925329208374, 'learning_rate': 1.0058671173047421e-05} -[Rank 2] Trainer log: {'loss': 0.4298, 'grad_norm': 3.322925329208374, 'learning_rate': 1.0058671173047421e-05} - -[Rank 3] Trainer log: {'loss': 0.4298, 'grad_norm': 3.322925329208374, 'learning_rate': 1.0058671173047421e-05} -{'loss': 0.4298, 'grad_norm': 3.322925329208374, 'learning_rate': 1.0058671173047421e-05, 'epoch': 0.52} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.050173079967498785, 'train/info_loss': 0.2670094072818756, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010168944718316198, 'train/video_loss': 0.2669077217578888, 'train/total_loss': 0.3170807957649231} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.4020, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0518, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0006210618652403356, 'train/lm_loss': 8.00449401140213e-05, 'train/info_loss': 3.1053055863594636e-05, 'train/ref_loss': 0.17894026637077332, 'train/uncertainty_loss': 0.005176690220832825, 'train/video_loss': 0.18911650776863098, 'train/total_loss': 0.18919655680656433} -[Rank 0] Trainer log: {'loss': 0.292, 'grad_norm': 10.916274070739746, 'learning_rate': 1.0048003778083302e-05}[Rank 1] Trainer log: {'loss': 0.292, 'grad_norm': 10.916274070739746, 'learning_rate': 1.0048003778083302e-05} -[Rank 2] Trainer log: {'loss': 0.292, 'grad_norm': 10.916274070739746, 'learning_rate': 1.0048003778083302e-05} -[Rank 3] Trainer log: {'loss': 0.292, 'grad_norm': 10.916274070739746, 'learning_rate': 1.0048003778083302e-05} - -{'loss': 0.292, 'grad_norm': 10.916274070739746, 'learning_rate': 1.0048003778083302e-05, 'epoch': 0.52} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.5012, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.23477642536163332, 'train/info_loss': 0.21321925520896912, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011224894551560284, 'train/video_loss': 0.21310700476169586, 'train/total_loss': 0.4478834271430969} -tensor(0.1963, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0429, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0915, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00020489711314439774, 'train/lm_loss': 0.0001885716919787228, 'train/info_loss': 3.772831769310869e-05, 'train/ref_loss': 0.25641506910324097, 'train/uncertainty_loss': 0.00914984941482544, 'train/video_loss': 0.26724183559417725, 'train/total_loss': 0.2674303948879242} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3402, 'grad_norm': 3.3754327297210693, 'learning_rate': 1.0037336328492541e-05} -[Rank 0] Trainer log: {'loss': 0.3402, 'grad_norm': 3.3754327297210693, 'learning_rate': 1.0037336328492541e-05}[Rank 2] Trainer log: {'loss': 0.3402, 'grad_norm': 3.3754327297210693, 'learning_rate': 1.0037336328492541e-05} -[Rank 1] Trainer log: {'loss': 0.3402, 'grad_norm': 3.3754327297210693, 'learning_rate': 1.0037336328492541e-05} - -{'loss': 0.3402, 'grad_norm': 3.3754327297210693, 'learning_rate': 1.0037336328492541e-05, 'epoch': 0.52} -tensor(0.1631, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003767469432204962, 'train/lm_loss': 9.408168261870742e-05, 'train/info_loss': 3.075505082961172e-05, 'train/ref_loss': 0.13449618220329285, 'train/uncertainty_loss': -6.884667673148216e-05, 'train/video_loss': 0.13747206330299377, 'train/total_loss': 0.13756614923477173} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0221, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.271199631690979, 'train/info_loss': 0.18855834007263184, 'train/ref_loss': None, 'train/uncertainty_loss': -9.469745564274491e-05, 'train/video_loss': 0.18846364319324493, 'train/total_loss': 0.4596632719039917} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.1589, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1915, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.2948, 'grad_norm': 4.811509132385254, 'learning_rate': 1.0026668836414325e-05}[Rank 1] Trainer log: {'loss': 0.2948, 'grad_norm': 4.811509132385254, 'learning_rate': 1.0026668836414325e-05} - -[Rank 2] Trainer log: {'loss': 0.2948, 'grad_norm': 4.811509132385254, 'learning_rate': 1.0026668836414325e-05}[Rank 3] Trainer log: {'loss': 0.2948, 'grad_norm': 4.811509132385254, 'learning_rate': 1.0026668836414325e-05} - -{'loss': 0.2948, 'grad_norm': 4.811509132385254, 'learning_rate': 1.0026668836414325e-05, 'epoch': 0.52} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(0.2192, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0006, device='cuda:3', grad_fn=) tensor(-0.0006, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.22506694793701174, 'train/info_loss': 0.26982539892196655, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012653276789933443, 'train/video_loss': 0.2696988582611084, 'train/total_loss': 0.4947658181190491} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(0.0215, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00027425438165664675, 'train/lm_loss': 5.349397542886436e-05, 'train/info_loss': 2.574854443082586e-05, 'train/ref_loss': 0.19092264771461487, 'train/uncertainty_loss': 0.0021529845893383027, 'train/video_loss': 0.19529540836811066, 'train/total_loss': 0.19534890353679657} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0006, device='cuda:3', grad_fn=) tensor(-0.0006, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3087, 'grad_norm': 13.433445930480957, 'learning_rate': 1.0016001313987893e-05}[Rank 0] Trainer log: {'loss': 0.3087, 'grad_norm': 13.433445930480957, 'learning_rate': 1.0016001313987893e-05} -[Rank 2] Trainer log: {'loss': 0.3087, 'grad_norm': 13.433445930480957, 'learning_rate': 1.0016001313987893e-05} - -[Rank 3] Trainer log: {'loss': 0.3087, 'grad_norm': 13.433445930480957, 'learning_rate': 1.0016001313987893e-05} -{'loss': 0.3087, 'grad_norm': 13.433445930480957, 'learning_rate': 1.0016001313987893e-05, 'epoch': 0.52} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(0.3414, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002538881963118911, 'train/lm_loss': 0.00020169573836028578, 'train/info_loss': 3.772831769310869e-05, 'train/ref_loss': 0.45652705430984497, 'train/uncertainty_loss': 0.0341391921043396, 'train/video_loss': 0.4927350878715515, 'train/total_loss': 0.49293678998947144} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.4036, device='cuda:0', grad_fn=) tensor(-0.0006, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00018340573878958822, 'train/lm_loss': 4.164738929830492e-05, 'train/info_loss': 2.455650974297896e-05, 'train/ref_loss': 0.507586658000946, 'train/uncertainty_loss': 0.040363344550132754, 'train/video_loss': 0.5494418144226074, 'train/total_loss': 0.5494834780693054} -[Rank 3] Trainer log: {'loss': 0.4838, 'grad_norm': 4.623040676116943, 'learning_rate': 1.0005333773352515e-05}[Rank 1] Trainer log: {'loss': 0.4838, 'grad_norm': 4.623040676116943, 'learning_rate': 1.0005333773352515e-05} - -[Rank 2] Trainer log: {'loss': 0.4838, 'grad_norm': 4.623040676116943, 'learning_rate': 1.0005333773352515e-05}[Rank 0] Trainer log: {'loss': 0.4838, 'grad_norm': 4.623040676116943, 'learning_rate': 1.0005333773352515e-05} - -{'loss': 0.4838, 'grad_norm': 4.623040676116943, 'learning_rate': 1.0005333773352515e-05, 'epoch': 0.52} -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0016, device='cuda:0', grad_fn=) tensor(-0.0016, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2825288772583008, 'train/info_loss': 0.21376781165599823, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00015511977253481748, 'train/video_loss': 0.21361269056797028, 'train/total_loss': 0.49614155292510986} -tensor(0.1631, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0078, device='cuda:3', grad_fn=) tensor(-0.0006, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0066, device='cuda:0', grad_fn=) tensor(-0.0006, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.000258711795322597, 'train/lm_loss': 6.822376162745059e-05, 'train/info_loss': 2.574854443082586e-05, 'train/ref_loss': 0.1460549533367157, 'train/uncertainty_loss': 0.0006645868532359601, 'train/video_loss': 0.14881499111652374, 'train/total_loss': 0.14888320863246918} -[Rank 0] Trainer log: {'loss': 0.3893, 'grad_norm': 7.223668575286865, 'learning_rate': 9.99466622664749e-06}[Rank 2] Trainer log: {'loss': 0.3893, 'grad_norm': 7.223668575286865, 'learning_rate': 9.99466622664749e-06}[Rank 1] Trainer log: {'loss': 0.3893, 'grad_norm': 7.223668575286865, 'learning_rate': 9.99466622664749e-06} - -[Rank 3] Trainer log: {'loss': 0.3893, 'grad_norm': 7.223668575286865, 'learning_rate': 9.99466622664749e-06} - -{'loss': 0.3893, 'grad_norm': 7.223668575286865, 'learning_rate': 9.99466622664749e-06, 'epoch': 0.53} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.31644217967987065, 'train/info_loss': 0.17710520327091217, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011837707133963705, 'train/video_loss': 0.17698682844638824, 'train/total_loss': 0.4934290051460266} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1642373323440552, 'train/info_loss': 0.11295831203460693, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010268555488437415, 'train/video_loss': 0.11285562813282013, 'train/total_loss': 0.27709296345710754} -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(0.4128, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3663, 'grad_norm': 6.046321868896484, 'learning_rate': 9.98399868601211e-06}[Rank 0] Trainer log: {'loss': 0.3663, 'grad_norm': 6.046321868896484, 'learning_rate': 9.98399868601211e-06} - -[Rank 3] Trainer log: {'loss': 0.3663, 'grad_norm': 6.046321868896484, 'learning_rate': 9.98399868601211e-06}[Rank 1] Trainer log: {'loss': 0.3663, 'grad_norm': 6.046321868896484, 'learning_rate': 9.98399868601211e-06} - -{'loss': 0.3663, 'grad_norm': 6.046321868896484, 'learning_rate': 9.98399868601211e-06, 'epoch': 0.53} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3770950317382813, 'train/info_loss': 0.19047217071056366, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010856628650799394, 'train/video_loss': 0.19036360085010529, 'train/total_loss': 0.5674586296081543} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.1348, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.1188, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00027085249312222006, 'train/lm_loss': 4.7177472151815894e-05, 'train/info_loss': 2.4914121240726672e-05, 'train/ref_loss': 0.25221166014671326, 'train/uncertainty_loss': 0.011883569508790971, 'train/video_loss': 0.2662869393825531, 'train/total_loss': 0.2663341164588928} -tensor(0.0097, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3133, 'grad_norm': 5.697210311889648, 'learning_rate': 9.97333116358568e-06} -[Rank 0] Trainer log: {'loss': 0.3133, 'grad_norm': 5.697210311889648, 'learning_rate': 9.97333116358568e-06}[Rank 2] Trainer log: {'loss': 0.3133, 'grad_norm': 5.697210311889648, 'learning_rate': 9.97333116358568e-06}[Rank 3] Trainer log: {'loss': 0.3133, 'grad_norm': 5.697210311889648, 'learning_rate': 9.97333116358568e-06} - - -{'loss': 0.3133, 'grad_norm': 5.697210311889648, 'learning_rate': 9.97333116358568e-06, 'epoch': 0.53} -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.16684737205505373, 'train/info_loss': 0.19795072078704834, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010188871528953314, 'train/video_loss': 0.1978488266468048, 'train/total_loss': 0.364696204662323} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.5952568054199219, 'train/info_loss': 0.16124223172664642, 'train/ref_loss': None, 'train/uncertainty_loss': -9.248280548490584e-05, 'train/video_loss': 0.1611497551202774, 'train/total_loss': 0.7564065456390381} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4261, 'grad_norm': 4.716311931610107, 'learning_rate': 9.96266367150746e-06}[Rank 1] Trainer log: {'loss': 0.4261, 'grad_norm': 4.716311931610107, 'learning_rate': 9.96266367150746e-06}[Rank 2] Trainer log: {'loss': 0.4261, 'grad_norm': 4.716311931610107, 'learning_rate': 9.96266367150746e-06} - - -[Rank 0] Trainer log: {'loss': 0.4261, 'grad_norm': 4.716311931610107, 'learning_rate': 9.96266367150746e-06} -{'loss': 0.4261, 'grad_norm': 4.716311931610107, 'learning_rate': 9.96266367150746e-06, 'epoch': 0.53} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.0895, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002478619571775198, 'train/lm_loss': 3.185018722433597e-05, 'train/info_loss': 2.0980394765501842e-05, 'train/ref_loss': 0.26268285512924194, 'train/uncertainty_loss': 0.00895066112279892, 'train/video_loss': 0.2736373841762543, 'train/total_loss': 0.2736692428588867} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00013374363770708443, 'train/lm_loss': 5.363698583096266e-05, 'train/info_loss': 2.413929905742407e-05, 'train/ref_loss': 0.13141101598739624, 'train/uncertainty_loss': -6.687840213999152e-05, 'train/video_loss': 0.13243822753429413, 'train/total_loss': 0.13249187171459198} -[Rank 2] Trainer log: {'loss': 0.3826, 'grad_norm': 4.268515586853027, 'learning_rate': 9.9519962219167e-06}[Rank 3] Trainer log: {'loss': 0.3826, 'grad_norm': 4.268515586853027, 'learning_rate': 9.9519962219167e-06}[Rank 0] Trainer log: {'loss': 0.3826, 'grad_norm': 4.268515586853027, 'learning_rate': 9.9519962219167e-06} - -[Rank 1] Trainer log: {'loss': 0.3826, 'grad_norm': 4.268515586853027, 'learning_rate': 9.9519962219167e-06} - -{'loss': 0.3826, 'grad_norm': 4.268515586853027, 'learning_rate': 9.9519962219167e-06, 'epoch': 0.53} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.10129740238189698, 'train/info_loss': 0.21946188807487488, 'train/ref_loss': None, 'train/uncertainty_loss': -9.306857245974244e-05, 'train/video_loss': 0.2193688154220581, 'train/total_loss': 0.32066622376441956} -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(0.0963, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(0.1304, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00016445619985461235, 'train/lm_loss': 7.792385295033455e-05, 'train/info_loss': 2.9861035727662966e-05, 'train/ref_loss': 0.2649697959423065, 'train/uncertainty_loss': 0.013040946424007417, 'train/video_loss': 0.2793562710285187, 'train/total_loss': 0.2794342041015625} -[Rank 1] Trainer log: {'loss': 0.3447, 'grad_norm': 7.2637128829956055, 'learning_rate': 9.941328826952584e-06} -[Rank 0] Trainer log: {'loss': 0.3447, 'grad_norm': 7.2637128829956055, 'learning_rate': 9.941328826952584e-06} -[Rank 3] Trainer log: {'loss': 0.3447, 'grad_norm': 7.2637128829956055, 'learning_rate': 9.941328826952584e-06} -[Rank 2] Trainer log: {'loss': 0.3447, 'grad_norm': 7.2637128829956055, 'learning_rate': 9.941328826952584e-06} -{'loss': 0.3447, 'grad_norm': 7.2637128829956055, 'learning_rate': 9.941328826952584e-06, 'epoch': 0.53} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.4019469261169434, 'train/info_loss': 0.16441477835178375, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010219553951174021, 'train/video_loss': 0.16431258618831635, 'train/total_loss': 0.566259503364563} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.1543, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.52326979637146, 'train/info_loss': 0.14738011360168457, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011244198540225626, 'train/video_loss': 0.1472676694393158, 'train/total_loss': 0.6705374717712402} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(0.0269, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1366, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.4341, 'grad_norm': 5.05082368850708, 'learning_rate': 9.930661498754237e-06}[Rank 0] Trainer log: {'loss': 0.4341, 'grad_norm': 5.05082368850708, 'learning_rate': 9.930661498754237e-06} -[Rank 3] Trainer log: {'loss': 0.4341, 'grad_norm': 5.05082368850708, 'learning_rate': 9.930661498754237e-06}[Rank 1] Trainer log: {'loss': 0.4341, 'grad_norm': 5.05082368850708, 'learning_rate': 9.930661498754237e-06} - - -{'loss': 0.4341, 'grad_norm': 5.05082368850708, 'learning_rate': 9.930661498754237e-06, 'epoch': 0.53} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.2891, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.4777, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1556, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00012294191401451826, 'train/lm_loss': 3.1659481464885175e-05, 'train/info_loss': 2.002676046686247e-05, 'train/ref_loss': 0.3185858130455017, 'train/uncertainty_loss': 0.015562005341053009, 'train/video_loss': 0.3351513743400574, 'train/total_loss': 0.3351830244064331} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00020797725301235914, 'train/lm_loss': 4.095611802767962e-05, 'train/info_loss': 2.235124156868551e-05, 'train/ref_loss': 0.0552663654088974, 'train/uncertainty_loss': -7.04715319443494e-05, 'train/video_loss': 0.056882064789533615, 'train/total_loss': 0.056923020631074905} -[Rank 3] Trainer log: {'loss': 0.3602, 'grad_norm': 13.073532104492188, 'learning_rate': 9.919994249460719e-06} -[Rank 1] Trainer log: {'loss': 0.3602, 'grad_norm': 13.073532104492188, 'learning_rate': 9.919994249460719e-06}[Rank 2] Trainer log: {'loss': 0.3602, 'grad_norm': 13.073532104492188, 'learning_rate': 9.919994249460719e-06} - -[Rank 0] Trainer log: {'loss': 0.3602, 'grad_norm': 13.073532104492188, 'learning_rate': 9.919994249460719e-06} -{'loss': 0.3602, 'grad_norm': 13.073532104492188, 'learning_rate': 9.919994249460719e-06, 'epoch': 0.53} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.169821834564209, 'train/info_loss': 0.21931423246860504, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010809461819007994, 'train/video_loss': 0.21920613944530487, 'train/total_loss': 0.38902798295021057} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(0.2659, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.13058121204376222, 'train/info_loss': 0.17436739802360535, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012870189966633918, 'train/video_loss': 0.17423869669437408, 'train/total_loss': 0.30481991171836853} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.0576, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.8981, device='cuda:3', grad_fn=) tensor(-0.0006, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4346, 'grad_norm': 15.364766120910645, 'learning_rate': 9.909327091210983e-06} -[Rank 2] Trainer log: {'loss': 0.4346, 'grad_norm': 15.364766120910645, 'learning_rate': 9.909327091210983e-06} -[Rank 0] Trainer log: {'loss': 0.4346, 'grad_norm': 15.364766120910645, 'learning_rate': 9.909327091210983e-06}[Rank 1] Trainer log: {'loss': 0.4346, 'grad_norm': 15.364766120910645, 'learning_rate': 9.909327091210983e-06} - -{'loss': 0.4346, 'grad_norm': 15.364766120910645, 'learning_rate': 9.909327091210983e-06, 'epoch': 0.53} -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.16613200902938843, 'train/info_loss': 0.19078221917152405, 'train/ref_loss': None, 'train/uncertainty_loss': -9.068567305803299e-05, 'train/video_loss': 0.1906915307044983, 'train/total_loss': 0.35682356357574463} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.0031, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.6496, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00028957964386790993, 'train/lm_loss': 0.0001080221263691783, 'train/info_loss': 3.462909808149561e-05, 'train/ref_loss': 0.7035796046257019, 'train/uncertainty_loss': 0.06495667099952698, 'train/video_loss': 0.770887553691864, 'train/total_loss': 0.770995557308197} -tensor(0.1334, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3869, 'grad_norm': 10.740744590759277, 'learning_rate': 9.898660036143893e-06}[Rank 0] Trainer log: {'loss': 0.3869, 'grad_norm': 10.740744590759277, 'learning_rate': 9.898660036143893e-06} -[Rank 3] Trainer log: {'loss': 0.3869, 'grad_norm': 10.740744590759277, 'learning_rate': 9.898660036143893e-06} -[Rank 2] Trainer log: {'loss': 0.3869, 'grad_norm': 10.740744590759277, 'learning_rate': 9.898660036143893e-06} - -{'loss': 0.3869, 'grad_norm': 10.740744590759277, 'learning_rate': 9.898660036143893e-06, 'epoch': 0.53} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002766900230199099, 'train/lm_loss': 6.02631364017725e-05, 'train/info_loss': 2.6106152290594764e-05, 'train/ref_loss': 0.15466630458831787, 'train/uncertainty_loss': -7.207310991361738e-05, 'train/video_loss': 0.15683385729789734, 'train/total_loss': 0.15689411759376526} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.2303, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00022668892052024603, 'train/lm_loss': 4.105146508663893e-05, 'train/info_loss': 2.270885306643322e-05, 'train/ref_loss': 0.07787476480007172, 'train/uncertainty_loss': -6.734549533575774e-05, 'train/video_loss': 0.07964363694190979, 'train/total_loss': 0.07968468964099884} -[Rank 2] Trainer log: {'loss': 0.2783, 'grad_norm': 9.843986511230469, 'learning_rate': 9.887993096398189e-06} -[Rank 1] Trainer log: {'loss': 0.2783, 'grad_norm': 9.843986511230469, 'learning_rate': 9.887993096398189e-06} -[Rank 3] Trainer log: {'loss': 0.2783, 'grad_norm': 9.843986511230469, 'learning_rate': 9.887993096398189e-06} -[Rank 0] Trainer log: {'loss': 0.2783, 'grad_norm': 9.843986511230469, 'learning_rate': 9.887993096398189e-06} -{'loss': 0.2783, 'grad_norm': 9.843986511230469, 'learning_rate': 9.887993096398189e-06, 'epoch': 0.53} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3843416690826416, 'train/info_loss': 0.24556589126586914, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011938431998714805, 'train/video_loss': 0.24544650316238403, 'train/total_loss': 0.6297881603240967} -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(0.0658, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0015, device='cuda:3', grad_fn=) tensor(-0.0015, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.4413, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00019019241444766523, 'train/lm_loss': 5.368465790525079e-05, 'train/info_loss': 2.735778434725944e-05, 'train/ref_loss': 0.4962139427661896, 'train/uncertainty_loss': 0.044126811623573306, 'train/video_loss': 0.5418896675109863, 'train/total_loss': 0.541943371295929} -[Rank 1] Trainer log: {'loss': 0.4519, 'grad_norm': 4.2213053703308105, 'learning_rate': 9.877326284112474e-06} -[Rank 2] Trainer log: {'loss': 0.4519, 'grad_norm': 4.2213053703308105, 'learning_rate': 9.877326284112474e-06} -[Rank 0] Trainer log: {'loss': 0.4519, 'grad_norm': 4.2213053703308105, 'learning_rate': 9.877326284112474e-06}[Rank 3] Trainer log: {'loss': 0.4519, 'grad_norm': 4.2213053703308105, 'learning_rate': 9.877326284112474e-06} - -{'loss': 0.4519, 'grad_norm': 4.2213053703308105, 'learning_rate': 9.877326284112474e-06, 'epoch': 0.53} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=)tensor(-0.0009, device='cuda:2', grad_fn=) - {'train/tv_loss': None, 'train/lm_loss': 0.125505530834198, 'train/info_loss': 0.11409539729356766, 'train/ref_loss': None, 'train/uncertainty_loss': -8.469679742120207e-05, 'train/video_loss': 0.11401069909334183, 'train/total_loss': 0.2395162284374237} -tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00020534088835120202, 'train/lm_loss': 3.676076594274491e-05, 'train/info_loss': 2.270885306643322e-05, 'train/ref_loss': 0.15260684490203857, 'train/uncertainty_loss': -6.945908535271883e-05, 'train/video_loss': 0.15420283377170563, 'train/total_loss': 0.15423959493637085} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3192, 'grad_norm': 4.41371488571167, 'learning_rate': 9.866659611425227e-06}[Rank 3] Trainer log: {'loss': 0.3192, 'grad_norm': 4.41371488571167, 'learning_rate': 9.866659611425227e-06}[Rank 0] Trainer log: {'loss': 0.3192, 'grad_norm': 4.41371488571167, 'learning_rate': 9.866659611425227e-06} - -[Rank 2] Trainer log: {'loss': 0.3192, 'grad_norm': 4.41371488571167, 'learning_rate': 9.866659611425227e-06} - -{'loss': 0.3192, 'grad_norm': 4.41371488571167, 'learning_rate': 9.866659611425227e-06, 'epoch': 0.53} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00016512495931237937, 'train/lm_loss': 6.085900240577758e-05, 'train/info_loss': 2.6106152290594764e-05, 'train/ref_loss': 0.1595165729522705, 'train/uncertainty_loss': -7.22062075510621e-05, 'train/video_loss': 0.16079147160053253, 'train/total_loss': 0.1608523279428482} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.0955, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0097, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0533, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0004295493941754103, 'train/lm_loss': 6.0644489713013174e-05, 'train/info_loss': 2.6523362976149656e-05, 'train/ref_loss': 0.2457863986492157, 'train/uncertainty_loss': 0.005326532945036888, 'train/video_loss': 0.25457584857940674, 'train/total_loss': 0.2546364963054657} -[Rank 1] Trainer log: {'loss': 0.2868, 'grad_norm': 2.2143747806549072, 'learning_rate': 9.855993090474741e-06}[Rank 0] Trainer log: {'loss': 0.2868, 'grad_norm': 2.2143747806549072, 'learning_rate': 9.855993090474741e-06} -[Rank 3] Trainer log: {'loss': 0.2868, 'grad_norm': 2.2143747806549072, 'learning_rate': 9.855993090474741e-06} -[Rank 2] Trainer log: {'loss': 0.2868, 'grad_norm': 2.2143747806549072, 'learning_rate': 9.855993090474741e-06} - -{'loss': 0.2868, 'grad_norm': 2.2143747806549072, 'learning_rate': 9.855993090474741e-06, 'epoch': 0.53} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(0.4906, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.2125, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0006, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002541269408538938, 'train/lm_loss': 4.715363611467183e-05, 'train/info_loss': 2.574854443082586e-05, 'train/ref_loss': 0.22081905603408813, 'train/uncertainty_loss': 6.473585963249207e-05, 'train/video_loss': 0.2229425460100174, 'train/total_loss': 0.22298969328403473} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(0.1158, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0006, device='cuda:0', grad_fn=) tensor(-0.0006, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00020664033945649862, 'train/lm_loss': 4.078925994690508e-05, 'train/info_loss': 2.270885306643322e-05, 'train/ref_loss': 0.18206000328063965, 'train/uncertainty_loss': -6.44629297312349e-05, 'train/video_loss': 0.18367137014865875, 'train/total_loss': 0.1837121546268463} -[Rank 2] Trainer log: {'loss': 0.3523, 'grad_norm': 12.248583793640137, 'learning_rate': 9.845326733399157e-06}[Rank 1] Trainer log: {'loss': 0.3523, 'grad_norm': 12.248583793640137, 'learning_rate': 9.845326733399157e-06}[Rank 3] Trainer log: {'loss': 0.3523, 'grad_norm': 12.248583793640137, 'learning_rate': 9.845326733399157e-06} - - -[Rank 0] Trainer log: {'loss': 0.3523, 'grad_norm': 12.248583793640137, 'learning_rate': 9.845326733399157e-06} -{'loss': 0.3523, 'grad_norm': 12.248583793640137, 'learning_rate': 9.845326733399157e-06, 'epoch': 0.53} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.18165471553802492, 'train/info_loss': 0.20819121599197388, 'train/ref_loss': None, 'train/uncertainty_loss': -9.400860872119666e-05, 'train/video_loss': 0.2080972045660019, 'train/total_loss': 0.3897519111633301} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.2657, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2413576126098633, 'train/info_loss': 0.2448195517063141, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012905091280117633, 'train/video_loss': 0.24469050765037537, 'train/total_loss': 0.4860481023788452} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(0.1724, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0472, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3904, 'grad_norm': 12.638797760009766, 'learning_rate': 9.834660552336415e-06}[Rank 3] Trainer log: {'loss': 0.3904, 'grad_norm': 12.638797760009766, 'learning_rate': 9.834660552336415e-06}[Rank 1] Trainer log: {'loss': 0.3904, 'grad_norm': 12.638797760009766, 'learning_rate': 9.834660552336415e-06} - -[Rank 2] Trainer log: {'loss': 0.3904, 'grad_norm': 12.638797760009766, 'learning_rate': 9.834660552336415e-06} - -{'loss': 0.3904, 'grad_norm': 12.638797760009766, 'learning_rate': 9.834660552336415e-06, 'epoch': 0.53} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(0.2095, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.2437, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002777520567178726, 'train/lm_loss': 4.109914007131011e-05, 'train/info_loss': 2.3781687559676357e-05, 'train/ref_loss': 0.3772381544113159, 'train/uncertainty_loss': 0.0243689700961113, 'train/video_loss': 0.4038529098033905, 'train/total_loss': 0.40389400720596313} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.26831533908843996, 'train/info_loss': 0.15871329605579376, 'train/ref_loss': None, 'train/uncertainty_loss': -9.758115047588945e-05, 'train/video_loss': 0.15861570835113525, 'train/total_loss': 0.4269310534000397} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3291, 'grad_norm': 5.383129596710205, 'learning_rate': 9.823994559424268e-06}[Rank 3] Trainer log: {'loss': 0.3291, 'grad_norm': 5.383129596710205, 'learning_rate': 9.823994559424268e-06}[Rank 2] Trainer log: {'loss': 0.3291, 'grad_norm': 5.383129596710205, 'learning_rate': 9.823994559424268e-06} - - -[Rank 0] Trainer log: {'loss': 0.3291, 'grad_norm': 5.383129596710205, 'learning_rate': 9.823994559424268e-06} -{'loss': 0.3291, 'grad_norm': 5.383129596710205, 'learning_rate': 9.823994559424268e-06, 'epoch': 0.53} -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2022587299346924, 'train/info_loss': 0.19138947129249573, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012515303678810598, 'train/video_loss': 0.1912643164396286, 'train/total_loss': 0.39352303743362427} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(0.0467, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.2286, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.3261, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003235219744965434, 'train/lm_loss': 3.621250216383487e-05, 'train/info_loss': 2.3066464564180933e-05, 'train/ref_loss': 0.42734014987945557, 'train/uncertainty_loss': 0.03261305391788483, 'train/video_loss': 0.46256446838378906, 'train/total_loss': 0.4626006782054901} -[Rank 2] Trainer log: {'loss': 0.3884, 'grad_norm': 11.575973510742188, 'learning_rate': 9.813328766800243e-06}[Rank 3] Trainer log: {'loss': 0.3884, 'grad_norm': 11.575973510742188, 'learning_rate': 9.813328766800243e-06}[Rank 1] Trainer log: {'loss': 0.3884, 'grad_norm': 11.575973510742188, 'learning_rate': 9.813328766800243e-06} - - -[Rank 0] Trainer log: {'loss': 0.3884, 'grad_norm': 11.575973510742188, 'learning_rate': 9.813328766800243e-06} -{'loss': 0.3884, 'grad_norm': 11.575973510742188, 'learning_rate': 9.813328766800243e-06, 'epoch': 0.53} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.051041108369827275, 'train/info_loss': 0.16646994650363922, 'train/ref_loss': None, 'train/uncertainty_loss': -9.636799804866314e-05, 'train/video_loss': 0.1663735806941986, 'train/total_loss': 0.21741469204425812} -tensor(0.0441, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.4454, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.20763864517211916, 'train/info_loss': 0.1740013211965561, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013908690307289364, 'train/video_loss': 0.17386223375797272, 'train/total_loss': 0.38150089979171753} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(0.2765, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4121, 'grad_norm': 5.2237958908081055, 'learning_rate': 9.802663186601647e-06}[Rank 2] Trainer log: {'loss': 0.4121, 'grad_norm': 5.2237958908081055, 'learning_rate': 9.802663186601647e-06}[Rank 3] Trainer log: {'loss': 0.4121, 'grad_norm': 5.2237958908081055, 'learning_rate': 9.802663186601647e-06} - - -[Rank 0] Trainer log: {'loss': 0.4121, 'grad_norm': 5.2237958908081055, 'learning_rate': 9.802663186601647e-06} -{'loss': 0.4121, 'grad_norm': 5.2237958908081055, 'learning_rate': 9.802663186601647e-06, 'epoch': 0.53} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.169815993309021, 'train/info_loss': 0.18057125806808472, 'train/ref_loss': None, 'train/uncertainty_loss': -9.531412506476046e-05, 'train/video_loss': 0.18047595024108887, 'train/total_loss': 0.3502919673919678} -tensor(0.0528, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.0704, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00046075573191046716, 'train/lm_loss': 4.145669518038631e-05, 'train/info_loss': 2.3066464564180933e-05, 'train/ref_loss': 0.2637425363063812, 'train/uncertainty_loss': 0.007037167251110078, 'train/video_loss': 0.27448880672454834, 'train/total_loss': 0.2745302617549896} -[Rank 1] Trainer log: {'loss': 0.3686, 'grad_norm': 6.806494235992432, 'learning_rate': 9.791997830965543e-06}[Rank 3] Trainer log: {'loss': 0.3686, 'grad_norm': 6.806494235992432, 'learning_rate': 9.791997830965543e-06}[Rank 2] Trainer log: {'loss': 0.3686, 'grad_norm': 6.806494235992432, 'learning_rate': 9.791997830965543e-06} - - -[Rank 0] Trainer log: {'loss': 0.3686, 'grad_norm': 6.806494235992432, 'learning_rate': 9.791997830965543e-06} -{'loss': 0.3686, 'grad_norm': 6.806494235992432, 'learning_rate': 9.791997830965543e-06, 'epoch': 0.53} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.0541, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00022077534813433886, 'train/lm_loss': 6.886727060191333e-05, 'train/info_loss': 3.00994397548493e-05, 'train/ref_loss': 0.25693479180336, 'train/uncertainty_loss': 0.005410416796803475, 'train/video_loss': 0.2641414999961853, 'train/total_loss': 0.26421037316322327} -tensor(0.4416, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.24093453884124758, 'train/info_loss': 0.09037043899297714, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010554806794971228, 'train/video_loss': 0.09026489406824112, 'train/total_loss': 0.33119943737983704} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(0.2070, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4041, 'grad_norm': 5.10682487487793, 'learning_rate': 9.781332712028731e-06}[Rank 0] Trainer log: {'loss': 0.4041, 'grad_norm': 5.10682487487793, 'learning_rate': 9.781332712028731e-06} -[Rank 1] Trainer log: {'loss': 0.4041, 'grad_norm': 5.10682487487793, 'learning_rate': 9.781332712028731e-06} -[Rank 2] Trainer log: {'loss': 0.4041, 'grad_norm': 5.10682487487793, 'learning_rate': 9.781332712028731e-06} - -{'loss': 0.4041, 'grad_norm': 5.10682487487793, 'learning_rate': 9.781332712028731e-06, 'epoch': 0.53} -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.13408709764480592, 'train/info_loss': 0.07826387137174606, 'train/ref_loss': None, 'train/uncertainty_loss': -8.86229332536459e-05, 'train/video_loss': 0.07817524671554565, 'train/total_loss': 0.2122623473405838} -tensor(0.1551, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0006, device='cuda:2', grad_fn=) tensor(-0.0006, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.34188194274902345, 'train/info_loss': 0.15787264704704285, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011038222583010794, 'train/video_loss': 0.15776225924491882, 'train/total_loss': 0.4996442198753357} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.1557, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.2976, 'grad_norm': 4.018528938293457, 'learning_rate': 9.770667841927764e-06}[Rank 0] Trainer log: {'loss': 0.2976, 'grad_norm': 4.018528938293457, 'learning_rate': 9.770667841927764e-06} -[Rank 3] Trainer log: {'loss': 0.2976, 'grad_norm': 4.018528938293457, 'learning_rate': 9.770667841927764e-06} - -[Rank 2] Trainer log: {'loss': 0.2976, 'grad_norm': 4.018528938293457, 'learning_rate': 9.770667841927764e-06} -{'loss': 0.2976, 'grad_norm': 4.018528938293457, 'learning_rate': 9.770667841927764e-06, 'epoch': 0.53} -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.31538813114166264, 'train/info_loss': 0.2878419756889343, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012184723746031523, 'train/video_loss': 0.28772011399269104, 'train/total_loss': 0.603108286857605} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.0757, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002881627297028899, 'train/lm_loss': 4.7082128003239634e-05, 'train/info_loss': 2.6344558136770502e-05, 'train/ref_loss': 0.26518481969833374, 'train/uncertainty_loss': 0.007572246342897416, 'train/video_loss': 0.27508869767189026, 'train/total_loss': 0.2751357853412628} -[Rank 3] Trainer log: {'loss': 0.4163, 'grad_norm': 4.89597749710083, 'learning_rate': 9.760003232798879e-06}[Rank 1] Trainer log: {'loss': 0.4163, 'grad_norm': 4.89597749710083, 'learning_rate': 9.760003232798879e-06} - -[Rank 0] Trainer log: {'loss': 0.4163, 'grad_norm': 4.89597749710083, 'learning_rate': 9.760003232798879e-06} -[Rank 2] Trainer log: {'loss': 0.4163, 'grad_norm': 4.89597749710083, 'learning_rate': 9.760003232798879e-06} -{'loss': 0.4163, 'grad_norm': 4.89597749710083, 'learning_rate': 9.760003232798879e-06, 'epoch': 0.53} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(0.0251, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0613, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1955156445503235, 'train/info_loss': 0.1936686784029007, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011405829573050142, 'train/video_loss': 0.19355462491512299, 'train/total_loss': 0.3890702724456787} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2648075819015503, 'train/info_loss': 0.20446257293224335, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012263103853911163, 'train/video_loss': 0.20433993637561798, 'train/total_loss': 0.4691475033760071} -tensor(0.0035, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.1664, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0897, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3237, 'grad_norm': 3.4838531017303467, 'learning_rate': 9.749338896778048e-06}[Rank 3] Trainer log: {'loss': 0.3237, 'grad_norm': 3.4838531017303467, 'learning_rate': 9.749338896778048e-06}[Rank 0] Trainer log: {'loss': 0.3237, 'grad_norm': 3.4838531017303467, 'learning_rate': 9.749338896778048e-06} - - -[Rank 1] Trainer log: {'loss': 0.3237, 'grad_norm': 3.4838531017303467, 'learning_rate': 9.749338896778048e-06} -{'loss': 0.3237, 'grad_norm': 3.4838531017303467, 'learning_rate': 9.749338896778048e-06, 'epoch': 0.53} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00020807110704481604, 'train/lm_loss': 8.862438844516874e-05, 'train/info_loss': 3.129145989078097e-05, 'train/ref_loss': 0.195805162191391, 'train/uncertainty_loss': -6.847928161732853e-05, 'train/video_loss': 0.1974325329065323, 'train/total_loss': 0.1975211501121521} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.36058945655822755, 'train/info_loss': 0.16579300165176392, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012042627204209567, 'train/video_loss': 0.16567257046699524, 'train/total_loss': 0.5262620449066162} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.0208, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1084, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3522, 'grad_norm': 2.1928794384002686, 'learning_rate': 9.738674846000905e-06}[Rank 3] Trainer log: {'loss': 0.3522, 'grad_norm': 2.1928794384002686, 'learning_rate': 9.738674846000905e-06} -[Rank 2] Trainer log: {'loss': 0.3522, 'grad_norm': 2.1928794384002686, 'learning_rate': 9.738674846000905e-06} -[Rank 1] Trainer log: {'loss': 0.3522, 'grad_norm': 2.1928794384002686, 'learning_rate': 9.738674846000905e-06} - -{'loss': 0.3522, 'grad_norm': 2.1928794384002686, 'learning_rate': 9.738674846000905e-06, 'epoch': 0.53} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.46995482444763187, 'train/info_loss': 0.12922178208827972, 'train/ref_loss': None, 'train/uncertainty_loss': -9.968726662918926e-05, 'train/video_loss': 0.12912209331989288, 'train/total_loss': 0.5990769267082214} -tensor(0.1976, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003465271554887295, 'train/lm_loss': 3.180250932928175e-05, 'train/info_loss': 2.3066464564180933e-05, 'train/ref_loss': 0.21952688694000244, 'train/uncertainty_loss': -7.408594829030336e-05, 'train/video_loss': 0.22224807739257812, 'train/total_loss': 0.2222798764705658} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3215, 'grad_norm': 2.577279806137085, 'learning_rate': 9.72801109260278e-06} -[Rank 2] Trainer log: {'loss': 0.3215, 'grad_norm': 2.577279806137085, 'learning_rate': 9.72801109260278e-06}[Rank 3] Trainer log: {'loss': 0.3215, 'grad_norm': 2.577279806137085, 'learning_rate': 9.72801109260278e-06} - -[Rank 0] Trainer log: {'loss': 0.3215, 'grad_norm': 2.577279806137085, 'learning_rate': 9.72801109260278e-06} -{'loss': 0.3215, 'grad_norm': 2.577279806137085, 'learning_rate': 9.72801109260278e-06, 'epoch': 0.53} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.1330, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0227, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002340949606150389, 'train/lm_loss': 6.901026936247946e-05, 'train/info_loss': 3.075505082961172e-05, 'train/ref_loss': 0.19648388028144836, 'train/uncertainty_loss': 0.002269328013062477, 'train/video_loss': 0.2006567269563675, 'train/total_loss': 0.2007257342338562} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.07664724588394166, 'train/info_loss': 0.18582814931869507, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011259358143433928, 'train/video_loss': 0.18571555614471436, 'train/total_loss': 0.2623628079891205} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.1173, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.321, 'grad_norm': 3.118265151977539, 'learning_rate': 9.71734764871866e-06} -[Rank 3] Trainer log: {'loss': 0.321, 'grad_norm': 3.118265151977539, 'learning_rate': 9.71734764871866e-06}[Rank 0] Trainer log: {'loss': 0.321, 'grad_norm': 3.118265151977539, 'learning_rate': 9.71734764871866e-06} -[Rank 2] Trainer log: {'loss': 0.321, 'grad_norm': 3.118265151977539, 'learning_rate': 9.71734764871866e-06} - -{'loss': 0.321, 'grad_norm': 3.118265151977539, 'learning_rate': 9.71734764871866e-06, 'epoch': 0.53} -tensor(0.1174, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002742295851930976, 'train/lm_loss': 6.82952580973506e-05, 'train/info_loss': 2.914582182711456e-05, 'train/ref_loss': 0.29380857944488525, 'train/uncertainty_loss': 0.011735703051090242, 'train/video_loss': 0.30776727199554443, 'train/total_loss': 0.30783557891845703} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0015, device='cuda:0', grad_fn=) tensor(-0.0015, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.24001338481903078, 'train/info_loss': 0.312690794467926, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00014620739966630936, 'train/video_loss': 0.312544584274292, 'train/total_loss': 0.5525579452514648} -tensor(0.0692, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3743, 'grad_norm': 2.7257261276245117, 'learning_rate': 9.706684526483168e-06}[Rank 3] Trainer log: {'loss': 0.3743, 'grad_norm': 2.7257261276245117, 'learning_rate': 9.706684526483168e-06} -[Rank 0] Trainer log: {'loss': 0.3743, 'grad_norm': 2.7257261276245117, 'learning_rate': 9.706684526483168e-06} - -[Rank 2] Trainer log: {'loss': 0.3743, 'grad_norm': 2.7257261276245117, 'learning_rate': 9.706684526483168e-06} -{'loss': 0.3743, 'grad_norm': 2.7257261276245117, 'learning_rate': 9.706684526483168e-06, 'epoch': 0.53} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2571375608444214, 'train/info_loss': 0.22226837277412415, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001177371246740222, 'train/video_loss': 0.22215063869953156, 'train/total_loss': 0.4792882204055786} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1046, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00034622158855199817, 'train/lm_loss': 4.138518415857107e-05, 'train/info_loss': 2.3424076061928645e-05, 'train/ref_loss': 0.13371910154819489, 'train/uncertainty_loss': -7.283146842382849e-05, 'train/video_loss': 0.1364394575357437, 'train/total_loss': 0.13648083806037903} -[Rank 2] Trainer log: {'loss': 0.3622, 'grad_norm': 5.535956382751465, 'learning_rate': 9.696021738030575e-06} -[Rank 1] Trainer log: {'loss': 0.3622, 'grad_norm': 5.535956382751465, 'learning_rate': 9.696021738030575e-06} -[Rank 0] Trainer log: {'loss': 0.3622, 'grad_norm': 5.535956382751465, 'learning_rate': 9.696021738030575e-06}[Rank 3] Trainer log: {'loss': 0.3622, 'grad_norm': 5.535956382751465, 'learning_rate': 9.696021738030575e-06} - -{'loss': 0.3622, 'grad_norm': 5.535956382751465, 'learning_rate': 9.696021738030575e-06, 'epoch': 0.53} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.7946, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0885, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.000155243871267885, 'train/lm_loss': 5.3374795243144035e-05, 'train/info_loss': 2.6761768822325394e-05, 'train/ref_loss': 0.25474658608436584, 'train/uncertainty_loss': 0.008845546841621399, 'train/video_loss': 0.2648608386516571, 'train/total_loss': 0.26491421461105347} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.20422778129577637, 'train/info_loss': 0.15859103202819824, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010192171903327108, 'train/video_loss': 0.15848910808563232, 'train/total_loss': 0.3627169132232666} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.4406, 'grad_norm': 5.280895709991455, 'learning_rate': 9.685359295494764e-06}[Rank 1] Trainer log: {'loss': 0.4406, 'grad_norm': 5.280895709991455, 'learning_rate': 9.685359295494764e-06}[Rank 3] Trainer log: {'loss': 0.4406, 'grad_norm': 5.280895709991455, 'learning_rate': 9.685359295494764e-06} - - -[Rank 2] Trainer log: {'loss': 0.4406, 'grad_norm': 5.280895709991455, 'learning_rate': 9.685359295494764e-06} -{'loss': 0.4406, 'grad_norm': 5.280895709991455, 'learning_rate': 9.685359295494764e-06, 'epoch': 0.53} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(0.0598, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -tensor(0.1941, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -{'train/tv_loss': 0.00020097726956009866, 'train/lm_loss': 4.1027629049494865e-05, 'train/info_loss': 2.3424076061928645e-05, 'train/ref_loss': 0.2568090558052063, 'train/uncertainty_loss': 0.005977282673120499, 'train/video_loss': 0.2644175589084625, 'train/total_loss': 0.2644585967063904} -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.07885547280311585, 'train/info_loss': 0.1465349644422531, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010343560716137291, 'train/video_loss': 0.14643153548240662, 'train/total_loss': 0.22528702020645142} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(0.0392, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.347, 'grad_norm': 5.634327411651611, 'learning_rate': 9.67469721100923e-06}[Rank 2] Trainer log: {'loss': 0.347, 'grad_norm': 5.634327411651611, 'learning_rate': 9.67469721100923e-06}[Rank 1] Trainer log: {'loss': 0.347, 'grad_norm': 5.634327411651611, 'learning_rate': 9.67469721100923e-06} - -[Rank 3] Trainer log: {'loss': 0.347, 'grad_norm': 5.634327411651611, 'learning_rate': 9.67469721100923e-06} - -{'loss': 0.347, 'grad_norm': 5.634327411651611, 'learning_rate': 9.67469721100923e-06, 'epoch': 0.53} -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.30265533924102783, 'train/info_loss': 0.21858270466327667, 'train/ref_loss': None, 'train/uncertainty_loss': -9.96728427708149e-05, 'train/video_loss': 0.21848303079605103, 'train/total_loss': 0.5211383700370789} -tensor(0.0003, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.7897, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.06621423959732056, 'train/info_loss': 0.15537168085575104, 'train/ref_loss': None, 'train/uncertainty_loss': -8.806685218587518e-05, 'train/video_loss': 0.1552836149930954, 'train/total_loss': 0.22149786353111267} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.0755, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3926, 'grad_norm': 10.60463809967041, 'learning_rate': 9.664035496707054e-06}[Rank 2] Trainer log: {'loss': 0.3926, 'grad_norm': 10.60463809967041, 'learning_rate': 9.664035496707054e-06} - -[Rank 0] Trainer log: {'loss': 0.3926, 'grad_norm': 10.60463809967041, 'learning_rate': 9.664035496707054e-06}[Rank 3] Trainer log: {'loss': 0.3926, 'grad_norm': 10.60463809967041, 'learning_rate': 9.664035496707054e-06} - -{'loss': 0.3926, 'grad_norm': 10.60463809967041, 'learning_rate': 9.664035496707054e-06, 'epoch': 0.54} -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.4752538681030274, 'train/info_loss': 0.17347098886966705, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00014194814721122385, 'train/video_loss': 0.17332904040813446, 'train/total_loss': 0.648582935333252} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.3593, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.0286, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00023177650291472673, 'train/lm_loss': 6.06921617873013e-05, 'train/info_loss': 2.735778434725944e-05, 'train/ref_loss': 0.23189103603363037, 'train/uncertainty_loss': 0.002863031066954136, 'train/video_loss': 0.23663564026355743, 'train/total_loss': 0.23669633269309998} -[Rank 0] Trainer log: {'loss': 0.3579, 'grad_norm': 8.113485336303711, 'learning_rate': 9.653374164720898e-06}[Rank 1] Trainer log: {'loss': 0.3579, 'grad_norm': 8.113485336303711, 'learning_rate': 9.653374164720898e-06} -[Rank 3] Trainer log: {'loss': 0.3579, 'grad_norm': 8.113485336303711, 'learning_rate': 9.653374164720898e-06} - -[Rank 2] Trainer log: {'loss': 0.3579, 'grad_norm': 8.113485336303711, 'learning_rate': 9.653374164720898e-06} -{'loss': 0.3579, 'grad_norm': 8.113485336303711, 'learning_rate': 9.653374164720898e-06, 'epoch': 0.54} -tensor(-0.0015, device='cuda:0', grad_fn=) tensor(-0.0015, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.29831943511962894, 'train/info_loss': 0.22511884570121765, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00014594700187444686, 'train/video_loss': 0.2249729037284851, 'train/total_loss': 0.5232923030853271} -tensor(0.1100, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.6478, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0020, device='cuda:2', grad_fn=) tensor(-0.0020, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.33261654376983646, 'train/info_loss': 0.11136122792959213, 'train/ref_loss': None, 'train/uncertainty_loss': -8.942072163335979e-05, 'train/video_loss': 0.11127180606126785, 'train/total_loss': 0.4438883364200592} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0198, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4554, 'grad_norm': 9.882575988769531, 'learning_rate': 9.642713227182999e-06}[Rank 0] Trainer log: {'loss': 0.4554, 'grad_norm': 9.882575988769531, 'learning_rate': 9.642713227182999e-06}[Rank 3] Trainer log: {'loss': 0.4554, 'grad_norm': 9.882575988769531, 'learning_rate': 9.642713227182999e-06} - - -[Rank 2] Trainer log: {'loss': 0.4554, 'grad_norm': 9.882575988769531, 'learning_rate': 9.642713227182999e-06} -{'loss': 0.4554, 'grad_norm': 9.882575988769531, 'learning_rate': 9.642713227182999e-06, 'epoch': 0.54} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=)tensor(0.2230, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) -{'train/tv_loss': 0.00015724065015092494, 'train/lm_loss': 5.218301666900516e-05, 'train/info_loss': 2.6106152290594764e-05, 'train/ref_loss': 0.17848537862300873, 'train/uncertainty_loss': -6.778498063795269e-05, 'train/video_loss': 0.17970162630081177, 'train/total_loss': 0.17975381016731262} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.27405126094818116, 'train/info_loss': 0.1542804092168808, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00014063097769394519, 'train/video_loss': 0.15413977205753326, 'train/total_loss': 0.42819106578826904} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3603, 'grad_norm': 2.149109363555908, 'learning_rate': 9.632052696225127e-06}[Rank 3] Trainer log: {'loss': 0.3603, 'grad_norm': 2.149109363555908, 'learning_rate': 9.632052696225127e-06}[Rank 0] Trainer log: {'loss': 0.3603, 'grad_norm': 2.149109363555908, 'learning_rate': 9.632052696225127e-06} - - -[Rank 1] Trainer log: {'loss': 0.3603, 'grad_norm': 2.149109363555908, 'learning_rate': 9.632052696225127e-06} -{'loss': 0.3603, 'grad_norm': 2.149109363555908, 'learning_rate': 9.632052696225127e-06, 'epoch': 0.54} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(0.1592, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00031643721740692854, 'train/lm_loss': 4.7034455928951505e-05, 'train/info_loss': 2.455650974297896e-05, 'train/ref_loss': 0.31853726506233215, 'train/uncertainty_loss': 0.015915381908416747, 'train/video_loss': 0.3370087146759033, 'train/total_loss': 0.3370557427406311} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2602261781692505, 'train/info_loss': 0.21789927780628204, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001425767783075571, 'train/video_loss': 0.2177567034959793, 'train/total_loss': 0.47798287868499756} -tensor(0.3804, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.2051, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4092, 'grad_norm': 10.236905097961426, 'learning_rate': 9.621392583978606e-06}[Rank 3] Trainer log: {'loss': 0.4092, 'grad_norm': 10.236905097961426, 'learning_rate': 9.621392583978606e-06} - -[Rank 0] Trainer log: {'loss': 0.4092, 'grad_norm': 10.236905097961426, 'learning_rate': 9.621392583978606e-06} -[Rank 2] Trainer log: {'loss': 0.4092, 'grad_norm': 10.236905097961426, 'learning_rate': 9.621392583978606e-06} -{'loss': 0.4092, 'grad_norm': 10.236905097961426, 'learning_rate': 9.621392583978606e-06, 'epoch': 0.54} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.33124206066131595, 'train/info_loss': 0.1399199664592743, 'train/ref_loss': None, 'train/uncertainty_loss': -9.411557693965733e-05, 'train/video_loss': 0.13982585072517395, 'train/total_loss': 0.4710679054260254} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00025881533510982994, 'train/lm_loss': 6.886727060191333e-05, 'train/info_loss': 3.1053055863594636e-05, 'train/ref_loss': 0.1887257844209671, 'train/uncertainty_loss': -6.947320071049035e-05, 'train/video_loss': 0.1907578855752945, 'train/total_loss': 0.19082675874233246} -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4743, 'grad_norm': 7.671222686767578, 'learning_rate': 9.610732902574268e-06} -[Rank 2] Trainer log: {'loss': 0.4743, 'grad_norm': 7.671222686767578, 'learning_rate': 9.610732902574268e-06}[Rank 0] Trainer log: {'loss': 0.4743, 'grad_norm': 7.671222686767578, 'learning_rate': 9.610732902574268e-06} - -[Rank 3] Trainer log: {'loss': 0.4743, 'grad_norm': 7.671222686767578, 'learning_rate': 9.610732902574268e-06} -{'loss': 0.4743, 'grad_norm': 7.671222686767578, 'learning_rate': 9.610732902574268e-06, 'epoch': 0.54} -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3333335638046265, 'train/info_loss': 0.12767177820205688, 'train/ref_loss': None, 'train/uncertainty_loss': -9.277042117901147e-05, 'train/video_loss': 0.127579003572464, 'train/total_loss': 0.4609125852584839} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2512779474258423, 'train/info_loss': 0.2819758951663971, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013534653699025513, 'train/video_loss': 0.28184056282043457, 'train/total_loss': 0.533118486404419} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3674, 'grad_norm': 4.5042266845703125, 'learning_rate': 9.600073664142473e-06}[Rank 3] Trainer log: {'loss': 0.3674, 'grad_norm': 4.5042266845703125, 'learning_rate': 9.600073664142473e-06} -[Rank 1] Trainer log: {'loss': 0.3674, 'grad_norm': 4.5042266845703125, 'learning_rate': 9.600073664142473e-06} - -[Rank 2] Trainer log: {'loss': 0.3674, 'grad_norm': 4.5042266845703125, 'learning_rate': 9.600073664142473e-06} -{'loss': 0.3674, 'grad_norm': 4.5042266845703125, 'learning_rate': 9.600073664142473e-06, 'epoch': 0.54} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(0.1767, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00023776006419211628, 'train/lm_loss': 8.845757110975683e-05, 'train/info_loss': 3.2006668334361166e-05, 'train/ref_loss': 0.3290175199508667, 'train/uncertainty_loss': 0.01766931414604187, 'train/video_loss': 0.3486209213733673, 'train/total_loss': 0.348709374666214} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3483451843261719, 'train/info_loss': 0.2062227874994278, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010193883208557964, 'train/video_loss': 0.20612084865570068, 'train/total_loss': 0.5544660091400146} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3589, 'grad_norm': 7.100051403045654, 'learning_rate': 9.589414880813058e-06}[Rank 2] Trainer log: {'loss': 0.3589, 'grad_norm': 7.100051403045654, 'learning_rate': 9.589414880813058e-06} - -[Rank 1] Trainer log: {'loss': 0.3589, 'grad_norm': 7.100051403045654, 'learning_rate': 9.589414880813058e-06} -[Rank 0] Trainer log: {'loss': 0.3589, 'grad_norm': 7.100051403045654, 'learning_rate': 9.589414880813058e-06} -{'loss': 0.3589, 'grad_norm': 7.100051403045654, 'learning_rate': 9.589414880813058e-06, 'epoch': 0.54} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.24530386924743652, 'train/info_loss': 0.19854773581027985, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010724200401455164, 'train/video_loss': 0.19844049215316772, 'train/total_loss': 0.44374436140060425} -tensor(0.0892, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.36172034740448, 'train/info_loss': 0.1608542650938034, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011244749184697868, 'train/video_loss': 0.16074182093143463, 'train/total_loss': 0.5224621891975403} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.8912, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4786, 'grad_norm': 15.091630935668945, 'learning_rate': 9.578756564715355e-06} -[Rank 2] Trainer log: {'loss': 0.4786, 'grad_norm': 15.091630935668945, 'learning_rate': 9.578756564715355e-06} -[Rank 0] Trainer log: {'loss': 0.4786, 'grad_norm': 15.091630935668945, 'learning_rate': 9.578756564715355e-06}[Rank 3] Trainer log: {'loss': 0.4786, 'grad_norm': 15.091630935668945, 'learning_rate': 9.578756564715355e-06} - -{'loss': 0.4786, 'grad_norm': 15.091630935668945, 'learning_rate': 9.578756564715355e-06, 'epoch': 0.54} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.1895, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002889050403609872, 'train/lm_loss': 6.028697243891657e-05, 'train/info_loss': 2.7596188374445774e-05, 'train/ref_loss': 0.33519095182418823, 'train/uncertainty_loss': 0.018952089548110965, 'train/video_loss': 0.3564818799495697, 'train/total_loss': 0.35654217004776} -tensor(0.0892, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2042630434036255, 'train/info_loss': 0.2433052659034729, 'train/ref_loss': None, 'train/uncertainty_loss': -8.26373405288905e-05, 'train/video_loss': 0.24322262406349182, 'train/total_loss': 0.44748568534851074} -tensor(0.0385, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3558, 'grad_norm': 13.5802640914917, 'learning_rate': 9.56809872797816e-06}[Rank 3] Trainer log: {'loss': 0.3558, 'grad_norm': 13.5802640914917, 'learning_rate': 9.56809872797816e-06} -[Rank 2] Trainer log: {'loss': 0.3558, 'grad_norm': 13.5802640914917, 'learning_rate': 9.56809872797816e-06} - -[Rank 0] Trainer log: {'loss': 0.3558, 'grad_norm': 13.5802640914917, 'learning_rate': 9.56809872797816e-06} -{'loss': 0.3558, 'grad_norm': 13.5802640914917, 'learning_rate': 9.56809872797816e-06, 'epoch': 0.54} -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.18093007802963257, 'train/info_loss': 0.20319950580596924, 'train/ref_loss': None, 'train/uncertainty_loss': -9.548667585477233e-05, 'train/video_loss': 0.20310401916503906, 'train/total_loss': 0.38403409719467163} -tensor(0.2134, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0990, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.3164, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003065637778490782, 'train/lm_loss': 3.2017051125876606e-05, 'train/info_loss': 2.1338008082238957e-05, 'train/ref_loss': 0.4119284749031067, 'train/uncertainty_loss': 0.03163539469242096, 'train/video_loss': 0.44603773951530457, 'train/total_loss': 0.44606974720954895} -[Rank 3] Trainer log: {'loss': 0.3721, 'grad_norm': 13.390600204467773, 'learning_rate': 9.557441382729723e-06} -[Rank 2] Trainer log: {'loss': 0.3721, 'grad_norm': 13.390600204467773, 'learning_rate': 9.557441382729723e-06}[Rank 1] Trainer log: {'loss': 0.3721, 'grad_norm': 13.390600204467773, 'learning_rate': 9.557441382729723e-06} - -[Rank 0] Trainer log: {'loss': 0.3721, 'grad_norm': 13.390600204467773, 'learning_rate': 9.557441382729723e-06} -{'loss': 0.3721, 'grad_norm': 13.390600204467773, 'learning_rate': 9.557441382729723e-06, 'epoch': 0.54} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(0.0828, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00019380693556740882, 'train/lm_loss': 2.8107574325986207e-05, 'train/info_loss': 2.0324770957813598e-05, 'train/ref_loss': 0.1898394674062729, 'train/uncertainty_loss': -6.779891555197537e-05, 'train/video_loss': 0.19134244322776794, 'train/total_loss': 0.19137054681777954} -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.2270, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0004580265376716852, 'train/lm_loss': 6.095434073358775e-05, 'train/info_loss': 2.8251803087187e-05, 'train/ref_loss': 0.3599044680595398, 'train/uncertainty_loss': 0.02269997298717499, 'train/video_loss': 0.38629692792892456, 'train/total_loss': 0.3863578736782074} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3589, 'grad_norm': 9.940496444702148, 'learning_rate': 9.54678454109774e-06}[Rank 3] Trainer log: {'loss': 0.3589, 'grad_norm': 9.940496444702148, 'learning_rate': 9.54678454109774e-06} - -[Rank 2] Trainer log: {'loss': 0.3589, 'grad_norm': 9.940496444702148, 'learning_rate': 9.54678454109774e-06} -[Rank 0] Trainer log: {'loss': 0.3589, 'grad_norm': 9.940496444702148, 'learning_rate': 9.54678454109774e-06} -{'loss': 0.3589, 'grad_norm': 9.940496444702148, 'learning_rate': 9.54678454109774e-06, 'epoch': 0.54} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1896279811859131, 'train/info_loss': 0.18844792246818542, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011336342431604862, 'train/video_loss': 0.18833455443382263, 'train/total_loss': 0.37796252965927124} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0015, device='cuda:0', grad_fn=) tensor(-0.0015, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.345444130897522, 'train/info_loss': 0.22835075855255127, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00014664926566183568, 'train/video_loss': 0.22820411622524261, 'train/total_loss': 0.5736482739448547} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.4054, 'grad_norm': 4.151080131530762, 'learning_rate': 9.53612821520932e-06} -[Rank 0] Trainer log: {'loss': 0.4054, 'grad_norm': 4.151080131530762, 'learning_rate': 9.53612821520932e-06}[Rank 1] Trainer log: {'loss': 0.4054, 'grad_norm': 4.151080131530762, 'learning_rate': 9.53612821520932e-06} - -[Rank 3] Trainer log: {'loss': 0.4054, 'grad_norm': 4.151080131530762, 'learning_rate': 9.53612821520932e-06} -{'loss': 0.4054, 'grad_norm': 4.151080131530762, 'learning_rate': 9.53612821520932e-06, 'epoch': 0.54} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(0.0282, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0609, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002624561544507742, 'train/lm_loss': 5.3708493942394854e-05, 'train/info_loss': 2.6940573661704548e-05, 'train/ref_loss': 0.24082404375076294, 'train/uncertainty_loss': 0.006088812649250031, 'train/video_loss': 0.2490394413471222, 'train/total_loss': 0.24909314513206482} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(0.0409, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00037947793025523425, 'train/lm_loss': 6.0596823459491134e-05, 'train/info_loss': 2.914582182711456e-05, 'train/ref_loss': 0.24587687849998474, 'train/uncertainty_loss': 0.004090975597500802, 'train/video_loss': 0.2530328333377838, 'train/total_loss': 0.253093421459198} -[Rank 1] Trainer log: {'loss': 0.35, 'grad_norm': 5.5853800773620605, 'learning_rate': 9.525472417191007e-06} -[Rank 2] Trainer log: {'loss': 0.35, 'grad_norm': 5.5853800773620605, 'learning_rate': 9.525472417191007e-06} -[Rank 0] Trainer log: {'loss': 0.35, 'grad_norm': 5.5853800773620605, 'learning_rate': 9.525472417191007e-06}[Rank 3] Trainer log: {'loss': 0.35, 'grad_norm': 5.5853800773620605, 'learning_rate': 9.525472417191007e-06} - -{'loss': 0.35, 'grad_norm': 5.5853800773620605, 'learning_rate': 9.525472417191007e-06, 'epoch': 0.54} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.34693470001220705, 'train/info_loss': 0.3046843707561493, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011773339938372374, 'train/video_loss': 0.3045666515827179, 'train/total_loss': 0.6515013575553894} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.0332, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003025880549103022, 'train/lm_loss': 4.150436725467444e-05, 'train/info_loss': 2.455650974297896e-05, 'train/ref_loss': 0.18963420391082764, 'train/uncertainty_loss': -6.800792762078345e-05, 'train/video_loss': 0.19201146066188812, 'train/total_loss': 0.192052960395813} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.4148, 'grad_norm': 4.034366607666016, 'learning_rate': 9.514817159168716e-06}[Rank 1] Trainer log: {'loss': 0.4148, 'grad_norm': 4.034366607666016, 'learning_rate': 9.514817159168716e-06} -[Rank 3] Trainer log: {'loss': 0.4148, 'grad_norm': 4.034366607666016, 'learning_rate': 9.514817159168716e-06} - -[Rank 2] Trainer log: {'loss': 0.4148, 'grad_norm': 4.034366607666016, 'learning_rate': 9.514817159168716e-06} -{'loss': 0.4148, 'grad_norm': 4.034366607666016, 'learning_rate': 9.514817159168716e-06, 'epoch': 0.54} -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.31916022300720215, 'train/info_loss': 0.15121236443519592, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010216149967163802, 'train/video_loss': 0.1511102020740509, 'train/total_loss': 0.47027042508125305} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.14652259349823, 'train/info_loss': 0.12283731251955032, 'train/ref_loss': None, 'train/uncertainty_loss': -9.606963722035289e-05, 'train/video_loss': 0.12274124473333359, 'train/total_loss': 0.2692638337612152} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0078, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3051, 'grad_norm': 6.526281833648682, 'learning_rate': 9.504162453267776e-06}[Rank 1] Trainer log: {'loss': 0.3051, 'grad_norm': 6.526281833648682, 'learning_rate': 9.504162453267776e-06} -[Rank 2] Trainer log: {'loss': 0.3051, 'grad_norm': 6.526281833648682, 'learning_rate': 9.504162453267776e-06} - -[Rank 3] Trainer log: {'loss': 0.3051, 'grad_norm': 6.526281833648682, 'learning_rate': 9.504162453267776e-06} -{'loss': 0.3051, 'grad_norm': 6.526281833648682, 'learning_rate': 9.504162453267776e-06, 'epoch': 0.54} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.24257307052612306, 'train/info_loss': 0.24217525124549866, 'train/ref_loss': None, 'train/uncertainty_loss': -0.000113467569462955, 'train/video_loss': 0.2420617789030075, 'train/total_loss': 0.4846348464488983} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.0360, device='cuda:2', grad_fn=) tensor(0.1093, device='cuda:1', grad_fn=) tensor(-0.0006, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0649, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00045130364596843723, 'train/lm_loss': 6.870043580420315e-05, 'train/info_loss': 2.8907417799928226e-05, 'train/ref_loss': 0.24687933921813965, 'train/uncertainty_loss': 0.006489764153957367, 'train/video_loss': 0.25700843334198, 'train/total_loss': 0.2570771276950836} -tensor(0.0597, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3263, 'grad_norm': 2.03303861618042, 'learning_rate': 9.493508311612874e-06}[Rank 0] Trainer log: {'loss': 0.3263, 'grad_norm': 2.03303861618042, 'learning_rate': 9.493508311612874e-06}[Rank 1] Trainer log: {'loss': 0.3263, 'grad_norm': 2.03303861618042, 'learning_rate': 9.493508311612874e-06} - - -[Rank 3] Trainer log: {'loss': 0.3263, 'grad_norm': 2.03303861618042, 'learning_rate': 9.493508311612874e-06}{'loss': 0.3263, 'grad_norm': 2.03303861618042, 'learning_rate': 9.493508311612874e-06, 'epoch': 0.54} - -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.1230, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(0.0251, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1154, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00044967769645154476, 'train/lm_loss': 4.658156540244818e-05, 'train/info_loss': 2.5331331926281564e-05, 'train/ref_loss': 0.2898183763027191, 'train/uncertainty_loss': 0.011542823165655136, 'train/video_loss': 0.3049839437007904, 'train/total_loss': 0.3050305247306824} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.0871050238609314, 'train/info_loss': 0.2609493136405945, 'train/ref_loss': None, 'train/uncertainty_loss': -9.86261060461402e-05, 'train/video_loss': 0.2608506977558136, 'train/total_loss': 0.34795573353767395} -[Rank 0] Trainer log: {'loss': 0.3944, 'grad_norm': 5.526535987854004, 'learning_rate': 9.482854746328051e-06}[Rank 2] Trainer log: {'loss': 0.3944, 'grad_norm': 5.526535987854004, 'learning_rate': 9.482854746328051e-06} -[Rank 1] Trainer log: {'loss': 0.3944, 'grad_norm': 5.526535987854004, 'learning_rate': 9.482854746328051e-06} - -{'loss': 0.3944, 'grad_norm': 5.526535987854004, 'learning_rate': 9.482854746328051e-06, 'epoch': 0.54}[Rank 3] Trainer log: {'loss': 0.3944, 'grad_norm': 5.526535987854004, 'learning_rate': 9.482854746328051e-06} - -tensor(-0.0015, device='cuda:1', grad_fn=) tensor(-0.0015, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.22970468997955323, 'train/info_loss': 0.27847427129745483, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012597715249285102, 'train/video_loss': 0.27834829688072205, 'train/total_loss': 0.5080530047416687} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2651612997055054, 'train/info_loss': 0.36612316966056824, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012617559405043722, 'train/video_loss': 0.36599698662757874, 'train/total_loss': 0.6311582922935486} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.5193, 'grad_norm': 3.7453277111053467, 'learning_rate': 9.4722017695367e-06} -[Rank 2] Trainer log: {'loss': 0.5193, 'grad_norm': 3.7453277111053467, 'learning_rate': 9.4722017695367e-06} -[Rank 0] Trainer log: {'loss': 0.5193, 'grad_norm': 3.7453277111053467, 'learning_rate': 9.4722017695367e-06}[Rank 3] Trainer log: {'loss': 0.5193, 'grad_norm': 3.7453277111053467, 'learning_rate': 9.4722017695367e-06} - -{'loss': 0.5193, 'grad_norm': 3.7453277111053467, 'learning_rate': 9.4722017695367e-06, 'epoch': 0.54} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2248347043991089, 'train/info_loss': 0.21403174102306366, 'train/ref_loss': None, 'train/uncertainty_loss': -9.104302152991296e-05, 'train/video_loss': 0.21394069492816925, 'train/total_loss': 0.4387754201889038} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.22490966320037842, 'train/info_loss': 0.18516968190670013, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011789656709879637, 'train/video_loss': 0.1850517839193344, 'train/total_loss': 0.409961462020874} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(0.1935, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3696, 'grad_norm': 4.202642440795898, 'learning_rate': 9.461549393361538e-06}[Rank 3] Trainer log: {'loss': 0.3696, 'grad_norm': 4.202642440795898, 'learning_rate': 9.461549393361538e-06} - -[Rank 0] Trainer log: {'loss': 0.3696, 'grad_norm': 4.202642440795898, 'learning_rate': 9.461549393361538e-06} -[Rank 2] Trainer log: {'loss': 0.3696, 'grad_norm': 4.202642440795898, 'learning_rate': 9.461549393361538e-06} -{'loss': 0.3696, 'grad_norm': 4.202642440795898, 'learning_rate': 9.461549393361538e-06, 'epoch': 0.54} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(0.3053, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(0.2125, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00045595299452543263, 'train/lm_loss': 7.835283759050071e-05, 'train/info_loss': 3.0516646802425385e-05, 'train/ref_loss': 0.34991657733917236, 'train/uncertainty_loss': 0.02124537378549576, 'train/video_loss': 0.3748400807380676, 'train/total_loss': 0.3749184310436249} -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.1531, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00022519740741699934, 'train/lm_loss': 9.532087133266032e-05, 'train/info_loss': 3.1053055863594636e-05, 'train/ref_loss': 0.3143131732940674, 'train/uncertainty_loss': 0.0153070405125618, 'train/video_loss': 0.3314528465270996, 'train/total_loss': 0.33154815435409546} -[Rank 1] Trainer log: {'loss': 0.3958, 'grad_norm': 10.216862678527832, 'learning_rate': 9.45089762992461e-06}[Rank 3] Trainer log: {'loss': 0.3958, 'grad_norm': 10.216862678527832, 'learning_rate': 9.45089762992461e-06} -[Rank 0] Trainer log: {'loss': 0.3958, 'grad_norm': 10.216862678527832, 'learning_rate': 9.45089762992461e-06} - -[Rank 2] Trainer log: {'loss': 0.3958, 'grad_norm': 10.216862678527832, 'learning_rate': 9.45089762992461e-06} -{'loss': 0.3958, 'grad_norm': 10.216862678527832, 'learning_rate': 9.45089762992461e-06, 'epoch': 0.54} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(0.3185, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00033856565132737163, 'train/lm_loss': 4.681992868427187e-05, 'train/info_loss': 2.6523362976149656e-05, 'train/ref_loss': 0.4195234179496765, 'train/uncertainty_loss': 0.03184826672077179, 'train/video_loss': 0.45410674810409546, 'train/total_loss': 0.45415356755256653} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.0002, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.0518, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00030450262129306794, 'train/lm_loss': 4.093228199053556e-05, 'train/info_loss': 2.5331331926281564e-05, 'train/ref_loss': 0.25722718238830566, 'train/uncertainty_loss': 0.005177756771445274, 'train/video_loss': 0.26486629247665405, 'train/total_loss': 0.26490721106529236} -tensor(0.1808, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.2976, 'grad_norm': 1.4003363847732544, 'learning_rate': 9.440246491347244e-06}[Rank 0] Trainer log: {'loss': 0.2976, 'grad_norm': 1.4003363847732544, 'learning_rate': 9.440246491347244e-06}[Rank 1] Trainer log: {'loss': 0.2976, 'grad_norm': 1.4003363847732544, 'learning_rate': 9.440246491347244e-06} - - -[Rank 2] Trainer log: {'loss': 0.2976, 'grad_norm': 1.4003363847732544, 'learning_rate': 9.440246491347244e-06} -{'loss': 0.2976, 'grad_norm': 1.4003363847732544, 'learning_rate': 9.440246491347244e-06, 'epoch': 0.54} -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3065045356750489, 'train/info_loss': 0.23876319825649261, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013836227590218188, 'train/video_loss': 0.23862484097480774, 'train/total_loss': 0.5451294183731079} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(0.4877, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0015, device='cuda:1', grad_fn=) tensor(-0.0015, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1840248465538025, 'train/info_loss': 0.17625491321086884, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010029881959781051, 'train/video_loss': 0.17615461349487305, 'train/total_loss': 0.36017948389053345} -[Rank 3] Trainer log: {'loss': 0.4701, 'grad_norm': 5.809206485748291, 'learning_rate': 9.429595989750074e-06} -[Rank 1] Trainer log: {'loss': 0.4701, 'grad_norm': 5.809206485748291, 'learning_rate': 9.429595989750074e-06} -[Rank 0] Trainer log: {'loss': 0.4701, 'grad_norm': 5.809206485748291, 'learning_rate': 9.429595989750074e-06}[Rank 2] Trainer log: {'loss': 0.4701, 'grad_norm': 5.809206485748291, 'learning_rate': 9.429595989750074e-06} - -{'loss': 0.4701, 'grad_norm': 5.809206485748291, 'learning_rate': 9.429595989750074e-06, 'epoch': 0.54} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1405, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0418, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002643181011080742, 'train/lm_loss': 4.741583543363959e-05, 'train/info_loss': 2.455650974297896e-05, 'train/ref_loss': 0.2448350191116333, 'train/uncertainty_loss': 0.004178495332598686, 'train/video_loss': 0.2511526346206665, 'train/total_loss': 0.2512000501155853} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.43758645057678225, 'train/info_loss': 0.1407741904258728, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012122861808165908, 'train/video_loss': 0.14065295457839966, 'train/total_loss': 0.5782394409179688} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(0.0573, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.362, 'grad_norm': 1.4374339580535889, 'learning_rate': 9.41894613725301e-06} -[Rank 0] Trainer log: {'loss': 0.362, 'grad_norm': 1.4374339580535889, 'learning_rate': 9.41894613725301e-06} -[Rank 3] Trainer log: {'loss': 0.362, 'grad_norm': 1.4374339580535889, 'learning_rate': 9.41894613725301e-06}[Rank 2] Trainer log: {'loss': 0.362, 'grad_norm': 1.4374339580535889, 'learning_rate': 9.41894613725301e-06} - -{'loss': 0.362, 'grad_norm': 1.4374339580535889, 'learning_rate': 9.41894613725301e-06, 'epoch': 0.54} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.32096657752990726, 'train/info_loss': 0.18470431864261627, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00014291390543803574, 'train/video_loss': 0.18456140160560608, 'train/total_loss': 0.5055279731750488} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2177116632461548, 'train/info_loss': 0.19812767207622528, 'train/ref_loss': None, 'train/uncertainty_loss': -9.276440832763911e-05, 'train/video_loss': 0.19803491234779358, 'train/total_loss': 0.4157465696334839} -tensor(0.0616, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3353, 'grad_norm': 3.6279196739196777, 'learning_rate': 9.408296945975205e-06}[Rank 3] Trainer log: {'loss': 0.3353, 'grad_norm': 3.6279196739196777, 'learning_rate': 9.408296945975205e-06}[Rank 2] Trainer log: {'loss': 0.3353, 'grad_norm': 3.6279196739196777, 'learning_rate': 9.408296945975205e-06} - - -[Rank 1] Trainer log: {'loss': 0.3353, 'grad_norm': 3.6279196739196777, 'learning_rate': 9.408296945975205e-06} -{'loss': 0.3353, 'grad_norm': 3.6279196739196777, 'learning_rate': 9.408296945975205e-06, 'epoch': 0.54} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.06661384701728822, 'train/info_loss': 0.13498245179653168, 'train/ref_loss': None, 'train/uncertainty_loss': -8.586746989749373e-05, 'train/video_loss': 0.13489659130573273, 'train/total_loss': 0.20151042938232422} -tensor(0.2262, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0004478378687053919, 'train/lm_loss': 8.857672801241279e-05, 'train/info_loss': 2.8907417799928226e-05, 'train/ref_loss': 0.1260557621717453, 'train/uncertainty_loss': -7.624886347912252e-05, 'train/video_loss': 0.12959112226963043, 'train/total_loss': 0.12967969477176666} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.2429, 'grad_norm': 3.0552499294281006, 'learning_rate': 9.397648428035082e-06}[Rank 1] Trainer log: {'loss': 0.2429, 'grad_norm': 3.0552499294281006, 'learning_rate': 9.397648428035082e-06} - -[Rank 2] Trainer log: {'loss': 0.2429, 'grad_norm': 3.0552499294281006, 'learning_rate': 9.397648428035082e-06}[Rank 0] Trainer log: {'loss': 0.2429, 'grad_norm': 3.0552499294281006, 'learning_rate': 9.397648428035082e-06} - -{'loss': 0.2429, 'grad_norm': 3.0552499294281006, 'learning_rate': 9.397648428035082e-06, 'epoch': 0.54} -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.4981, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0001827665837481618, 'train/lm_loss': 9.544002241455019e-05, 'train/info_loss': 3.176826794515364e-05, 'train/ref_loss': 0.19224651157855988, 'train/uncertainty_loss': -7.231468334794045e-05, 'train/video_loss': 0.19366809725761414, 'train/total_loss': 0.19376353919506073} -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.4256267547607422, 'train/info_loss': 0.23845143616199493, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00014294867869466544, 'train/video_loss': 0.23830848932266235, 'train/total_loss': 0.6639352440834045} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1970, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.4172, 'grad_norm': 3.3400685787200928, 'learning_rate': 9.387000595550282e-06}[Rank 1] Trainer log: {'loss': 0.4172, 'grad_norm': 3.3400685787200928, 'learning_rate': 9.387000595550282e-06}[Rank 3] Trainer log: {'loss': 0.4172, 'grad_norm': 3.3400685787200928, 'learning_rate': 9.387000595550282e-06} - - -[Rank 2] Trainer log: {'loss': 0.4172, 'grad_norm': 3.3400685787200928, 'learning_rate': 9.387000595550282e-06} -{'loss': 0.4172, 'grad_norm': 3.3400685787200928, 'learning_rate': 9.387000595550282e-06, 'epoch': 0.54} -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.12970561981201173, 'train/info_loss': 0.12115035951137543, 'train/ref_loss': None, 'train/uncertainty_loss': -9.730044985190034e-05, 'train/video_loss': 0.1210530623793602, 'train/total_loss': 0.25075867772102356} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1318, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2382072448730469, 'train/info_loss': 0.21938000619411469, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012298480141907932, 'train/video_loss': 0.21925702691078186, 'train/total_loss': 0.4574642777442932} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1727, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.2594, 'grad_norm': 8.81113338470459, 'learning_rate': 9.37635346063767e-06}[Rank 1] Trainer log: {'loss': 0.2594, 'grad_norm': 8.81113338470459, 'learning_rate': 9.37635346063767e-06} - -[Rank 0] Trainer log: {'loss': 0.2594, 'grad_norm': 8.81113338470459, 'learning_rate': 9.37635346063767e-06} -[Rank 2] Trainer log: {'loss': 0.2594, 'grad_norm': 8.81113338470459, 'learning_rate': 9.37635346063767e-06} -{'loss': 0.2594, 'grad_norm': 8.81113338470459, 'learning_rate': 9.37635346063767e-06, 'epoch': 0.54} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.20512323379516603, 'train/info_loss': 0.14834271371364594, 'train/ref_loss': None, 'train/uncertainty_loss': -9.815541561692954e-05, 'train/video_loss': 0.14824455976486206, 'train/total_loss': 0.35336780548095703} -tensor(0.5617, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3618662357330322, 'train/info_loss': 0.26912254095077515, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001406722003594041, 'train/video_loss': 0.2689818739891052, 'train/total_loss': 0.6308481097221375} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4287, 'grad_norm': 9.235176086425781, 'learning_rate': 9.365707035413324e-06}[Rank 0] Trainer log: {'loss': 0.4287, 'grad_norm': 9.235176086425781, 'learning_rate': 9.365707035413324e-06}[Rank 3] Trainer log: {'loss': 0.4287, 'grad_norm': 9.235176086425781, 'learning_rate': 9.365707035413324e-06} - - -[Rank 2] Trainer log: {'loss': 0.4287, 'grad_norm': 9.235176086425781, 'learning_rate': 9.365707035413324e-06} -{'loss': 0.4287, 'grad_norm': 9.235176086425781, 'learning_rate': 9.365707035413324e-06, 'epoch': 0.54} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3539663553237915, 'train/info_loss': 0.16615357995033264, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011482088593766094, 'train/video_loss': 0.16603876650333405, 'train/total_loss': 0.5200051069259644} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.29586100578308105, 'train/info_loss': 0.2800045311450958, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011517516104504467, 'train/video_loss': 0.2798893451690674, 'train/total_loss': 0.5757503509521484} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.4142, 'grad_norm': 4.4888410568237305, 'learning_rate': 9.355061331992503e-06}[Rank 0] Trainer log: {'loss': 0.4142, 'grad_norm': 4.4888410568237305, 'learning_rate': 9.355061331992503e-06} -[Rank 3] Trainer log: {'loss': 0.4142, 'grad_norm': 4.4888410568237305, 'learning_rate': 9.355061331992503e-06} - -[Rank 1] Trainer log: {'loss': 0.4142, 'grad_norm': 4.4888410568237305, 'learning_rate': 9.355061331992503e-06}{'loss': 0.4142, 'grad_norm': 4.4888410568237305, 'learning_rate': 9.355061331992503e-06, 'epoch': 0.54} - -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.0384, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003199937986209989, 'train/lm_loss': 6.119268946349622e-05, 'train/info_loss': 2.8013399060000665e-05, 'train/ref_loss': 0.23720648884773254, 'train/uncertainty_loss': 0.0038350820541381836, 'train/video_loss': 0.24362953007221222, 'train/total_loss': 0.24369072914123535} -tensor(0.1095, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.8170, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.30412433147430423, 'train/info_loss': 0.20032528042793274, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013496396131813527, 'train/video_loss': 0.20019032061100006, 'train/total_loss': 0.504314661026001} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.0096, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3651, 'grad_norm': 15.612680435180664, 'learning_rate': 9.344416362489656e-06}[Rank 0] Trainer log: {'loss': 0.3651, 'grad_norm': 15.612680435180664, 'learning_rate': 9.344416362489656e-06}[Rank 3] Trainer log: {'loss': 0.3651, 'grad_norm': 15.612680435180664, 'learning_rate': 9.344416362489656e-06} - - -[Rank 1] Trainer log: {'loss': 0.3651, 'grad_norm': 15.612680435180664, 'learning_rate': 9.344416362489656e-06} -{'loss': 0.3651, 'grad_norm': 15.612680435180664, 'learning_rate': 9.344416362489656e-06, 'epoch': 0.54} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.18944610357284547, 'train/info_loss': 0.16067780554294586, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011183180613443256, 'train/video_loss': 0.16056597232818604, 'train/total_loss': 0.35001206398010254} -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(0.0079, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.35603802204132085, 'train/info_loss': 0.1659715622663498, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001357730943709612, 'train/video_loss': 0.16583578288555145, 'train/total_loss': 0.5218738317489624} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.4006, 'grad_norm': 2.673783540725708, 'learning_rate': 9.333772139018388e-06}[Rank 2] Trainer log: {'loss': 0.4006, 'grad_norm': 2.673783540725708, 'learning_rate': 9.333772139018388e-06}[Rank 1] Trainer log: {'loss': 0.4006, 'grad_norm': 2.673783540725708, 'learning_rate': 9.333772139018388e-06} - - -[Rank 3] Trainer log: {'loss': 0.4006, 'grad_norm': 2.673783540725708, 'learning_rate': 9.333772139018388e-06} -{'loss': 0.4006, 'grad_norm': 2.673783540725708, 'learning_rate': 9.333772139018388e-06, 'epoch': 0.55} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.12567628622055055, 'train/info_loss': 0.13776634633541107, 'train/ref_loss': None, 'train/uncertainty_loss': -9.832968935370446e-05, 'train/video_loss': 0.13766801357269287, 'train/total_loss': 0.26334428787231445} -tensor(0.1783, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.2146, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00018373928032815458, 'train/lm_loss': 4.155204223934561e-05, 'train/info_loss': 2.3781687559676357e-05, 'train/ref_loss': 0.357957661151886, 'train/uncertainty_loss': 0.02146444767713547, 'train/video_loss': 0.3809157907962799, 'train/total_loss': 0.38095733523368835} -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(0.8074, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3803, 'grad_norm': 4.0947160720825195, 'learning_rate': 9.32312867369146e-06}[Rank 0] Trainer log: {'loss': 0.3803, 'grad_norm': 4.0947160720825195, 'learning_rate': 9.32312867369146e-06} - -[Rank 3] Trainer log: {'loss': 0.3803, 'grad_norm': 4.0947160720825195, 'learning_rate': 9.32312867369146e-06} -[Rank 2] Trainer log: {'loss': 0.3803, 'grad_norm': 4.0947160720825195, 'learning_rate': 9.32312867369146e-06} -{'loss': 0.3803, 'grad_norm': 4.0947160720825195, 'learning_rate': 9.32312867369146e-06, 'epoch': 0.55} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00027843769639730455, 'train/lm_loss': 6.097817677073181e-05, 'train/info_loss': 2.735778434725944e-05, 'train/ref_loss': 0.11140483617782593, 'train/uncertainty_loss': -7.137178909033537e-05, 'train/video_loss': 0.11358832567930222, 'train/total_loss': 0.11364930123090744} -tensor(0.0947, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.255977988243103, 'train/info_loss': 0.17315317690372467, 'train/ref_loss': None, 'train/uncertainty_loss': -9.026019251905382e-05, 'train/video_loss': 0.17306292057037354, 'train/total_loss': 0.42904090881347656} -tensor(0.2015, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.359, 'grad_norm': 7.771337509155273, 'learning_rate': 9.312485978620762e-06}[Rank 1] Trainer log: {'loss': 0.359, 'grad_norm': 7.771337509155273, 'learning_rate': 9.312485978620762e-06}[Rank 0] Trainer log: {'loss': 0.359, 'grad_norm': 7.771337509155273, 'learning_rate': 9.312485978620762e-06} - -[Rank 2] Trainer log: {'loss': 0.359, 'grad_norm': 7.771337509155273, 'learning_rate': 9.312485978620762e-06} - -{'loss': 0.359, 'grad_norm': 7.771337509155273, 'learning_rate': 9.312485978620762e-06, 'epoch': 0.55} -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2902655124664307, 'train/info_loss': 0.16029460728168488, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013230196200311183, 'train/video_loss': 0.1601622998714447, 'train/total_loss': 0.4504278302192688} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.13268585205078126, 'train/info_loss': 0.2527315020561218, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011558406986296178, 'train/video_loss': 0.25261592864990234, 'train/total_loss': 0.38530176877975464} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3284, 'grad_norm': 5.478717803955078, 'learning_rate': 9.301844065917327e-06}[Rank 2] Trainer log: {'loss': 0.3284, 'grad_norm': 5.478717803955078, 'learning_rate': 9.301844065917327e-06}[Rank 0] Trainer log: {'loss': 0.3284, 'grad_norm': 5.478717803955078, 'learning_rate': 9.301844065917327e-06} - - -[Rank 1] Trainer log: {'loss': 0.3284, 'grad_norm': 5.478717803955078, 'learning_rate': 9.301844065917327e-06} -{'loss': 0.3284, 'grad_norm': 5.478717803955078, 'learning_rate': 9.301844065917327e-06, 'epoch': 0.55} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(0.1765, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(0.3313, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00023891807068139316, 'train/lm_loss': 2.4793995544314386e-05, 'train/info_loss': 1.9728748156921938e-05, 'train/ref_loss': 0.41939979791641235, 'train/uncertainty_loss': 0.03313388228416443, 'train/video_loss': 0.45446476340293884, 'train/total_loss': 0.4544895589351654} -tensor(1.1486, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002179989591240883, 'train/lm_loss': 4.7034455928951505e-05, 'train/info_loss': 2.592734745121561e-05, 'train/ref_loss': 0.10439988225698471, 'train/uncertainty_loss': -6.832254002802074e-05, 'train/video_loss': 0.10610148310661316, 'train/total_loss': 0.10614851862192154} -[Rank 2] Trainer log: {'loss': 0.4825, 'grad_norm': 18.51424217224121, 'learning_rate': 9.291202947691272e-06}[Rank 0] Trainer log: {'loss': 0.4825, 'grad_norm': 18.51424217224121, 'learning_rate': 9.291202947691272e-06}[Rank 3] Trainer log: {'loss': 0.4825, 'grad_norm': 18.51424217224121, 'learning_rate': 9.291202947691272e-06} - - -[Rank 1] Trainer log: {'loss': 0.4825, 'grad_norm': 18.51424217224121, 'learning_rate': 9.291202947691272e-06} -{'loss': 0.4825, 'grad_norm': 18.51424217224121, 'learning_rate': 9.291202947691272e-06, 'epoch': 0.55} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1468, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00021507218480110168, 'train/lm_loss': 5.3565483540296555e-05, 'train/info_loss': 2.592734745121561e-05, 'train/ref_loss': 0.3134171962738037, 'train/uncertainty_loss': 0.014675822854042054, 'train/video_loss': 0.3298395276069641, 'train/total_loss': 0.3298930823802948} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.3187, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00020506540313363077, 'train/lm_loss': 6.860509747639299e-05, 'train/info_loss': 3.176826794515364e-05, 'train/ref_loss': 0.43290427327156067, 'train/uncertainty_loss': 0.03187484443187714, 'train/video_loss': 0.46645140647888184, 'train/total_loss': 0.4665200114250183} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3214, 'grad_norm': 9.358657836914062, 'learning_rate': 9.28056263605183e-06}[Rank 0] Trainer log: {'loss': 0.3214, 'grad_norm': 9.358657836914062, 'learning_rate': 9.28056263605183e-06} -[Rank 2] Trainer log: {'loss': 0.3214, 'grad_norm': 9.358657836914062, 'learning_rate': 9.28056263605183e-06} - -[Rank 3] Trainer log: {'loss': 0.3214, 'grad_norm': 9.358657836914062, 'learning_rate': 9.28056263605183e-06} -{'loss': 0.3214, 'grad_norm': 9.358657836914062, 'learning_rate': 9.28056263605183e-06, 'epoch': 0.55} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(0.1147, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.3404, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00026971925981342796, 'train/lm_loss': 0.00012136590667068959, 'train/info_loss': 3.409269265830517e-05, 'train/ref_loss': 0.4524582624435425, 'train/uncertainty_loss': 0.03404040336608887, 'train/video_loss': 0.48869049549102783, 'train/total_loss': 0.4888118505477905} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1226392149925232, 'train/info_loss': 0.16481050848960876, 'train/ref_loss': None, 'train/uncertainty_loss': -8.838853100314737e-05, 'train/video_loss': 0.16472211480140686, 'train/total_loss': 0.2873613238334656} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.2959, 'grad_norm': 7.889101982116699, 'learning_rate': 9.269923143107308e-06}[Rank 3] Trainer log: {'loss': 0.2959, 'grad_norm': 7.889101982116699, 'learning_rate': 9.269923143107308e-06} - -[Rank 2] Trainer log: {'loss': 0.2959, 'grad_norm': 7.889101982116699, 'learning_rate': 9.269923143107308e-06} -[Rank 0] Trainer log: {'loss': 0.2959, 'grad_norm': 7.889101982116699, 'learning_rate': 9.269923143107308e-06} -{'loss': 0.2959, 'grad_norm': 7.889101982116699, 'learning_rate': 9.269923143107308e-06, 'epoch': 0.55} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0861, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00034105044323951006, 'train/lm_loss': 5.320794880390168e-05, 'train/info_loss': 2.6523362976149656e-05, 'train/ref_loss': 0.26000481843948364, 'train/uncertainty_loss': 0.008614403009414674, 'train/video_loss': 0.2713741362094879, 'train/total_loss': 0.27142733335494995} -tensor(0.0633, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.1072, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002771625062450767, 'train/lm_loss': 0.0001564605860039592, 'train/info_loss': 3.409269265830517e-05, 'train/ref_loss': 0.28541702032089233, 'train/uncertainty_loss': 0.010715561360120774, 'train/video_loss': 0.2983839809894562, 'train/total_loss': 0.2985404431819916} -tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.2516, 'grad_norm': 2.942739486694336, 'learning_rate': 9.25928448096508e-06}[Rank 0] Trainer log: {'loss': 0.2516, 'grad_norm': 2.942739486694336, 'learning_rate': 9.25928448096508e-06}[Rank 3] Trainer log: {'loss': 0.2516, 'grad_norm': 2.942739486694336, 'learning_rate': 9.25928448096508e-06} - - -[Rank 2] Trainer log: {'loss': 0.2516, 'grad_norm': 2.942739486694336, 'learning_rate': 9.25928448096508e-06} -{'loss': 0.2516, 'grad_norm': 2.942739486694336, 'learning_rate': 9.25928448096508e-06, 'epoch': 0.55} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.6408, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) {'train/tv_loss': 0.00035058287903666496, 'train/lm_loss': 4.6438546269200744e-05, 'train/info_loss': 2.5331331926281564e-05, 'train/ref_loss': 0.689724326133728, 'train/uncertainty_loss': 0.06408032178878785, 'train/video_loss': 0.7566345930099487, 'train/total_loss': 0.7566810250282288} -tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) -{'train/tv_loss': 0.0001823872677050531, 'train/lm_loss': 6.073982804082334e-05, 'train/info_loss': 2.735778434725944e-05, 'train/ref_loss': 0.10497887432575226, 'train/uncertainty_loss': -6.938718142919242e-05, 'train/video_loss': 0.10639594495296478, 'train/total_loss': 0.1064566820859909} -[Rank 2] Trainer log: {'loss': 0.2393, 'grad_norm': 12.407557487487793, 'learning_rate': 9.24864666173158e-06}[Rank 0] Trainer log: {'loss': 0.2393, 'grad_norm': 12.407557487487793, 'learning_rate': 9.24864666173158e-06} -[Rank 1] Trainer log: {'loss': 0.2393, 'grad_norm': 12.407557487487793, 'learning_rate': 9.24864666173158e-06} - -[Rank 3] Trainer log: {'loss': 0.2393, 'grad_norm': 12.407557487487793, 'learning_rate': 9.24864666173158e-06} -{'loss': 0.2393, 'grad_norm': 12.407557487487793, 'learning_rate': 9.24864666173158e-06, 'epoch': 0.55} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.0163, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00021611009724438192, 'train/lm_loss': 5.33986312802881e-05, 'train/info_loss': 2.6940573661704548e-05, 'train/ref_loss': 0.15203598141670227, 'train/uncertainty_loss': -6.957332952879369e-05, 'train/video_loss': 0.15372222661972046, 'train/total_loss': 0.1537756323814392} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.21316821575164796, 'train/info_loss': 0.12229054421186447, 'train/ref_loss': None, 'train/uncertainty_loss': -8.493220666423441e-05, 'train/video_loss': 0.12220561504364014, 'train/total_loss': 0.33537381887435913} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3565, 'grad_norm': 2.7887606620788574, 'learning_rate': 9.238009697512274e-06} -[Rank 0] Trainer log: {'loss': 0.3565, 'grad_norm': 2.7887606620788574, 'learning_rate': 9.238009697512274e-06}[Rank 3] Trainer log: {'loss': 0.3565, 'grad_norm': 2.7887606620788574, 'learning_rate': 9.238009697512274e-06} - -[Rank 2] Trainer log: {'loss': 0.3565, 'grad_norm': 2.7887606620788574, 'learning_rate': 9.238009697512274e-06} -{'loss': 0.3565, 'grad_norm': 2.7887606620788574, 'learning_rate': 9.238009697512274e-06, 'epoch': 0.55} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(0.0283, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.6054, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00020277411676943304, 'train/lm_loss': 3.182634827680886e-05, 'train/info_loss': 2.3424076061928645e-05, 'train/ref_loss': 0.6312800645828247, 'train/uncertainty_loss': 0.06054283976554871, 'train/video_loss': 0.6934685111045837, 'train/total_loss': 0.6935003399848938} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.0891, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0069, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.1928, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00020064390264451504, 'train/lm_loss': 3.6427041050046685e-05, 'train/info_loss': 2.3066464564180933e-05, 'train/ref_loss': 0.3427790105342865, 'train/uncertainty_loss': 0.019283726811408997, 'train/video_loss': 0.36369097232818604, 'train/total_loss': 0.3637273907661438} -[Rank 1] Trainer log: {'loss': 0.4242, 'grad_norm': 3.7637710571289062, 'learning_rate': 9.227373600411668e-06} -[Rank 3] Trainer log: {'loss': 0.4242, 'grad_norm': 3.7637710571289062, 'learning_rate': 9.227373600411668e-06}[Rank 0] Trainer log: {'loss': 0.4242, 'grad_norm': 3.7637710571289062, 'learning_rate': 9.227373600411668e-06} - -[Rank 2] Trainer log: {'loss': 0.4242, 'grad_norm': 3.7637710571289062, 'learning_rate': 9.227373600411668e-06} -{'loss': 0.4242, 'grad_norm': 3.7637710571289062, 'learning_rate': 9.227373600411668e-06, 'epoch': 0.55} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.26769261360168456, 'train/info_loss': 0.2196851372718811, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011174279497936369, 'train/video_loss': 0.21957339346408844, 'train/total_loss': 0.48726600408554077} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00024801099207252266, 'train/lm_loss': 6.917710416018963e-05, 'train/info_loss': 2.8907417799928226e-05, 'train/ref_loss': 0.08887650072574615, 'train/uncertainty_loss': -6.669271388091147e-05, 'train/video_loss': 0.09082280844449997, 'train/total_loss': 0.09089198708534241} -[Rank 1] Trainer log: {'loss': 0.366, 'grad_norm': 3.211427688598633, 'learning_rate': 9.216738382533267e-06}[Rank 3] Trainer log: {'loss': 0.366, 'grad_norm': 3.211427688598633, 'learning_rate': 9.216738382533267e-06}[Rank 0] Trainer log: {'loss': 0.366, 'grad_norm': 3.211427688598633, 'learning_rate': 9.216738382533267e-06} - - -[Rank 2] Trainer log: {'loss': 0.366, 'grad_norm': 3.211427688598633, 'learning_rate': 9.216738382533267e-06} -{'loss': 0.366, 'grad_norm': 3.211427688598633, 'learning_rate': 9.216738382533267e-06, 'epoch': 0.55} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.11113511323928833, 'train/info_loss': 0.19855089485645294, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011325160739943385, 'train/video_loss': 0.1984376460313797, 'train/total_loss': 0.3095727562904358} -tensor(0.0511, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.2427, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0005835212767124177, 'train/lm_loss': 4.1528203291818505e-05, 'train/info_loss': 2.3781687559676357e-05, 'train/ref_loss': 0.37916725873947144, 'train/uncertainty_loss': 0.024273116886615754, 'train/video_loss': 0.4081323444843292, 'train/total_loss': 0.4081738591194153} -[Rank 0] Trainer log: {'loss': 0.3317, 'grad_norm': 7.694416522979736, 'learning_rate': 9.206104055979585e-06}[Rank 3] Trainer log: {'loss': 0.3317, 'grad_norm': 7.694416522979736, 'learning_rate': 9.206104055979585e-06}[Rank 2] Trainer log: {'loss': 0.3317, 'grad_norm': 7.694416522979736, 'learning_rate': 9.206104055979585e-06} - - -[Rank 1] Trainer log: {'loss': 0.3317, 'grad_norm': 7.694416522979736, 'learning_rate': 9.206104055979585e-06} -{'loss': 0.3317, 'grad_norm': 7.694416522979736, 'learning_rate': 9.206104055979585e-06, 'epoch': 0.55} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(0.6048, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.0039, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002489942824468017, 'train/lm_loss': 6.104967906139791e-05, 'train/info_loss': 3.129145989078097e-05, 'train/ref_loss': 0.18501849472522736, 'train/uncertainty_loss': 0.00038980660028755667, 'train/video_loss': 0.18743154406547546, 'train/total_loss': 0.18749259412288666} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(0.0668, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0697, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0001884793513454497, 'train/lm_loss': 3.6331691080704335e-05, 'train/info_loss': 2.413929905742407e-05, 'train/ref_loss': 0.24946756660938263, 'train/uncertainty_loss': 0.006966853886842728, 'train/video_loss': 0.25796639919281006, 'train/total_loss': 0.25800272822380066} -tensor(0.0775, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3594, 'grad_norm': 2.3701202869415283, 'learning_rate': 9.195470632852125e-06}[Rank 0] Trainer log: {'loss': 0.3594, 'grad_norm': 2.3701202869415283, 'learning_rate': 9.195470632852125e-06} -[Rank 3] Trainer log: {'loss': 0.3594, 'grad_norm': 2.3701202869415283, 'learning_rate': 9.195470632852125e-06} - -[Rank 1] Trainer log: {'loss': 0.3594, 'grad_norm': 2.3701202869415283, 'learning_rate': 9.195470632852125e-06} -{'loss': 0.3594, 'grad_norm': 2.3701202869415283, 'learning_rate': 9.195470632852125e-06, 'epoch': 0.55} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.1226, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003677859902381897, 'train/lm_loss': 4.71059640403837e-05, 'train/info_loss': 2.455650974297896e-05, 'train/ref_loss': 0.28232908248901367, 'train/uncertainty_loss': 0.012259149551391603, 'train/video_loss': 0.2975550889968872, 'train/total_loss': 0.29760220646858215} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.46223239898681645, 'train/info_loss': 0.20951387286186218, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012176741147413851, 'train/video_loss': 0.20939210057258606, 'train/total_loss': 0.6716245412826538} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.2092, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3853, 'grad_norm': 7.6610026359558105, 'learning_rate': 9.184838125251342e-06}[Rank 1] Trainer log: {'loss': 0.3853, 'grad_norm': 7.6610026359558105, 'learning_rate': 9.184838125251342e-06}[Rank 2] Trainer log: {'loss': 0.3853, 'grad_norm': 7.6610026359558105, 'learning_rate': 9.184838125251342e-06} - - -[Rank 3] Trainer log: {'loss': 0.3853, 'grad_norm': 7.6610026359558105, 'learning_rate': 9.184838125251342e-06} -{'loss': 0.3853, 'grad_norm': 7.6610026359558105, 'learning_rate': 9.184838125251342e-06, 'epoch': 0.55} -tensor(0.1048, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.1938, device='cuda:2', grad_fn=) tensor(-0.0006, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.5050107955932618, 'train/info_loss': 0.14222250878810883, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011442047543823719, 'train/video_loss': 0.14210808277130127, 'train/total_loss': 0.647118866443634} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1445, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.5092, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003156833117827773, 'train/lm_loss': 3.204089007340372e-05, 'train/info_loss': 2.1338008082238957e-05, 'train/ref_loss': 0.6021792888641357, 'train/uncertainty_loss': 0.05092241764068604, 'train/video_loss': 0.6556484699249268, 'train/total_loss': 0.6556805372238159} -[Rank 1] Trainer log: {'loss': 0.4696, 'grad_norm': 2.990781784057617, 'learning_rate': 9.174206545276678e-06}[Rank 3] Trainer log: {'loss': 0.4696, 'grad_norm': 2.990781784057617, 'learning_rate': 9.174206545276678e-06} -[Rank 2] Trainer log: {'loss': 0.4696, 'grad_norm': 2.990781784057617, 'learning_rate': 9.174206545276678e-06} - -[Rank 0] Trainer log: {'loss': 0.4696, 'grad_norm': 2.990781784057617, 'learning_rate': 9.174206545276678e-06} -{'loss': 0.4696, 'grad_norm': 2.990781784057617, 'learning_rate': 9.174206545276678e-06, 'epoch': 0.55} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(0.6692, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002707474632188678, 'train/lm_loss': 6.915327394381165e-05, 'train/info_loss': 2.8907417799928226e-05, 'train/ref_loss': 0.13663944602012634, 'train/uncertainty_loss': -6.807194440625609e-05, 'train/video_loss': 0.13876625895500183, 'train/total_loss': 0.13883541524410248} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00029139099642634393, 'train/lm_loss': 7.689904887229205e-05, 'train/info_loss': 2.783459422062151e-05, 'train/ref_loss': 0.16890206933021545, 'train/uncertainty_loss': -6.944728665985167e-05, 'train/video_loss': 0.17119157314300537, 'train/total_loss': 0.17126847803592682} -tensor(0.1397, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.2884, 'grad_norm': 15.408380508422852, 'learning_rate': 9.163575905026494e-06}[Rank 2] Trainer log: {'loss': 0.2884, 'grad_norm': 15.408380508422852, 'learning_rate': 9.163575905026494e-06} -[Rank 3] Trainer log: {'loss': 0.2884, 'grad_norm': 15.408380508422852, 'learning_rate': 9.163575905026494e-06} - -[Rank 0] Trainer log: {'loss': 0.2884, 'grad_norm': 15.408380508422852, 'learning_rate': 9.163575905026494e-06} -{'loss': 0.2884, 'grad_norm': 15.408380508422852, 'learning_rate': 9.163575905026494e-06, 'epoch': 0.55} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2745846271514893, 'train/info_loss': 0.16951704025268555, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011891252361238003, 'train/video_loss': 0.16939812898635864, 'train/total_loss': 0.4439827501773834} -tensor(0.2089, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.0785706639289856, 'train/info_loss': 0.13730105757713318, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010901604546234012, 'train/video_loss': 0.137192040681839, 'train/total_loss': 0.21576270461082458} -tensor(0.1046, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.433, 'grad_norm': 3.2139129638671875, 'learning_rate': 9.1529462165981e-06}[Rank 1] Trainer log: {'loss': 0.433, 'grad_norm': 3.2139129638671875, 'learning_rate': 9.1529462165981e-06} - -[Rank 3] Trainer log: {'loss': 0.433, 'grad_norm': 3.2139129638671875, 'learning_rate': 9.1529462165981e-06} -[Rank 0] Trainer log: {'loss': 0.433, 'grad_norm': 3.2139129638671875, 'learning_rate': 9.1529462165981e-06} -{'loss': 0.433, 'grad_norm': 3.2139129638671875, 'learning_rate': 9.1529462165981e-06, 'epoch': 0.55} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.40537533760070804, 'train/info_loss': 0.14399798214435577, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010811272077262402, 'train/video_loss': 0.1438898742198944, 'train/total_loss': 0.5492652058601379} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2452869415283203, 'train/info_loss': 0.170973539352417, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001263252692297101, 'train/video_loss': 0.17084720730781555, 'train/total_loss': 0.41613414883613586} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.0839, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.4169, 'grad_norm': 5.179174423217773, 'learning_rate': 9.142317492087708e-06} -[Rank 3] Trainer log: {'loss': 0.4169, 'grad_norm': 5.179174423217773, 'learning_rate': 9.142317492087708e-06}[Rank 1] Trainer log: {'loss': 0.4169, 'grad_norm': 5.179174423217773, 'learning_rate': 9.142317492087708e-06} - -[Rank 0] Trainer log: {'loss': 0.4169, 'grad_norm': 5.179174423217773, 'learning_rate': 9.142317492087708e-06} -{'loss': 0.4169, 'grad_norm': 5.179174423217773, 'learning_rate': 9.142317492087708e-06, 'epoch': 0.55} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.0032, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00023079614620655776, 'train/lm_loss': 2.8107574325986207e-05, 'train/info_loss': 2.1338008082238957e-05, 'train/ref_loss': 0.18635442852973938, 'train/uncertainty_loss': 0.0003220364917069674, 'train/video_loss': 0.18854418396949768, 'train/total_loss': 0.18857228755950928} -tensor(0.0618, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.08253957629203797, 'train/info_loss': 0.1869511902332306, 'train/ref_loss': None, 'train/uncertainty_loss': -9.878388373181225e-05, 'train/video_loss': 0.18685241043567657, 'train/total_loss': 0.2693919837474823} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0961, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.2859, 'grad_norm': 10.601298332214355, 'learning_rate': 9.131689743590442e-06}[Rank 1] Trainer log: {'loss': 0.2859, 'grad_norm': 10.601298332214355, 'learning_rate': 9.131689743590442e-06}[Rank 2] Trainer log: {'loss': 0.2859, 'grad_norm': 10.601298332214355, 'learning_rate': 9.131689743590442e-06} - - -[Rank 3] Trainer log: {'loss': 0.2859, 'grad_norm': 10.601298332214355, 'learning_rate': 9.131689743590442e-06} -{'loss': 0.2859, 'grad_norm': 10.601298332214355, 'learning_rate': 9.131689743590442e-06, 'epoch': 0.55} -tensor(0.1149, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002421250566840172, 'train/lm_loss': 5.344630335457623e-05, 'train/info_loss': 2.6523362976149656e-05, 'train/ref_loss': 0.28464946150779724, 'train/uncertainty_loss': 0.011491268128156663, 'train/video_loss': 0.29810425639152527, 'train/total_loss': 0.2981576919555664} -tensor(0.5961, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.1309, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2484498977661133, 'train/info_loss': 0.20741936564445496, 'train/ref_loss': None, 'train/uncertainty_loss': -8.994783856905997e-05, 'train/video_loss': 0.2073294222354889, 'train/total_loss': 0.4557793140411377} -tensor(0.0141, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1038, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3699, 'grad_norm': 3.5670077800750732, 'learning_rate': 9.12106298320032e-06}[Rank 0] Trainer log: {'loss': 0.3699, 'grad_norm': 3.5670077800750732, 'learning_rate': 9.12106298320032e-06}[Rank 2] Trainer log: {'loss': 0.3699, 'grad_norm': 3.5670077800750732, 'learning_rate': 9.12106298320032e-06} - -[Rank 3] Trainer log: {'loss': 0.3699, 'grad_norm': 3.5670077800750732, 'learning_rate': 9.12106298320032e-06} - -{'loss': 0.3699, 'grad_norm': 3.5670077800750732, 'learning_rate': 9.12106298320032e-06, 'epoch': 0.55} -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.04165436923503876, 'train/info_loss': 0.1545407623052597, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010383529588580132, 'train/video_loss': 0.15443693101406097, 'train/total_loss': 0.19609129428863525} -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(0.1663, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(0.1777, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00027846714947372676, 'train/lm_loss': 6.886727060191333e-05, 'train/info_loss': 3.075505082961172e-05, 'train/ref_loss': 0.16273248195648193, 'train/uncertainty_loss': -6.505821365863085e-05, 'train/video_loss': 0.16492591798305511, 'train/total_loss': 0.16499479115009308} -tensor(0.2865, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.2938, 'grad_norm': 13.46999454498291, 'learning_rate': 9.110437223010217e-06}[Rank 2] Trainer log: {'loss': 0.2938, 'grad_norm': 13.46999454498291, 'learning_rate': 9.110437223010217e-06}[Rank 1] Trainer log: {'loss': 0.2938, 'grad_norm': 13.46999454498291, 'learning_rate': 9.110437223010217e-06} - - -[Rank 0] Trainer log: {'loss': 0.2938, 'grad_norm': 13.46999454498291, 'learning_rate': 9.110437223010217e-06} -{'loss': 0.2938, 'grad_norm': 13.46999454498291, 'learning_rate': 9.110437223010217e-06, 'epoch': 0.55} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.30661132335662844, 'train/info_loss': 0.22122928500175476, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011310392292216421, 'train/video_loss': 0.22111618518829346, 'train/total_loss': 0.527727484703064} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1367, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.2835, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002960506360977888, 'train/lm_loss': 5.378000205382705e-05, 'train/info_loss': 2.962263170047663e-05, 'train/ref_loss': 0.40320125222206116, 'train/uncertainty_loss': 0.028347185254096987, 'train/video_loss': 0.4339464604854584, 'train/total_loss': 0.43400025367736816} -[Rank 2] Trainer log: {'loss': 0.3454, 'grad_norm': 14.396492004394531, 'learning_rate': 9.099812475111895e-06}[Rank 1] Trainer log: {'loss': 0.3454, 'grad_norm': 14.396492004394531, 'learning_rate': 9.099812475111895e-06} -[Rank 3] Trainer log: {'loss': 0.3454, 'grad_norm': 14.396492004394531, 'learning_rate': 9.099812475111895e-06} - -[Rank 0] Trainer log: {'loss': 0.3454, 'grad_norm': 14.396492004394531, 'learning_rate': 9.099812475111895e-06} -{'loss': 0.3454, 'grad_norm': 14.396492004394531, 'learning_rate': 9.099812475111895e-06, 'epoch': 0.55} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00020645663607865572, 'train/lm_loss': 3.6069477209821345e-05, 'train/info_loss': 2.3781687559676357e-05, 'train/ref_loss': 0.1556714028120041, 'train/uncertainty_loss': -6.9972756318748e-05, 'train/video_loss': 0.15727686882019043, 'train/total_loss': 0.15731294453144073} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1412, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2174118995666504, 'train/info_loss': 0.14136287569999695, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013287768233567477, 'train/video_loss': 0.1412300020456314, 'train/total_loss': 0.35864192247390747} -tensor(0.1236, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3749, 'grad_norm': 2.509734630584717, 'learning_rate': 9.089188751595937e-06}[Rank 0] Trainer log: {'loss': 0.3749, 'grad_norm': 2.509734630584717, 'learning_rate': 9.089188751595937e-06}[Rank 3] Trainer log: {'loss': 0.3749, 'grad_norm': 2.509734630584717, 'learning_rate': 9.089188751595937e-06} - - -[Rank 2] Trainer log: {'loss': 0.3749, 'grad_norm': 2.509734630584717, 'learning_rate': 9.089188751595937e-06} -{'loss': 0.3749, 'grad_norm': 2.509734630584717, 'learning_rate': 9.089188751595937e-06, 'epoch': 0.55} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002425121143460274, 'train/lm_loss': 6.06921617873013e-05, 'train/info_loss': 2.9384225854300894e-05, 'train/ref_loss': 0.17894014716148376, 'train/uncertainty_loss': -7.15229834895581e-05, 'train/video_loss': 0.18083810806274414, 'train/total_loss': 0.18089880049228668} -tensor(-0.0015, device='cuda:0', grad_fn=) tensor(-0.0015, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.403077507019043, 'train/info_loss': 0.27405843138694763, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001480280188843608, 'train/video_loss': 0.27391040325164795, 'train/total_loss': 0.676987886428833} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.4338, 'grad_norm': 3.060697317123413, 'learning_rate': 9.078566064551788e-06}[Rank 1] Trainer log: {'loss': 0.4338, 'grad_norm': 3.060697317123413, 'learning_rate': 9.078566064551788e-06} -[Rank 3] Trainer log: {'loss': 0.4338, 'grad_norm': 3.060697317123413, 'learning_rate': 9.078566064551788e-06} - -[Rank 2] Trainer log: {'loss': 0.4338, 'grad_norm': 3.060697317123413, 'learning_rate': 9.078566064551788e-06} -{'loss': 0.4338, 'grad_norm': 3.060697317123413, 'learning_rate': 9.078566064551788e-06, 'epoch': 0.55} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.26277966499328614, 'train/info_loss': 0.18983784317970276, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010621114633977414, 'train/video_loss': 0.189731627702713, 'train/total_loss': 0.4525113105773926} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(0.1510, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0262, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002584239933639765, 'train/lm_loss': 3.168332041241229e-05, 'train/info_loss': 2.235124156868551e-05, 'train/ref_loss': 0.2125122845172882, 'train/uncertainty_loss': 0.002616296336054802, 'train/video_loss': 0.2172183245420456, 'train/total_loss': 0.2172500044107437} -tensor(0.0108, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3038, 'grad_norm': 5.018774509429932, 'learning_rate': 9.067944426067688e-06}[Rank 1] Trainer log: {'loss': 0.3038, 'grad_norm': 5.018774509429932, 'learning_rate': 9.067944426067688e-06} - -[Rank 0] Trainer log: {'loss': 0.3038, 'grad_norm': 5.018774509429932, 'learning_rate': 9.067944426067688e-06}[Rank 2] Trainer log: {'loss': 0.3038, 'grad_norm': 5.018774509429932, 'learning_rate': 9.067944426067688e-06} - -{'loss': 0.3038, 'grad_norm': 5.018774509429932, 'learning_rate': 9.067944426067688e-06, 'epoch': 0.55} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.3704, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00019042911008000375, 'train/lm_loss': 4.651005729101599e-05, 'train/info_loss': 2.592734745121561e-05, 'train/ref_loss': 0.09771868586540222, 'train/uncertainty_loss': -6.888741627335549e-05, 'train/video_loss': 0.09919916093349457, 'train/total_loss': 0.09924566745758057} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1961296319961548, 'train/info_loss': 0.17122480273246765, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011867266148328781, 'train/video_loss': 0.17110612988471985, 'train/total_loss': 0.36723577976226807} -tensor(0.1799, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3493, 'grad_norm': 8.686603546142578, 'learning_rate': 9.057323848230701e-06}[Rank 3] Trainer log: {'loss': 0.3493, 'grad_norm': 8.686603546142578, 'learning_rate': 9.057323848230701e-06}[Rank 0] Trainer log: {'loss': 0.3493, 'grad_norm': 8.686603546142578, 'learning_rate': 9.057323848230701e-06} - - -[Rank 2] Trainer log: {'loss': 0.3493, 'grad_norm': 8.686603546142578, 'learning_rate': 9.057323848230701e-06} -{'loss': 0.3493, 'grad_norm': 8.686603546142578, 'learning_rate': 9.057323848230701e-06, 'epoch': 0.55} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(0.1319, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0744, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0004630133975297213, 'train/lm_loss': 4.112297610845417e-05, 'train/info_loss': 2.4914121240726672e-05, 'train/ref_loss': 0.11285079270601273, 'train/uncertainty_loss': -8.332119323313237e-05, 'train/video_loss': 0.1164964959025383, 'train/total_loss': 0.11653761565685272} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.2628, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00020116420928388836, 'train/lm_loss': 4.090844304300845e-05, 'train/info_loss': 2.574854443082586e-05, 'train/ref_loss': 0.1483575701713562, 'train/uncertainty_loss': -6.88530970364809e-05, 'train/video_loss': 0.14992377161979675, 'train/total_loss': 0.14996467530727386} -[Rank 3] Trainer log: {'loss': 0.2924, 'grad_norm': 3.579298257827759, 'learning_rate': 9.04670434312668e-06} -[Rank 0] Trainer log: {'loss': 0.2924, 'grad_norm': 3.579298257827759, 'learning_rate': 9.04670434312668e-06}[Rank 1] Trainer log: {'loss': 0.2924, 'grad_norm': 3.579298257827759, 'learning_rate': 9.04670434312668e-06} -[Rank 2] Trainer log: {'loss': 0.2924, 'grad_norm': 3.579298257827759, 'learning_rate': 9.04670434312668e-06} - -{'loss': 0.2924, 'grad_norm': 3.579298257827759, 'learning_rate': 9.04670434312668e-06, 'epoch': 0.55} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.3075, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00029390871059149505, 'train/lm_loss': 2.4698639754205944e-05, 'train/info_loss': 2.002676046686247e-05, 'train/ref_loss': 0.4104575514793396, 'train/uncertainty_loss': 0.030749958753585816, 'train/video_loss': 0.4435788094997406, 'train/total_loss': 0.443603515625} -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.03956726491451264, 'train/info_loss': 0.07958732545375824, 'train/ref_loss': None, 'train/uncertainty_loss': -9.146417141892016e-05, 'train/video_loss': 0.0794958621263504, 'train/total_loss': 0.1190631240606308} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.2426, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.2896, 'grad_norm': 12.210664749145508, 'learning_rate': 9.036085922840252e-06}[Rank 2] Trainer log: {'loss': 0.2896, 'grad_norm': 12.210664749145508, 'learning_rate': 9.036085922840252e-06} - -[Rank 3] Trainer log: {'loss': 0.2896, 'grad_norm': 12.210664749145508, 'learning_rate': 9.036085922840252e-06} -[Rank 0] Trainer log: {'loss': 0.2896, 'grad_norm': 12.210664749145508, 'learning_rate': 9.036085922840252e-06} -{'loss': 0.2896, 'grad_norm': 12.210664749145508, 'learning_rate': 9.036085922840252e-06, 'epoch': 0.55} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0015, device='cuda:0', grad_fn=) tensor(-0.0015, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1780571699142456, 'train/info_loss': 0.28473106026649475, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00015100892633199692, 'train/video_loss': 0.2845800518989563, 'train/total_loss': 0.4626372456550598} -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(0.2227, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.2832, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00018995951395481825, 'train/lm_loss': 3.256532654631883e-05, 'train/info_loss': 2.3424076061928645e-05, 'train/ref_loss': 0.3999098539352417, 'train/uncertainty_loss': 0.028324890136718753, 'train/video_loss': 0.4297778606414795, 'train/total_loss': 0.42981043457984924} -tensor(0.3264, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3947, 'grad_norm': 13.47357177734375, 'learning_rate': 9.025468599454818e-06} -[Rank 2] Trainer log: {'loss': 0.3947, 'grad_norm': 13.47357177734375, 'learning_rate': 9.025468599454818e-06}[Rank 0] Trainer log: {'loss': 0.3947, 'grad_norm': 13.47357177734375, 'learning_rate': 9.025468599454818e-06} -[Rank 3] Trainer log: {'loss': 0.3947, 'grad_norm': 13.47357177734375, 'learning_rate': 9.025468599454818e-06} - -{'loss': 0.3947, 'grad_norm': 13.47357177734375, 'learning_rate': 9.025468599454818e-06, 'epoch': 0.55} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.2176, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003544754581525922, 'train/lm_loss': 0.00010163584956899286, 'train/info_loss': 3.683431350509636e-05, 'train/ref_loss': 0.35894498229026794, 'train/uncertainty_loss': 0.021761536598205566, 'train/video_loss': 0.38357916474342346, 'train/total_loss': 0.3836807906627655} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.25544722080230714, 'train/info_loss': 0.21635828912258148, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013895109295845032, 'train/video_loss': 0.21621933579444885, 'train/total_loss': 0.4716665744781494} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(0.1374, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.2943, 'grad_norm': 2.9427649974823, 'learning_rate': 9.014852385052519e-06}[Rank 1] Trainer log: {'loss': 0.2943, 'grad_norm': 2.9427649974823, 'learning_rate': 9.014852385052519e-06}[Rank 3] Trainer log: {'loss': 0.2943, 'grad_norm': 2.9427649974823, 'learning_rate': 9.014852385052519e-06} - - -[Rank 2] Trainer log: {'loss': 0.2943, 'grad_norm': 2.9427649974823, 'learning_rate': 9.014852385052519e-06} -{'loss': 0.2943, 'grad_norm': 2.9427649974823, 'learning_rate': 9.014852385052519e-06, 'epoch': 0.55} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(0.2022, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3223651170730591, 'train/info_loss': 0.3853102922439575, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011109838960692287, 'train/video_loss': 0.3851991891860962, 'train/total_loss': 0.7075643539428711} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.1230, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(1.8659, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.3389, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003209629096090794, 'train/lm_loss': 2.8059899341315032e-05, 'train/info_loss': 2.270885306643322e-05, 'train/ref_loss': 0.4315263032913208, 'train/uncertainty_loss': 0.03389402031898499, 'train/video_loss': 0.4680107533931732, 'train/total_loss': 0.4680388271808624} -[Rank 2] Trainer log: {'loss': 0.6372, 'grad_norm': 21.63381004333496, 'learning_rate': 9.004237291714248e-06} -[Rank 1] Trainer log: {'loss': 0.6372, 'grad_norm': 21.63381004333496, 'learning_rate': 9.004237291714248e-06}[Rank 0] Trainer log: {'loss': 0.6372, 'grad_norm': 21.63381004333496, 'learning_rate': 9.004237291714248e-06} - -[Rank 3] Trainer log: {'loss': 0.6372, 'grad_norm': 21.63381004333496, 'learning_rate': 9.004237291714248e-06} -{'loss': 0.6372, 'grad_norm': 21.63381004333496, 'learning_rate': 9.004237291714248e-06, 'epoch': 0.56} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.000219904026016593, 'train/lm_loss': 6.1025843024253845e-05, 'train/info_loss': 3.254307739553042e-05, 'train/ref_loss': 0.19062358140945435, 'train/uncertainty_loss': -6.967116496525705e-05, 'train/video_loss': 0.19234567880630493, 'train/total_loss': 0.19240669906139374} -tensor(0.0553, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.3050, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00029441134538501503, 'train/lm_loss': 9.238969651050866e-05, 'train/info_loss': 2.783459422062151e-05, 'train/ref_loss': 0.03584258258342743, 'train/uncertainty_loss': -6.99342112056911e-05, 'train/video_loss': 0.038155775517225266, 'train/total_loss': 0.03824816644191742} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3035, 'grad_norm': 1.5874674320220947, 'learning_rate': 8.99362333151961e-06} -[Rank 1] Trainer log: {'loss': 0.3035, 'grad_norm': 1.5874674320220947, 'learning_rate': 8.99362333151961e-06} -[Rank 0] Trainer log: {'loss': 0.3035, 'grad_norm': 1.5874674320220947, 'learning_rate': 8.99362333151961e-06}[Rank 3] Trainer log: {'loss': 0.3035, 'grad_norm': 1.5874674320220947, 'learning_rate': 8.99362333151961e-06} - -{'loss': 0.3035, 'grad_norm': 1.5874674320220947, 'learning_rate': 8.99362333151961e-06, 'epoch': 0.56} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.49175891876220706, 'train/info_loss': 0.16716443002223969, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011430287268012763, 'train/video_loss': 0.16705012321472168, 'train/total_loss': 0.6588090658187866} -tensor(0.2413, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1126, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.14258801937103271, 'train/info_loss': 0.23832416534423828, 'train/ref_loss': None, 'train/uncertainty_loss': -8.84438748471439e-05, 'train/video_loss': 0.2382357269525528, 'train/total_loss': 0.3808237314224243} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.3815, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4438, 'grad_norm': 9.210333824157715, 'learning_rate': 8.983010516546925e-06} -[Rank 0] Trainer log: {'loss': 0.4438, 'grad_norm': 9.210333824157715, 'learning_rate': 8.983010516546925e-06}[Rank 3] Trainer log: {'loss': 0.4438, 'grad_norm': 9.210333824157715, 'learning_rate': 8.983010516546925e-06} -[Rank 2] Trainer log: {'loss': 0.4438, 'grad_norm': 9.210333824157715, 'learning_rate': 8.983010516546925e-06} - -{'loss': 0.4438, 'grad_norm': 9.210333824157715, 'learning_rate': 8.983010516546925e-06, 'epoch': 0.56} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2292158603668213, 'train/info_loss': 0.1503593772649765, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011496699880808592, 'train/video_loss': 0.15024441480636597, 'train/total_loss': 0.37946027517318726} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.0001, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0935, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0645, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00019625925924628975, 'train/lm_loss': 9.939584415405989e-05, 'train/info_loss': 2.8490208933362737e-05, 'train/ref_loss': 0.24444207549095154, 'train/uncertainty_loss': 0.006449112296104432, 'train/video_loss': 0.25248974561691284, 'train/total_loss': 0.2525891363620758} -[Rank 3] Trainer log: {'loss': 0.3131, 'grad_norm': 6.922909736633301, 'learning_rate': 8.97239885887322e-06}[Rank 0] Trainer log: {'loss': 0.3131, 'grad_norm': 6.922909736633301, 'learning_rate': 8.97239885887322e-06}[Rank 2] Trainer log: {'loss': 0.3131, 'grad_norm': 6.922909736633301, 'learning_rate': 8.97239885887322e-06} - - -[Rank 1] Trainer log: {'loss': 0.3131, 'grad_norm': 6.922909736633301, 'learning_rate': 8.97239885887322e-06} -{'loss': 0.3131, 'grad_norm': 6.922909736633301, 'learning_rate': 8.97239885887322e-06, 'epoch': 0.56} -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.07027212977409363, 'train/info_loss': 0.17274342477321625, 'train/ref_loss': None, 'train/uncertainty_loss': -9.008837514556944e-05, 'train/video_loss': 0.17265333235263824, 'train/total_loss': 0.2429254651069641} -tensor(0.0629, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0184, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00023967642337083817, 'train/lm_loss': 4.14328562328592e-05, 'train/info_loss': 2.413929905742407e-05, 'train/ref_loss': 0.17328542470932007, 'train/uncertainty_loss': -6.977331941016018e-05, 'train/video_loss': 0.17515720427036285, 'train/total_loss': 0.17519864439964294} -[Rank 1] Trainer log: {'loss': 0.2858, 'grad_norm': 6.061092376708984, 'learning_rate': 8.961788370574184e-06} -[Rank 2] Trainer log: {'loss': 0.2858, 'grad_norm': 6.061092376708984, 'learning_rate': 8.961788370574184e-06} -[Rank 0] Trainer log: {'loss': 0.2858, 'grad_norm': 6.061092376708984, 'learning_rate': 8.961788370574184e-06}[Rank 3] Trainer log: {'loss': 0.2858, 'grad_norm': 6.061092376708984, 'learning_rate': 8.961788370574184e-06} - -{'loss': 0.2858, 'grad_norm': 6.061092376708984, 'learning_rate': 8.961788370574184e-06, 'epoch': 0.56} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1880, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003741441294550896, 'train/lm_loss': 3.635553002823144e-05, 'train/info_loss': 2.3424076061928645e-05, 'train/ref_loss': 0.33924394845962524, 'train/uncertainty_loss': 0.018800055980682375, 'train/video_loss': 0.36106058955192566, 'train/total_loss': 0.36109694838523865} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.23377931118011475, 'train/info_loss': 0.22430726885795593, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001321212388575077, 'train/video_loss': 0.22417514026165009, 'train/total_loss': 0.457954466342926} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.1523, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4182, 'grad_norm': 6.8900837898254395, 'learning_rate': 8.951179063724199e-06}[Rank 0] Trainer log: {'loss': 0.4182, 'grad_norm': 6.8900837898254395, 'learning_rate': 8.951179063724199e-06}[Rank 2] Trainer log: {'loss': 0.4182, 'grad_norm': 6.8900837898254395, 'learning_rate': 8.951179063724199e-06} - -[Rank 1] Trainer log: {'loss': 0.4182, 'grad_norm': 6.8900837898254395, 'learning_rate': 8.951179063724199e-06} - -{'loss': 0.4182, 'grad_norm': 6.8900837898254395, 'learning_rate': 8.951179063724199e-06, 'epoch': 0.56} -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.38649461269378665, 'train/info_loss': 0.22110435366630554, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001274154637940228, 'train/video_loss': 0.22097693383693695, 'train/total_loss': 0.6074715256690979} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.0264, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(0.3113, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00038760872557759286, 'train/lm_loss': 3.621250216383487e-05, 'train/info_loss': 2.3781687559676357e-05, 'train/ref_loss': 0.4119510054588318, 'train/uncertainty_loss': 0.03112652897834778, 'train/video_loss': 0.44620218873023987, 'train/total_loss': 0.4462383985519409} -tensor(0.1808, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3527, 'grad_norm': 20.404314041137695, 'learning_rate': 8.940570950396274e-06}[Rank 0] Trainer log: {'loss': 0.3527, 'grad_norm': 20.404314041137695, 'learning_rate': 8.940570950396274e-06}[Rank 1] Trainer log: {'loss': 0.3527, 'grad_norm': 20.404314041137695, 'learning_rate': 8.940570950396274e-06} - - -[Rank 2] Trainer log: {'loss': 0.3527, 'grad_norm': 20.404314041137695, 'learning_rate': 8.940570950396274e-06} -{'loss': 0.3527, 'grad_norm': 20.404314041137695, 'learning_rate': 8.940570950396274e-06, 'epoch': 0.56} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.48944683074951173, 'train/info_loss': 0.2251630425453186, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012230491265654563, 'train/video_loss': 0.2250407338142395, 'train/total_loss': 0.7144875526428223} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.273010778427124, 'train/info_loss': 0.15003924071788788, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010431231930851936, 'train/video_loss': 0.14993493258953094, 'train/total_loss': 0.4229457378387451} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.2161, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3353, 'grad_norm': 7.610635757446289, 'learning_rate': 8.929964042662092e-06}[Rank 2] Trainer log: {'loss': 0.3353, 'grad_norm': 7.610635757446289, 'learning_rate': 8.929964042662092e-06} -[Rank 1] Trainer log: {'loss': 0.3353, 'grad_norm': 7.610635757446289, 'learning_rate': 8.929964042662092e-06} - -[Rank 3] Trainer log: {'loss': 0.3353, 'grad_norm': 7.610635757446289, 'learning_rate': 8.929964042662092e-06} -{'loss': 0.3353, 'grad_norm': 7.610635757446289, 'learning_rate': 8.929964042662092e-06, 'epoch': 0.56} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(0.0481, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1718, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003502966603264213, 'train/lm_loss': 4.105146508663893e-05, 'train/info_loss': 2.5331331926281564e-05, 'train/ref_loss': 0.32688766717910767, 'train/uncertainty_loss': 0.017176987230777742, 'train/video_loss': 0.3468923568725586, 'train/total_loss': 0.34693339467048645} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.4096261501312256, 'train/info_loss': 0.1936580389738083, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010123669635504485, 'train/video_loss': 0.1935568004846573, 'train/total_loss': 0.6031829714775085} -tensor(0.0708, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3715, 'grad_norm': 2.972278594970703, 'learning_rate': 8.91935835259194e-06}[Rank 2] Trainer log: {'loss': 0.3715, 'grad_norm': 2.972278594970703, 'learning_rate': 8.91935835259194e-06} -[Rank 3] Trainer log: {'loss': 0.3715, 'grad_norm': 2.972278594970703, 'learning_rate': 8.91935835259194e-06} - -[Rank 0] Trainer log: {'loss': 0.3715, 'grad_norm': 2.972278594970703, 'learning_rate': 8.91935835259194e-06} -{'loss': 0.3715, 'grad_norm': 2.972278594970703, 'learning_rate': 8.91935835259194e-06, 'epoch': 0.56} -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00021545719355344775, 'train/lm_loss': 5.3327123168855906e-05, 'train/info_loss': 2.8669011953752488e-05, 'train/ref_loss': 0.1538826823234558, 'train/uncertainty_loss': -6.634494056925178e-05, 'train/video_loss': 0.1555686742067337, 'train/total_loss': 0.15562200546264648} -tensor(0.2701, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.1999, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.050134825706481936, 'train/info_loss': 0.21106058359146118, 'train/ref_loss': None, 'train/uncertainty_loss': -8.956401725299657e-05, 'train/video_loss': 0.21097101271152496, 'train/total_loss': 0.26110583543777466} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3201, 'grad_norm': 2.154449939727783, 'learning_rate': 8.90875389225473e-06}[Rank 3] Trainer log: {'loss': 0.3201, 'grad_norm': 2.154449939727783, 'learning_rate': 8.90875389225473e-06}[Rank 1] Trainer log: {'loss': 0.3201, 'grad_norm': 2.154449939727783, 'learning_rate': 8.90875389225473e-06} - -[Rank 2] Trainer log: {'loss': 0.3201, 'grad_norm': 2.154449939727783, 'learning_rate': 8.90875389225473e-06} - -{'loss': 0.3201, 'grad_norm': 2.154449939727783, 'learning_rate': 8.90875389225473e-06, 'epoch': 0.56} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.32925484180450443, 'train/info_loss': 0.189666286110878, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001084381015971303, 'train/video_loss': 0.18955785036087036, 'train/total_loss': 0.5188126564025879} -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(0.0352, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1558, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0001618057256564498, 'train/lm_loss': 4.150436725467444e-05, 'train/info_loss': 2.574854443082586e-05, 'train/ref_loss': 0.16130885481834412, 'train/uncertainty_loss': -6.780005060136318e-05, 'train/video_loss': 0.16256125271320343, 'train/total_loss': 0.1626027524471283} -tensor(0.0430, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.342, 'grad_norm': 4.088895797729492, 'learning_rate': 8.898150673717974e-06}[Rank 0] Trainer log: {'loss': 0.342, 'grad_norm': 4.088895797729492, 'learning_rate': 8.898150673717974e-06} -[Rank 1] Trainer log: {'loss': 0.342, 'grad_norm': 4.088895797729492, 'learning_rate': 8.898150673717974e-06} - -[Rank 2] Trainer log: {'loss': 0.342, 'grad_norm': 4.088895797729492, 'learning_rate': 8.898150673717974e-06} -{'loss': 0.342, 'grad_norm': 4.088895797729492, 'learning_rate': 8.898150673717974e-06, 'epoch': 0.56} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.2986, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.0487, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00017750251572579145, 'train/lm_loss': 3.614099114201963e-05, 'train/info_loss': 2.3781687559676357e-05, 'train/ref_loss': 0.2387169599533081, 'train/uncertainty_loss': 0.004867368936538697, 'train/video_loss': 0.24502813816070557, 'train/total_loss': 0.24506427347660065} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.0476, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00019851832184940578, 'train/lm_loss': 5.33986312802881e-05, 'train/info_loss': 2.8669011953752488e-05, 'train/ref_loss': 0.23379480838775635, 'train/uncertainty_loss': 0.004760876297950745, 'train/video_loss': 0.24017250537872314, 'train/total_loss': 0.2402259111404419} -tensor(0.0472, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.2949, 'grad_norm': 3.2343595027923584, 'learning_rate': 8.887548709047765e-06}[Rank 3] Trainer log: {'loss': 0.2949, 'grad_norm': 3.2343595027923584, 'learning_rate': 8.887548709047765e-06} - -[Rank 0] Trainer log: {'loss': 0.2949, 'grad_norm': 3.2343595027923584, 'learning_rate': 8.887548709047765e-06} -[Rank 1] Trainer log: {'loss': 0.2949, 'grad_norm': 3.2343595027923584, 'learning_rate': 8.887548709047765e-06} -{'loss': 0.2949, 'grad_norm': 3.2343595027923584, 'learning_rate': 8.887548709047765e-06, 'epoch': 0.56} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(0.2728, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00024714339524507524, 'train/lm_loss': 4.705829196609557e-05, 'train/info_loss': 2.7178979507880285e-05, 'train/ref_loss': 0.10566742718219757, 'train/uncertainty_loss': -6.699634832330048e-05, 'train/video_loss': 0.10760475695133209, 'train/total_loss': 0.10765181481838226} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.47607378959655766, 'train/info_loss': 0.1374422162771225, 'train/ref_loss': None, 'train/uncertainty_loss': -9.784718276932836e-05, 'train/video_loss': 0.1373443752527237, 'train/total_loss': 0.6134181618690491} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3638, 'grad_norm': 6.0593719482421875, 'learning_rate': 8.876948010308778e-06}[Rank 2] Trainer log: {'loss': 0.3638, 'grad_norm': 6.0593719482421875, 'learning_rate': 8.876948010308778e-06} -[Rank 3] Trainer log: {'loss': 0.3638, 'grad_norm': 6.0593719482421875, 'learning_rate': 8.876948010308778e-06} - -[Rank 1] Trainer log: {'loss': 0.3638, 'grad_norm': 6.0593719482421875, 'learning_rate': 8.876948010308778e-06} -{'loss': 0.3638, 'grad_norm': 6.0593719482421875, 'learning_rate': 8.876948010308778e-06, 'epoch': 0.56} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.6101, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.2323, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002478282433003187, 'train/lm_loss': 6.076366407796741e-05, 'train/info_loss': 3.075505082961172e-05, 'train/ref_loss': 0.3708431124687195, 'train/uncertainty_loss': 0.023225906491279605, 'train/video_loss': 0.39608240127563477, 'train/total_loss': 0.3961431682109833} -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.07145253419876099, 'train/info_loss': 0.15283329784870148, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010471282294020057, 'train/video_loss': 0.1527285873889923, 'train/total_loss': 0.22418111562728882} -tensor(0.0276, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0218, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3678, 'grad_norm': 5.276296615600586, 'learning_rate': 8.866348589564238e-06}[Rank 2] Trainer log: {'loss': 0.3678, 'grad_norm': 5.276296615600586, 'learning_rate': 8.866348589564238e-06}[Rank 1] Trainer log: {'loss': 0.3678, 'grad_norm': 5.276296615600586, 'learning_rate': 8.866348589564238e-06} - - -[Rank 3] Trainer log: {'loss': 0.3678, 'grad_norm': 5.276296615600586, 'learning_rate': 8.866348589564238e-06} -{'loss': 0.3678, 'grad_norm': 5.276296615600586, 'learning_rate': 8.866348589564238e-06, 'epoch': 0.56} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.20753505229949953, 'train/info_loss': 0.1934487223625183, 'train/ref_loss': None, 'train/uncertainty_loss': -9.038676507771015e-05, 'train/video_loss': 0.19335833191871643, 'train/total_loss': 0.4008933901786804} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.352232837677002, 'train/info_loss': 0.22788240015506744, 'train/ref_loss': None, 'train/uncertainty_loss': -9.911848464980722e-05, 'train/video_loss': 0.22778327763080597, 'train/total_loss': 0.5800161361694336} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3677, 'grad_norm': 2.821669578552246, 'learning_rate': 8.855750458875924e-06} -[Rank 3] Trainer log: {'loss': 0.3677, 'grad_norm': 2.821669578552246, 'learning_rate': 8.855750458875924e-06}[Rank 2] Trainer log: {'loss': 0.3677, 'grad_norm': 2.821669578552246, 'learning_rate': 8.855750458875924e-06} - -[Rank 0] Trainer log: {'loss': 0.3677, 'grad_norm': 2.821669578552246, 'learning_rate': 8.855750458875924e-06} -{'loss': 0.3677, 'grad_norm': 2.821669578552246, 'learning_rate': 8.855750458875924e-06, 'epoch': 0.56} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0001815511379390955, 'train/lm_loss': 3.673692990560085e-05, 'train/info_loss': 2.413929905742407e-05, 'train/ref_loss': 0.21026332676410675, 'train/uncertainty_loss': -7.051019347272814e-05, 'train/video_loss': 0.21166937053203583, 'train/total_loss': 0.21170610189437866} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.35069749355316165, 'train/info_loss': 0.3894088566303253, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010250078048557043, 'train/video_loss': 0.38930636644363403, 'train/total_loss': 0.7400038242340088} -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(0.7973, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.5283, 'grad_norm': 9.05600357055664, 'learning_rate': 8.84515363030414e-06}[Rank 3] Trainer log: {'loss': 0.5283, 'grad_norm': 9.05600357055664, 'learning_rate': 8.84515363030414e-06}[Rank 0] Trainer log: {'loss': 0.5283, 'grad_norm': 9.05600357055664, 'learning_rate': 8.84515363030414e-06} - - -[Rank 2] Trainer log: {'loss': 0.5283, 'grad_norm': 9.05600357055664, 'learning_rate': 8.84515363030414e-06} -{'loss': 0.5283, 'grad_norm': 9.05600357055664, 'learning_rate': 8.84515363030414e-06, 'epoch': 0.56} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.05237340331077576, 'train/info_loss': 0.17803144454956055, 'train/ref_loss': None, 'train/uncertainty_loss': -9.140969486907125e-05, 'train/video_loss': 0.17794004082679749, 'train/total_loss': 0.23031345009803772} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.07830315828323364, 'train/info_loss': 0.14053326845169067, 'train/ref_loss': None, 'train/uncertainty_loss': -9.734202176332475e-05, 'train/video_loss': 0.14043591916561127, 'train/total_loss': 0.2187390774488449} -[Rank 1] Trainer log: {'loss': 0.3712, 'grad_norm': 5.217689514160156, 'learning_rate': 8.834558115907714e-06} -[Rank 0] Trainer log: {'loss': 0.3712, 'grad_norm': 5.217689514160156, 'learning_rate': 8.834558115907714e-06}[Rank 2] Trainer log: {'loss': 0.3712, 'grad_norm': 5.217689514160156, 'learning_rate': 8.834558115907714e-06} - -[Rank 3] Trainer log: {'loss': 0.3712, 'grad_norm': 5.217689514160156, 'learning_rate': 8.834558115907714e-06} -{'loss': 0.3712, 'grad_norm': 5.217689514160156, 'learning_rate': 8.834558115907714e-06, 'epoch': 0.56} -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) {'train/tv_loss': None, 'train/lm_loss': 0.2928750991821289, 'train/info_loss': 0.17933721840381622, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010132931638509036, 'train/video_loss': 0.17923589050769806, 'train/total_loss': 0.4721109867095947}tensor(-0.0012, device='cuda:2', grad_fn=) - -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1817354440689087, 'train/info_loss': 0.2048591822385788, 'train/ref_loss': None, 'train/uncertainty_loss': -8.864918490871788e-05, 'train/video_loss': 0.2047705352306366, 'train/total_loss': 0.38650596141815186} -tensor(0.0982, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4335, 'grad_norm': 5.02566385269165, 'learning_rate': 8.823963927743979e-06}[Rank 0] Trainer log: {'loss': 0.4335, 'grad_norm': 5.02566385269165, 'learning_rate': 8.823963927743979e-06}[Rank 1] Trainer log: {'loss': 0.4335, 'grad_norm': 5.02566385269165, 'learning_rate': 8.823963927743979e-06} - - -[Rank 2] Trainer log: {'loss': 0.4335, 'grad_norm': 5.02566385269165, 'learning_rate': 8.823963927743979e-06} -{'loss': 0.4335, 'grad_norm': 5.02566385269165, 'learning_rate': 8.823963927743979e-06, 'epoch': 0.56} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(0.1814, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002445435151457787, 'train/lm_loss': 4.164738929830492e-05, 'train/info_loss': 2.6523362976149656e-05, 'train/ref_loss': 0.3348836898803711, 'train/uncertainty_loss': 0.018141193687915804, 'train/video_loss': 0.35500773787498474, 'train/total_loss': 0.35504937171936035} -tensor(0.1643, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.2767, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2495647192001343, 'train/info_loss': 0.13123972713947296, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011836185585707427, 'train/video_loss': 0.13112136721611023, 'train/total_loss': 0.38068610429763794} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0015, device='cuda:2', grad_fn=) tensor(-0.0015, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.4213, 'grad_norm': 13.190470695495605, 'learning_rate': 8.81337107786875e-06}[Rank 1] Trainer log: {'loss': 0.4213, 'grad_norm': 13.190470695495605, 'learning_rate': 8.81337107786875e-06}[Rank 0] Trainer log: {'loss': 0.4213, 'grad_norm': 13.190470695495605, 'learning_rate': 8.81337107786875e-06} - - -[Rank 3] Trainer log: {'loss': 0.4213, 'grad_norm': 13.190470695495605, 'learning_rate': 8.81337107786875e-06} -{'loss': 0.4213, 'grad_norm': 13.190470695495605, 'learning_rate': 8.81337107786875e-06, 'epoch': 0.56} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.40624070167541504, 'train/info_loss': 0.235785111784935, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012330901809036733, 'train/video_loss': 0.2356618046760559, 'train/total_loss': 0.641902506351471} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.15950651168823243, 'train/info_loss': 0.1889410763978958, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001036779023706913, 'train/video_loss': 0.18883739411830902, 'train/total_loss': 0.3483439087867737} -tensor(0.9440, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4282, 'grad_norm': 3.636773109436035, 'learning_rate': 8.80277957833633e-06}[Rank 0] Trainer log: {'loss': 0.4282, 'grad_norm': 3.636773109436035, 'learning_rate': 8.80277957833633e-06}[Rank 3] Trainer log: {'loss': 0.4282, 'grad_norm': 3.636773109436035, 'learning_rate': 8.80277957833633e-06} - -[Rank 2] Trainer log: {'loss': 0.4282, 'grad_norm': 3.636773109436035, 'learning_rate': 8.80277957833633e-06} - -{'loss': 0.4282, 'grad_norm': 3.636773109436035, 'learning_rate': 8.80277957833633e-06, 'epoch': 0.56} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0441, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0006576369516551495, 'train/lm_loss': 6.066832575015724e-05, 'train/info_loss': 2.914582182711456e-05, 'train/ref_loss': 0.1642296016216278, 'train/uncertainty_loss': -7.178793312050402e-05, 'train/video_loss': 0.16944804787635803, 'train/total_loss': 0.16950871050357819} -tensor(0.1356, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1543, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.0693, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00020483718253672123, 'train/lm_loss': 4.109914007131011e-05, 'train/info_loss': 2.6106152290594764e-05, 'train/ref_loss': 0.11175263673067093, 'train/uncertainty_loss': -7.279793499037623e-05, 'train/video_loss': 0.11334463953971863, 'train/total_loss': 0.11338573694229126} -[Rank 1] Trainer log: {'loss': 0.2027, 'grad_norm': 10.542369842529297, 'learning_rate': 8.792189441199478e-06}[Rank 3] Trainer log: {'loss': 0.2027, 'grad_norm': 10.542369842529297, 'learning_rate': 8.792189441199478e-06} - -[Rank 2] Trainer log: {'loss': 0.2027, 'grad_norm': 10.542369842529297, 'learning_rate': 8.792189441199478e-06}[Rank 0] Trainer log: {'loss': 0.2027, 'grad_norm': 10.542369842529297, 'learning_rate': 8.792189441199478e-06} - -{'loss': 0.2027, 'grad_norm': 10.542369842529297, 'learning_rate': 8.792189441199478e-06, 'epoch': 0.56} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.21438729763031006, 'train/info_loss': 0.18680749833583832, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011876063654199244, 'train/video_loss': 0.18668873608112335, 'train/total_loss': 0.4010760188102722} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.005070081725716592, 'train/info_loss': 0.15142688155174255, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010908471886068584, 'train/video_loss': 0.1513177901506424, 'train/total_loss': 0.15638786554336548} -tensor(0.0099, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.3044, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3298, 'grad_norm': 5.885463714599609, 'learning_rate': 8.781600678509404e-06}[Rank 3] Trainer log: {'loss': 0.3298, 'grad_norm': 5.885463714599609, 'learning_rate': 8.781600678509404e-06} - -[Rank 1] Trainer log: {'loss': 0.3298, 'grad_norm': 5.885463714599609, 'learning_rate': 8.781600678509404e-06} -[Rank 2] Trainer log: {'loss': 0.3298, 'grad_norm': 5.885463714599609, 'learning_rate': 8.781600678509404e-06} -{'loss': 0.3298, 'grad_norm': 5.885463714599609, 'learning_rate': 8.781600678509404e-06, 'epoch': 0.56} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.06866081357002259, 'train/info_loss': 0.12299181520938873, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001139504136517644, 'train/video_loss': 0.12287786602973938, 'train/total_loss': 0.19153869152069092} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.0984, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1536, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.31163835525512695, 'train/info_loss': 0.19720619916915894, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010820142924785615, 'train/video_loss': 0.1970980018377304, 'train/total_loss': 0.5087363719940186} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3491, 'grad_norm': 8.226096153259277, 'learning_rate': 8.771013302315756e-06}[Rank 1] Trainer log: {'loss': 0.3491, 'grad_norm': 8.226096153259277, 'learning_rate': 8.771013302315756e-06}[Rank 3] Trainer log: {'loss': 0.3491, 'grad_norm': 8.226096153259277, 'learning_rate': 8.771013302315756e-06} - - -[Rank 0] Trainer log: {'loss': 0.3491, 'grad_norm': 8.226096153259277, 'learning_rate': 8.771013302315756e-06} -{'loss': 0.3491, 'grad_norm': 8.226096153259277, 'learning_rate': 8.771013302315756e-06, 'epoch': 0.56} -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.26744649410247806, 'train/info_loss': 0.12647970020771027, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010034560691565276, 'train/video_loss': 0.1263793557882309, 'train/total_loss': 0.39382582902908325} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0015, device='cuda:0', grad_fn=) tensor(-0.0015, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.4032500743865967, 'train/info_loss': 0.29580286145210266, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001491114846430719, 'train/video_loss': 0.295653760433197, 'train/total_loss': 0.6989037990570068} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.3368, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.408, 'grad_norm': 7.347342491149902, 'learning_rate': 8.760427324666601e-06}[Rank 1] Trainer log: {'loss': 0.408, 'grad_norm': 7.347342491149902, 'learning_rate': 8.760427324666601e-06} - -[Rank 2] Trainer log: {'loss': 0.408, 'grad_norm': 7.347342491149902, 'learning_rate': 8.760427324666601e-06}[Rank 0] Trainer log: {'loss': 0.408, 'grad_norm': 7.347342491149902, 'learning_rate': 8.760427324666601e-06} - -{'loss': 0.408, 'grad_norm': 7.347342491149902, 'learning_rate': 8.760427324666601e-06, 'epoch': 0.56} -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.22616579532623293, 'train/info_loss': 0.16564373672008514, 'train/ref_loss': None, 'train/uncertainty_loss': -0.000103292940184474, 'train/video_loss': 0.1655404418706894, 'train/total_loss': 0.3917062282562256} -tensor(0.0424, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.4593, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.31718192100524906, 'train/info_loss': 0.2083832174539566, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010285479947924614, 'train/video_loss': 0.20828036963939667, 'train/total_loss': 0.52546226978302} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.2516, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3506, 'grad_norm': 3.3307695388793945, 'learning_rate': 8.749842757608423e-06}[Rank 1] Trainer log: {'loss': 0.3506, 'grad_norm': 3.3307695388793945, 'learning_rate': 8.749842757608423e-06} - -[Rank 0] Trainer log: {'loss': 0.3506, 'grad_norm': 3.3307695388793945, 'learning_rate': 8.749842757608423e-06} -[Rank 2] Trainer log: {'loss': 0.3506, 'grad_norm': 3.3307695388793945, 'learning_rate': 8.749842757608423e-06} -{'loss': 0.3506, 'grad_norm': 3.3307695388793945, 'learning_rate': 8.749842757608423e-06, 'epoch': 0.56} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(0.0866, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002316743601113558, 'train/lm_loss': 3.6617740988731384e-05, 'train/info_loss': 2.3066464564180933e-05, 'train/ref_loss': 0.26969876885414124, 'train/uncertainty_loss': 0.008660124987363816, 'train/video_loss': 0.2802353799343109, 'train/total_loss': 0.2802720069885254} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.11208038330078125, 'train/info_loss': 0.12285266816616058, 'train/ref_loss': None, 'train/uncertainty_loss': -9.529684321023524e-05, 'train/video_loss': 0.12275736778974533, 'train/total_loss': 0.23483775556087494} -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3146, 'grad_norm': 5.764158248901367, 'learning_rate': 8.739259613186086e-06}[Rank 1] Trainer log: {'loss': 0.3146, 'grad_norm': 5.764158248901367, 'learning_rate': 8.739259613186086e-06} -[Rank 3] Trainer log: {'loss': 0.3146, 'grad_norm': 5.764158248901367, 'learning_rate': 8.739259613186086e-06} - -[Rank 2] Trainer log: {'loss': 0.3146, 'grad_norm': 5.764158248901367, 'learning_rate': 8.739259613186086e-06} -{'loss': 0.3146, 'grad_norm': 5.764158248901367, 'learning_rate': 8.739259613186086e-06, 'epoch': 0.56} -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2773792266845703, 'train/info_loss': 0.15449626743793488, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010368695948272944, 'train/video_loss': 0.15439258515834808, 'train/total_loss': 0.43177181482315063} -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(0.0732, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0628, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.2852, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002894368954002857, 'train/lm_loss': 5.3327123168855906e-05, 'train/info_loss': 2.8013399060000665e-05, 'train/ref_loss': 0.4057480990886688, 'train/uncertainty_loss': 0.028523340821266174, 'train/video_loss': 0.4366149306297302, 'train/total_loss': 0.4366682469844818} -[Rank 3] Trainer log: {'loss': 0.3436, 'grad_norm': 12.813337326049805, 'learning_rate': 8.728677903442853e-06} -[Rank 1] Trainer log: {'loss': 0.3436, 'grad_norm': 12.813337326049805, 'learning_rate': 8.728677903442853e-06} -[Rank 2] Trainer log: {'loss': 0.3436, 'grad_norm': 12.813337326049805, 'learning_rate': 8.728677903442853e-06} -[Rank 0] Trainer log: {'loss': 0.3436, 'grad_norm': 12.813337326049805, 'learning_rate': 8.728677903442853e-06} -{'loss': 0.3436, 'grad_norm': 12.813337326049805, 'learning_rate': 8.728677903442853e-06, 'epoch': 0.56} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.09068726301193238, 'train/info_loss': 0.10971034318208694, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011255614226683975, 'train/video_loss': 0.10959778726100922, 'train/total_loss': 0.20028504729270935} -tensor(0.0019, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2397413492202759, 'train/info_loss': 0.19256441295146942, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001390017569065094, 'train/video_loss': 0.1924254149198532, 'train/total_loss': 0.4321667551994324} -tensor(0.1316, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.2468, 'grad_norm': 5.531517028808594, 'learning_rate': 8.718097640420335e-06}[Rank 0] Trainer log: {'loss': 0.2468, 'grad_norm': 5.531517028808594, 'learning_rate': 8.718097640420335e-06} -[Rank 3] Trainer log: {'loss': 0.2468, 'grad_norm': 5.531517028808594, 'learning_rate': 8.718097640420335e-06} - -[Rank 2] Trainer log: {'loss': 0.2468, 'grad_norm': 5.531517028808594, 'learning_rate': 8.718097640420335e-06} -{'loss': 0.2468, 'grad_norm': 5.531517028808594, 'learning_rate': 8.718097640420335e-06, 'epoch': 0.56} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.21541898250579836, 'train/info_loss': 0.2547071874141693, 'train/ref_loss': None, 'train/uncertainty_loss': -8.914447971619666e-05, 'train/video_loss': 0.2546180486679077, 'train/total_loss': 0.470037043094635} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.1989, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.2489, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002568110125139356, 'train/lm_loss': 5.387534038163722e-05, 'train/info_loss': 2.8490208933362737e-05, 'train/ref_loss': 0.3808397650718689, 'train/uncertainty_loss': 0.024894624948501587, 'train/video_loss': 0.40781736373901367, 'train/total_loss': 0.4078712463378906} -tensor(0.1754, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.387, 'grad_norm': 4.650364398956299, 'learning_rate': 8.707518836158513e-06}[Rank 3] Trainer log: {'loss': 0.387, 'grad_norm': 4.650364398956299, 'learning_rate': 8.707518836158513e-06} - -[Rank 1] Trainer log: {'loss': 0.387, 'grad_norm': 4.650364398956299, 'learning_rate': 8.707518836158513e-06} -[Rank 2] Trainer log: {'loss': 0.387, 'grad_norm': 4.650364398956299, 'learning_rate': 8.707518836158513e-06} -{'loss': 0.387, 'grad_norm': 4.650364398956299, 'learning_rate': 8.707518836158513e-06, 'epoch': 0.56} -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3062197208404541, 'train/info_loss': 0.2516930401325226, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013053921284154058, 'train/video_loss': 0.2515625059604645, 'train/total_loss': 0.5577822327613831} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(0.0979, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1868668079376221, 'train/info_loss': 0.16405779123306274, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011114072985947133, 'train/video_loss': 0.16394664347171783, 'train/total_loss': 0.35081344842910767} -tensor(0.0925, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4078, 'grad_norm': 11.659296035766602, 'learning_rate': 8.6969415026957e-06} -[Rank 0] Trainer log: {'loss': 0.4078, 'grad_norm': 11.659296035766602, 'learning_rate': 8.6969415026957e-06}[Rank 2] Trainer log: {'loss': 0.4078, 'grad_norm': 11.659296035766602, 'learning_rate': 8.6969415026957e-06} - -[Rank 3] Trainer log: {'loss': 0.4078, 'grad_norm': 11.659296035766602, 'learning_rate': 8.6969415026957e-06} -{'loss': 0.4078, 'grad_norm': 11.659296035766602, 'learning_rate': 8.6969415026957e-06, 'epoch': 0.56} -tensor(0.1039, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0509, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3389082193374634, 'train/info_loss': 0.24557803571224213, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012158537283539772, 'train/video_loss': 0.24545645713806152, 'train/total_loss': 0.584364652633667} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(0.0279, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002773832529783249, 'train/lm_loss': 3.239845973439515e-05, 'train/info_loss': 2.3781687559676357e-05, 'train/ref_loss': 0.21696653962135315, 'train/uncertainty_loss': 0.0027878893539309505, 'train/video_loss': 0.22199727594852448, 'train/total_loss': 0.2220296710729599} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3273, 'grad_norm': 2.679468870162964, 'learning_rate': 8.686365652068536e-06}[Rank 0] Trainer log: {'loss': 0.3273, 'grad_norm': 2.679468870162964, 'learning_rate': 8.686365652068536e-06} -[Rank 3] Trainer log: {'loss': 0.3273, 'grad_norm': 2.679468870162964, 'learning_rate': 8.686365652068536e-06} - -[Rank 1] Trainer log: {'loss': 0.3273, 'grad_norm': 2.679468870162964, 'learning_rate': 8.686365652068536e-06} -{'loss': 0.3273, 'grad_norm': 2.679468870162964, 'learning_rate': 8.686365652068536e-06, 'epoch': 0.56} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2465620756149292, 'train/info_loss': 0.2372182160615921, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013496502069756388, 'train/video_loss': 0.23708325624465942, 'train/total_loss': 0.48364531993865967} -tensor(0.1229, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0194, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00022428452502936126, 'train/lm_loss': 3.223159583285451e-05, 'train/info_loss': 2.1636018573190086e-05, 'train/ref_loss': 0.10810238122940063, 'train/uncertainty_loss': -7.374417618848384e-05, 'train/video_loss': 0.10984455049037933, 'train/total_loss': 0.10987678170204163} -[Rank 0] Trainer log: {'loss': 0.2843, 'grad_norm': 7.949274063110352, 'learning_rate': 8.675791296311976e-06}[Rank 3] Trainer log: {'loss': 0.2843, 'grad_norm': 7.949274063110352, 'learning_rate': 8.675791296311976e-06} - -[Rank 1] Trainer log: {'loss': 0.2843, 'grad_norm': 7.949274063110352, 'learning_rate': 8.675791296311976e-06} -[Rank 2] Trainer log: {'loss': 0.2843, 'grad_norm': 7.949274063110352, 'learning_rate': 8.675791296311976e-06} -{'loss': 0.2843, 'grad_norm': 7.949274063110352, 'learning_rate': 8.675791296311976e-06, 'epoch': 0.57} -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.24960243701934814, 'train/info_loss': 0.12290974706411362, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010393400443717838, 'train/video_loss': 0.12280581146478653, 'train/total_loss': 0.3724082410335541} -tensor(0.0928, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.10687600374221802, 'train/info_loss': 0.18781112134456635, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010536249028518796, 'train/video_loss': 0.18770575523376465, 'train/total_loss': 0.2945817708969116} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.2568, 'grad_norm': 8.414517402648926, 'learning_rate': 8.66521844745927e-06}[Rank 2] Trainer log: {'loss': 0.2568, 'grad_norm': 8.414517402648926, 'learning_rate': 8.66521844745927e-06} -[Rank 1] Trainer log: {'loss': 0.2568, 'grad_norm': 8.414517402648926, 'learning_rate': 8.66521844745927e-06} -[Rank 3] Trainer log: {'loss': 0.2568, 'grad_norm': 8.414517402648926, 'learning_rate': 8.66521844745927e-06} - -{'loss': 0.2568, 'grad_norm': 8.414517402648926, 'learning_rate': 8.66521844745927e-06, 'epoch': 0.57} -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.09305580854415894, 'train/info_loss': 0.22605526447296143, 'train/ref_loss': None, 'train/uncertainty_loss': -9.250458679161967e-05, 'train/video_loss': 0.22596275806427002, 'train/total_loss': 0.31901857256889343} -tensor(0.1794, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0263, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.5246, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.16306209564208984, 'train/info_loss': 0.15752574801445007, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010764681501314045, 'train/video_loss': 0.15741810202598572, 'train/total_loss': 0.32048019766807556} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.4046, 'grad_norm': 10.179515838623047, 'learning_rate': 8.65464711754196e-06}[Rank 1] Trainer log: {'loss': 0.4046, 'grad_norm': 10.179515838623047, 'learning_rate': 8.65464711754196e-06} - -[Rank 0] Trainer log: {'loss': 0.4046, 'grad_norm': 10.179515838623047, 'learning_rate': 8.65464711754196e-06}[Rank 3] Trainer log: {'loss': 0.4046, 'grad_norm': 10.179515838623047, 'learning_rate': 8.65464711754196e-06} - -{'loss': 0.4046, 'grad_norm': 10.179515838623047, 'learning_rate': 8.65464711754196e-06, 'epoch': 0.57} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.18216742277145387, 'train/info_loss': 0.23790018260478973, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011933641508221626, 'train/video_loss': 0.2377808392047882, 'train/total_loss': 0.4199482798576355} -tensor(0.3796, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3737796783447266, 'train/info_loss': 0.246211975812912, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012285507982596756, 'train/video_loss': 0.2460891157388687, 'train/total_loss': 0.619868814945221} -tensor(0.0831, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.2211, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3666, 'grad_norm': 10.094522476196289, 'learning_rate': 8.644077318589848e-06}[Rank 1] Trainer log: {'loss': 0.3666, 'grad_norm': 10.094522476196289, 'learning_rate': 8.644077318589848e-06}[Rank 0] Trainer log: {'loss': 0.3666, 'grad_norm': 10.094522476196289, 'learning_rate': 8.644077318589848e-06} - -[Rank 2] Trainer log: {'loss': 0.3666, 'grad_norm': 10.094522476196289, 'learning_rate': 8.644077318589848e-06} - -{'loss': 0.3666, 'grad_norm': 10.094522476196289, 'learning_rate': 8.644077318589848e-06, 'epoch': 0.57} -tensor(-0.0015, device='cuda:2', grad_fn=) tensor(-0.0015, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1257664442062378, 'train/info_loss': 0.14834293723106384, 'train/ref_loss': None, 'train/uncertainty_loss': -9.224725654348731e-05, 'train/video_loss': 0.14825068414211273, 'train/total_loss': 0.2740171253681183} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0015, device='cuda:3', grad_fn=) tensor(-0.0015, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(0.3493, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00018505933694541455, 'train/lm_loss': 3.666541597340256e-05, 'train/info_loss': 2.5331331926281564e-05, 'train/ref_loss': 0.1892111450433731, 'train/uncertainty_loss': -6.954813725315035e-05, 'train/video_loss': 0.1906474083662033, 'train/total_loss': 0.19068408012390137} -[Rank 0] Trainer log: {'loss': 0.3525, 'grad_norm': 6.083531379699707, 'learning_rate': 8.633509062631008e-06}[Rank 2] Trainer log: {'loss': 0.3525, 'grad_norm': 6.083531379699707, 'learning_rate': 8.633509062631008e-06}[Rank 1] Trainer log: {'loss': 0.3525, 'grad_norm': 6.083531379699707, 'learning_rate': 8.633509062631008e-06} - -[Rank 3] Trainer log: {'loss': 0.3525, 'grad_norm': 6.083531379699707, 'learning_rate': 8.633509062631008e-06} - -{'loss': 0.3525, 'grad_norm': 6.083531379699707, 'learning_rate': 8.633509062631008e-06, 'epoch': 0.57} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2006915092468262, 'train/info_loss': 0.10838840901851654, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011667845537886024, 'train/video_loss': 0.10827173292636871, 'train/total_loss': 0.30896323919296265} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.4235, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.06288254857063294, 'train/info_loss': 0.2131846845149994, 'train/ref_loss': None, 'train/uncertainty_loss': -8.911635377444327e-05, 'train/video_loss': 0.21309557557106018, 'train/total_loss': 0.27597811818122864} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.321, 'grad_norm': 7.394283771514893, 'learning_rate': 8.622942361691744e-06}[Rank 1] Trainer log: {'loss': 0.321, 'grad_norm': 7.394283771514893, 'learning_rate': 8.622942361691744e-06}[Rank 3] Trainer log: {'loss': 0.321, 'grad_norm': 7.394283771514893, 'learning_rate': 8.622942361691744e-06} - - -[Rank 0] Trainer log: {'loss': 0.321, 'grad_norm': 7.394283771514893, 'learning_rate': 8.622942361691744e-06} -{'loss': 0.321, 'grad_norm': 7.394283771514893, 'learning_rate': 8.622942361691744e-06, 'epoch': 0.57} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.35930676460266114, 'train/info_loss': 0.18881697952747345, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010813293047249318, 'train/video_loss': 0.1887088418006897, 'train/total_loss': 0.5480155944824219} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(0.0286, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0991, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00018963334150612354, 'train/lm_loss': 5.2612059516832235e-05, 'train/info_loss': 2.592734745121561e-05, 'train/ref_loss': 0.28451013565063477, 'train/uncertainty_loss': 0.00990513861179352, 'train/video_loss': 0.2959582507610321, 'train/total_loss': 0.2960108518600464} -tensor(0.1494, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3163, 'grad_norm': 3.8802151679992676, 'learning_rate': 8.612377227796602e-06}[Rank 0] Trainer log: {'loss': 0.3163, 'grad_norm': 3.8802151679992676, 'learning_rate': 8.612377227796602e-06} -[Rank 2] Trainer log: {'loss': 0.3163, 'grad_norm': 3.8802151679992676, 'learning_rate': 8.612377227796602e-06} -[Rank 1] Trainer log: {'loss': 0.3163, 'grad_norm': 3.8802151679992676, 'learning_rate': 8.612377227796602e-06} - -{'loss': 0.3163, 'grad_norm': 3.8802151679992676, 'learning_rate': 8.612377227796602e-06, 'epoch': 0.57} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.37515149116516117, 'train/info_loss': 0.1729058027267456, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010675262892618776, 'train/video_loss': 0.17279905080795288, 'train/total_loss': 0.5479505062103271} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.1579, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.11128069162368776, 'train/info_loss': 0.12162915617227554, 'train/ref_loss': None, 'train/uncertainty_loss': -9.07807843759656e-05, 'train/video_loss': 0.12153837829828262, 'train/total_loss': 0.2328190803527832} -tensor(0.1750, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1717, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3613, 'grad_norm': 8.820052146911621, 'learning_rate': 8.601813672968342e-06}[Rank 1] Trainer log: {'loss': 0.3613, 'grad_norm': 8.820052146911621, 'learning_rate': 8.601813672968342e-06} - -[Rank 0] Trainer log: {'loss': 0.3613, 'grad_norm': 8.820052146911621, 'learning_rate': 8.601813672968342e-06}[Rank 3] Trainer log: {'loss': 0.3613, 'grad_norm': 8.820052146911621, 'learning_rate': 8.601813672968342e-06} - -{'loss': 0.3613, 'grad_norm': 8.820052146911621, 'learning_rate': 8.601813672968342e-06, 'epoch': 0.57} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.3432, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00024874298833310604, 'train/lm_loss': 3.2017051125876606e-05, 'train/info_loss': 2.270885306643322e-05, 'train/ref_loss': 0.4461354613304138, 'train/uncertainty_loss': 0.03431602716445923, 'train/video_loss': 0.48246413469314575, 'train/total_loss': 0.48249614238739014} -tensor(0.0321, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002884474815800786, 'train/lm_loss': 3.66415799362585e-05, 'train/info_loss': 2.413929905742407e-05, 'train/ref_loss': 0.08888167887926102, 'train/uncertainty_loss': -6.630058633163572e-05, 'train/video_loss': 0.09114709496498108, 'train/total_loss': 0.09118373692035675} -tensor(0.0296, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.2792, 'grad_norm': 7.225982666015625, 'learning_rate': 8.59125170922792e-06}[Rank 0] Trainer log: {'loss': 0.2792, 'grad_norm': 7.225982666015625, 'learning_rate': 8.59125170922792e-06} -[Rank 3] Trainer log: {'loss': 0.2792, 'grad_norm': 7.225982666015625, 'learning_rate': 8.59125170922792e-06} - -[Rank 2] Trainer log: {'loss': 0.2792, 'grad_norm': 7.225982666015625, 'learning_rate': 8.59125170922792e-06} -{'loss': 0.2792, 'grad_norm': 7.225982666015625, 'learning_rate': 8.59125170922792e-06, 'epoch': 0.57} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.4634280681610108, 'train/info_loss': 0.2141062617301941, 'train/ref_loss': None, 'train/uncertainty_loss': -9.710198501124979e-05, 'train/video_loss': 0.21400916576385498, 'train/total_loss': 0.6774372458457947} -tensor(0.1320, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.31789560317993165, 'train/info_loss': 0.17526833713054657, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011702567571774125, 'train/video_loss': 0.17515131831169128, 'train/total_loss': 0.49304693937301636} -tensor(0.0184, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.3067, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3956, 'grad_norm': 16.138051986694336, 'learning_rate': 8.580691348594493e-06}[Rank 0] Trainer log: {'loss': 0.3956, 'grad_norm': 16.138051986694336, 'learning_rate': 8.580691348594493e-06}[Rank 2] Trainer log: {'loss': 0.3956, 'grad_norm': 16.138051986694336, 'learning_rate': 8.580691348594493e-06} - - -[Rank 3] Trainer log: {'loss': 0.3956, 'grad_norm': 16.138051986694336, 'learning_rate': 8.580691348594493e-06} -{'loss': 0.3956, 'grad_norm': 16.138051986694336, 'learning_rate': 8.580691348594493e-06, 'epoch': 0.57} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(0.0933, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1752, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00018219378544017675, 'train/lm_loss': 5.3470139391720295e-05, 'train/info_loss': 2.8013399060000665e-05, 'train/ref_loss': 0.3299592435359955, 'train/uncertainty_loss': 0.017518487572669984, 'train/video_loss': 0.34896329045295715, 'train/total_loss': 0.3490167558193207} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.2303, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.4110119342803955, 'train/info_loss': 0.1445493996143341, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013065721141174437, 'train/video_loss': 0.14441874623298645, 'train/total_loss': 0.5554306507110596} -[Rank 3] Trainer log: {'loss': 0.4358, 'grad_norm': 9.632416725158691, 'learning_rate': 8.570132603085378e-06} -[Rank 0] Trainer log: {'loss': 0.4358, 'grad_norm': 9.632416725158691, 'learning_rate': 8.570132603085378e-06}[Rank 1] Trainer log: {'loss': 0.4358, 'grad_norm': 9.632416725158691, 'learning_rate': 8.570132603085378e-06} -[Rank 2] Trainer log: {'loss': 0.4358, 'grad_norm': 9.632416725158691, 'learning_rate': 8.570132603085378e-06} - -{'loss': 0.4358, 'grad_norm': 9.632416725158691, 'learning_rate': 8.570132603085378e-06, 'epoch': 0.57} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002169486600905657, 'train/lm_loss': 4.164738929830492e-05, 'train/info_loss': 2.6523362976149656e-05, 'train/ref_loss': 0.1916642189025879, 'train/uncertainty_loss': -7.312980014830829e-05, 'train/video_loss': 0.19335319101810455, 'train/total_loss': 0.19339483976364136} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.4820, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.0669, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002209642203524709, 'train/lm_loss': 2.856050559785217e-05, 'train/info_loss': 2.1338008082238957e-05, 'train/ref_loss': 0.2605785131454468, 'train/uncertainty_loss': 0.006688939034938813, 'train/video_loss': 0.2690565288066864, 'train/total_loss': 0.2690850794315338} -[Rank 1] Trainer log: {'loss': 0.3247, 'grad_norm': 14.97860336303711, 'learning_rate': 8.559575484716075e-06}[Rank 3] Trainer log: {'loss': 0.3247, 'grad_norm': 14.97860336303711, 'learning_rate': 8.559575484716075e-06} -[Rank 0] Trainer log: {'loss': 0.3247, 'grad_norm': 14.97860336303711, 'learning_rate': 8.559575484716075e-06} -[Rank 2] Trainer log: {'loss': 0.3247, 'grad_norm': 14.97860336303711, 'learning_rate': 8.559575484716075e-06} - -{'loss': 0.3247, 'grad_norm': 14.97860336303711, 'learning_rate': 8.559575484716075e-06, 'epoch': 0.57} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.0318, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1465, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.2760, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002605001907795668, 'train/lm_loss': 6.107351509854198e-05, 'train/info_loss': 2.8669011953752488e-05, 'train/ref_loss': 0.3984062075614929, 'train/uncertainty_loss': 0.02760396897792816, 'train/video_loss': 0.4281228184700012, 'train/total_loss': 0.4281838834285736} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.0881, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0152, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00021300564985722304, 'train/lm_loss': 0.0001309204613789916, 'train/info_loss': 3.57019089278765e-05, 'train/ref_loss': 0.11155346035957336, 'train/uncertainty_loss': -6.875089020468295e-05, 'train/video_loss': 0.11322445422410965, 'train/total_loss': 0.1133553758263588} -[Rank 1] Trainer log: {'loss': 0.2954, 'grad_norm': 12.71616268157959, 'learning_rate': 8.54902000550021e-06} -[Rank 0] Trainer log: {'loss': 0.2954, 'grad_norm': 12.71616268157959, 'learning_rate': 8.54902000550021e-06}[Rank 2] Trainer log: {'loss': 0.2954, 'grad_norm': 12.71616268157959, 'learning_rate': 8.54902000550021e-06} -[Rank 3] Trainer log: {'loss': 0.2954, 'grad_norm': 12.71616268157959, 'learning_rate': 8.54902000550021e-06} - -{'loss': 0.2954, 'grad_norm': 12.71616268157959, 'learning_rate': 8.54902000550021e-06, 'epoch': 0.57} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00016534507740288974, 'train/lm_loss': 3.182634827680886e-05, 'train/info_loss': 2.3781687559676357e-05, 'train/ref_loss': 0.1684214174747467, 'train/uncertainty_loss': -7.011963753029704e-05, 'train/video_loss': 0.1696978360414505, 'train/total_loss': 0.16972966492176056} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3120708465576172, 'train/info_loss': 0.15786534547805786, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010034021688625217, 'train/video_loss': 0.1577650010585785, 'train/total_loss': 0.4698358476161957} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3718, 'grad_norm': 2.2067136764526367, 'learning_rate': 8.538466177449559e-06} -[Rank 0] Trainer log: {'loss': 0.3718, 'grad_norm': 2.2067136764526367, 'learning_rate': 8.538466177449559e-06}[Rank 3] Trainer log: {'loss': 0.3718, 'grad_norm': 2.2067136764526367, 'learning_rate': 8.538466177449559e-06} -[Rank 2] Trainer log: {'loss': 0.3718, 'grad_norm': 2.2067136764526367, 'learning_rate': 8.538466177449559e-06} - -{'loss': 0.3718, 'grad_norm': 2.2067136764526367, 'learning_rate': 8.538466177449559e-06, 'epoch': 0.57} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.060002201795578004, 'train/info_loss': 0.14454637467861176, 'train/ref_loss': None, 'train/uncertainty_loss': -8.780016214586795e-05, 'train/video_loss': 0.1444585770368576, 'train/total_loss': 0.20446078479290009} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.04544896483421326, 'train/info_loss': 0.23591840267181396, 'train/ref_loss': None, 'train/uncertainty_loss': -8.587419870309532e-05, 'train/video_loss': 0.23583252727985382, 'train/total_loss': 0.2812815010547638} -tensor(0.2751, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3953, 'grad_norm': 6.607307434082031, 'learning_rate': 8.52791401257402e-06}[Rank 1] Trainer log: {'loss': 0.3953, 'grad_norm': 6.607307434082031, 'learning_rate': 8.52791401257402e-06} - -[Rank 2] Trainer log: {'loss': 0.3953, 'grad_norm': 6.607307434082031, 'learning_rate': 8.52791401257402e-06}[Rank 0] Trainer log: {'loss': 0.3953, 'grad_norm': 6.607307434082031, 'learning_rate': 8.52791401257402e-06} - -{'loss': 0.3953, 'grad_norm': 6.607307434082031, 'learning_rate': 8.52791401257402e-06, 'epoch': 0.57} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.03749457895755768, 'train/info_loss': 0.267259806394577, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010700656566768885, 'train/video_loss': 0.2671527862548828, 'train/total_loss': 0.3046473562717438} -tensor(0.2301, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.3401, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2877607583999634, 'train/info_loss': 0.17437371611595154, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00014204932376742365, 'train/video_loss': 0.1742316633462906, 'train/total_loss': 0.46199244260787964} -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.1524, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4524, 'grad_norm': 12.892083168029785, 'learning_rate': 8.51736352288158e-06}[Rank 0] Trainer log: {'loss': 0.4524, 'grad_norm': 12.892083168029785, 'learning_rate': 8.51736352288158e-06} - -[Rank 3] Trainer log: {'loss': 0.4524, 'grad_norm': 12.892083168029785, 'learning_rate': 8.51736352288158e-06}[Rank 2] Trainer log: {'loss': 0.4524, 'grad_norm': 12.892083168029785, 'learning_rate': 8.51736352288158e-06} - -{'loss': 0.4524, 'grad_norm': 12.892083168029785, 'learning_rate': 8.51736352288158e-06, 'epoch': 0.57} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.30454246997833256, 'train/info_loss': 0.20316146314144135, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001248778193257749, 'train/video_loss': 0.2030365914106369, 'train/total_loss': 0.5075790882110596} -tensor(0.2867, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(0.2353, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.26230595111846927, 'train/info_loss': 0.1996956318616867, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012898585991933943, 'train/video_loss': 0.19956664741039276, 'train/total_loss': 0.46187257766723633} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3932, 'grad_norm': 3.290391445159912, 'learning_rate': 8.506814720378346e-06}[Rank 3] Trainer log: {'loss': 0.3932, 'grad_norm': 3.290391445159912, 'learning_rate': 8.506814720378346e-06}[Rank 0] Trainer log: {'loss': 0.3932, 'grad_norm': 3.290391445159912, 'learning_rate': 8.506814720378346e-06} - - -[Rank 2] Trainer log: {'loss': 0.3932, 'grad_norm': 3.290391445159912, 'learning_rate': 8.506814720378346e-06} -{'loss': 0.3932, 'grad_norm': 3.290391445159912, 'learning_rate': 8.506814720378346e-06, 'epoch': 0.57} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.07941729426383973, 'train/info_loss': 0.19724038243293762, 'train/ref_loss': None, 'train/uncertainty_loss': -9.715104824863375e-05, 'train/video_loss': 0.19714322686195374, 'train/total_loss': 0.276560515165329} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.34917864799499515, 'train/info_loss': 0.20755964517593384, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011765406234189869, 'train/video_loss': 0.20744198560714722, 'train/total_loss': 0.5566205978393555} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3394, 'grad_norm': 3.0609726905822754, 'learning_rate': 8.496267617068483e-06} -[Rank 2] Trainer log: {'loss': 0.3394, 'grad_norm': 3.0609726905822754, 'learning_rate': 8.496267617068483e-06}[Rank 1] Trainer log: {'loss': 0.3394, 'grad_norm': 3.0609726905822754, 'learning_rate': 8.496267617068483e-06} - -[Rank 0] Trainer log: {'loss': 0.3394, 'grad_norm': 3.0609726905822754, 'learning_rate': 8.496267617068483e-06} -{'loss': 0.3394, 'grad_norm': 3.0609726905822754, 'learning_rate': 8.496267617068483e-06, 'epoch': 0.57} -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.27154462337493895, 'train/info_loss': 0.2181044965982437, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013400971656665205, 'train/video_loss': 0.21797049045562744, 'train/total_loss': 0.48951512575149536} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.6607, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.0292, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00041379244066774845, 'train/lm_loss': 5.3660821868106724e-05, 'train/info_loss': 2.6940573661704548e-05, 'train/ref_loss': 0.21292835474014282, 'train/uncertainty_loss': 0.002921386063098908, 'train/video_loss': 0.21918702125549316, 'train/total_loss': 0.2192406803369522} -[Rank 1] Trainer log: {'loss': 0.3941, 'grad_norm': 15.842937469482422, 'learning_rate': 8.485722224954237e-06} -[Rank 3] Trainer log: {'loss': 0.3941, 'grad_norm': 15.842937469482422, 'learning_rate': 8.485722224954237e-06}[Rank 0] Trainer log: {'loss': 0.3941, 'grad_norm': 15.842937469482422, 'learning_rate': 8.485722224954237e-06} - -[Rank 2] Trainer log: {'loss': 0.3941, 'grad_norm': 15.842937469482422, 'learning_rate': 8.485722224954237e-06} -{'loss': 0.3941, 'grad_norm': 15.842937469482422, 'learning_rate': 8.485722224954237e-06, 'epoch': 0.57} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3824377298355103, 'train/info_loss': 0.27648282051086426, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012352197663858532, 'train/video_loss': 0.27635928988456726, 'train/total_loss': 0.658797025680542} -tensor(0.0233, device='cuda:3', grad_fn=) tensor(-0.0006, device='cuda:3', grad_fn=) -tensor(0.0752, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3325687885284424, 'train/info_loss': 0.22957658767700195, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013048560358583928, 'train/video_loss': 0.22944609820842743, 'train/total_loss': 0.5620148777961731} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4372, 'grad_norm': 2.838341236114502, 'learning_rate': 8.475178556035893e-06}[Rank 2] Trainer log: {'loss': 0.4372, 'grad_norm': 2.838341236114502, 'learning_rate': 8.475178556035893e-06} - -[Rank 0] Trainer log: {'loss': 0.4372, 'grad_norm': 2.838341236114502, 'learning_rate': 8.475178556035893e-06} -[Rank 3] Trainer log: {'loss': 0.4372, 'grad_norm': 2.838341236114502, 'learning_rate': 8.475178556035893e-06} -{'loss': 0.4372, 'grad_norm': 2.838341236114502, 'learning_rate': 8.475178556035893e-06, 'epoch': 0.57} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.09418655633926393, 'train/info_loss': 0.20841072499752045, 'train/ref_loss': None, 'train/uncertainty_loss': -8.295951993204654e-05, 'train/video_loss': 0.2083277702331543, 'train/total_loss': 0.30251431465148926} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.37814581394195557, 'train/info_loss': 0.24560746550559998, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001426283153705299, 'train/video_loss': 0.24546483159065247, 'train/total_loss': 0.6236106157302856} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.0515, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3973, 'grad_norm': 4.344086647033691, 'learning_rate': 8.464636622311786e-06}[Rank 0] Trainer log: {'loss': 0.3973, 'grad_norm': 4.344086647033691, 'learning_rate': 8.464636622311786e-06} -[Rank 2] Trainer log: {'loss': 0.3973, 'grad_norm': 4.344086647033691, 'learning_rate': 8.464636622311786e-06}[Rank 3] Trainer log: {'loss': 0.3973, 'grad_norm': 4.344086647033691, 'learning_rate': 8.464636622311786e-06} - - -{'loss': 0.3973, 'grad_norm': 4.344086647033691, 'learning_rate': 8.464636622311786e-06, 'epoch': 0.57} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.27165248394012453, 'train/info_loss': 0.22631284594535828, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011655098060145975, 'train/video_loss': 0.2261962890625, 'train/total_loss': 0.497848778963089} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.0535, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.0625, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.000402799341827631, 'train/lm_loss': 3.154028963763267e-05, 'train/info_loss': 2.1338008082238957e-05, 'train/ref_loss': 0.2439042031764984, 'train/uncertainty_loss': 0.006248579174280167, 'train/video_loss': 0.25339651107788086, 'train/total_loss': 0.25342804193496704} -[Rank 1] Trainer log: {'loss': 0.4037, 'grad_norm': 4.016700267791748, 'learning_rate': 8.45409643577828e-06} -[Rank 3] Trainer log: {'loss': 0.4037, 'grad_norm': 4.016700267791748, 'learning_rate': 8.45409643577828e-06}[Rank 2] Trainer log: {'loss': 0.4037, 'grad_norm': 4.016700267791748, 'learning_rate': 8.45409643577828e-06} - -[Rank 0] Trainer log: {'loss': 0.4037, 'grad_norm': 4.016700267791748, 'learning_rate': 8.45409643577828e-06} -{'loss': 0.4037, 'grad_norm': 4.016700267791748, 'learning_rate': 8.45409643577828e-06, 'epoch': 0.57} -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1295374870300293, 'train/info_loss': 0.12624423205852509, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010491572320461274, 'train/video_loss': 0.1261393129825592, 'train/total_loss': 0.255676805973053} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(0.0539, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.0584, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00035140146501362325, 'train/lm_loss': 0.00010034904116764665, 'train/info_loss': 3.129145989078097e-05, 'train/ref_loss': 0.23063328862190247, 'train/uncertainty_loss': 0.005844449624419212, 'train/video_loss': 0.23932023346424103, 'train/total_loss': 0.2394205778837204} -[Rank 1] Trainer log: {'loss': 0.2625, 'grad_norm': 4.53999662399292, 'learning_rate': 8.443558008429735e-06} -[Rank 0] Trainer log: {'loss': 0.2625, 'grad_norm': 4.53999662399292, 'learning_rate': 8.443558008429735e-06}[Rank 2] Trainer log: {'loss': 0.2625, 'grad_norm': 4.53999662399292, 'learning_rate': 8.443558008429735e-06} - -[Rank 3] Trainer log: {'loss': 0.2625, 'grad_norm': 4.53999662399292, 'learning_rate': 8.443558008429735e-06} -{'loss': 0.2625, 'grad_norm': 4.53999662399292, 'learning_rate': 8.443558008429735e-06, 'epoch': 0.57} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.09085333347320557, 'train/info_loss': 0.17837347090244293, 'train/ref_loss': None, 'train/uncertainty_loss': -8.797778864391149e-05, 'train/video_loss': 0.17828549444675446, 'train/total_loss': 0.26913881301879883} -tensor(0.0255, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(0.1334, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3026509523391724, 'train/info_loss': 0.24837394058704376, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011365870013833047, 'train/video_loss': 0.2482602745294571, 'train/total_loss': 0.5509112477302551} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.4878, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3836, 'grad_norm': 10.016975402832031, 'learning_rate': 8.433021352258522e-06} -[Rank 2] Trainer log: {'loss': 0.3836, 'grad_norm': 10.016975402832031, 'learning_rate': 8.433021352258522e-06}[Rank 3] Trainer log: {'loss': 0.3836, 'grad_norm': 10.016975402832031, 'learning_rate': 8.433021352258522e-06} - -[Rank 0] Trainer log: {'loss': 0.3836, 'grad_norm': 10.016975402832031, 'learning_rate': 8.433021352258522e-06} -{'loss': 0.3836, 'grad_norm': 10.016975402832031, 'learning_rate': 8.433021352258522e-06, 'epoch': 0.57} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(0.2097, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003892923938110471, 'train/lm_loss': 3.158796753268689e-05, 'train/info_loss': 2.1636018573190086e-05, 'train/ref_loss': 0.3539840579032898, 'train/uncertainty_loss': 0.020973826944828036, 'train/video_loss': 0.3780938684940338, 'train/total_loss': 0.37812545895576477} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0334, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.0493, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00023125244770199062, 'train/lm_loss': 0.00011874487390741707, 'train/info_loss': 2.914582182711456e-05, 'train/ref_loss': 0.24108979105949402, 'train/uncertainty_loss': 0.004930242523550988, 'train/video_loss': 0.24789920449256897, 'train/total_loss': 0.24801795184612274} -[Rank 2] Trainer log: {'loss': 0.3719, 'grad_norm': 5.356694221496582, 'learning_rate': 8.42248647925499e-06} -[Rank 3] Trainer log: {'loss': 0.3719, 'grad_norm': 5.356694221496582, 'learning_rate': 8.42248647925499e-06} -[Rank 0] Trainer log: {'loss': 0.3719, 'grad_norm': 5.356694221496582, 'learning_rate': 8.42248647925499e-06}[Rank 1] Trainer log: {'loss': 0.3719, 'grad_norm': 5.356694221496582, 'learning_rate': 8.42248647925499e-06} - -{'loss': 0.3719, 'grad_norm': 5.356694221496582, 'learning_rate': 8.42248647925499e-06, 'epoch': 0.57} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.305660605430603, 'train/info_loss': 0.15529416501522064, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010797857539728284, 'train/video_loss': 0.15518619120121002, 'train/total_loss': 0.46084678173065186} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.000261582643724978, 'train/lm_loss': 3.173099539708346e-05, 'train/info_loss': 2.1338008082238957e-05, 'train/ref_loss': 0.2117089033126831, 'train/uncertainty_loss': -6.97259500157088e-05, 'train/video_loss': 0.21375317871570587, 'train/total_loss': 0.21378490328788757} -[Rank 1] Trainer log: {'loss': 0.4037, 'grad_norm': 3.866602897644043, 'learning_rate': 8.411953401407467e-06}[Rank 3] Trainer log: {'loss': 0.4037, 'grad_norm': 3.866602897644043, 'learning_rate': 8.411953401407467e-06} - -[Rank 0] Trainer log: {'loss': 0.4037, 'grad_norm': 3.866602897644043, 'learning_rate': 8.411953401407467e-06}[Rank 2] Trainer log: {'loss': 0.4037, 'grad_norm': 3.866602897644043, 'learning_rate': 8.411953401407467e-06} - -{'loss': 0.4037, 'grad_norm': 3.866602897644043, 'learning_rate': 8.411953401407467e-06, 'epoch': 0.57} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.08034816980361939, 'train/info_loss': 0.13731738924980164, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010927838739007712, 'train/video_loss': 0.13720810413360596, 'train/total_loss': 0.21755626797676086} -tensor(0.0087, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0308, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002730557229369879, 'train/lm_loss': 6.810459308326244e-05, 'train/info_loss': 2.6523362976149656e-05, 'train/ref_loss': 0.24081850051879883, 'train/uncertainty_loss': 0.0030788796022534373, 'train/video_loss': 0.24610835313796997, 'train/total_loss': 0.24617645144462585} -[Rank 1] Trainer log: {'loss': 0.3288, 'grad_norm': 3.8226940631866455, 'learning_rate': 8.401422130702224e-06}[Rank 2] Trainer log: {'loss': 0.3288, 'grad_norm': 3.8226940631866455, 'learning_rate': 8.401422130702224e-06}[Rank 0] Trainer log: {'loss': 0.3288, 'grad_norm': 3.8226940631866455, 'learning_rate': 8.401422130702224e-06} - - -[Rank 3] Trainer log: {'loss': 0.3288, 'grad_norm': 3.8226940631866455, 'learning_rate': 8.401422130702224e-06} -{'loss': 0.3288, 'grad_norm': 3.8226940631866455, 'learning_rate': 8.401422130702224e-06, 'epoch': 0.57} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.46920108795166016, 'train/info_loss': 0.1750355064868927, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010354258120059967, 'train/video_loss': 0.17493195831775665, 'train/total_loss': 0.6441330313682556} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(0.3546, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00016273122746497394, 'train/lm_loss': 2.493702922947705e-05, 'train/info_loss': 1.8834713046089746e-05, 'train/ref_loss': 0.457827091217041, 'train/uncertainty_loss': 0.0354598343372345, 'train/video_loss': 0.4946076273918152, 'train/total_loss': 0.4946325719356537} -tensor(0.1960, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.2241, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0606, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.399, 'grad_norm': 9.65767765045166, 'learning_rate': 8.390892679123488e-06}[Rank 1] Trainer log: {'loss': 0.399, 'grad_norm': 9.65767765045166, 'learning_rate': 8.390892679123488e-06}[Rank 3] Trainer log: {'loss': 0.399, 'grad_norm': 9.65767765045166, 'learning_rate': 8.390892679123488e-06} - - -[Rank 0] Trainer log: {'loss': 0.399, 'grad_norm': 9.65767765045166, 'learning_rate': 8.390892679123488e-06} -{'loss': 0.399, 'grad_norm': 9.65767765045166, 'learning_rate': 8.390892679123488e-06, 'epoch': 0.57} -tensor(0.2497, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00021021047141402962, 'train/lm_loss': 4.1528203291818505e-05, 'train/info_loss': 2.270885306643322e-05, 'train/ref_loss': 0.12170752882957458, 'train/uncertainty_loss': -7.002723286859691e-05, 'train/video_loss': 0.12334189563989639, 'train/total_loss': 0.12338342517614365} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.1275, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0018, device='cuda:1', grad_fn=) tensor(-0.0018, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.0755246639251709, 'train/info_loss': 0.15886560082435608, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011318894103169442, 'train/video_loss': 0.1587524116039276, 'train/total_loss': 0.23427706956863403} -tensor(0.2218, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.4237, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3928, 'grad_norm': 12.481135368347168, 'learning_rate': 8.380365058653416e-06}[Rank 1] Trainer log: {'loss': 0.3928, 'grad_norm': 12.481135368347168, 'learning_rate': 8.380365058653416e-06} - -[Rank 3] Trainer log: {'loss': 0.3928, 'grad_norm': 12.481135368347168, 'learning_rate': 8.380365058653416e-06} -[Rank 2] Trainer log: {'loss': 0.3928, 'grad_norm': 12.481135368347168, 'learning_rate': 8.380365058653416e-06} -{'loss': 0.3928, 'grad_norm': 12.481135368347168, 'learning_rate': 8.380365058653416e-06, 'epoch': 0.57} -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2723289251327515, 'train/info_loss': 0.15029111504554749, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011133514344692231, 'train/video_loss': 0.15017977356910706, 'train/total_loss': 0.42250871658325195} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.06485701203346253, 'train/info_loss': 0.1585433930158615, 'train/ref_loss': None, 'train/uncertainty_loss': -9.230870055034757e-05, 'train/video_loss': 0.15845108032226562, 'train/total_loss': 0.22330808639526367} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.2363, 'grad_norm': 3.1525371074676514, 'learning_rate': 8.369839281272072e-06}[Rank 3] Trainer log: {'loss': 0.2363, 'grad_norm': 3.1525371074676514, 'learning_rate': 8.369839281272072e-06}[Rank 1] Trainer log: {'loss': 0.2363, 'grad_norm': 3.1525371074676514, 'learning_rate': 8.369839281272072e-06} - - -[Rank 2] Trainer log: {'loss': 0.2363, 'grad_norm': 3.1525371074676514, 'learning_rate': 8.369839281272072e-06} -{'loss': 0.2363, 'grad_norm': 3.1525371074676514, 'learning_rate': 8.369839281272072e-06, 'epoch': 0.57} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2721230745315552, 'train/info_loss': 0.15055711567401886, 'train/ref_loss': None, 'train/uncertainty_loss': -9.816531091928483e-05, 'train/video_loss': 0.1504589468240738, 'train/total_loss': 0.4225820302963257} -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(0.0683, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.1109, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.2011, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00020345065277069808, 'train/lm_loss': 6.050148513168097e-05, 'train/info_loss': 2.5331331926281564e-05, 'train/ref_loss': 0.34916257858276367, 'train/uncertainty_loss': 0.02011382430791855, 'train/video_loss': 0.3709293305873871, 'train/total_loss': 0.3709898293018341} -tensor(0.8761, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.432, 'grad_norm': 14.959662437438965, 'learning_rate': 8.359315358957432e-06}[Rank 3] Trainer log: {'loss': 0.432, 'grad_norm': 14.959662437438965, 'learning_rate': 8.359315358957432e-06}[Rank 0] Trainer log: {'loss': 0.432, 'grad_norm': 14.959662437438965, 'learning_rate': 8.359315358957432e-06} - - -[Rank 2] Trainer log: {'loss': 0.432, 'grad_norm': 14.959662437438965, 'learning_rate': 8.359315358957432e-06} -{'loss': 0.432, 'grad_norm': 14.959662437438965, 'learning_rate': 8.359315358957432e-06, 'epoch': 0.57} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.0366, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003096492728218436, 'train/lm_loss': 4.14328562328592e-05, 'train/info_loss': 2.3781687559676357e-05, 'train/ref_loss': 0.15231460332870483, 'train/uncertainty_loss': -7.099803769961e-05, 'train/video_loss': 0.15474458038806915, 'train/total_loss': 0.15478602051734924} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.03542513251304626, 'train/info_loss': 0.19368034601211548, 'train/ref_loss': None, 'train/uncertainty_loss': -9.61030600592494e-05, 'train/video_loss': 0.19358424842357635, 'train/total_loss': 0.22900938987731934} -[Rank 3] Trainer log: {'loss': 0.3536, 'grad_norm': 1.7954152822494507, 'learning_rate': 8.34879330368535e-06} -[Rank 1] Trainer log: {'loss': 0.3536, 'grad_norm': 1.7954152822494507, 'learning_rate': 8.34879330368535e-06} -[Rank 0] Trainer log: {'loss': 0.3536, 'grad_norm': 1.7954152822494507, 'learning_rate': 8.34879330368535e-06} -[Rank 2] Trainer log: {'loss': 0.3536, 'grad_norm': 1.7954152822494507, 'learning_rate': 8.34879330368535e-06} -{'loss': 0.3536, 'grad_norm': 1.7954152822494507, 'learning_rate': 8.34879330368535e-06, 'epoch': 0.58} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(0.2200, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.1851, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00023725407663732767, 'train/lm_loss': 6.002478767186403e-05, 'train/info_loss': 2.413929905742407e-05, 'train/ref_loss': 0.3368619978427887, 'train/uncertainty_loss': 0.018513400852680207, 'train/video_loss': 0.3572975695133209, 'train/total_loss': 0.35735759139060974} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(0.4292, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0840, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=)tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) -{'train/tv_loss': 0.0004586364142596722, 'train/lm_loss': 3.611715219449252e-05, 'train/info_loss': 2.1338008082238957e-05, 'train/ref_loss': 0.2693779468536377, 'train/uncertainty_loss': 0.008398322016000747, 'train/video_loss': 0.28146669268608093, 'train/total_loss': 0.2815028131008148} -[Rank 2] Trainer log: {'loss': 0.3775, 'grad_norm': 4.608431816101074, 'learning_rate': 8.33827312742957e-06}[Rank 1] Trainer log: {'loss': 0.3775, 'grad_norm': 4.608431816101074, 'learning_rate': 8.33827312742957e-06} - -[Rank 0] Trainer log: {'loss': 0.3775, 'grad_norm': 4.608431816101074, 'learning_rate': 8.33827312742957e-06} -[Rank 3] Trainer log: {'loss': 0.3775, 'grad_norm': 4.608431816101074, 'learning_rate': 8.33827312742957e-06} -{'loss': 0.3775, 'grad_norm': 4.608431816101074, 'learning_rate': 8.33827312742957e-06, 'epoch': 0.58} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2413203239440918, 'train/info_loss': 0.2647455334663391, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010508744744583965, 'train/video_loss': 0.2646404504776001, 'train/total_loss': 0.5059607625007629} -tensor(-0.0015, device='cuda:2', grad_fn=) tensor(-0.0015, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00037475090939551595, 'train/lm_loss': 4.715363611467183e-05, 'train/info_loss': 2.574854443082586e-05, 'train/ref_loss': 0.207945317029953, 'train/uncertainty_loss': -7.176551152952016e-05, 'train/video_loss': 0.2108973115682602, 'train/total_loss': 0.21094445884227753} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3447, 'grad_norm': 4.240108966827393, 'learning_rate': 8.327754842161685e-06}[Rank 2] Trainer log: {'loss': 0.3447, 'grad_norm': 4.240108966827393, 'learning_rate': 8.327754842161685e-06} -[Rank 1] Trainer log: {'loss': 0.3447, 'grad_norm': 4.240108966827393, 'learning_rate': 8.327754842161685e-06}[Rank 3] Trainer log: {'loss': 0.3447, 'grad_norm': 4.240108966827393, 'learning_rate': 8.327754842161685e-06} - - -{'loss': 0.3447, 'grad_norm': 4.240108966827393, 'learning_rate': 8.327754842161685e-06, 'epoch': 0.58} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(0.0615, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00019584074616432192, 'train/lm_loss': 2.4674800806678834e-05, 'train/info_loss': 1.8238688426208682e-05, 'train/ref_loss': 0.06670720875263214, 'train/uncertainty_loss': -6.748899468220771e-05, 'train/video_loss': 0.06822468340396881, 'train/total_loss': 0.06824935972690582} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2141350030899048, 'train/info_loss': 0.26848793029785156, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012422180734574794, 'train/video_loss': 0.26836371421813965, 'train/total_loss': 0.4824987053871155} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.3063, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0551, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.2932, 'grad_norm': 9.70335865020752, 'learning_rate': 8.31723845985114e-06}[Rank 2] Trainer log: {'loss': 0.2932, 'grad_norm': 9.70335865020752, 'learning_rate': 8.31723845985114e-06}[Rank 0] Trainer log: {'loss': 0.2932, 'grad_norm': 9.70335865020752, 'learning_rate': 8.31723845985114e-06} - -[Rank 1] Trainer log: {'loss': 0.2932, 'grad_norm': 9.70335865020752, 'learning_rate': 8.31723845985114e-06} - -{'loss': 0.2932, 'grad_norm': 9.70335865020752, 'learning_rate': 8.31723845985114e-06, 'epoch': 0.58} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.2273, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0005227552261203528, 'train/lm_loss': 4.126599815208465e-05, 'train/info_loss': 2.235124156868551e-05, 'train/ref_loss': 0.3636597990989685, 'train/uncertainty_loss': 0.022729510068893434, 'train/video_loss': 0.3905937075614929, 'train/total_loss': 0.3906349837779999} -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2846405029296875, 'train/info_loss': 0.2386389970779419, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011690501123666764, 'train/video_loss': 0.23852209746837616, 'train/total_loss': 0.5231626033782959} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4146, 'grad_norm': 2.812859058380127, 'learning_rate': 8.306723992465224e-06} -[Rank 2] Trainer log: {'loss': 0.4146, 'grad_norm': 2.812859058380127, 'learning_rate': 8.306723992465224e-06} -[Rank 1] Trainer log: {'loss': 0.4146, 'grad_norm': 2.812859058380127, 'learning_rate': 8.306723992465224e-06} -[Rank 0] Trainer log: {'loss': 0.4146, 'grad_norm': 2.812859058380127, 'learning_rate': 8.306723992465224e-06} -{'loss': 0.4146, 'grad_norm': 2.812859058380127, 'learning_rate': 8.306723992465224e-06, 'epoch': 0.58} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.4502320766448975, 'train/info_loss': 0.1863643229007721, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011045219143852593, 'train/video_loss': 0.1862538754940033, 'train/total_loss': 0.6364859342575073} -tensor(0.0361, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2996868371963501, 'train/info_loss': 0.1897967904806137, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012820740230381488, 'train/video_loss': 0.18966858088970184, 'train/total_loss': 0.4893554449081421} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.31, 'grad_norm': 1.532781958580017, 'learning_rate': 8.296211451969026e-06}[Rank 0] Trainer log: {'loss': 0.31, 'grad_norm': 1.532781958580017, 'learning_rate': 8.296211451969026e-06} -[Rank 1] Trainer log: {'loss': 0.31, 'grad_norm': 1.532781958580017, 'learning_rate': 8.296211451969026e-06} -[Rank 3] Trainer log: {'loss': 0.31, 'grad_norm': 1.532781958580017, 'learning_rate': 8.296211451969026e-06} - -{'loss': 0.31, 'grad_norm': 1.532781958580017, 'learning_rate': 8.296211451969026e-06, 'epoch': 0.58} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.24970092773437502, 'train/info_loss': 0.18759432435035706, 'train/ref_loss': None, 'train/uncertainty_loss': -0.000107451097574085, 'train/video_loss': 0.18748687207698822, 'train/total_loss': 0.4371877908706665} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(0.1693, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(0.7438, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00015879167476668955, 'train/lm_loss': 4.755885165650398e-05, 'train/info_loss': 2.413929905742407e-05, 'train/ref_loss': 0.6931647062301636, 'train/uncertainty_loss': 0.07437763810157776, 'train/video_loss': 0.7688368558883667, 'train/total_loss': 0.7688844203948975} -[Rank 1] Trainer log: {'loss': 0.4158, 'grad_norm': 7.178661346435547, 'learning_rate': 8.285700850325467e-06} -[Rank 0] Trainer log: {'loss': 0.4158, 'grad_norm': 7.178661346435547, 'learning_rate': 8.285700850325467e-06}[Rank 3] Trainer log: {'loss': 0.4158, 'grad_norm': 7.178661346435547, 'learning_rate': 8.285700850325467e-06} - -[Rank 2] Trainer log: {'loss': 0.4158, 'grad_norm': 7.178661346435547, 'learning_rate': 8.285700850325467e-06} -{'loss': 0.4158, 'grad_norm': 7.178661346435547, 'learning_rate': 8.285700850325467e-06, 'epoch': 0.58} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00021669212728738787, 'train/lm_loss': 3.1659481464885175e-05, 'train/info_loss': 2.0682384274550714e-05, 'train/ref_loss': 0.09957362711429596, 'train/uncertainty_loss': -6.593595026060939e-05, 'train/video_loss': 0.10126190632581711, 'train/total_loss': 0.10129356384277344} -tensor(0.2293, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -tensor(0.0979, device='cuda:2', grad_fn=) {'train/tv_loss': 0.00016631362959742548, 'train/lm_loss': 4.109914007131011e-05, 'train/info_loss': 2.413929905742407e-05, 'train/ref_loss': 0.12169698625802994, 'train/uncertainty_loss': -6.928115617483854e-05, 'train/video_loss': 0.12298235297203064, 'train/total_loss': 0.12302345037460327} -tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0663, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3451, 'grad_norm': 8.245985984802246, 'learning_rate': 8.275192199495237e-06}[Rank 3] Trainer log: {'loss': 0.3451, 'grad_norm': 8.245985984802246, 'learning_rate': 8.275192199495237e-06}[Rank 1] Trainer log: {'loss': 0.3451, 'grad_norm': 8.245985984802246, 'learning_rate': 8.275192199495237e-06} - - -[Rank 0] Trainer log: {'loss': 0.3451, 'grad_norm': 8.245985984802246, 'learning_rate': 8.275192199495237e-06} -{'loss': 0.3451, 'grad_norm': 8.245985984802246, 'learning_rate': 8.275192199495237e-06, 'epoch': 0.58} -tensor(0.2338, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.0454, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00036622839979827404, 'train/lm_loss': 5.292192217893899e-05, 'train/info_loss': 2.413929905742407e-05, 'train/ref_loss': 0.24662943184375763, 'train/uncertainty_loss': 0.004542823508381844, 'train/video_loss': 0.2541262209415436, 'train/total_loss': 0.2541791498661041} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.1667, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(0.0862, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00022041494958102703, 'train/lm_loss': 3.621250216383487e-05, 'train/info_loss': 2.0980394765501842e-05, 'train/ref_loss': 0.2741661071777344, 'train/uncertainty_loss': 0.008619014173746109, 'train/video_loss': 0.2845694124698639, 'train/total_loss': 0.28460562229156494} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.375, 'grad_norm': 2.365518569946289, 'learning_rate': 8.264685511436831e-06}[Rank 1] Trainer log: {'loss': 0.375, 'grad_norm': 2.365518569946289, 'learning_rate': 8.264685511436831e-06}[Rank 3] Trainer log: {'loss': 0.375, 'grad_norm': 2.365518569946289, 'learning_rate': 8.264685511436831e-06} - - -[Rank 2] Trainer log: {'loss': 0.375, 'grad_norm': 2.365518569946289, 'learning_rate': 8.264685511436831e-06} -{'loss': 0.375, 'grad_norm': 2.365518569946289, 'learning_rate': 8.264685511436831e-06, 'epoch': 0.58} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.1075, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.0049, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00034519624896347525, 'train/lm_loss': 4.7225144226104024e-05, 'train/info_loss': 2.1636018573190086e-05, 'train/ref_loss': 0.21472159028053284, 'train/uncertainty_loss': 0.00048650046810507777, 'train/video_loss': 0.21799129247665405, 'train/total_loss': 0.21803851425647736} -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.23285667896270754, 'train/info_loss': 0.257907509803772, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001257142284885049, 'train/video_loss': 0.2577818036079407, 'train/total_loss': 0.49063849449157715} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3259, 'grad_norm': 2.7278833389282227, 'learning_rate': 8.254180798106484e-06}[Rank 2] Trainer log: {'loss': 0.3259, 'grad_norm': 2.7278833389282227, 'learning_rate': 8.254180798106484e-06} - -[Rank 0] Trainer log: {'loss': 0.3259, 'grad_norm': 2.7278833389282227, 'learning_rate': 8.254180798106484e-06}[Rank 3] Trainer log: {'loss': 0.3259, 'grad_norm': 2.7278833389282227, 'learning_rate': 8.254180798106484e-06} - -{'loss': 0.3259, 'grad_norm': 2.7278833389282227, 'learning_rate': 8.254180798106484e-06, 'epoch': 0.58} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.10044503211975098, 'train/info_loss': 0.17890861630439758, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010478626936674119, 'train/video_loss': 0.17880383133888245, 'train/total_loss': 0.2792488634586334} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.24563562870025635, 'train/info_loss': 0.17303606867790222, 'train/ref_loss': None, 'train/uncertainty_loss': -0.000138058268930763, 'train/video_loss': 0.17289800941944122, 'train/total_loss': 0.4185336232185364} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.4482, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3489, 'grad_norm': 7.939164161682129, 'learning_rate': 8.24367807145821e-06} -[Rank 3] Trainer log: {'loss': 0.3489, 'grad_norm': 7.939164161682129, 'learning_rate': 8.24367807145821e-06}[Rank 2] Trainer log: {'loss': 0.3489, 'grad_norm': 7.939164161682129, 'learning_rate': 8.24367807145821e-06} - -[Rank 0] Trainer log: {'loss': 0.3489, 'grad_norm': 7.939164161682129, 'learning_rate': 8.24367807145821e-06} -{'loss': 0.3489, 'grad_norm': 7.939164161682129, 'learning_rate': 8.24367807145821e-06, 'epoch': 0.58} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1481959342956543, 'train/info_loss': 0.153592050075531, 'train/ref_loss': None, 'train/uncertainty_loss': -8.212334359996021e-05, 'train/video_loss': 0.1535099297761917, 'train/total_loss': 0.30170586705207825} -tensor(0.4771, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0547, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00024304280523210764, 'train/lm_loss': 4.140902019571513e-05, 'train/info_loss': 2.1993630070937797e-05, 'train/ref_loss': 0.2543547749519348, 'train/uncertainty_loss': 0.005470168590545655, 'train/video_loss': 0.26179125905036926, 'train/total_loss': 0.26183265447616577} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.2987, 'grad_norm': 4.392376899719238, 'learning_rate': 8.233177343443745e-06} -[Rank 0] Trainer log: {'loss': 0.2987, 'grad_norm': 4.392376899719238, 'learning_rate': 8.233177343443745e-06}[Rank 1] Trainer log: {'loss': 0.2987, 'grad_norm': 4.392376899719238, 'learning_rate': 8.233177343443745e-06}[Rank 3] Trainer log: {'loss': 0.2987, 'grad_norm': 4.392376899719238, 'learning_rate': 8.233177343443745e-06} - - -{'loss': 0.2987, 'grad_norm': 4.392376899719238, 'learning_rate': 8.233177343443745e-06, 'epoch': 0.58} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(0.0544, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0529, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00018890849314630033, 'train/lm_loss': 4.1480531217530375e-05, 'train/info_loss': 2.235124156868551e-05, 'train/ref_loss': 0.24649251997470856, 'train/uncertainty_loss': 0.005294829234480858, 'train/video_loss': 0.25332096219062805, 'train/total_loss': 0.2533624470233917} -tensor(0.6070, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1495535135269165, 'train/info_loss': 0.13052970170974731, 'train/ref_loss': None, 'train/uncertainty_loss': -9.282481623813511e-05, 'train/video_loss': 0.13043688237667084, 'train/total_loss': 0.27999040484428406} -tensor(0.2241, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4423, 'grad_norm': 3.2419235706329346, 'learning_rate': 8.222678626012554e-06}[Rank 0] Trainer log: {'loss': 0.4423, 'grad_norm': 3.2419235706329346, 'learning_rate': 8.222678626012554e-06}[Rank 2] Trainer log: {'loss': 0.4423, 'grad_norm': 3.2419235706329346, 'learning_rate': 8.222678626012554e-06} - - -[Rank 3] Trainer log: {'loss': 0.4423, 'grad_norm': 3.2419235706329346, 'learning_rate': 8.222678626012554e-06} -{'loss': 0.4423, 'grad_norm': 3.2419235706329346, 'learning_rate': 8.222678626012554e-06, 'epoch': 0.58} -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(0.1428, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.5468, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00016689333133399487, 'train/lm_loss': 6.085900240577758e-05, 'train/info_loss': 2.574854443082586e-05, 'train/ref_loss': 0.5690100789070129, 'train/uncertainty_loss': 0.05467517375946045, 'train/video_loss': 0.6250461339950562, 'train/total_loss': 0.6251069903373718} -tensor(0.0290, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.059867972135543825, 'train/info_loss': 0.24461250007152557, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010001306654885412, 'train/video_loss': 0.24451248347759247, 'train/total_loss': 0.3043804466724396} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.1309, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3835, 'grad_norm': 6.824668884277344, 'learning_rate': 8.21218193111182e-06} -[Rank 0] Trainer log: {'loss': 0.3835, 'grad_norm': 6.824668884277344, 'learning_rate': 8.21218193111182e-06}[Rank 3] Trainer log: {'loss': 0.3835, 'grad_norm': 6.824668884277344, 'learning_rate': 8.21218193111182e-06} - -[Rank 2] Trainer log: {'loss': 0.3835, 'grad_norm': 6.824668884277344, 'learning_rate': 8.21218193111182e-06} -{'loss': 0.3835, 'grad_norm': 6.824668884277344, 'learning_rate': 8.21218193111182e-06, 'epoch': 0.58} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(0.1772, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003680656896904111, 'train/lm_loss': 4.729665233753622e-05, 'train/info_loss': 2.3781687559676357e-05, 'train/ref_loss': 0.3308458924293518, 'train/uncertainty_loss': 0.017724953591823578, 'train/video_loss': 0.35153916478157043, 'train/total_loss': 0.3515864610671997} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(0.0175, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0001868139603175223, 'train/lm_loss': 6.178854964673519e-05, 'train/info_loss': 2.8251803087187e-05, 'train/ref_loss': 0.19760672748088837, 'train/uncertainty_loss': -6.76418305374682e-05, 'train/video_loss': 0.19906185567378998, 'train/total_loss': 0.19912365078926086} -[Rank 1] Trainer log: {'loss': 0.4014, 'grad_norm': 7.401335716247559, 'learning_rate': 8.201687270686415e-06}[Rank 2] Trainer log: {'loss': 0.4014, 'grad_norm': 7.401335716247559, 'learning_rate': 8.201687270686415e-06} - -[Rank 3] Trainer log: {'loss': 0.4014, 'grad_norm': 7.401335716247559, 'learning_rate': 8.201687270686415e-06} -[Rank 0] Trainer log: {'loss': 0.4014, 'grad_norm': 7.401335716247559, 'learning_rate': 8.201687270686415e-06} -{'loss': 0.4014, 'grad_norm': 7.401335716247559, 'learning_rate': 8.201687270686415e-06, 'epoch': 0.58} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0198, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002282246481627226, 'train/lm_loss': 7.89248151704669e-05, 'train/info_loss': 3.1529863917967305e-05, 'train/ref_loss': 0.19166076183319092, 'train/uncertainty_loss': -7.098731584846974e-05, 'train/video_loss': 0.19344709813594818, 'train/total_loss': 0.193526029586792} -tensor(-0.0015, device='cuda:1', grad_fn=) tensor(-0.0015, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.33268766403198247, 'train/info_loss': 0.17668801546096802, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010789779480546713, 'train/video_loss': 0.17658011615276337, 'train/total_loss': 0.5092678070068359} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.3317, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.2648, 'grad_norm': 4.745727062225342, 'learning_rate': 8.191194656678905e-06} -[Rank 2] Trainer log: {'loss': 0.2648, 'grad_norm': 4.745727062225342, 'learning_rate': 8.191194656678905e-06} -[Rank 0] Trainer log: {'loss': 0.2648, 'grad_norm': 4.745727062225342, 'learning_rate': 8.191194656678905e-06}[Rank 3] Trainer log: {'loss': 0.2648, 'grad_norm': 4.745727062225342, 'learning_rate': 8.191194656678905e-06} - -{'loss': 0.2648, 'grad_norm': 4.745727062225342, 'learning_rate': 8.191194656678905e-06, 'epoch': 0.58} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.3990, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00017805756069719792, 'train/lm_loss': 6.035847472958267e-05, 'train/info_loss': 2.574854443082586e-05, 'train/ref_loss': 0.47507917881011963, 'train/uncertainty_loss': 0.03990022838115692, 'train/video_loss': 0.516429603099823, 'train/total_loss': 0.5164899826049805} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.19384398460388186, 'train/info_loss': 0.15035618841648102, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010069848503917456, 'train/video_loss': 0.150255486369133, 'train/total_loss': 0.3440994620323181} -tensor(0.0015, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.2025, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.2906, 'grad_norm': 8.614053726196289, 'learning_rate': 8.180704101029518e-06} -[Rank 0] Trainer log: {'loss': 0.2906, 'grad_norm': 8.614053726196289, 'learning_rate': 8.180704101029518e-06}[Rank 1] Trainer log: {'loss': 0.2906, 'grad_norm': 8.614053726196289, 'learning_rate': 8.180704101029518e-06} - -[Rank 2] Trainer log: {'loss': 0.2906, 'grad_norm': 8.614053726196289, 'learning_rate': 8.180704101029518e-06} -{'loss': 0.2906, 'grad_norm': 8.614053726196289, 'learning_rate': 8.180704101029518e-06, 'epoch': 0.58} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.12604349851608276, 'train/info_loss': 0.18397852778434753, 'train/ref_loss': None, 'train/uncertainty_loss': -8.032940095290542e-05, 'train/video_loss': 0.1838981956243515, 'train/total_loss': 0.30994170904159546} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1389, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(0.1478, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.17310125827789308, 'train/info_loss': 0.16305382549762726, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010879377368837595, 'train/video_loss': 0.16294503211975098, 'train/total_loss': 0.3360462784767151} -[Rank 1] Trainer log: {'loss': 0.3616, 'grad_norm': 7.141207218170166, 'learning_rate': 8.170215615676145e-06} -[Rank 2] Trainer log: {'loss': 0.3616, 'grad_norm': 7.141207218170166, 'learning_rate': 8.170215615676145e-06} -[Rank 3] Trainer log: {'loss': 0.3616, 'grad_norm': 7.141207218170166, 'learning_rate': 8.170215615676145e-06}[Rank 0] Trainer log: {'loss': 0.3616, 'grad_norm': 7.141207218170166, 'learning_rate': 8.170215615676145e-06} - -{'loss': 0.3616, 'grad_norm': 7.141207218170166, 'learning_rate': 8.170215615676145e-06, 'epoch': 0.58} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.0017, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0308, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.2860, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00016919680638238788, 'train/lm_loss': 5.349397542886436e-05, 'train/info_loss': 2.5331331926281564e-05, 'train/ref_loss': 0.39339587092399597, 'train/uncertainty_loss': 0.028602352738380434, 'train/video_loss': 0.423377126455307, 'train/total_loss': 0.4234306216239929} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2731693744659424, 'train/info_loss': 0.1290113776922226, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011304112849757076, 'train/video_loss': 0.12889833748340607, 'train/total_loss': 0.40206772089004517} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(0.4250, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.3311, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.42, 'grad_norm': 13.771077156066895, 'learning_rate': 8.159729212554328e-06}[Rank 2] Trainer log: {'loss': 0.42, 'grad_norm': 13.771077156066895, 'learning_rate': 8.159729212554328e-06} -[Rank 3] Trainer log: {'loss': 0.42, 'grad_norm': 13.771077156066895, 'learning_rate': 8.159729212554328e-06} - -[Rank 1] Trainer log: {'loss': 0.42, 'grad_norm': 13.771077156066895, 'learning_rate': 8.159729212554328e-06} -{'loss': 0.42, 'grad_norm': 13.771077156066895, 'learning_rate': 8.159729212554328e-06, 'epoch': 0.58} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(0.1225, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1383, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0001771475304849446, 'train/lm_loss': 2.794070460367948e-05, 'train/info_loss': 2.002676046686247e-05, 'train/ref_loss': 0.30517417192459106, 'train/uncertainty_loss': 0.013829855620861054, 'train/video_loss': 0.32044124603271484, 'train/total_loss': 0.3204692006111145} -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.18328508138656618, 'train/info_loss': 0.15023578703403473, 'train/ref_loss': None, 'train/uncertainty_loss': -8.88864218723029e-05, 'train/video_loss': 0.15014690160751343, 'train/total_loss': 0.33343198895454407} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3498, 'grad_norm': 6.697657585144043, 'learning_rate': 8.149244903597225e-06}[Rank 2] Trainer log: {'loss': 0.3498, 'grad_norm': 6.697657585144043, 'learning_rate': 8.149244903597225e-06} - -[Rank 0] Trainer log: {'loss': 0.3498, 'grad_norm': 6.697657585144043, 'learning_rate': 8.149244903597225e-06} -[Rank 3] Trainer log: {'loss': 0.3498, 'grad_norm': 6.697657585144043, 'learning_rate': 8.149244903597225e-06} -{'loss': 0.3498, 'grad_norm': 6.697657585144043, 'learning_rate': 8.149244903597225e-06, 'epoch': 0.58} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.12958513498306276, 'train/info_loss': 0.14840617775917053, 'train/ref_loss': None, 'train/uncertainty_loss': -9.529576054774226e-05, 'train/video_loss': 0.14831088483333588, 'train/total_loss': 0.2778960168361664} -tensor(0.2733, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.0198, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.37890880107879643, 'train/info_loss': 0.1910867542028427, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013872941490262748, 'train/video_loss': 0.190948024392128, 'train/total_loss': 0.5698568224906921} -tensor(0.0339, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.4490, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3698, 'grad_norm': 8.829907417297363, 'learning_rate': 8.138762700735622e-06}[Rank 1] Trainer log: {'loss': 0.3698, 'grad_norm': 8.829907417297363, 'learning_rate': 8.138762700735622e-06} -[Rank 3] Trainer log: {'loss': 0.3698, 'grad_norm': 8.829907417297363, 'learning_rate': 8.138762700735622e-06} - -[Rank 2] Trainer log: {'loss': 0.3698, 'grad_norm': 8.829907417297363, 'learning_rate': 8.138762700735622e-06} -{'loss': 0.3698, 'grad_norm': 8.829907417297363, 'learning_rate': 8.138762700735622e-06, 'epoch': 0.58} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.08641330003738404, 'train/info_loss': 0.10125043243169785, 'train/ref_loss': None, 'train/uncertainty_loss': -9.630617569200695e-05, 'train/video_loss': 0.10115412622690201, 'train/total_loss': 0.18756742775440216} -tensor(0.0381, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.19862734079360964, 'train/info_loss': 0.25544077157974243, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001196382800117135, 'train/video_loss': 0.2553211450576782, 'train/total_loss': 0.4539484977722168} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3746, 'grad_norm': 5.729103088378906, 'learning_rate': 8.128282615897904e-06} -[Rank 3] Trainer log: {'loss': 0.3746, 'grad_norm': 5.729103088378906, 'learning_rate': 8.128282615897904e-06}[Rank 0] Trainer log: {'loss': 0.3746, 'grad_norm': 5.729103088378906, 'learning_rate': 8.128282615897904e-06} -[Rank 2] Trainer log: {'loss': 0.3746, 'grad_norm': 5.729103088378906, 'learning_rate': 8.128282615897904e-06} - -{'loss': 0.3746, 'grad_norm': 5.729103088378906, 'learning_rate': 8.128282615897904e-06, 'epoch': 0.58} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.4290374755859375, 'train/info_loss': 0.21294137835502625, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012240289943292739, 'train/video_loss': 0.21281898021697998, 'train/total_loss': 0.6418564319610596} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.5356, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0627, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3036221981048584, 'train/info_loss': 0.25799354910850525, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011627946514636278, 'train/video_loss': 0.25787726044654846, 'train/total_loss': 0.5614994764328003} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.1814, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.476, 'grad_norm': 3.3108267784118652, 'learning_rate': 8.117804661010046e-06}[Rank 0] Trainer log: {'loss': 0.476, 'grad_norm': 3.3108267784118652, 'learning_rate': 8.117804661010046e-06}[Rank 1] Trainer log: {'loss': 0.476, 'grad_norm': 3.3108267784118652, 'learning_rate': 8.117804661010046e-06} - - -[Rank 2] Trainer log: {'loss': 0.476, 'grad_norm': 3.3108267784118652, 'learning_rate': 8.117804661010046e-06} -{'loss': 0.476, 'grad_norm': 3.3108267784118652, 'learning_rate': 8.117804661010046e-06, 'epoch': 0.58} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2370182752609253, 'train/info_loss': 0.2098541408777237, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011830831645056606, 'train/video_loss': 0.20973582565784454, 'train/total_loss': 0.4467540979385376} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.14444385766983034, 'train/info_loss': 0.29443076252937317, 'train/ref_loss': None, 'train/uncertainty_loss': -9.369755862280727e-05, 'train/video_loss': 0.29433706402778625, 'train/total_loss': 0.43878090381622314} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3012, 'grad_norm': 3.895914316177368, 'learning_rate': 8.107328847995596e-06} -[Rank 3] Trainer log: {'loss': 0.3012, 'grad_norm': 3.895914316177368, 'learning_rate': 8.107328847995596e-06} -[Rank 1] Trainer log: {'loss': 0.3012, 'grad_norm': 3.895914316177368, 'learning_rate': 8.107328847995596e-06}[Rank 0] Trainer log: {'loss': 0.3012, 'grad_norm': 3.895914316177368, 'learning_rate': 8.107328847995596e-06} - -{'loss': 0.3012, 'grad_norm': 3.895914316177368, 'learning_rate': 8.107328847995596e-06, 'epoch': 0.58} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.2477, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00022544432431459427, 'train/lm_loss': 4.653389332816005e-05, 'train/info_loss': 2.270885306643322e-05, 'train/ref_loss': 0.3829549252986908, 'train/uncertainty_loss': 0.02476864904165268, 'train/video_loss': 0.4095498323440552, 'train/total_loss': 0.40959635376930237} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(0.0647, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.5443, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00036395939532667403, 'train/lm_loss': 3.5974127240478995e-05, 'train/info_loss': 2.3424076061928645e-05, 'train/ref_loss': 0.6474859714508057, 'train/uncertainty_loss': 0.05442849397659302, 'train/video_loss': 0.7048496007919312, 'train/total_loss': 0.7048856019973755} -tensor(0.3257, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4725, 'grad_norm': 5.873077392578125, 'learning_rate': 8.096855188775672e-06}[Rank 2] Trainer log: {'loss': 0.4725, 'grad_norm': 5.873077392578125, 'learning_rate': 8.096855188775672e-06} - -[Rank 0] Trainer log: {'loss': 0.4725, 'grad_norm': 5.873077392578125, 'learning_rate': 8.096855188775672e-06}[Rank 3] Trainer log: {'loss': 0.4725, 'grad_norm': 5.873077392578125, 'learning_rate': 8.096855188775672e-06} - -{'loss': 0.4725, 'grad_norm': 5.873077392578125, 'learning_rate': 8.096855188775672e-06, 'epoch': 0.58} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00016185141867026688, 'train/lm_loss': 4.720130818895996e-05, 'train/info_loss': 2.455650974297896e-05, 'train/ref_loss': 0.12982803583145142, 'train/uncertainty_loss': -6.867795600555837e-05, 'train/video_loss': 0.13107872009277344, 'train/total_loss': 0.13112592697143555} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.4041, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0010206847451627256, 'train/lm_loss': 4.1575878276489676e-05, 'train/info_loss': 2.3781687559676357e-05, 'train/ref_loss': 0.513627290725708, 'train/uncertainty_loss': 0.040414467453956604, 'train/video_loss': 0.5622310042381287, 'train/total_loss': 0.5622726082801819} -[Rank 0] Trainer log: {'loss': 0.3726, 'grad_norm': 9.826825141906738, 'learning_rate': 8.086383695268937e-06}[Rank 1] Trainer log: {'loss': 0.3726, 'grad_norm': 9.826825141906738, 'learning_rate': 8.086383695268937e-06} -[Rank 3] Trainer log: {'loss': 0.3726, 'grad_norm': 9.826825141906738, 'learning_rate': 8.086383695268937e-06} - -[Rank 2] Trainer log: {'loss': 0.3726, 'grad_norm': 9.826825141906738, 'learning_rate': 8.086383695268937e-06} -{'loss': 0.3726, 'grad_norm': 9.826825141906738, 'learning_rate': 8.086383695268937e-06, 'epoch': 0.58} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.40064373016357424, 'train/info_loss': 0.25235918164253235, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010734613751992584, 'train/video_loss': 0.25225183367729187, 'train/total_loss': 0.6528955698013306} -tensor(0.3931, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.1785, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.1797, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002411607885733247, 'train/lm_loss': 6.131185800768435e-05, 'train/info_loss': 2.6523362976149656e-05, 'train/ref_loss': 0.33073684573173523, 'train/uncertainty_loss': 0.017974931001663207, 'train/video_loss': 0.3506675958633423, 'train/total_loss': 0.3507288992404938} -[Rank 1] Trainer log: {'loss': 0.3841, 'grad_norm': 14.857680320739746, 'learning_rate': 8.075914379391587e-06}[Rank 0] Trainer log: {'loss': 0.3841, 'grad_norm': 14.857680320739746, 'learning_rate': 8.075914379391587e-06}[Rank 2] Trainer log: {'loss': 0.3841, 'grad_norm': 14.857680320739746, 'learning_rate': 8.075914379391587e-06} - -[Rank 3] Trainer log: {'loss': 0.3841, 'grad_norm': 14.857680320739746, 'learning_rate': 8.075914379391587e-06} - -{'loss': 0.3841, 'grad_norm': 14.857680320739746, 'learning_rate': 8.075914379391587e-06, 'epoch': 0.58} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.7131, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0220, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00015527155483141541, 'train/lm_loss': 2.8107574325986207e-05, 'train/info_loss': 1.8834713046089746e-05, 'train/ref_loss': 0.23008447885513306, 'train/uncertainty_loss': 0.002203440107405186, 'train/video_loss': 0.23354892432689667, 'train/total_loss': 0.23357702791690826} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.38809633255004883, 'train/info_loss': 0.22399897873401642, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011103797005489469, 'train/video_loss': 0.22388793528079987, 'train/total_loss': 0.6119842529296875} -[Rank 1] Trainer log: {'loss': 0.4196, 'grad_norm': 3.87770676612854, 'learning_rate': 8.065447253057348e-06} -[Rank 2] Trainer log: {'loss': 0.4196, 'grad_norm': 3.87770676612854, 'learning_rate': 8.065447253057348e-06}[Rank 3] Trainer log: {'loss': 0.4196, 'grad_norm': 3.87770676612854, 'learning_rate': 8.065447253057348e-06}[Rank 0] Trainer log: {'loss': 0.4196, 'grad_norm': 3.87770676612854, 'learning_rate': 8.065447253057348e-06} - - -{'loss': 0.4196, 'grad_norm': 3.87770676612854, 'learning_rate': 8.065447253057348e-06, 'epoch': 0.58} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00021433569490909578, 'train/lm_loss': 4.658156540244818e-05, 'train/info_loss': 2.1993630070937797e-05, 'train/ref_loss': 0.20952406525611877, 'train/uncertainty_loss': -6.964405765756965e-05, 'train/video_loss': 0.2111911028623581, 'train/total_loss': 0.21123768389225006} -tensor(0.3832, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.07780472040176392, 'train/info_loss': 0.15684643387794495, 'train/ref_loss': None, 'train/uncertainty_loss': -8.716833544895053e-05, 'train/video_loss': 0.15675926208496094, 'train/total_loss': 0.23456397652626038} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4397, 'grad_norm': 6.328469276428223, 'learning_rate': 8.054982328177441e-06} -[Rank 3] Trainer log: {'loss': 0.4397, 'grad_norm': 6.328469276428223, 'learning_rate': 8.054982328177441e-06} -[Rank 0] Trainer log: {'loss': 0.4397, 'grad_norm': 6.328469276428223, 'learning_rate': 8.054982328177441e-06} -[Rank 2] Trainer log: {'loss': 0.4397, 'grad_norm': 6.328469276428223, 'learning_rate': 8.054982328177441e-06} -{'loss': 0.4397, 'grad_norm': 6.328469276428223, 'learning_rate': 8.054982328177441e-06, 'epoch': 0.58} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.38809421062469485, 'train/info_loss': 0.22527265548706055, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013503632508218288, 'train/video_loss': 0.2251376211643219, 'train/total_loss': 0.6132318377494812} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0015, device='cuda:0', grad_fn=) tensor(-0.0015, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.37056841850280764, 'train/info_loss': 0.39117470383644104, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001524962834082544, 'train/video_loss': 0.3910222053527832, 'train/total_loss': 0.7615906000137329} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.0771, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1929, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4516, 'grad_norm': 8.091751098632812, 'learning_rate': 8.044519616660598e-06}[Rank 2] Trainer log: {'loss': 0.4516, 'grad_norm': 8.091751098632812, 'learning_rate': 8.044519616660598e-06}[Rank 0] Trainer log: {'loss': 0.4516, 'grad_norm': 8.091751098632812, 'learning_rate': 8.044519616660598e-06} - - -{'loss': 0.4516, 'grad_norm': 8.091751098632812, 'learning_rate': 8.044519616660598e-06, 'epoch': 0.58} -[Rank 1] Trainer log: {'loss': 0.4516, 'grad_norm': 8.091751098632812, 'learning_rate': 8.044519616660598e-06} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(1.0694, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0947, device='cuda:0', grad_fn=)tensor(0.1948, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002680151723325253, 'train/lm_loss': 4.7272816300392154e-05, 'train/info_loss': 2.3424076061928645e-05, 'train/ref_loss': 0.26263347268104553, 'train/uncertainty_loss': 0.009472687542438508, 'train/video_loss': 0.27427372336387634, 'train/total_loss': 0.2743209898471832} -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.08816068768501283, 'train/info_loss': 0.21034598350524902, 'train/ref_loss': None, 'train/uncertainty_loss': -9.151448612101377e-05, 'train/video_loss': 0.2102544754743576, 'train/total_loss': 0.2984151542186737} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.0340, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4017, 'grad_norm': 13.190881729125977, 'learning_rate': 8.034059130413015e-06}[Rank 2] Trainer log: {'loss': 0.4017, 'grad_norm': 13.190881729125977, 'learning_rate': 8.034059130413015e-06} - -[Rank 3] Trainer log: {'loss': 0.4017, 'grad_norm': 13.190881729125977, 'learning_rate': 8.034059130413015e-06} -[Rank 0] Trainer log: {'loss': 0.4017, 'grad_norm': 13.190881729125977, 'learning_rate': 8.034059130413015e-06} -{'loss': 0.4017, 'grad_norm': 13.190881729125977, 'learning_rate': 8.034059130413015e-06, 'epoch': 0.58} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.12682393789291382, 'train/info_loss': 0.2183239758014679, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010143162216991186, 'train/video_loss': 0.21822254359722137, 'train/total_loss': 0.3450464904308319} -tensor(0.2351, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.1177, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1621, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00017249174416065217, 'train/lm_loss': 3.666541597340256e-05, 'train/info_loss': 2.1993630070937797e-05, 'train/ref_loss': 0.1917893886566162, 'train/uncertainty_loss': -6.85110513586551e-05, 'train/video_loss': 0.19312280416488647, 'train/total_loss': 0.19315947592258453} -[Rank 1] Trainer log: {'loss': 0.3373, 'grad_norm': 16.21238899230957, 'learning_rate': 8.023600881338372e-06} -[Rank 2] Trainer log: {'loss': 0.3373, 'grad_norm': 16.21238899230957, 'learning_rate': 8.023600881338372e-06}[Rank 3] Trainer log: {'loss': 0.3373, 'grad_norm': 16.21238899230957, 'learning_rate': 8.023600881338372e-06} - -[Rank 0] Trainer log: {'loss': 0.3373, 'grad_norm': 16.21238899230957, 'learning_rate': 8.023600881338372e-06} -{'loss': 0.3373, 'grad_norm': 16.21238899230957, 'learning_rate': 8.023600881338372e-06, 'epoch': 0.59} -tensor(0.1370, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.2060, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002842919435352087, 'train/lm_loss': 6.114501738920809e-05, 'train/info_loss': 2.6523362976149656e-05, 'train/ref_loss': 0.35271063446998596, 'train/uncertainty_loss': 0.020597222447395327, 'train/video_loss': 0.3756087124347687, 'train/total_loss': 0.3756698668003082} -tensor(0.0783, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0006, device='cuda:0', grad_fn=) tensor(-0.0006, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00019877639133483174, 'train/lm_loss': 7.77331879362464e-05, 'train/info_loss': 2.6523362976149656e-05, 'train/ref_loss': 0.15795962512493134, 'train/uncertainty_loss': -6.440860452130438e-05, 'train/video_loss': 0.15951195359230042, 'train/total_loss': 0.15958969295024872} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.2585, 'grad_norm': 6.719442844390869, 'learning_rate': 8.013144881337797e-06}[Rank 3] Trainer log: {'loss': 0.2585, 'grad_norm': 6.719442844390869, 'learning_rate': 8.013144881337797e-06} - -[Rank 0] Trainer log: {'loss': 0.2585, 'grad_norm': 6.719442844390869, 'learning_rate': 8.013144881337797e-06}[Rank 1] Trainer log: {'loss': 0.2585, 'grad_norm': 6.719442844390869, 'learning_rate': 8.013144881337797e-06} - -{'loss': 0.2585, 'grad_norm': 6.719442844390869, 'learning_rate': 8.013144881337797e-06, 'epoch': 0.59} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.1334, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00018367874436080457, 'train/lm_loss': 4.1671225335448984e-05, 'train/info_loss': 2.3781687559676357e-05, 'train/ref_loss': 0.29850447177886963, 'train/uncertainty_loss': 0.013335129618644715, 'train/video_loss': 0.31333282589912415, 'train/total_loss': 0.31337448954582214} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.22688462734222414, 'train/info_loss': 0.19481955468654633, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012093845289200545, 'train/video_loss': 0.19469861686229706, 'train/total_loss': 0.42158323526382446} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0223, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.304, 'grad_norm': 2.4272260665893555, 'learning_rate': 8.00269114230985e-06}[Rank 3] Trainer log: {'loss': 0.304, 'grad_norm': 2.4272260665893555, 'learning_rate': 8.00269114230985e-06}[Rank 1] Trainer log: {'loss': 0.304, 'grad_norm': 2.4272260665893555, 'learning_rate': 8.00269114230985e-06} - - -[Rank 2] Trainer log: {'loss': 0.304, 'grad_norm': 2.4272260665893555, 'learning_rate': 8.00269114230985e-06} -{'loss': 0.304, 'grad_norm': 2.4272260665893555, 'learning_rate': 8.00269114230985e-06, 'epoch': 0.59} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.22975971698760989, 'train/info_loss': 0.21056129038333893, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011950545012950897, 'train/video_loss': 0.21044178307056427, 'train/total_loss': 0.4402015209197998} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.06438305377960206, 'train/info_loss': 0.34662535786628723, 'train/ref_loss': None, 'train/uncertainty_loss': -8.812848245725036e-05, 'train/video_loss': 0.3465372323989868, 'train/total_loss': 0.41092029213905334} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3366, 'grad_norm': 3.1514477729797363, 'learning_rate': 7.992239676150535e-06}[Rank 1] Trainer log: {'loss': 0.3366, 'grad_norm': 3.1514477729797363, 'learning_rate': 7.992239676150535e-06} -[Rank 2] Trainer log: {'loss': 0.3366, 'grad_norm': 3.1514477729797363, 'learning_rate': 7.992239676150535e-06} - -[Rank 3] Trainer log: {'loss': 0.3366, 'grad_norm': 3.1514477729797363, 'learning_rate': 7.992239676150535e-06} -{'loss': 0.3366, 'grad_norm': 3.1514477729797363, 'learning_rate': 7.992239676150535e-06, 'epoch': 0.59} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002622131956741214, 'train/lm_loss': 4.696294490713626e-05, 'train/info_loss': 2.3424076061928645e-05, 'train/ref_loss': 0.07205285131931305, 'train/uncertainty_loss': -6.795987137593329e-05, 'train/video_loss': 0.07410602271556854, 'train/total_loss': 0.07415298372507095} -tensor(0.1182, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.28977129459381107, 'train/info_loss': 0.10295277088880539, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011230695527046919, 'train/video_loss': 0.10284046083688736, 'train/total_loss': 0.3926117420196533} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(0.1899, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3063, 'grad_norm': 10.49147891998291, 'learning_rate': 7.98179049475325e-06}[Rank 2] Trainer log: {'loss': 0.3063, 'grad_norm': 10.49147891998291, 'learning_rate': 7.98179049475325e-06}[Rank 1] Trainer log: {'loss': 0.3063, 'grad_norm': 10.49147891998291, 'learning_rate': 7.98179049475325e-06} - - -[Rank 3] Trainer log: {'loss': 0.3063, 'grad_norm': 10.49147891998291, 'learning_rate': 7.98179049475325e-06} -{'loss': 0.3063, 'grad_norm': 10.49147891998291, 'learning_rate': 7.98179049475325e-06, 'epoch': 0.59} -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3608160972595215, 'train/info_loss': 0.25664499402046204, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001317265792749822, 'train/video_loss': 0.2565132677555084, 'train/total_loss': 0.6173293590545654} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1718, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.36797475814819336, 'train/info_loss': 0.17365585267543793, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011369192507117987, 'train/video_loss': 0.17354215681552887, 'train/total_loss': 0.541516900062561} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.4349, 'grad_norm': 2.9052913188934326, 'learning_rate': 7.971343610008812e-06}[Rank 3] Trainer log: {'loss': 0.4349, 'grad_norm': 2.9052913188934326, 'learning_rate': 7.971343610008812e-06}[Rank 0] Trainer log: {'loss': 0.4349, 'grad_norm': 2.9052913188934326, 'learning_rate': 7.971343610008812e-06} - -[Rank 1] Trainer log: {'loss': 0.4349, 'grad_norm': 2.9052913188934326, 'learning_rate': 7.971343610008812e-06} - -{'loss': 0.4349, 'grad_norm': 2.9052913188934326, 'learning_rate': 7.971343610008812e-06, 'epoch': 0.59} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.09323577880859375, 'train/info_loss': 0.17796127498149872, 'train/ref_loss': None, 'train/uncertainty_loss': -8.615686092525721e-05, 'train/video_loss': 0.1778751164674759, 'train/total_loss': 0.2711108922958374} -tensor(0.0345, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2062403917312622, 'train/info_loss': 0.2187064290046692, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010149430017918349, 'train/video_loss': 0.2186049371957779, 'train/total_loss': 0.4248453378677368} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3786, 'grad_norm': 4.577812194824219, 'learning_rate': 7.960899033805408e-06}[Rank 2] Trainer log: {'loss': 0.3786, 'grad_norm': 4.577812194824219, 'learning_rate': 7.960899033805408e-06} - -[Rank 0] Trainer log: {'loss': 0.3786, 'grad_norm': 4.577812194824219, 'learning_rate': 7.960899033805408e-06}[Rank 3] Trainer log: {'loss': 0.3786, 'grad_norm': 4.577812194824219, 'learning_rate': 7.960899033805408e-06} - -{'loss': 0.3786, 'grad_norm': 4.577812194824219, 'learning_rate': 7.960899033805408e-06, 'epoch': 0.59} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(0.4209, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.2091, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00021735862828791143, 'train/lm_loss': 6.174088339321316e-05, 'train/info_loss': 2.914582182711456e-05, 'train/ref_loss': 0.35408949851989746, 'train/uncertainty_loss': 0.020913960039615632, 'train/video_loss': 0.376771479845047, 'train/total_loss': 0.3768332302570343} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1852956771850586, 'train/info_loss': 0.11521470546722412, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013027485692873597, 'train/video_loss': 0.11508443206548691, 'train/total_loss': 0.3003801107406616} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.0302, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.4281, 'grad_norm': 1.9677194356918335, 'learning_rate': 7.95045677802861e-06}[Rank 1] Trainer log: {'loss': 0.4281, 'grad_norm': 1.9677194356918335, 'learning_rate': 7.95045677802861e-06}[Rank 2] Trainer log: {'loss': 0.4281, 'grad_norm': 1.9677194356918335, 'learning_rate': 7.95045677802861e-06} - - -{'loss': 0.4281, 'grad_norm': 1.9677194356918335, 'learning_rate': 7.95045677802861e-06, 'epoch': 0.59} -[Rank 3] Trainer log: {'loss': 0.4281, 'grad_norm': 1.9677194356918335, 'learning_rate': 7.95045677802861e-06} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.07362222671508789, 'train/info_loss': 0.1705067753791809, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010411815019324423, 'train/video_loss': 0.1704026609659195, 'train/total_loss': 0.24402488768100739} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.13258928060531616, 'train/info_loss': 0.15157505869865417, 'train/ref_loss': None, 'train/uncertainty_loss': -9.302309481427074e-05, 'train/video_loss': 0.15148203074932098, 'train/total_loss': 0.28407132625579834} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.2806, 'grad_norm': 5.01986837387085, 'learning_rate': 7.94001685456135e-06} -[Rank 1] Trainer log: {'loss': 0.2806, 'grad_norm': 5.01986837387085, 'learning_rate': 7.94001685456135e-06}[Rank 0] Trainer log: {'loss': 0.2806, 'grad_norm': 5.01986837387085, 'learning_rate': 7.94001685456135e-06} - -[Rank 3] Trainer log: {'loss': 0.2806, 'grad_norm': 5.01986837387085, 'learning_rate': 7.94001685456135e-06} -{'loss': 0.2806, 'grad_norm': 5.01986837387085, 'learning_rate': 7.94001685456135e-06, 'epoch': 0.59} -tensor(0.2186, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0437, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00020617928821593524, 'train/lm_loss': 8.848140132613481e-05, 'train/info_loss': 2.8669011953752488e-05, 'train/ref_loss': 0.24445605278015137, 'train/uncertainty_loss': 0.004371405765414238, 'train/video_loss': 0.25050556659698486, 'train/total_loss': 0.25059404969215393} -tensor(0.0592, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1420721650123596, 'train/info_loss': 0.09172564744949341, 'train/ref_loss': None, 'train/uncertainty_loss': -9.305874700658024e-05, 'train/video_loss': 0.09163258969783783, 'train/total_loss': 0.23370476067066193} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3292, 'grad_norm': 6.035580158233643, 'learning_rate': 7.929579275283889e-06}[Rank 1] Trainer log: {'loss': 0.3292, 'grad_norm': 6.035580158233643, 'learning_rate': 7.929579275283889e-06}[Rank 3] Trainer log: {'loss': 0.3292, 'grad_norm': 6.035580158233643, 'learning_rate': 7.929579275283889e-06} - - -[Rank 0] Trainer log: {'loss': 0.3292, 'grad_norm': 6.035580158233643, 'learning_rate': 7.929579275283889e-06} -{'loss': 0.3292, 'grad_norm': 6.035580158233643, 'learning_rate': 7.929579275283889e-06, 'epoch': 0.59} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.2867, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002244357019662857, 'train/lm_loss': 3.204089007340372e-05, 'train/info_loss': 2.0324770957813598e-05, 'train/ref_loss': 0.061737850308418274, 'train/uncertainty_loss': -6.80452270898968e-05, 'train/video_loss': 0.06348561495542526, 'train/total_loss': 0.06351765245199203} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0004628085531294346, 'train/lm_loss': 0.00012107997899875046, 'train/info_loss': 3.027824277523905e-05, 'train/ref_loss': 0.13801546394824982, 'train/uncertainty_loss': -7.79222114942968e-05, 'train/video_loss': 0.14167028665542603, 'train/total_loss': 0.14179137349128723} -[Rank 1] Trainer log: {'loss': 0.287, 'grad_norm': 6.919968605041504, 'learning_rate': 7.919144052073845e-06} -[Rank 2] Trainer log: {'loss': 0.287, 'grad_norm': 6.919968605041504, 'learning_rate': 7.919144052073845e-06}[Rank 0] Trainer log: {'loss': 0.287, 'grad_norm': 6.919968605041504, 'learning_rate': 7.919144052073845e-06} - -[Rank 3] Trainer log: {'loss': 0.287, 'grad_norm': 6.919968605041504, 'learning_rate': 7.919144052073845e-06} -{'loss': 0.287, 'grad_norm': 6.919968605041504, 'learning_rate': 7.919144052073845e-06, 'epoch': 0.59} -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.18290399312973024, 'train/info_loss': 0.30016443133354187, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013590492308139803, 'train/video_loss': 0.300028532743454, 'train/total_loss': 0.48293250799179077} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.1879, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.3215, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.22137303352355958, 'train/info_loss': 0.1719212830066681, 'train/ref_loss': None, 'train/uncertainty_loss': -9.38605284318328e-05, 'train/video_loss': 0.17182742059230804, 'train/total_loss': 0.39320045709609985} -tensor(0.5537, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1683, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.4993, 'grad_norm': 13.540406227111816, 'learning_rate': 7.908711196806131e-06}[Rank 2] Trainer log: {'loss': 0.4993, 'grad_norm': 13.540406227111816, 'learning_rate': 7.908711196806131e-06} -[Rank 1] Trainer log: {'loss': 0.4993, 'grad_norm': 13.540406227111816, 'learning_rate': 7.908711196806131e-06} - -[Rank 3] Trainer log: {'loss': 0.4993, 'grad_norm': 13.540406227111816, 'learning_rate': 7.908711196806131e-06} -{'loss': 0.4993, 'grad_norm': 13.540406227111816, 'learning_rate': 7.908711196806131e-06, 'epoch': 0.59} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.2283, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0378, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0412, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00040622372180223467, 'train/lm_loss': 7.882948848418892e-05, 'train/info_loss': 3.075505082961172e-05, 'train/ref_loss': 0.1744045615196228, 'train/uncertainty_loss': 0.004121148586273194, 'train/video_loss': 0.1818062663078308, 'train/total_loss': 0.18188509345054626} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3373388528823853, 'train/info_loss': 0.2806244492530823, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001251397072337568, 'train/video_loss': 0.28049930930137634, 'train/total_loss': 0.6178381443023682} -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(0.1384, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3772, 'grad_norm': 3.8546433448791504, 'learning_rate': 7.898280721352988e-06} -[Rank 0] Trainer log: {'loss': 0.3772, 'grad_norm': 3.8546433448791504, 'learning_rate': 7.898280721352988e-06}[Rank 3] Trainer log: {'loss': 0.3772, 'grad_norm': 3.8546433448791504, 'learning_rate': 7.898280721352988e-06}[Rank 2] Trainer log: {'loss': 0.3772, 'grad_norm': 3.8546433448791504, 'learning_rate': 7.898280721352988e-06} - - -{'loss': 0.3772, 'grad_norm': 3.8546433448791504, 'learning_rate': 7.898280721352988e-06, 'epoch': 0.59} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.06254404783248901, 'train/info_loss': 0.14593036472797394, 'train/ref_loss': None, 'train/uncertainty_loss': -9.560070466250182e-05, 'train/video_loss': 0.1458347588777542, 'train/total_loss': 0.20837880671024323} -tensor(0.1368, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.7225, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002779400441795588, 'train/lm_loss': 5.420903908088804e-05, 'train/info_loss': 2.574854443082586e-05, 'train/ref_loss': 0.8068410158157349, 'train/uncertainty_loss': 0.07225141525268555, 'train/video_loss': 0.8813416957855225, 'train/total_loss': 0.8813958764076233} -[Rank 1] Trainer log: {'loss': 0.3835, 'grad_norm': 9.983467102050781, 'learning_rate': 7.887852637583927e-06}[Rank 3] Trainer log: {'loss': 0.3835, 'grad_norm': 9.983467102050781, 'learning_rate': 7.887852637583927e-06}[Rank 0] Trainer log: {'loss': 0.3835, 'grad_norm': 9.983467102050781, 'learning_rate': 7.887852637583927e-06} - -[Rank 2] Trainer log: {'loss': 0.3835, 'grad_norm': 9.983467102050781, 'learning_rate': 7.887852637583927e-06} - -{'loss': 0.3835, 'grad_norm': 9.983467102050781, 'learning_rate': 7.887852637583927e-06, 'epoch': 0.59} -tensor(-0.0015, device='cuda:1', grad_fn=) tensor(-0.0015, device='cuda:1', grad_fn=) -tensor(-0.0015, device='cuda:0', grad_fn=) tensor(-0.0015, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.323030948638916, 'train/info_loss': 0.2121509462594986, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00014866406563669444, 'train/video_loss': 0.21200227737426758, 'train/total_loss': 0.5350332260131836} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1891, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0015, device='cuda:1', grad_fn=) tensor(-0.0015, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.0653049111366272, 'train/info_loss': 0.1515606939792633, 'train/ref_loss': None, 'train/uncertainty_loss': -8.988846093416214e-05, 'train/video_loss': 0.15147081017494202, 'train/total_loss': 0.21677571535110474} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3418, 'grad_norm': 2.8359487056732178, 'learning_rate': 7.877426957365754e-06}[Rank 3] Trainer log: {'loss': 0.3418, 'grad_norm': 2.8359487056732178, 'learning_rate': 7.877426957365754e-06}[Rank 1] Trainer log: {'loss': 0.3418, 'grad_norm': 2.8359487056732178, 'learning_rate': 7.877426957365754e-06} - - -[Rank 2] Trainer log: {'loss': 0.3418, 'grad_norm': 2.8359487056732178, 'learning_rate': 7.877426957365754e-06} -{'loss': 0.3418, 'grad_norm': 2.8359487056732178, 'learning_rate': 7.877426957365754e-06, 'epoch': 0.59} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.08044473528862, 'train/info_loss': 0.1803637593984604, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010721860453486443, 'train/video_loss': 0.18025654554367065, 'train/total_loss': 0.2607012987136841} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.0452, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3014044046401978, 'train/info_loss': 0.4274747967720032, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001287597930058837, 'train/video_loss': 0.42734605073928833, 'train/total_loss': 0.728750467300415} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3423, 'grad_norm': 6.794540882110596, 'learning_rate': 7.867003692562533e-06}[Rank 3] Trainer log: {'loss': 0.3423, 'grad_norm': 6.794540882110596, 'learning_rate': 7.867003692562533e-06} - -[Rank 2] Trainer log: {'loss': 0.3423, 'grad_norm': 6.794540882110596, 'learning_rate': 7.867003692562533e-06} -[Rank 0] Trainer log: {'loss': 0.3423, 'grad_norm': 6.794540882110596, 'learning_rate': 7.867003692562533e-06} -{'loss': 0.3423, 'grad_norm': 6.794540882110596, 'learning_rate': 7.867003692562533e-06, 'epoch': 0.59} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.13370500802993776, 'train/info_loss': 0.14711271226406097, 'train/ref_loss': None, 'train/uncertainty_loss': -9.823095751926303e-05, 'train/video_loss': 0.14701448380947113, 'train/total_loss': 0.28071948885917664} -tensor(0.1275, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00015780983958393338, 'train/lm_loss': 0.00010204096324741841, 'train/info_loss': 3.00994397548493e-05, 'train/ref_loss': 0.16082602739334106, 'train/uncertainty_loss': -7.072204607538879e-05, 'train/video_loss': 0.16204789280891418, 'train/total_loss': 0.16214993596076965} -[Rank 2] Trainer log: {'loss': 0.2495, 'grad_norm': 3.433659553527832, 'learning_rate': 7.856582855035578e-06}[Rank 1] Trainer log: {'loss': 0.2495, 'grad_norm': 3.433659553527832, 'learning_rate': 7.856582855035578e-06} - -[Rank 0] Trainer log: {'loss': 0.2495, 'grad_norm': 3.433659553527832, 'learning_rate': 7.856582855035578e-06}[Rank 3] Trainer log: {'loss': 0.2495, 'grad_norm': 3.433659553527832, 'learning_rate': 7.856582855035578e-06} - -{'loss': 0.2495, 'grad_norm': 3.433659553527832, 'learning_rate': 7.856582855035578e-06, 'epoch': 0.59} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00024008408654481174, 'train/lm_loss': 3.635553002823144e-05, 'train/info_loss': 2.270885306643322e-05, 'train/ref_loss': 0.03940065577626228, 'train/uncertainty_loss': -7.221019477583468e-05, 'train/video_loss': 0.04127182811498642, 'train/total_loss': 0.04130818322300911} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3751489877700806, 'train/info_loss': 0.22822266817092896, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012443956220522522, 'train/video_loss': 0.22809822857379913, 'train/total_loss': 0.6032472252845764} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.4178, 'grad_norm': 2.750547170639038, 'learning_rate': 7.846164456643445e-06}[Rank 1] Trainer log: {'loss': 0.4178, 'grad_norm': 2.750547170639038, 'learning_rate': 7.846164456643445e-06} - -[Rank 0] Trainer log: {'loss': 0.4178, 'grad_norm': 2.750547170639038, 'learning_rate': 7.846164456643445e-06}[Rank 3] Trainer log: {'loss': 0.4178, 'grad_norm': 2.750547170639038, 'learning_rate': 7.846164456643445e-06} - -{'loss': 0.4178, 'grad_norm': 2.750547170639038, 'learning_rate': 7.846164456643445e-06, 'epoch': 0.59} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.142816162109375, 'train/info_loss': 0.14793792366981506, 'train/ref_loss': None, 'train/uncertainty_loss': -9.036964038386941e-05, 'train/video_loss': 0.14784754812717438, 'train/total_loss': 0.2906637191772461} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.0631, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002088263863697648, 'train/lm_loss': 4.1671225335448984e-05, 'train/info_loss': 2.3781687559676357e-05, 'train/ref_loss': 0.17990684509277344, 'train/uncertainty_loss': -7.319495780393482e-05, 'train/video_loss': 0.18152804672718048, 'train/total_loss': 0.18156972527503967} -tensor(0.2574, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3426, 'grad_norm': 5.074666976928711, 'learning_rate': 7.83574850924191e-06}[Rank 2] Trainer log: {'loss': 0.3426, 'grad_norm': 5.074666976928711, 'learning_rate': 7.83574850924191e-06} - -[Rank 3] Trainer log: {'loss': 0.3426, 'grad_norm': 5.074666976928711, 'learning_rate': 7.83574850924191e-06} -[Rank 0] Trainer log: {'loss': 0.3426, 'grad_norm': 5.074666976928711, 'learning_rate': 7.83574850924191e-06} -{'loss': 0.3426, 'grad_norm': 5.074666976928711, 'learning_rate': 7.83574850924191e-06, 'epoch': 0.59} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2993391275405884, 'train/info_loss': 0.11496538668870926, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001093423692509532, 'train/video_loss': 0.1148560419678688, 'train/total_loss': 0.41419517993927} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.393855881690979, 'train/info_loss': 0.19263900816440582, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010891100391745568, 'train/video_loss': 0.19253009557724, 'train/total_loss': 0.58638596534729} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3297, 'grad_norm': 3.94773268699646, 'learning_rate': 7.825335024683967e-06}[Rank 3] Trainer log: {'loss': 0.3297, 'grad_norm': 3.94773268699646, 'learning_rate': 7.825335024683967e-06}[Rank 1] Trainer log: {'loss': 0.3297, 'grad_norm': 3.94773268699646, 'learning_rate': 7.825335024683967e-06} - - -[Rank 2] Trainer log: {'loss': 0.3297, 'grad_norm': 3.94773268699646, 'learning_rate': 7.825335024683967e-06} -{'loss': 0.3297, 'grad_norm': 3.94773268699646, 'learning_rate': 7.825335024683967e-06, 'epoch': 0.59} -tensor(0.7538, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00024079608265310527, 'train/lm_loss': 4.1623553261160855e-05, 'train/info_loss': 2.455650974297896e-05, 'train/ref_loss': 0.1834862232208252, 'train/uncertainty_loss': -7.152511971071362e-05, 'train/video_loss': 0.18536561727523804, 'train/total_loss': 0.18540723621845245} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2157777786254883, 'train/info_loss': 0.16164831817150116, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012761394027620556, 'train/video_loss': 0.16152070462703705, 'train/total_loss': 0.3772984743118286} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3603, 'grad_norm': 11.229022026062012, 'learning_rate': 7.814924014819793e-06} -[Rank 2] Trainer log: {'loss': 0.3603, 'grad_norm': 11.229022026062012, 'learning_rate': 7.814924014819793e-06} -[Rank 0] Trainer log: {'loss': 0.3603, 'grad_norm': 11.229022026062012, 'learning_rate': 7.814924014819793e-06}[Rank 3] Trainer log: {'loss': 0.3603, 'grad_norm': 11.229022026062012, 'learning_rate': 7.814924014819793e-06} - -{'loss': 0.3603, 'grad_norm': 11.229022026062012, 'learning_rate': 7.814924014819793e-06, 'epoch': 0.59} -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2644446134567261, 'train/info_loss': 0.2512053847312927, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012118408922106028, 'train/video_loss': 0.25108420848846436, 'train/total_loss': 0.5155287981033325} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(0.7015, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.2673, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00019226474687457086, 'train/lm_loss': 2.79883824987337e-05, 'train/info_loss': 2.0324770957813598e-05, 'train/ref_loss': 0.39371877908706665, 'train/uncertainty_loss': 0.026726591587066653, 'train/video_loss': 0.422003835439682, 'train/total_loss': 0.42203181982040405} -[Rank 1] Trainer log: {'loss': 0.4912, 'grad_norm': 8.239139556884766, 'learning_rate': 7.804515491496765e-06} -[Rank 2] Trainer log: {'loss': 0.4912, 'grad_norm': 8.239139556884766, 'learning_rate': 7.804515491496765e-06} -[Rank 3] Trainer log: {'loss': 0.4912, 'grad_norm': 8.239139556884766, 'learning_rate': 7.804515491496765e-06} -[Rank 0] Trainer log: {'loss': 0.4912, 'grad_norm': 8.239139556884766, 'learning_rate': 7.804515491496765e-06} -{'loss': 0.4912, 'grad_norm': 8.239139556884766, 'learning_rate': 7.804515491496765e-06, 'epoch': 0.59} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.1768, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00018891118234023454, 'train/lm_loss': 4.190959443803877e-05, 'train/info_loss': 2.3424076061928645e-05, 'train/ref_loss': 0.3324098587036133, 'train/uncertainty_loss': 0.01768471598625183, 'train/video_loss': 0.3516292870044708, 'train/total_loss': 0.3516711890697479} -tensor(-0.0015, device='cuda:0', grad_fn=) tensor(-0.0015, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.44789953231811525, 'train/info_loss': 0.2712928354740143, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00015320940874516965, 'train/video_loss': 0.27113962173461914, 'train/total_loss': 0.7190392017364502} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.4128, 'grad_norm': 5.9371843338012695, 'learning_rate': 7.794109466559427e-06}[Rank 1] Trainer log: {'loss': 0.4128, 'grad_norm': 5.9371843338012695, 'learning_rate': 7.794109466559427e-06} - -[Rank 2] Trainer log: {'loss': 0.4128, 'grad_norm': 5.9371843338012695, 'learning_rate': 7.794109466559427e-06} -[Rank 3] Trainer log: {'loss': 0.4128, 'grad_norm': 5.9371843338012695, 'learning_rate': 7.794109466559427e-06} -{'loss': 0.4128, 'grad_norm': 5.9371843338012695, 'learning_rate': 7.794109466559427e-06, 'epoch': 0.59} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.11009162664413452, 'train/info_loss': 0.12211618572473526, 'train/ref_loss': None, 'train/uncertainty_loss': -9.189867996610701e-05, 'train/video_loss': 0.1220242902636528, 'train/total_loss': 0.23211592435836792} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.0012, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00014523683348670603, 'train/lm_loss': 4.112297610845417e-05, 'train/info_loss': 2.413929905742407e-05, 'train/ref_loss': 0.14712956547737122, 'train/uncertainty_loss': -6.802547140978276e-05, 'train/video_loss': 0.14824756979942322, 'train/total_loss': 0.14828869700431824} -[Rank 1] Trainer log: {'loss': 0.2793, 'grad_norm': 3.8092219829559326, 'learning_rate': 7.78370595184947e-06} -[Rank 2] Trainer log: {'loss': 0.2793, 'grad_norm': 3.8092219829559326, 'learning_rate': 7.78370595184947e-06} -[Rank 3] Trainer log: {'loss': 0.2793, 'grad_norm': 3.8092219829559326, 'learning_rate': 7.78370595184947e-06} -[Rank 0] Trainer log: {'loss': 0.2793, 'grad_norm': 3.8092219829559326, 'learning_rate': 7.78370595184947e-06} -{'loss': 0.2793, 'grad_norm': 3.8092219829559326, 'learning_rate': 7.78370595184947e-06, 'epoch': 0.59} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(0.4176, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00013277443358674645, 'train/lm_loss': 2.817909116856754e-05, 'train/info_loss': 1.9728748156921938e-05, 'train/ref_loss': 0.504716157913208, 'train/uncertainty_loss': 0.04175951480865479, 'train/video_loss': 0.5475575923919678, 'train/total_loss': 0.5475857853889465} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.1162, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(1.0878, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) -{'train/tv_loss': 0.00017434384208172562, 'train/lm_loss': 2.1718753851018848e-05, 'train/info_loss': 1.710624019324314e-05, 'train/ref_loss': 0.9592832922935486, 'train/uncertainty_loss': 0.10877858400344849, 'train/video_loss': 1.0694737434387207, 'train/total_loss': 1.069495439529419} -[Rank 1] Trainer log: {'loss': 0.5008, 'grad_norm': 14.949400901794434, 'learning_rate': 7.773304959205739e-06} -[Rank 2] Trainer log: {'loss': 0.5008, 'grad_norm': 14.949400901794434, 'learning_rate': 7.773304959205739e-06} -[Rank 0] Trainer log: {'loss': 0.5008, 'grad_norm': 14.949400901794434, 'learning_rate': 7.773304959205739e-06}[Rank 3] Trainer log: {'loss': 0.5008, 'grad_norm': 14.949400901794434, 'learning_rate': 7.773304959205739e-06} - -{'loss': 0.5008, 'grad_norm': 14.949400901794434, 'learning_rate': 7.773304959205739e-06, 'epoch': 0.59} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(0.0264, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00018547301879152657, 'train/lm_loss': 3.6879954859614375e-05, 'train/info_loss': 2.270885306643322e-05, 'train/ref_loss': 0.21521054208278656, 'train/uncertainty_loss': 0.002637326717376709, 'train/video_loss': 0.21935436129570007, 'train/total_loss': 0.21939124166965485} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.04846282005310059, 'train/info_loss': 0.24395129084587097, 'train/ref_loss': None, 'train/uncertainty_loss': -9.407388861291111e-05, 'train/video_loss': 0.2438572198152542, 'train/total_loss': 0.29232004284858704} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3459, 'grad_norm': 2.7006547451019287, 'learning_rate': 7.7629065004642e-06} -[Rank 3] Trainer log: {'loss': 0.3459, 'grad_norm': 2.7006547451019287, 'learning_rate': 7.7629065004642e-06} -[Rank 2] Trainer log: {'loss': 0.3459, 'grad_norm': 2.7006547451019287, 'learning_rate': 7.7629065004642e-06} -[Rank 0] Trainer log: {'loss': 0.3459, 'grad_norm': 2.7006547451019287, 'learning_rate': 7.7629065004642e-06} -{'loss': 0.3459, 'grad_norm': 2.7006547451019287, 'learning_rate': 7.7629065004642e-06, 'epoch': 0.59} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.07510597109794617, 'train/info_loss': 0.24960657954216003, 'train/ref_loss': None, 'train/uncertainty_loss': -8.936626836657524e-05, 'train/video_loss': 0.24951721727848053, 'train/total_loss': 0.3246231973171234} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.19016557931900024, 'train/info_loss': 0.1583804041147232, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012014046078547836, 'train/video_loss': 0.1582602709531784, 'train/total_loss': 0.34842586517333984} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3878, 'grad_norm': 3.026332378387451, 'learning_rate': 7.75251058745795e-06}[Rank 2] Trainer log: {'loss': 0.3878, 'grad_norm': 3.026332378387451, 'learning_rate': 7.75251058745795e-06} - -[Rank 0] Trainer log: {'loss': 0.3878, 'grad_norm': 3.026332378387451, 'learning_rate': 7.75251058745795e-06} -[Rank 3] Trainer log: {'loss': 0.3878, 'grad_norm': 3.026332378387451, 'learning_rate': 7.75251058745795e-06} -{'loss': 0.3878, 'grad_norm': 3.026332378387451, 'learning_rate': 7.75251058745795e-06, 'epoch': 0.59} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.7707, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00016111176228150725, 'train/lm_loss': 6.905793561600149e-05, 'train/info_loss': 3.075505082961172e-05, 'train/ref_loss': 0.7287635803222656, 'train/uncertainty_loss': 0.07707285284996034, 'train/video_loss': 0.8071560859680176, 'train/total_loss': 0.8072251677513123} -tensor(0.1501, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3869994163513184, 'train/info_loss': 0.18649138510227203, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001291242544539273, 'train/video_loss': 0.18636226654052734, 'train/total_loss': 0.5733616948127747} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(0.1169, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4275, 'grad_norm': 3.173077344894409, 'learning_rate': 7.74211723201717e-06} -[Rank 1] Trainer log: {'loss': 0.4275, 'grad_norm': 3.173077344894409, 'learning_rate': 7.74211723201717e-06}[Rank 2] Trainer log: {'loss': 0.4275, 'grad_norm': 3.173077344894409, 'learning_rate': 7.74211723201717e-06} - -[Rank 0] Trainer log: {'loss': 0.4275, 'grad_norm': 3.173077344894409, 'learning_rate': 7.74211723201717e-06} -{'loss': 0.4275, 'grad_norm': 3.173077344894409, 'learning_rate': 7.74211723201717e-06, 'epoch': 0.59} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00023631604854017498, 'train/lm_loss': 2.8131413273513317e-05, 'train/info_loss': 1.9430735846981406e-05, 'train/ref_loss': 0.1406382918357849, 'train/uncertainty_loss': -6.996968295425177e-05, 'train/video_loss': 0.14247827231884003, 'train/total_loss': 0.142506405711174} -tensor(-0.0016, device='cuda:1', grad_fn=) tensor(-0.0016, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.23319098949432374, 'train/info_loss': 0.20762018859386444, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001346903038211167, 'train/video_loss': 0.20748549699783325, 'train/total_loss': 0.4406765103340149} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3211, 'grad_norm': 3.455209732055664, 'learning_rate': 7.73172644596914e-06}[Rank 1] Trainer log: {'loss': 0.3211, 'grad_norm': 3.455209732055664, 'learning_rate': 7.73172644596914e-06} -[Rank 3] Trainer log: {'loss': 0.3211, 'grad_norm': 3.455209732055664, 'learning_rate': 7.73172644596914e-06} -[Rank 2] Trainer log: {'loss': 0.3211, 'grad_norm': 3.455209732055664, 'learning_rate': 7.73172644596914e-06} - -{'loss': 0.3211, 'grad_norm': 3.455209732055664, 'learning_rate': 7.73172644596914e-06, 'epoch': 0.59} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3572758674621582, 'train/info_loss': 0.14719876646995544, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011153947561979294, 'train/video_loss': 0.1470872312784195, 'train/total_loss': 0.5043631196022034} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0015, device='cuda:2', grad_fn=) tensor(-0.0015, device='cuda:2', grad_fn=) -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.21764738559722902, 'train/info_loss': 0.23354318737983704, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013605484273284673, 'train/video_loss': 0.2334071397781372, 'train/total_loss': 0.45105451345443726} -[Rank 1] Trainer log: {'loss': 0.4163, 'grad_norm': 2.094433307647705, 'learning_rate': 7.72133824113823e-06} -[Rank 3] Trainer log: {'loss': 0.4163, 'grad_norm': 2.094433307647705, 'learning_rate': 7.72133824113823e-06} -[Rank 0] Trainer log: {'loss': 0.4163, 'grad_norm': 2.094433307647705, 'learning_rate': 7.72133824113823e-06}[Rank 2] Trainer log: {'loss': 0.4163, 'grad_norm': 2.094433307647705, 'learning_rate': 7.72133824113823e-06} - -{'loss': 0.4163, 'grad_norm': 2.094433307647705, 'learning_rate': 7.72133824113823e-06, 'epoch': 0.59} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(0.0355, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1403, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00025633820332586764, 'train/lm_loss': 3.614099114201963e-05, 'train/info_loss': 2.235124156868551e-05, 'train/ref_loss': 0.3003348112106323, 'train/uncertainty_loss': 0.014034076035022736, 'train/video_loss': 0.31644192337989807, 'train/total_loss': 0.31647807359695435} -tensor(0.0166, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.1322, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00021421273704618217, 'train/lm_loss': 3.6617740988731384e-05, 'train/info_loss': 2.3781687559676357e-05, 'train/ref_loss': 0.2946646809577942, 'train/uncertainty_loss': 0.013220897316932679, 'train/video_loss': 0.3096230626106262, 'train/total_loss': 0.3096596896648407}tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.2995, 'grad_norm': 4.418745040893555, 'learning_rate': 7.710952629345842e-06} -[Rank 1] Trainer log: {'loss': 0.2995, 'grad_norm': 4.418745040893555, 'learning_rate': 7.710952629345842e-06}[Rank 0] Trainer log: {'loss': 0.2995, 'grad_norm': 4.418745040893555, 'learning_rate': 7.710952629345842e-06} - -[Rank 3] Trainer log: {'loss': 0.2995, 'grad_norm': 4.418745040893555, 'learning_rate': 7.710952629345842e-06} -{'loss': 0.2995, 'grad_norm': 4.418745040893555, 'learning_rate': 7.710952629345842e-06, 'epoch': 0.59} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.11795279979705811, 'train/info_loss': 0.1540822982788086, 'train/ref_loss': None, 'train/uncertainty_loss': -9.443669114261866e-05, 'train/video_loss': 0.153987854719162, 'train/total_loss': 0.2719406485557556} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0118, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2702207565307617, 'train/info_loss': 0.323302686214447, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010866206139326096, 'train/video_loss': 0.3231940269470215, 'train/total_loss': 0.5934147834777832} -[Rank 1] Trainer log: {'loss': 0.3643, 'grad_norm': 2.20123553276062, 'learning_rate': 7.700569622410454e-06}[Rank 2] Trainer log: {'loss': 0.3643, 'grad_norm': 2.20123553276062, 'learning_rate': 7.700569622410454e-06} - -[Rank 0] Trainer log: {'loss': 0.3643, 'grad_norm': 2.20123553276062, 'learning_rate': 7.700569622410454e-06}[Rank 3] Trainer log: {'loss': 0.3643, 'grad_norm': 2.20123553276062, 'learning_rate': 7.700569622410454e-06} - -{'loss': 0.3643, 'grad_norm': 2.20123553276062, 'learning_rate': 7.700569622410454e-06, 'epoch': 0.6} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.12217814922332765, 'train/info_loss': 0.1844712197780609, 'train/ref_loss': None, 'train/uncertainty_loss': -9.90062253549695e-05, 'train/video_loss': 0.184372216463089, 'train/total_loss': 0.30655038356781006} -tensor(0.0142, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.4256895542144776, 'train/info_loss': 0.05673032999038696, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012231003493070602, 'train/video_loss': 0.05660802125930786, 'train/total_loss': 0.4822975695133209} -tensor(0.0832, device='cuda:1', grad_fn=) tensor(-0.0006, device='cuda:1', grad_fn=) -tensor(0.1859, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0134, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.2665, 'grad_norm': 4.459836959838867, 'learning_rate': 7.690189232147566e-06} -[Rank 3] Trainer log: {'loss': 0.2665, 'grad_norm': 4.459836959838867, 'learning_rate': 7.690189232147566e-06} -[Rank 0] Trainer log: {'loss': 0.2665, 'grad_norm': 4.459836959838867, 'learning_rate': 7.690189232147566e-06}[Rank 2] Trainer log: {'loss': 0.2665, 'grad_norm': 4.459836959838867, 'learning_rate': 7.690189232147566e-06} - -{'loss': 0.2665, 'grad_norm': 4.459836959838867, 'learning_rate': 7.690189232147566e-06, 'epoch': 0.6} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.2036, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00020011200103908777, 'train/lm_loss': 2.1718753851018848e-05, 'train/info_loss': 1.9132725356030278e-05, 'train/ref_loss': 0.3481380343437195, 'train/uncertainty_loss': 0.020359499752521517, 'train/video_loss': 0.37011754512786865, 'train/total_loss': 0.3701392710208893} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0710, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.36978337764739994, 'train/info_loss': 0.09081748872995377, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010888495016843082, 'train/video_loss': 0.09070860594511032, 'train/total_loss': 0.46049198508262634} -tensor(0.0437, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0016, device='cuda:3', grad_fn=) tensor(-0.0016, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.2545, 'grad_norm': 5.0259599685668945, 'learning_rate': 7.67981147036971e-06}[Rank 2] Trainer log: {'loss': 0.2545, 'grad_norm': 5.0259599685668945, 'learning_rate': 7.67981147036971e-06}[Rank 1] Trainer log: {'loss': 0.2545, 'grad_norm': 5.0259599685668945, 'learning_rate': 7.67981147036971e-06} - - -[Rank 0] Trainer log: {'loss': 0.2545, 'grad_norm': 5.0259599685668945, 'learning_rate': 7.67981147036971e-06} -{'loss': 0.2545, 'grad_norm': 5.0259599685668945, 'learning_rate': 7.67981147036971e-06, 'epoch': 0.6} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.0461246132850647, 'train/info_loss': 0.16313619911670685, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010505182435736061, 'train/video_loss': 0.16303114593029022, 'train/total_loss': 0.20915576815605164} -tensor(0.0663, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3204664707183838, 'train/info_loss': 0.1769418865442276, 'train/ref_loss': None, 'train/uncertainty_loss': -7.565144915133715e-05, 'train/video_loss': 0.17686623334884644, 'train/total_loss': 0.49733272194862366} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.3950, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.323, 'grad_norm': 7.854076862335205, 'learning_rate': 7.669436348886413e-06}[Rank 2] Trainer log: {'loss': 0.323, 'grad_norm': 7.854076862335205, 'learning_rate': 7.669436348886413e-06}[Rank 1] Trainer log: {'loss': 0.323, 'grad_norm': 7.854076862335205, 'learning_rate': 7.669436348886413e-06}[Rank 3] Trainer log: {'loss': 0.323, 'grad_norm': 7.854076862335205, 'learning_rate': 7.669436348886413e-06} - - - -{'loss': 0.323, 'grad_norm': 7.854076862335205, 'learning_rate': 7.669436348886413e-06, 'epoch': 0.6} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.4125, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0004910165444016457, 'train/lm_loss': 3.2279270817525686e-05, 'train/info_loss': 2.1636018573190086e-05, 'train/ref_loss': 0.1574905812740326, 'train/uncertainty_loss': -7.769016665406526e-05, 'train/video_loss': 0.1613626480102539, 'train/total_loss': 0.16139492392539978} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(0.0168, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00017966016894206406, 'train/lm_loss': 4.159971431363374e-05, 'train/info_loss': 2.3066464564180933e-05, 'train/ref_loss': 0.1220608800649643, 'train/uncertainty_loss': -6.767876911908389e-05, 'train/video_loss': 0.1234535500407219, 'train/total_loss': 0.12349514663219452} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3727, 'grad_norm': 4.307126998901367, 'learning_rate': 7.65906387950421e-06}[Rank 2] Trainer log: {'loss': 0.3727, 'grad_norm': 4.307126998901367, 'learning_rate': 7.65906387950421e-06} -[Rank 0] Trainer log: {'loss': 0.3727, 'grad_norm': 4.307126998901367, 'learning_rate': 7.65906387950421e-06} - -[Rank 3] Trainer log: {'loss': 0.3727, 'grad_norm': 4.307126998901367, 'learning_rate': 7.65906387950421e-06} -{'loss': 0.3727, 'grad_norm': 4.307126998901367, 'learning_rate': 7.65906387950421e-06, 'epoch': 0.6} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.2395, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00036186052020639183, 'train/lm_loss': 4.1480531217530375e-05, 'train/info_loss': 2.5331331926281564e-05, 'train/ref_loss': 0.12859846651554108, 'train/uncertainty_loss': -7.378943264484406e-05, 'train/video_loss': 0.13144488632678986, 'train/total_loss': 0.13148637115955353} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0960, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00019979318603873254, 'train/lm_loss': 2.8083735378459097e-05, 'train/info_loss': 2.0980394765501842e-05, 'train/ref_loss': 0.2733592391014099, 'train/uncertainty_loss': 0.009596331417560578, 'train/video_loss': 0.2845748960971832, 'train/total_loss': 0.28460296988487244} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.28, 'grad_norm': 7.697661876678467, 'learning_rate': 7.648694074026616e-06}[Rank 3] Trainer log: {'loss': 0.28, 'grad_norm': 7.697661876678467, 'learning_rate': 7.648694074026616e-06}[Rank 1] Trainer log: {'loss': 0.28, 'grad_norm': 7.697661876678467, 'learning_rate': 7.648694074026616e-06} - - -[Rank 0] Trainer log: {'loss': 0.28, 'grad_norm': 7.697661876678467, 'learning_rate': 7.648694074026616e-06} -{'loss': 0.28, 'grad_norm': 7.697661876678467, 'learning_rate': 7.648694074026616e-06, 'epoch': 0.6} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.20308387279510498, 'train/info_loss': 0.1979193538427353, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012051863595843316, 'train/video_loss': 0.19779883325099945, 'train/total_loss': 0.4008827209472656} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.18917366266250613, 'train/info_loss': 0.1902499794960022, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011460874229669572, 'train/video_loss': 0.19013537466526031, 'train/total_loss': 0.3793090581893921} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.45, 'grad_norm': 4.173702716827393, 'learning_rate': 7.638326944254107e-06}[Rank 2] Trainer log: {'loss': 0.45, 'grad_norm': 4.173702716827393, 'learning_rate': 7.638326944254107e-06}[Rank 3] Trainer log: {'loss': 0.45, 'grad_norm': 4.173702716827393, 'learning_rate': 7.638326944254107e-06} - - -[Rank 0] Trainer log: {'loss': 0.45, 'grad_norm': 4.173702716827393, 'learning_rate': 7.638326944254107e-06} -{'loss': 0.45, 'grad_norm': 4.173702716827393, 'learning_rate': 7.638326944254107e-06, 'epoch': 0.6} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002917618025094271, 'train/lm_loss': 3.2064729020930825e-05, 'train/info_loss': 2.270885306643322e-05, 'train/ref_loss': 0.2128363698720932, 'train/uncertainty_loss': -7.475423626601696e-05, 'train/video_loss': 0.215118408203125, 'train/total_loss': 0.21515047550201416} -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.10173180103302003, 'train/info_loss': 0.12733551859855652, 'train/ref_loss': None, 'train/uncertainty_loss': -9.128705714829267e-05, 'train/video_loss': 0.127244234085083, 'train/total_loss': 0.2289760410785675} -tensor(0.0551, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0353, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.2519, 'grad_norm': 3.487684965133667, 'learning_rate': 7.627962501984122e-06}[Rank 3] Trainer log: {'loss': 0.2519, 'grad_norm': 3.487684965133667, 'learning_rate': 7.627962501984122e-06}[Rank 1] Trainer log: {'loss': 0.2519, 'grad_norm': 3.487684965133667, 'learning_rate': 7.627962501984122e-06} - - -[Rank 0] Trainer log: {'loss': 0.2519, 'grad_norm': 3.487684965133667, 'learning_rate': 7.627962501984122e-06} -{'loss': 0.2519, 'grad_norm': 3.487684965133667, 'learning_rate': 7.627962501984122e-06, 'epoch': 0.6} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00028527448885142805, 'train/lm_loss': 6.955844582989812e-05, 'train/info_loss': 3.1053055863594636e-05, 'train/ref_loss': 0.15818822383880615, 'train/uncertainty_loss': -6.856922409497202e-05, 'train/video_loss': 0.16043290495872498, 'train/total_loss': 0.16050246357917786} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.3744, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0001756998128257692, 'train/lm_loss': 3.628401609603316e-05, 'train/info_loss': 2.3066464564180933e-05, 'train/ref_loss': 0.17682993412017822, 'train/uncertainty_loss': -6.943020271137357e-05, 'train/video_loss': 0.17818917334079742, 'train/total_loss': 0.17822545766830444} -tensor(0.2690, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.417, 'grad_norm': 5.834446907043457, 'learning_rate': 7.617600759011036e-06}[Rank 0] Trainer log: {'loss': 0.417, 'grad_norm': 5.834446907043457, 'learning_rate': 7.617600759011036e-06}[Rank 1] Trainer log: {'loss': 0.417, 'grad_norm': 5.834446907043457, 'learning_rate': 7.617600759011036e-06} - - -[Rank 2] Trainer log: {'loss': 0.417, 'grad_norm': 5.834446907043457, 'learning_rate': 7.617600759011036e-06} -{'loss': 0.417, 'grad_norm': 5.834446907043457, 'learning_rate': 7.617600759011036e-06, 'epoch': 0.6} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(0.2144, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.000260279537178576, 'train/lm_loss': 3.673692990560085e-05, 'train/info_loss': 2.3424076061928645e-05, 'train/ref_loss': 0.20732852816581726, 'train/uncertainty_loss': -6.85872626490891e-05, 'train/video_loss': 0.2093656063079834, 'train/total_loss': 0.20940233767032623} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(0.4066, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1211, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00032275419216603045, 'train/lm_loss': 2.457944501657039e-05, 'train/info_loss': 2.0324770957813598e-05, 'train/ref_loss': 0.289724737405777, 'train/uncertainty_loss': 0.012113715708255769, 'train/video_loss': 0.30444082617759705, 'train/total_loss': 0.3044654130935669} -[Rank 1] Trainer log: {'loss': 0.3856, 'grad_norm': 4.584377765655518, 'learning_rate': 7.607241727126163e-06}[Rank 0] Trainer log: {'loss': 0.3856, 'grad_norm': 4.584377765655518, 'learning_rate': 7.607241727126163e-06} -[Rank 3] Trainer log: {'loss': 0.3856, 'grad_norm': 4.584377765655518, 'learning_rate': 7.607241727126163e-06} - -[Rank 2] Trainer log: {'loss': 0.3856, 'grad_norm': 4.584377765655518, 'learning_rate': 7.607241727126163e-06} -{'loss': 0.3856, 'grad_norm': 4.584377765655518, 'learning_rate': 7.607241727126163e-06, 'epoch': 0.6} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.25175037384033205, 'train/info_loss': 0.17942382395267487, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011686765355989337, 'train/video_loss': 0.17930695414543152, 'train/total_loss': 0.431057333946228} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.19914772510528567, 'train/info_loss': 0.12948384881019592, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010485277744010091, 'train/video_loss': 0.12937898933887482, 'train/total_loss': 0.32852673530578613} -tensor(0.3459, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0104, device='cuda:3', grad_fn=) tensor(-0.0006, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.386, 'grad_norm': 10.542059898376465, 'learning_rate': 7.596885418117714e-06} -[Rank 2] Trainer log: {'loss': 0.386, 'grad_norm': 10.542059898376465, 'learning_rate': 7.596885418117714e-06} -[Rank 0] Trainer log: {'loss': 0.386, 'grad_norm': 10.542059898376465, 'learning_rate': 7.596885418117714e-06}[Rank 3] Trainer log: {'loss': 0.386, 'grad_norm': 10.542059898376465, 'learning_rate': 7.596885418117714e-06} - -{'loss': 0.386, 'grad_norm': 10.542059898376465, 'learning_rate': 7.596885418117714e-06, 'epoch': 0.6} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0163, device='cuda:0', grad_fn=) tensor(0.1637, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002671087859198451, 'train/lm_loss': 4.164738929830492e-05, 'train/info_loss': 2.574854443082586e-05, 'train/ref_loss': 0.22664247453212738, 'train/uncertainty_loss': 0.0016329223290085793, 'train/video_loss': 0.2304380238056183, 'train/total_loss': 0.2304796725511551} -tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1261, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2184701919555664, 'train/info_loss': 0.26789894700050354, 'train/ref_loss': None, 'train/uncertainty_loss': -9.709068690426648e-05, 'train/video_loss': 0.26780185103416443, 'train/total_loss': 0.48627203702926636} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3217, 'grad_norm': 11.546943664550781, 'learning_rate': 7.586531843770818e-06} -[Rank 2] Trainer log: {'loss': 0.3217, 'grad_norm': 11.546943664550781, 'learning_rate': 7.586531843770818e-06} -[Rank 0] Trainer log: {'loss': 0.3217, 'grad_norm': 11.546943664550781, 'learning_rate': 7.586531843770818e-06}[Rank 1] Trainer log: {'loss': 0.3217, 'grad_norm': 11.546943664550781, 'learning_rate': 7.586531843770818e-06} - -{'loss': 0.3217, 'grad_norm': 11.546943664550781, 'learning_rate': 7.586531843770818e-06, 'epoch': 0.6} -tensor(-0.0015, device='cuda:1', grad_fn=) tensor(-0.0015, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.1191, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.406261682510376, 'train/info_loss': 0.27982619404792786, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012428914196789266, 'train/video_loss': 0.27970191836357117, 'train/total_loss': 0.6859636306762695} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.1581, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0006835372187197208, 'train/lm_loss': 4.617634695023299e-05, 'train/info_loss': 2.5331331926281564e-05, 'train/ref_loss': 0.3184157609939575, 'train/uncertainty_loss': 0.015805424749851228, 'train/video_loss': 0.33971482515335083, 'train/total_loss': 0.33976098895072937} -tensor(-0.0015, device='cuda:1', grad_fn=) tensor(-0.0015, device='cuda:1', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4885, 'grad_norm': 7.542090892791748, 'learning_rate': 7.576181015867488e-06}[Rank 2] Trainer log: {'loss': 0.4885, 'grad_norm': 7.542090892791748, 'learning_rate': 7.576181015867488e-06}[Rank 1] Trainer log: {'loss': 0.4885, 'grad_norm': 7.542090892791748, 'learning_rate': 7.576181015867488e-06} - - -[Rank 0] Trainer log: {'loss': 0.4885, 'grad_norm': 7.542090892791748, 'learning_rate': 7.576181015867488e-06} -{'loss': 0.4885, 'grad_norm': 7.542090892791748, 'learning_rate': 7.576181015867488e-06, 'epoch': 0.6} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(0.2851, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.2741, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00016304615419358016, 'train/lm_loss': 3.635553002823144e-05, 'train/info_loss': 2.413929905742407e-05, 'train/ref_loss': 0.1963132619857788, 'train/uncertainty_loss': -7.211749907582998e-05, 'train/video_loss': 0.19756965339183807, 'train/total_loss': 0.19760601222515106} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3527570724487305, 'train/info_loss': 0.10959415882825851, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010497638722881675, 'train/video_loss': 0.10948918014764786, 'train/total_loss': 0.46224623918533325} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.369, 'grad_norm': 3.8267314434051514, 'learning_rate': 7.565832946186601e-06} -[Rank 3] Trainer log: {'loss': 0.369, 'grad_norm': 3.8267314434051514, 'learning_rate': 7.565832946186601e-06} -[Rank 2] Trainer log: {'loss': 0.369, 'grad_norm': 3.8267314434051514, 'learning_rate': 7.565832946186601e-06} -[Rank 0] Trainer log: {'loss': 0.369, 'grad_norm': 3.8267314434051514, 'learning_rate': 7.565832946186601e-06} -{'loss': 0.369, 'grad_norm': 3.8267314434051514, 'learning_rate': 7.565832946186601e-06, 'epoch': 0.6} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.1154, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00016906645614653826, 'train/lm_loss': 2.493702922947705e-05, 'train/info_loss': 2.1636018573190086e-05, 'train/ref_loss': 0.2893061339855194, 'train/uncertainty_loss': 0.011540085077285767, 'train/video_loss': 0.3022203743457794, 'train/total_loss': 0.3022453188896179} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.3588, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002929343841969967, 'train/lm_loss': 8.84099048562348e-05, 'train/info_loss': 2.9384225854300894e-05, 'train/ref_loss': 0.45973604917526245, 'train/uncertainty_loss': 0.03587656915187836, 'train/video_loss': 0.49798548221588135, 'train/total_loss': 0.49807390570640564} -[Rank 1] Trainer log: {'loss': 0.3693, 'grad_norm': 6.8969926834106445, 'learning_rate': 7.555487646503912e-06}[Rank 0] Trainer log: {'loss': 0.3693, 'grad_norm': 6.8969926834106445, 'learning_rate': 7.555487646503912e-06} -[Rank 2] Trainer log: {'loss': 0.3693, 'grad_norm': 6.8969926834106445, 'learning_rate': 7.555487646503912e-06} -[Rank 3] Trainer log: {'loss': 0.3693, 'grad_norm': 6.8969926834106445, 'learning_rate': 7.555487646503912e-06} - -{'loss': 0.3693, 'grad_norm': 6.8969926834106445, 'learning_rate': 7.555487646503912e-06, 'epoch': 0.6} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00023652513045817615, 'train/lm_loss': 4.7129800077527764e-05, 'train/info_loss': 2.6940573661704548e-05, 'train/ref_loss': 0.14192670583724976, 'train/uncertainty_loss': -7.101679802872241e-05, 'train/video_loss': 0.1437748223543167, 'train/total_loss': 0.14382195472717285} -tensor(0.1447, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.3031, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0790, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00025715823285281657, 'train/lm_loss': 4.14328562328592e-05, 'train/info_loss': 2.455650974297896e-05, 'train/ref_loss': 0.26602524518966675, 'train/uncertainty_loss': 0.007900532335042953, 'train/video_loss': 0.27600759267807007, 'train/total_loss': 0.27604901790618896} -[Rank 0] Trainer log: {'loss': 0.2934, 'grad_norm': 13.219318389892578, 'learning_rate': 7.545145128592009e-06}[Rank 1] Trainer log: {'loss': 0.2934, 'grad_norm': 13.219318389892578, 'learning_rate': 7.545145128592009e-06} -[Rank 2] Trainer log: {'loss': 0.2934, 'grad_norm': 13.219318389892578, 'learning_rate': 7.545145128592009e-06} -[Rank 3] Trainer log: {'loss': 0.2934, 'grad_norm': 13.219318389892578, 'learning_rate': 7.545145128592009e-06} - -{'loss': 0.2934, 'grad_norm': 13.219318389892578, 'learning_rate': 7.545145128592009e-06, 'epoch': 0.6} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.11042360067367554, 'train/info_loss': 0.20882554352283478, 'train/ref_loss': None, 'train/uncertainty_loss': -9.530109819024802e-05, 'train/video_loss': 0.20873023569583893, 'train/total_loss': 0.3191538453102112} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3299708127975464, 'train/info_loss': 0.4605627655982971, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011698735179379583, 'train/video_loss': 0.4604457914829254, 'train/total_loss': 0.7904165983200073} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4241, 'grad_norm': 4.492098331451416, 'learning_rate': 7.534805404220327e-06}[Rank 2] Trainer log: {'loss': 0.4241, 'grad_norm': 4.492098331451416, 'learning_rate': 7.534805404220327e-06} -[Rank 3] Trainer log: {'loss': 0.4241, 'grad_norm': 4.492098331451416, 'learning_rate': 7.534805404220327e-06} - -[Rank 0] Trainer log: {'loss': 0.4241, 'grad_norm': 4.492098331451416, 'learning_rate': 7.534805404220327e-06} -{'loss': 0.4241, 'grad_norm': 4.492098331451416, 'learning_rate': 7.534805404220327e-06, 'epoch': 0.6} -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.17918998003005981, 'train/info_loss': 0.1508532166481018, 'train/ref_loss': None, 'train/uncertainty_loss': -9.749698219820858e-05, 'train/video_loss': 0.15075571835041046, 'train/total_loss': 0.3299456834793091} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.4533, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(0.0569, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00039956849068403246, 'train/lm_loss': 3.6021802225150167e-05, 'train/info_loss': 2.413929905742407e-05, 'train/ref_loss': 0.25196051597595215, 'train/uncertainty_loss': 0.0056945160031318665, 'train/video_loss': 0.2608757019042969, 'train/total_loss': 0.2609117329120636} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.341, 'grad_norm': 12.944239616394043, 'learning_rate': 7.524468485155108e-06}[Rank 0] Trainer log: {'loss': 0.341, 'grad_norm': 12.944239616394043, 'learning_rate': 7.524468485155108e-06} -[Rank 2] Trainer log: {'loss': 0.341, 'grad_norm': 12.944239616394043, 'learning_rate': 7.524468485155108e-06} -[Rank 3] Trainer log: {'loss': 0.341, 'grad_norm': 12.944239616394043, 'learning_rate': 7.524468485155108e-06} - -{'loss': 0.341, 'grad_norm': 12.944239616394043, 'learning_rate': 7.524468485155108e-06, 'epoch': 0.6} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002970511792227626, 'train/lm_loss': 4.7082128003239634e-05, 'train/info_loss': 2.6761768822325394e-05, 'train/ref_loss': 0.1703183948993683, 'train/uncertainty_loss': -6.897957646287978e-05, 'train/video_loss': 0.17265258729457855, 'train/total_loss': 0.1726996749639511} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.4058058261871338, 'train/info_loss': 0.3258179724216461, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011577889090403916, 'train/video_loss': 0.3257021903991699, 'train/total_loss': 0.7315080165863037} -tensor(0.0708, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3843, 'grad_norm': 3.331165075302124, 'learning_rate': 7.514134383159414e-06} -[Rank 0] Trainer log: {'loss': 0.3843, 'grad_norm': 3.331165075302124, 'learning_rate': 7.514134383159414e-06} -[Rank 3] Trainer log: {'loss': 0.3843, 'grad_norm': 3.331165075302124, 'learning_rate': 7.514134383159414e-06}[Rank 2] Trainer log: {'loss': 0.3843, 'grad_norm': 3.331165075302124, 'learning_rate': 7.514134383159414e-06} - -{'loss': 0.3843, 'grad_norm': 3.331165075302124, 'learning_rate': 7.514134383159414e-06, 'epoch': 0.6} -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2906123399734497, 'train/info_loss': 0.23253260552883148, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012736804783344268, 'train/video_loss': 0.23240523040294647, 'train/total_loss': 0.5230175852775574} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(0.1789, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.1234, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0385, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00018321630777791144, 'train/lm_loss': 4.7129800077527764e-05, 'train/info_loss': 2.735778434725944e-05, 'train/ref_loss': 0.24208517372608185, 'train/uncertainty_loss': 0.0038536142557859423, 'train/video_loss': 0.2474318891763687, 'train/total_loss': 0.24747902154922485} -[Rank 1] Trainer log: {'loss': 0.4502, 'grad_norm': 12.275125503540039, 'learning_rate': 7.503803109993102e-06}[Rank 2] Trainer log: {'loss': 0.4502, 'grad_norm': 12.275125503540039, 'learning_rate': 7.503803109993102e-06} - -[Rank 0] Trainer log: {'loss': 0.4502, 'grad_norm': 12.275125503540039, 'learning_rate': 7.503803109993102e-06} -[Rank 3] Trainer log: {'loss': 0.4502, 'grad_norm': 12.275125503540039, 'learning_rate': 7.503803109993102e-06} -{'loss': 0.4502, 'grad_norm': 12.275125503540039, 'learning_rate': 7.503803109993102e-06, 'epoch': 0.6} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.16621037721633913, 'train/info_loss': 0.3299708366394043, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001228487817570567, 'train/video_loss': 0.3298479914665222, 'train/total_loss': 0.4960583746433258} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00023044899571686984, 'train/lm_loss': 5.327945691533387e-05, 'train/info_loss': 2.962263170047663e-05, 'train/ref_loss': 0.12401854991912842, 'train/uncertainty_loss': -7.115952903404833e-05, 'train/video_loss': 0.1258206069469452, 'train/total_loss': 0.1258738934993744} -[Rank 2] Trainer log: {'loss': 0.3511, 'grad_norm': 3.3317365646362305, 'learning_rate': 7.493474677412795e-06}[Rank 1] Trainer log: {'loss': 0.3511, 'grad_norm': 3.3317365646362305, 'learning_rate': 7.493474677412795e-06} - -[Rank 3] Trainer log: {'loss': 0.3511, 'grad_norm': 3.3317365646362305, 'learning_rate': 7.493474677412795e-06} -[Rank 0] Trainer log: {'loss': 0.3511, 'grad_norm': 3.3317365646362305, 'learning_rate': 7.493474677412795e-06} -{'loss': 0.3511, 'grad_norm': 3.3317365646362305, 'learning_rate': 7.493474677412795e-06, 'epoch': 0.6} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(0.1692, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.0986, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00017158103873953225, 'train/lm_loss': 4.121832607779652e-05, 'train/info_loss': 2.4914121240726672e-05, 'train/ref_loss': 0.28304117918014526, 'train/uncertainty_loss': 0.009863854199647904, 'train/video_loss': 0.2943025827407837, 'train/total_loss': 0.2943437993526459} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002512060338631272, 'train/lm_loss': 6.057298742234707e-05, 'train/info_loss': 3.0516646802425385e-05, 'train/ref_loss': 0.1588289588689804, 'train/uncertainty_loss': -7.013662834651769e-05, 'train/video_loss': 0.16079898178577423, 'train/total_loss': 0.16085955500602722} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.2639, 'grad_norm': 10.152749061584473, 'learning_rate': 7.483149097171902e-06}[Rank 0] Trainer log: {'loss': 0.2639, 'grad_norm': 10.152749061584473, 'learning_rate': 7.483149097171902e-06} -[Rank 3] Trainer log: {'loss': 0.2639, 'grad_norm': 10.152749061584473, 'learning_rate': 7.483149097171902e-06} - -[Rank 2] Trainer log: {'loss': 0.2639, 'grad_norm': 10.152749061584473, 'learning_rate': 7.483149097171902e-06} -{'loss': 0.2639, 'grad_norm': 10.152749061584473, 'learning_rate': 7.483149097171902e-06, 'epoch': 0.6} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00025415958371013405, 'train/lm_loss': 3.604564117267728e-05, 'train/info_loss': 2.3066464564180933e-05, 'train/ref_loss': 0.14214518666267395, 'train/uncertainty_loss': -7.107194978743792e-05, 'train/video_loss': 0.14413045346736908, 'train/total_loss': 0.144166499376297} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.0350, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00017266485374420882, 'train/lm_loss': 3.154028963763267e-05, 'train/info_loss': 2.235124156868551e-05, 'train/ref_loss': 0.1570884734392166, 'train/uncertainty_loss': -7.087861304171384e-05, 'train/video_loss': 0.15842126309871674, 'train/total_loss': 0.1584528088569641} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.2938, 'grad_norm': 4.525271892547607, 'learning_rate': 7.472826381020567e-06}[Rank 3] Trainer log: {'loss': 0.2938, 'grad_norm': 4.525271892547607, 'learning_rate': 7.472826381020567e-06}[Rank 0] Trainer log: {'loss': 0.2938, 'grad_norm': 4.525271892547607, 'learning_rate': 7.472826381020567e-06} - - -[Rank 2] Trainer log: {'loss': 0.2938, 'grad_norm': 4.525271892547607, 'learning_rate': 7.472826381020567e-06} -{'loss': 0.2938, 'grad_norm': 4.525271892547607, 'learning_rate': 7.472826381020567e-06, 'epoch': 0.6} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.39122741222381596, 'train/info_loss': 0.20494480431079865, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001311383326537907, 'train/video_loss': 0.2048136591911316, 'train/total_loss': 0.5960410833358765} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(0.0196, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(0.0500, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00026153146754950287, 'train/lm_loss': 3.685611591208726e-05, 'train/info_loss': 2.3781687559676357e-05, 'train/ref_loss': 0.2518708109855652, 'train/uncertainty_loss': 0.00499938540160656, 'train/video_loss': 0.258986234664917, 'train/total_loss': 0.25902310013771057} -[Rank 1] Trainer log: {'loss': 0.4199, 'grad_norm': 4.228402614593506, 'learning_rate': 7.462506540705696e-06} -[Rank 0] Trainer log: {'loss': 0.4199, 'grad_norm': 4.228402614593506, 'learning_rate': 7.462506540705696e-06}[Rank 2] Trainer log: {'loss': 0.4199, 'grad_norm': 4.228402614593506, 'learning_rate': 7.462506540705696e-06} - -[Rank 3] Trainer log: {'loss': 0.4199, 'grad_norm': 4.228402614593506, 'learning_rate': 7.462506540705696e-06} -{'loss': 0.4199, 'grad_norm': 4.228402614593506, 'learning_rate': 7.462506540705696e-06, 'epoch': 0.6} -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.4133223056793213, 'train/info_loss': 0.2035372406244278, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001362652168609202, 'train/video_loss': 0.20340096950531006, 'train/total_loss': 0.6167232990264893} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0015, device='cuda:3', grad_fn=) tensor(-0.0015, device='cuda:3', grad_fn=) -tensor(0.2098, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.13510432243347167, 'train/info_loss': 0.10665728896856308, 'train/ref_loss': None, 'train/uncertainty_loss': -9.567201486788691e-05, 'train/video_loss': 0.10656161606311798, 'train/total_loss': 0.24166594445705414} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.4006, 'grad_norm': 2.9256083965301514, 'learning_rate': 7.452189587970905e-06}[Rank 3] Trainer log: {'loss': 0.4006, 'grad_norm': 2.9256083965301514, 'learning_rate': 7.452189587970905e-06} - -[Rank 0] Trainer log: {'loss': 0.4006, 'grad_norm': 2.9256083965301514, 'learning_rate': 7.452189587970905e-06} -[Rank 1] Trainer log: {'loss': 0.4006, 'grad_norm': 2.9256083965301514, 'learning_rate': 7.452189587970905e-06} -{'loss': 0.4006, 'grad_norm': 2.9256083965301514, 'learning_rate': 7.452189587970905e-06, 'epoch': 0.6} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(0.2413, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0432, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00018908655038103462, 'train/lm_loss': 2.822676906362176e-05, 'train/info_loss': 2.1636018573190086e-05, 'train/ref_loss': 0.24993816018104553, 'train/uncertainty_loss': 0.004319602251052857, 'train/video_loss': 0.25579211115837097, 'train/total_loss': 0.2558203339576721} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.1210, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.2084, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002989564556628466, 'train/lm_loss': 3.2374623697251084e-05, 'train/info_loss': 2.413929905742407e-05, 'train/ref_loss': 0.3511437177658081, 'train/uncertainty_loss': 0.020839098095893863, 'train/video_loss': 0.3743986189365387, 'train/total_loss': 0.37443098425865173} -[Rank 3] Trainer log: {'loss': 0.3246, 'grad_norm': 8.11490249633789, 'learning_rate': 7.441875534556532e-06}[Rank 0] Trainer log: {'loss': 0.3246, 'grad_norm': 8.11490249633789, 'learning_rate': 7.441875534556532e-06} -[Rank 1] Trainer log: {'loss': 0.3246, 'grad_norm': 8.11490249633789, 'learning_rate': 7.441875534556532e-06}[Rank 2] Trainer log: {'loss': 0.3246, 'grad_norm': 8.11490249633789, 'learning_rate': 7.441875534556532e-06} - - -{'loss': 0.3246, 'grad_norm': 8.11490249633789, 'learning_rate': 7.441875534556532e-06, 'epoch': 0.6} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(0.2155, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.000260551692917943, 'train/lm_loss': 3.2064729020930825e-05, 'train/info_loss': 2.235124156868551e-05, 'train/ref_loss': 0.35253775119781494, 'train/uncertainty_loss': 0.0215474933385849, 'train/video_loss': 0.37619200348854065, 'train/total_loss': 0.3762240707874298} -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.09420356154441833, 'train/info_loss': 0.16639092564582825, 'train/ref_loss': None, 'train/uncertainty_loss': -8.704104693606496e-05, 'train/video_loss': 0.16630388796329498, 'train/total_loss': 0.2605074644088745} -tensor(0.2549, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1339, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4063, 'grad_norm': 6.015199661254883, 'learning_rate': 7.431564392199616e-06}[Rank 1] Trainer log: {'loss': 0.4063, 'grad_norm': 6.015199661254883, 'learning_rate': 7.431564392199616e-06}[Rank 0] Trainer log: {'loss': 0.4063, 'grad_norm': 6.015199661254883, 'learning_rate': 7.431564392199616e-06} - - -[Rank 2] Trainer log: {'loss': 0.4063, 'grad_norm': 6.015199661254883, 'learning_rate': 7.431564392199616e-06} -{'loss': 0.4063, 'grad_norm': 6.015199661254883, 'learning_rate': 7.431564392199616e-06, 'epoch': 0.6} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.0667, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00035148102324455976, 'train/lm_loss': 4.119448713026941e-05, 'train/info_loss': 2.3781687559676357e-05, 'train/ref_loss': 0.2631396949291229, 'train/uncertainty_loss': 0.006666860729455948, 'train/video_loss': 0.27264219522476196, 'train/total_loss': 0.27268338203430176} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3284570455551148, 'train/info_loss': 0.22344471514225006, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012344663264229894, 'train/video_loss': 0.22332127392292023, 'train/total_loss': 0.5517783164978027} -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3899, 'grad_norm': 3.793198347091675, 'learning_rate': 7.421256172633879e-06}[Rank 0] Trainer log: {'loss': 0.3899, 'grad_norm': 3.793198347091675, 'learning_rate': 7.421256172633879e-06}[Rank 1] Trainer log: {'loss': 0.3899, 'grad_norm': 3.793198347091675, 'learning_rate': 7.421256172633879e-06} - - -[Rank 3] Trainer log: {'loss': 0.3899, 'grad_norm': 3.793198347091675, 'learning_rate': 7.421256172633879e-06} -{'loss': 0.3899, 'grad_norm': 3.793198347091675, 'learning_rate': 7.421256172633879e-06, 'epoch': 0.6} -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.0409073680639267, 'train/info_loss': 0.1337866634130478, 'train/ref_loss': None, 'train/uncertainty_loss': -9.392414358444512e-05, 'train/video_loss': 0.13369274139404297, 'train/total_loss': 0.17460010945796967} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.1074, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) {'train/tv_loss': 0.0002516054315492511, 'train/lm_loss': 4.7082128003239634e-05, 'train/info_loss': 2.413929905742407e-05, 'train/ref_loss': 0.28860265016555786, 'train/uncertainty_loss': 0.010744983702898026, 'train/video_loss': 0.3013846278190613, 'train/total_loss': 0.30143171548843384} -tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3381, 'grad_norm': 8.358384132385254, 'learning_rate': 7.410950887589723e-06}[Rank 1] Trainer log: {'loss': 0.3381, 'grad_norm': 8.358384132385254, 'learning_rate': 7.410950887589723e-06} - -[Rank 2] Trainer log: {'loss': 0.3381, 'grad_norm': 8.358384132385254, 'learning_rate': 7.410950887589723e-06} -[Rank 3] Trainer log: {'loss': 0.3381, 'grad_norm': 8.358384132385254, 'learning_rate': 7.410950887589723e-06} -{'loss': 0.3381, 'grad_norm': 8.358384132385254, 'learning_rate': 7.410950887589723e-06, 'epoch': 0.6} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.0678770124912262, 'train/info_loss': 0.15179839730262756, 'train/ref_loss': None, 'train/uncertainty_loss': -9.47940512560308e-05, 'train/video_loss': 0.1517035961151123, 'train/total_loss': 0.21958062052726746} -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0018, device='cuda:0', grad_fn=) tensor(-0.0018, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.00016598963411524893, 'train/info_loss': 1.594387686054688e-05, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001759967184625566, 'train/video_loss': -0.00016005284851416945, 'train/total_loss': 5.93678851146251e-06} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3499, 'grad_norm': 2.0738017559051514, 'learning_rate': 7.4006485487942e-06}[Rank 0] Trainer log: {'loss': 0.3499, 'grad_norm': 2.0738017559051514, 'learning_rate': 7.4006485487942e-06}[Rank 2] Trainer log: {'loss': 0.3499, 'grad_norm': 2.0738017559051514, 'learning_rate': 7.4006485487942e-06} - - -[Rank 3] Trainer log: {'loss': 0.3499, 'grad_norm': 2.0738017559051514, 'learning_rate': 7.4006485487942e-06} -{'loss': 0.3499, 'grad_norm': 2.0738017559051514, 'learning_rate': 7.4006485487942e-06, 'epoch': 0.6} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00018143836641684177, 'train/lm_loss': 4.658156540244818e-05, 'train/info_loss': 2.6106152290594764e-05, 'train/ref_loss': 0.11871558427810669, 'train/uncertainty_loss': -7.071911823004484e-05, 'train/video_loss': 0.12012247741222382, 'train/total_loss': 0.12016905844211578} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2368436098098755, 'train/info_loss': 0.23286965489387512, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013321232981979846, 'train/video_loss': 0.23273643851280212, 'train/total_loss': 0.4695800542831421} -tensor(-0.0015, device='cuda:2', grad_fn=) tensor(-0.0015, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3687, 'grad_norm': 1.5719126462936401, 'learning_rate': 7.390349167971026e-06} -[Rank 2] Trainer log: {'loss': 0.3687, 'grad_norm': 1.5719126462936401, 'learning_rate': 7.390349167971026e-06}[Rank 1] Trainer log: {'loss': 0.3687, 'grad_norm': 1.5719126462936401, 'learning_rate': 7.390349167971026e-06} - -[Rank 0] Trainer log: {'loss': 0.3687, 'grad_norm': 1.5719126462936401, 'learning_rate': 7.390349167971026e-06} -{'loss': 0.3687, 'grad_norm': 1.5719126462936401, 'learning_rate': 7.390349167971026e-06, 'epoch': 0.6} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.1881, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(2.0709, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00030494665261358026, 'train/lm_loss': 4.190959443803877e-05, 'train/info_loss': 2.574854443082586e-05, 'train/ref_loss': 1.5152612924575806, 'train/uncertainty_loss': 0.20708768367767336, 'train/video_loss': 1.7248142957687378, 'train/total_loss': 1.7248562574386597} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.2040, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00018794705392792823, 'train/lm_loss': 7.861499325372279e-05, 'train/info_loss': 3.075505082961172e-05, 'train/ref_loss': 0.33991751074790955, 'train/uncertainty_loss': 0.02040142118930817, 'train/video_loss': 0.3618532717227936, 'train/total_loss': 0.3619318902492523} -[Rank 1] Trainer log: {'loss': 0.5213, 'grad_norm': 15.203214645385742, 'learning_rate': 7.380052756840532e-06} -[Rank 2] Trainer log: {'loss': 0.5213, 'grad_norm': 15.203214645385742, 'learning_rate': 7.380052756840532e-06} -[Rank 3] Trainer log: {'loss': 0.5213, 'grad_norm': 15.203214645385742, 'learning_rate': 7.380052756840532e-06}[Rank 0] Trainer log: {'loss': 0.5213, 'grad_norm': 15.203214645385742, 'learning_rate': 7.380052756840532e-06} - -{'loss': 0.5213, 'grad_norm': 15.203214645385742, 'learning_rate': 7.380052756840532e-06, 'epoch': 0.61} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(0.1675, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.3541, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00020236645359545947, 'train/lm_loss': 3.66415799362585e-05, 'train/info_loss': 2.6106152290594764e-05, 'train/ref_loss': 0.4295141100883484, 'train/uncertainty_loss': 0.035407188534736636, 'train/video_loss': 0.4665663242340088, 'train/total_loss': 0.46660295128822327} -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.060422807931900024, 'train/info_loss': 0.21386149525642395, 'train/ref_loss': None, 'train/uncertainty_loss': -8.4633071674034e-05, 'train/video_loss': 0.2137768566608429, 'train/total_loss': 0.2741996645927429} -tensor(0.0540, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3389, 'grad_norm': 6.235914707183838, 'learning_rate': 7.369759327119689e-06}[Rank 3] Trainer log: {'loss': 0.3389, 'grad_norm': 6.235914707183838, 'learning_rate': 7.369759327119689e-06} -[Rank 2] Trainer log: {'loss': 0.3389, 'grad_norm': 6.235914707183838, 'learning_rate': 7.369759327119689e-06} - -[Rank 0] Trainer log: {'loss': 0.3389, 'grad_norm': 6.235914707183838, 'learning_rate': 7.369759327119689e-06} -{'loss': 0.3389, 'grad_norm': 6.235914707183838, 'learning_rate': 7.369759327119689e-06, 'epoch': 0.61} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.27507691383361815, 'train/info_loss': 0.11027085781097412, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010852905688807368, 'train/video_loss': 0.11016232520341873, 'train/total_loss': 0.38523924350738525} -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.35483894348144535, 'train/info_loss': 0.16911433637142181, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010970082366839052, 'train/video_loss': 0.1690046340227127, 'train/total_loss': 0.5238435864448547} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.5214, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4003, 'grad_norm': 4.810464382171631, 'learning_rate': 7.359468890522053e-06}[Rank 0] Trainer log: {'loss': 0.4003, 'grad_norm': 4.810464382171631, 'learning_rate': 7.359468890522053e-06} -[Rank 2] Trainer log: {'loss': 0.4003, 'grad_norm': 4.810464382171631, 'learning_rate': 7.359468890522053e-06} - -[Rank 3] Trainer log: {'loss': 0.4003, 'grad_norm': 4.810464382171631, 'learning_rate': 7.359468890522053e-06} -{'loss': 0.4003, 'grad_norm': 4.810464382171631, 'learning_rate': 7.359468890522053e-06, 'epoch': 0.61} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00018432595534250142, 'train/lm_loss': 4.1718900320120156e-05, 'train/info_loss': 2.6940573661704548e-05, 'train/ref_loss': 0.11716257035732269, 'train/uncertainty_loss': -6.981982733123004e-05, 'train/video_loss': 0.11859429627656937, 'train/total_loss': 0.11863601207733154} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.5135190963745118, 'train/info_loss': 0.1432260274887085, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012278165668249132, 'train/video_loss': 0.1431032419204712, 'train/total_loss': 0.6566223502159119} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3883, 'grad_norm': 1.8167771100997925, 'learning_rate': 7.349181458757795e-06}[Rank 3] Trainer log: {'loss': 0.3883, 'grad_norm': 1.8167771100997925, 'learning_rate': 7.349181458757795e-06} -[Rank 1] Trainer log: {'loss': 0.3883, 'grad_norm': 1.8167771100997925, 'learning_rate': 7.349181458757795e-06} - -[Rank 2] Trainer log: {'loss': 0.3883, 'grad_norm': 1.8167771100997925, 'learning_rate': 7.349181458757795e-06} -{'loss': 0.3883, 'grad_norm': 1.8167771100997925, 'learning_rate': 7.349181458757795e-06, 'epoch': 0.61} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0089, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0008919524028897286, 'train/lm_loss': 4.1361345211043954e-05, 'train/info_loss': 2.4914121240726672e-05, 'train/ref_loss': 0.2176373451948166, 'train/uncertainty_loss': 0.0008864547125995159, 'train/video_loss': 0.22568432986736298, 'train/total_loss': 0.2257256954908371} -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(0.1423, device='cuda:2', grad_fn=) tensor(-0.0006, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00020396471954882146, 'train/lm_loss': 4.1766572394408286e-05, 'train/info_loss': 2.4914121240726672e-05, 'train/ref_loss': 0.2074023187160492, 'train/uncertainty_loss': -7.140139932744205e-05, 'train/video_loss': 0.2089875489473343, 'train/total_loss': 0.20902931690216064} -[Rank 1] Trainer log: {'loss': 0.2609, 'grad_norm': 4.321936130523682, 'learning_rate': 7.338897043533657e-06} -[Rank 0] Trainer log: {'loss': 0.2609, 'grad_norm': 4.321936130523682, 'learning_rate': 7.338897043533657e-06}[Rank 2] Trainer log: {'loss': 0.2609, 'grad_norm': 4.321936130523682, 'learning_rate': 7.338897043533657e-06} -[Rank 3] Trainer log: {'loss': 0.2609, 'grad_norm': 4.321936130523682, 'learning_rate': 7.338897043533657e-06} - -{'loss': 0.2609, 'grad_norm': 4.321936130523682, 'learning_rate': 7.338897043533657e-06, 'epoch': 0.61} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1577874302864075, 'train/info_loss': 0.21228693425655365, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011665247147902847, 'train/video_loss': 0.2121702879667282, 'train/total_loss': 0.36995771527290344}tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) - -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.0731, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00023307413794100286, 'train/lm_loss': 3.66415799362585e-05, 'train/info_loss': 2.4914121240726672e-05, 'train/ref_loss': 0.2575944662094116, 'train/uncertainty_loss': 0.007307835668325425, 'train/video_loss': 0.26679179072380066, 'train/total_loss': 0.26682841777801514} -[Rank 1] Trainer log: {'loss': 0.4045, 'grad_norm': 2.4570934772491455, 'learning_rate': 7.328615656552946e-06}[Rank 0] Trainer log: {'loss': 0.4045, 'grad_norm': 2.4570934772491455, 'learning_rate': 7.328615656552946e-06}[Rank 2] Trainer log: {'loss': 0.4045, 'grad_norm': 2.4570934772491455, 'learning_rate': 7.328615656552946e-06} - - -{'loss': 0.4045, 'grad_norm': 2.4570934772491455, 'learning_rate': 7.328615656552946e-06, 'epoch': 0.61} -[Rank 3] Trainer log: {'loss': 0.4045, 'grad_norm': 2.4570934772491455, 'learning_rate': 7.328615656552946e-06} -tensor(-0.0016, device='cuda:1', grad_fn=) tensor(-0.0016, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3085006952285767, 'train/info_loss': 0.2794577181339264, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001397313317283988, 'train/video_loss': 0.2793179750442505, 'train/total_loss': 0.5878186821937561} -tensor(-0.0017, device='cuda:3', grad_fn=) tensor(-0.0017, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.2566, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00021240522619336843, 'train/lm_loss': 3.2326945802196864e-05, 'train/info_loss': 2.3781687559676357e-05, 'train/ref_loss': 0.38862547278404236, 'train/uncertainty_loss': 0.02565765380859375, 'train/video_loss': 0.4160061478614807, 'train/total_loss': 0.41603848338127136} -[Rank 1] Trainer log: {'loss': 0.467, 'grad_norm': 7.8897576332092285, 'learning_rate': 7.31833730951553e-06}[Rank 2] Trainer log: {'loss': 0.467, 'grad_norm': 7.8897576332092285, 'learning_rate': 7.31833730951553e-06} - -[Rank 3] Trainer log: {'loss': 0.467, 'grad_norm': 7.8897576332092285, 'learning_rate': 7.31833730951553e-06} -[Rank 0] Trainer log: {'loss': 0.467, 'grad_norm': 7.8897576332092285, 'learning_rate': 7.31833730951553e-06} -{'loss': 0.467, 'grad_norm': 7.8897576332092285, 'learning_rate': 7.31833730951553e-06, 'epoch': 0.61} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.16088353395462038, 'train/info_loss': 0.1377648413181305, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011590631911531092, 'train/video_loss': 0.13764894008636475, 'train/total_loss': 0.29853248596191406} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(0.0712, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.2209, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1611046552658081, 'train/info_loss': 0.1764184683561325, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001022066455334425, 'train/video_loss': 0.1763162612915039, 'train/total_loss': 0.3374209403991699} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3113, 'grad_norm': 14.87103271484375, 'learning_rate': 7.308062014117807e-06}[Rank 3] Trainer log: {'loss': 0.3113, 'grad_norm': 14.87103271484375, 'learning_rate': 7.308062014117807e-06}[Rank 0] Trainer log: {'loss': 0.3113, 'grad_norm': 14.87103271484375, 'learning_rate': 7.308062014117807e-06} - -[Rank 1] Trainer log: {'loss': 0.3113, 'grad_norm': 14.87103271484375, 'learning_rate': 7.308062014117807e-06} - -{'loss': 0.3113, 'grad_norm': 14.87103271484375, 'learning_rate': 7.308062014117807e-06, 'epoch': 0.61} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.0016, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1546, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00024657673202455044, 'train/lm_loss': 4.1480531217530375e-05, 'train/info_loss': 2.6106152290594764e-05, 'train/ref_loss': 0.31095895171165466, 'train/uncertainty_loss': 0.015462432801723481, 'train/video_loss': 0.32842010259628296, 'train/total_loss': 0.32846158742904663} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0159, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00026678843423724177, 'train/lm_loss': 7.832900737412274e-05, 'train/info_loss': 3.00994397548493e-05, 'train/ref_loss': 0.18431296944618225, 'train/uncertainty_loss': -6.810157210566104e-05, 'train/video_loss': 0.18640927970409393, 'train/total_loss': 0.18648761510849} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.1873, 'grad_norm': 8.480284690856934, 'learning_rate': 7.297789782052716e-06}[Rank 2] Trainer log: {'loss': 0.1873, 'grad_norm': 8.480284690856934, 'learning_rate': 7.297789782052716e-06}[Rank 0] Trainer log: {'loss': 0.1873, 'grad_norm': 8.480284690856934, 'learning_rate': 7.297789782052716e-06} - -[Rank 1] Trainer log: {'loss': 0.1873, 'grad_norm': 8.480284690856934, 'learning_rate': 7.297789782052716e-06} - -{'loss': 0.1873, 'grad_norm': 8.480284690856934, 'learning_rate': 7.297789782052716e-06, 'epoch': 0.61} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.278236985206604, 'train/info_loss': 0.2693142294883728, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011899105738848448, 'train/video_loss': 0.26919522881507874, 'train/total_loss': 0.5474321842193604} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0671, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.04149869978427887, 'train/info_loss': 0.14352403581142426, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010661608539521695, 'train/video_loss': 0.14341741800308228, 'train/total_loss': 0.18491612374782562} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.1014, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.2919, 'grad_norm': 9.055035591125488, 'learning_rate': 7.287520625009698e-06}[Rank 3] Trainer log: {'loss': 0.2919, 'grad_norm': 9.055035591125488, 'learning_rate': 7.287520625009698e-06}[Rank 2] Trainer log: {'loss': 0.2919, 'grad_norm': 9.055035591125488, 'learning_rate': 7.287520625009698e-06} - - -[Rank 0] Trainer log: {'loss': 0.2919, 'grad_norm': 9.055035591125488, 'learning_rate': 7.287520625009698e-06} -{'loss': 0.2919, 'grad_norm': 9.055035591125488, 'learning_rate': 7.287520625009698e-06, 'epoch': 0.61} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.056796944141387945, 'train/info_loss': 0.12968945503234863, 'train/ref_loss': None, 'train/uncertainty_loss': -8.451696485280991e-05, 'train/video_loss': 0.12960493564605713, 'train/total_loss': 0.1864018738269806} -tensor(0.2347, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.4982144355773926, 'train/info_loss': 0.1737658530473709, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010206189472228289, 'train/video_loss': 0.17366379499435425, 'train/total_loss': 0.6718782186508179} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.422, 'grad_norm': 3.554595708847046, 'learning_rate': 7.2772545546747e-06}[Rank 0] Trainer log: {'loss': 0.422, 'grad_norm': 3.554595708847046, 'learning_rate': 7.2772545546747e-06}[Rank 3] Trainer log: {'loss': 0.422, 'grad_norm': 3.554595708847046, 'learning_rate': 7.2772545546747e-06} - -[Rank 1] Trainer log: {'loss': 0.422, 'grad_norm': 3.554595708847046, 'learning_rate': 7.2772545546747e-06} - -{'loss': 0.422, 'grad_norm': 3.554595708847046, 'learning_rate': 7.2772545546747e-06, 'epoch': 0.61} -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.34879364967346194, 'train/info_loss': 0.2005254179239273, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013787227217108012, 'train/video_loss': 0.20038755238056183, 'train/total_loss': 0.5491812229156494} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.29176979064941405, 'train/info_loss': 0.10909203439950943, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012095099082216622, 'train/video_loss': 0.10897108167409897, 'train/total_loss': 0.4007408916950226} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.0458, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.2777, 'grad_norm': 3.0205113887786865, 'learning_rate': 7.266991582730163e-06}[Rank 3] Trainer log: {'loss': 0.2777, 'grad_norm': 3.0205113887786865, 'learning_rate': 7.266991582730163e-06}[Rank 1] Trainer log: {'loss': 0.2777, 'grad_norm': 3.0205113887786865, 'learning_rate': 7.266991582730163e-06} - - -[Rank 0] Trainer log: {'loss': 0.2777, 'grad_norm': 3.0205113887786865, 'learning_rate': 7.266991582730163e-06} -{'loss': 0.2777, 'grad_norm': 3.0205113887786865, 'learning_rate': 7.266991582730163e-06, 'epoch': 0.61} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.08765064477920533, 'train/info_loss': 0.1949857473373413, 'train/ref_loss': None, 'train/uncertainty_loss': -8.698312449268998e-05, 'train/video_loss': 0.19489876925945282, 'train/total_loss': 0.2825494110584259} -tensor(0.0282, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0351, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00020341938361525538, 'train/lm_loss': 3.189786220900714e-05, 'train/info_loss': 2.1338008082238957e-05, 'train/ref_loss': 0.23324939608573914, 'train/uncertainty_loss': 0.0035092510282993318, 'train/video_loss': 0.23840734362602234, 'train/total_loss': 0.23843924701213837} -[Rank 1] Trainer log: {'loss': 0.3582, 'grad_norm': 5.407779216766357, 'learning_rate': 7.256731720854983e-06}[Rank 2] Trainer log: {'loss': 0.3582, 'grad_norm': 5.407779216766357, 'learning_rate': 7.256731720854983e-06} - -[Rank 0] Trainer log: {'loss': 0.3582, 'grad_norm': 5.407779216766357, 'learning_rate': 7.256731720854983e-06}[Rank 3] Trainer log: {'loss': 0.3582, 'grad_norm': 5.407779216766357, 'learning_rate': 7.256731720854983e-06} - -{'loss': 0.3582, 'grad_norm': 5.407779216766357, 'learning_rate': 7.256731720854983e-06, 'epoch': 0.61} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.9262, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00020331218838691713, 'train/lm_loss': 3.6617740988731384e-05, 'train/info_loss': 2.235124156868551e-05, 'train/ref_loss': 0.8567694425582886, 'train/uncertainty_loss': 0.09262324571609498, 'train/video_loss': 0.9510415196418762, 'train/total_loss': 0.9510781168937683} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1914445996284485, 'train/info_loss': 0.13093195855617523, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010999958030879498, 'train/video_loss': 0.13082195818424225, 'train/total_loss': 0.3222665786743164} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.0594, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1025, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.4532, 'grad_norm': 10.337276458740234, 'learning_rate': 7.246474980724541e-06}[Rank 0] Trainer log: {'loss': 0.4532, 'grad_norm': 10.337276458740234, 'learning_rate': 7.246474980724541e-06}[Rank 1] Trainer log: {'loss': 0.4532, 'grad_norm': 10.337276458740234, 'learning_rate': 7.246474980724541e-06}[Rank 3] Trainer log: {'loss': 0.4532, 'grad_norm': 10.337276458740234, 'learning_rate': 7.246474980724541e-06} - - - -{'loss': 0.4532, 'grad_norm': 10.337276458740234, 'learning_rate': 7.246474980724541e-06, 'epoch': 0.61} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.4020824432373047, 'train/info_loss': 0.15719595551490784, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011056320508942009, 'train/video_loss': 0.1570853888988495, 'train/total_loss': 0.5591678619384766} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2599753141403198, 'train/info_loss': 0.32197311520576477, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013629933819174768, 'train/video_loss': 0.32183682918548584, 'train/total_loss': 0.5818121433258057} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4479, 'grad_norm': 2.697141408920288, 'learning_rate': 7.236221374010648e-06}[Rank 1] Trainer log: {'loss': 0.4479, 'grad_norm': 2.697141408920288, 'learning_rate': 7.236221374010648e-06} - -[Rank 2] Trainer log: {'loss': 0.4479, 'grad_norm': 2.697141408920288, 'learning_rate': 7.236221374010648e-06} -[Rank 0] Trainer log: {'loss': 0.4479, 'grad_norm': 2.697141408920288, 'learning_rate': 7.236221374010648e-06} -{'loss': 0.4479, 'grad_norm': 2.697141408920288, 'learning_rate': 7.236221374010648e-06, 'epoch': 0.61} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.33686506748199463, 'train/info_loss': 0.16023565828800201, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001010504667647183, 'train/video_loss': 0.16013461351394653, 'train/total_loss': 0.49699968099594116} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0535, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(0.1133, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002482716692611575, 'train/lm_loss': 3.628401609603316e-05, 'train/info_loss': 2.1338008082238957e-05, 'train/ref_loss': 0.2751573622226715, 'train/uncertainty_loss': 0.011334220319986344, 'train/video_loss': 0.2884990870952606, 'train/total_loss': 0.28853535652160645} -[Rank 2] Trainer log: {'loss': 0.4172, 'grad_norm': 3.264735698699951, 'learning_rate': 7.225970912381557e-06}[Rank 0] Trainer log: {'loss': 0.4172, 'grad_norm': 3.264735698699951, 'learning_rate': 7.225970912381557e-06}[Rank 3] Trainer log: {'loss': 0.4172, 'grad_norm': 3.264735698699951, 'learning_rate': 7.225970912381557e-06}[Rank 1] Trainer log: {'loss': 0.4172, 'grad_norm': 3.264735698699951, 'learning_rate': 7.225970912381557e-06} - - - -{'loss': 0.4172, 'grad_norm': 3.264735698699951, 'learning_rate': 7.225970912381557e-06, 'epoch': 0.61} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.04162447452545166, 'train/info_loss': 0.17071090638637543, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010696225799620152, 'train/video_loss': 0.170603945851326, 'train/total_loss': 0.2122284173965454} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.2720, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002304486930370331, 'train/lm_loss': 3.6879954859614375e-05, 'train/info_loss': 2.3781687559676357e-05, 'train/ref_loss': 0.40188857913017273, 'train/uncertainty_loss': 0.027198073267936707, 'train/video_loss': 0.4309540390968323, 'train/total_loss': 0.43099090456962585} -tensor(0.8110, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.4436, 'grad_norm': 12.123979568481445, 'learning_rate': 7.215723607501936e-06}[Rank 1] Trainer log: {'loss': 0.4436, 'grad_norm': 12.123979568481445, 'learning_rate': 7.215723607501936e-06} - -[Rank 2] Trainer log: {'loss': 0.4436, 'grad_norm': 12.123979568481445, 'learning_rate': 7.215723607501936e-06} -[Rank 3] Trainer log: {'loss': 0.4436, 'grad_norm': 12.123979568481445, 'learning_rate': 7.215723607501936e-06} -{'loss': 0.4436, 'grad_norm': 12.123979568481445, 'learning_rate': 7.215723607501936e-06, 'epoch': 0.61} -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3010401010513306, 'train/info_loss': 0.25605079531669617, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013697628164663911, 'train/video_loss': 0.2559138238430023, 'train/total_loss': 0.5569539070129395} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.2268, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002595903351902962, 'train/lm_loss': 2.7916868566535415e-05, 'train/info_loss': 2.0980394765501842e-05, 'train/ref_loss': 0.366540789604187, 'train/uncertainty_loss': 0.022680968046188354, 'train/video_loss': 0.3913194537162781, 'train/total_loss': 0.39134737849235535} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3306, 'grad_norm': 2.3139994144439697, 'learning_rate': 7.205479471032868e-06}[Rank 2] Trainer log: {'loss': 0.3306, 'grad_norm': 2.3139994144439697, 'learning_rate': 7.205479471032868e-06}[Rank 3] Trainer log: {'loss': 0.3306, 'grad_norm': 2.3139994144439697, 'learning_rate': 7.205479471032868e-06} - - -[Rank 0] Trainer log: {'loss': 0.3306, 'grad_norm': 2.3139994144439697, 'learning_rate': 7.205479471032868e-06} -{'loss': 0.3306, 'grad_norm': 2.3139994144439697, 'learning_rate': 7.205479471032868e-06, 'epoch': 0.61} -tensor(0.1290, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1482, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00027003423310816287, 'train/lm_loss': 4.1766572394408286e-05, 'train/info_loss': 2.6940573661704548e-05, 'train/ref_loss': 0.19728365540504456, 'train/uncertainty_loss': -6.83060206938535e-05, 'train/video_loss': 0.1994025707244873, 'train/total_loss': 0.19944433867931366} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3207614660263062, 'train/info_loss': 0.20369303226470947, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010888667311519385, 'train/video_loss': 0.20358414947986603, 'train/total_loss': 0.5243456363677979} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.0489, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3541, 'grad_norm': 6.704573631286621, 'learning_rate': 7.195238514631828e-06} -[Rank 0] Trainer log: {'loss': 0.3541, 'grad_norm': 6.704573631286621, 'learning_rate': 7.195238514631828e-06}[Rank 3] Trainer log: {'loss': 0.3541, 'grad_norm': 6.704573631286621, 'learning_rate': 7.195238514631828e-06}[Rank 2] Trainer log: {'loss': 0.3541, 'grad_norm': 6.704573631286621, 'learning_rate': 7.195238514631828e-06} - - -{'loss': 0.3541, 'grad_norm': 6.704573631286621, 'learning_rate': 7.195238514631828e-06, 'epoch': 0.61} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.09019522666931153, 'train/info_loss': 0.20173078775405884, 'train/ref_loss': None, 'train/uncertainty_loss': -9.448174969293178e-05, 'train/video_loss': 0.20163629949092865, 'train/total_loss': 0.29183152318000793} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(0.5623, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00024150246754288675, 'train/lm_loss': 4.105146508663893e-05, 'train/info_loss': 2.455650974297896e-05, 'train/ref_loss': 0.6435815691947937, 'train/uncertainty_loss': 0.056234520673751835, 'train/video_loss': 0.7017726898193359, 'train/total_loss': 0.7018137574195862} -tensor(0.3200, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1459, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.4438, 'grad_norm': 6.091743469238281, 'learning_rate': 7.185000749952667e-06}[Rank 1] Trainer log: {'loss': 0.4438, 'grad_norm': 6.091743469238281, 'learning_rate': 7.185000749952667e-06}[Rank 3] Trainer log: {'loss': 0.4438, 'grad_norm': 6.091743469238281, 'learning_rate': 7.185000749952667e-06} - - -[Rank 0] Trainer log: {'loss': 0.4438, 'grad_norm': 6.091743469238281, 'learning_rate': 7.185000749952667e-06} -{'loss': 0.4438, 'grad_norm': 6.091743469238281, 'learning_rate': 7.185000749952667e-06, 'epoch': 0.61} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.07533609867095947, 'train/info_loss': 0.1740577667951584, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011698298621922732, 'train/video_loss': 0.1739407777786255, 'train/total_loss': 0.24927687644958496} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00022260025143623353, 'train/lm_loss': 2.7893029619008305e-05, 'train/info_loss': 2.002676046686247e-05, 'train/ref_loss': 0.17132730782032013, 'train/uncertainty_loss': -7.104640826582909e-05, 'train/video_loss': 0.17305709421634674, 'train/total_loss': 0.17308498919010162} -[Rank 3] Trainer log: {'loss': 0.2255, 'grad_norm': 3.1734390258789062, 'learning_rate': 7.1747661886456145e-06}[Rank 1] Trainer log: {'loss': 0.2255, 'grad_norm': 3.1734390258789062, 'learning_rate': 7.1747661886456145e-06}[Rank 2] Trainer log: {'loss': 0.2255, 'grad_norm': 3.1734390258789062, 'learning_rate': 7.1747661886456145e-06} - - -[Rank 0] Trainer log: {'loss': 0.2255, 'grad_norm': 3.1734390258789062, 'learning_rate': 7.1747661886456145e-06} -{'loss': 0.2255, 'grad_norm': 3.1734390258789062, 'learning_rate': 7.1747661886456145e-06, 'epoch': 0.61} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(0.0689, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002288286341354251, 'train/lm_loss': 3.235078474972397e-05, 'train/info_loss': 2.413929905742407e-05, 'train/ref_loss': 0.12724560499191284, 'train/uncertainty_loss': -6.960881291888655e-05, 'train/video_loss': 0.1290307641029358, 'train/total_loss': 0.12906311452388763} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.0808650016784668, 'train/info_loss': 0.21331509947776794, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011761366622522474, 'train/video_loss': 0.2131974846124649, 'train/total_loss': 0.2940624952316284} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3372, 'grad_norm': 6.673017501831055, 'learning_rate': 7.164534842357242e-06}[Rank 2] Trainer log: {'loss': 0.3372, 'grad_norm': 6.673017501831055, 'learning_rate': 7.164534842357242e-06} - -[Rank 0] Trainer log: {'loss': 0.3372, 'grad_norm': 6.673017501831055, 'learning_rate': 7.164534842357242e-06}[Rank 3] Trainer log: {'loss': 0.3372, 'grad_norm': 6.673017501831055, 'learning_rate': 7.164534842357242e-06} - -{'loss': 0.3372, 'grad_norm': 6.673017501831055, 'learning_rate': 7.164534842357242e-06, 'epoch': 0.61} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2221294641494751, 'train/info_loss': 0.20450808107852936, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011807627743110061, 'train/video_loss': 0.2043900042772293, 'train/total_loss': 0.4265194535255432} -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.11507636308670044, 'train/info_loss': 0.11638493835926056, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010106551926583052, 'train/video_loss': 0.11628387123346329, 'train/total_loss': 0.23136022686958313} -tensor(0.2311, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0847, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4399, 'grad_norm': 7.199936866760254, 'learning_rate': 7.154306722730478e-06}[Rank 2] Trainer log: {'loss': 0.4399, 'grad_norm': 7.199936866760254, 'learning_rate': 7.154306722730478e-06}[Rank 1] Trainer log: {'loss': 0.4399, 'grad_norm': 7.199936866760254, 'learning_rate': 7.154306722730478e-06} - - -[Rank 0] Trainer log: {'loss': 0.4399, 'grad_norm': 7.199936866760254, 'learning_rate': 7.154306722730478e-06} -{'loss': 0.4399, 'grad_norm': 7.199936866760254, 'learning_rate': 7.154306722730478e-06, 'epoch': 0.61} -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.32857804298400883, 'train/info_loss': 0.12067072093486786, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001347301993519068, 'train/video_loss': 0.12053599208593369, 'train/total_loss': 0.449114054441452} -tensor(0.2330, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0754, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.04577330946922303, 'train/info_loss': 0.2549552321434021, 'train/ref_loss': None, 'train/uncertainty_loss': -8.895613136701287e-05, 'train/video_loss': 0.25486627221107483, 'train/total_loss': 0.3006395697593689} -tensor(0.1011, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3002, 'grad_norm': 2.4748001098632812, 'learning_rate': 7.144081841404555e-06}[Rank 1] Trainer log: {'loss': 0.3002, 'grad_norm': 2.4748001098632812, 'learning_rate': 7.144081841404555e-06}[Rank 2] Trainer log: {'loss': 0.3002, 'grad_norm': 2.4748001098632812, 'learning_rate': 7.144081841404555e-06} - - -[Rank 0] Trainer log: {'loss': 0.3002, 'grad_norm': 2.4748001098632812, 'learning_rate': 7.144081841404555e-06} -{'loss': 0.3002, 'grad_norm': 2.4748001098632812, 'learning_rate': 7.144081841404555e-06, 'epoch': 0.61} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0001518888515420258, 'train/lm_loss': 4.0884607005864386e-05, 'train/info_loss': 2.574854443082586e-05, 'train/ref_loss': 0.19310010969638824, 'train/uncertainty_loss': -6.68015331029892e-05, 'train/video_loss': 0.1942741721868515, 'train/total_loss': 0.19431506097316742} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.1937, device='cuda:0', grad_fn=) tensor(-0.0006, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00016871318221092226, 'train/lm_loss': 2.46032839640975e-05, 'train/info_loss': 1.8536700736149214e-05, 'train/ref_loss': 0.3425246477127075, 'train/uncertainty_loss': 0.019366936385631563, 'train/video_loss': 0.36325985193252563, 'train/total_loss': 0.36328446865081787} -[Rank 2] Trainer log: {'loss': 0.3477, 'grad_norm': 2.5463738441467285, 'learning_rate': 7.133860210015049e-06}[Rank 3] Trainer log: {'loss': 0.3477, 'grad_norm': 2.5463738441467285, 'learning_rate': 7.133860210015049e-06} - -[Rank 0] Trainer log: {'loss': 0.3477, 'grad_norm': 2.5463738441467285, 'learning_rate': 7.133860210015049e-06}[Rank 1] Trainer log: {'loss': 0.3477, 'grad_norm': 2.5463738441467285, 'learning_rate': 7.133860210015049e-06} - -{'loss': 0.3477, 'grad_norm': 2.5463738441467285, 'learning_rate': 7.133860210015049e-06, 'epoch': 0.61} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(0.4249, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.2576, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002067379653453827, 'train/lm_loss': 0.0001150753814727068, 'train/info_loss': 3.2006668334361166e-05, 'train/ref_loss': 0.3875848948955536, 'train/uncertainty_loss': 0.025762787461280825, 'train/video_loss': 0.41503360867500305, 'train/total_loss': 0.41514867544174194} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1396630048751831, 'train/info_loss': 0.1982755959033966, 'train/ref_loss': None, 'train/uncertainty_loss': -8.177573909051716e-05, 'train/video_loss': 0.19819381833076477, 'train/total_loss': 0.33785682916641235} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3997, 'grad_norm': 5.599403381347656, 'learning_rate': 7.123641840193822e-06} -[Rank 3] Trainer log: {'loss': 0.3997, 'grad_norm': 5.599403381347656, 'learning_rate': 7.123641840193822e-06} -[Rank 0] Trainer log: {'loss': 0.3997, 'grad_norm': 5.599403381347656, 'learning_rate': 7.123641840193822e-06}[Rank 2] Trainer log: {'loss': 0.3997, 'grad_norm': 5.599403381347656, 'learning_rate': 7.123641840193822e-06} - -{'loss': 0.3997, 'grad_norm': 5.599403381347656, 'learning_rate': 7.123641840193822e-06, 'epoch': 0.61} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00019413747359067204, 'train/lm_loss': 2.822676906362176e-05, 'train/info_loss': 2.235124156868551e-05, 'train/ref_loss': 0.2031768262386322, 'train/uncertainty_loss': -6.708538858219981e-05, 'train/video_loss': 0.20468519628047943, 'train/total_loss': 0.20471341907978058} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3682896614074707, 'train/info_loss': 0.18811143934726715, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012148009845986963, 'train/video_loss': 0.1879899650812149, 'train/total_loss': 0.5562796592712402} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3749, 'grad_norm': 5.461950302124023, 'learning_rate': 7.113426743569021e-06} -[Rank 3] Trainer log: {'loss': 0.3749, 'grad_norm': 5.461950302124023, 'learning_rate': 7.113426743569021e-06}[Rank 0] Trainer log: {'loss': 0.3749, 'grad_norm': 5.461950302124023, 'learning_rate': 7.113426743569021e-06} - -[Rank 2] Trainer log: {'loss': 0.3749, 'grad_norm': 5.461950302124023, 'learning_rate': 7.113426743569021e-06} -{'loss': 0.3749, 'grad_norm': 5.461950302124023, 'learning_rate': 7.113426743569021e-06, 'epoch': 0.61} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.27834692001342776, 'train/info_loss': 0.08065059036016464, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011908147716894747, 'train/video_loss': 0.08053150773048401, 'train/total_loss': 0.3588784337043762} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.30547256469726564, 'train/info_loss': 0.2032054364681244, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012029089266434312, 'train/video_loss': 0.20308513939380646, 'train/total_loss': 0.5085577368736267} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0856, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3051, 'grad_norm': 5.296295166015625, 'learning_rate': 7.10321493176508e-06} -[Rank 0] Trainer log: {'loss': 0.3051, 'grad_norm': 5.296295166015625, 'learning_rate': 7.10321493176508e-06}[Rank 3] Trainer log: {'loss': 0.3051, 'grad_norm': 5.296295166015625, 'learning_rate': 7.10321493176508e-06} - -[Rank 2] Trainer log: {'loss': 0.3051, 'grad_norm': 5.296295166015625, 'learning_rate': 7.10321493176508e-06} -{'loss': 0.3051, 'grad_norm': 5.296295166015625, 'learning_rate': 7.10321493176508e-06, 'epoch': 0.61} -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.12030736207962037, 'train/info_loss': 0.2631644308567047, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013222689740359785, 'train/video_loss': 0.2630321979522705, 'train/total_loss': 0.3833395540714264} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.0196, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0236, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002046938054263592, 'train/lm_loss': 5.2659731591120365e-05, 'train/info_loss': 2.3424076061928645e-05, 'train/ref_loss': 0.21811693906784058, 'train/uncertainty_loss': 0.0023552108556032184, 'train/video_loss': 0.22213312983512878, 'train/total_loss': 0.22218579053878784} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(0.1242, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3101, 'grad_norm': 6.107998371124268, 'learning_rate': 7.093006416402684e-06}[Rank 3] Trainer log: {'loss': 0.3101, 'grad_norm': 6.107998371124268, 'learning_rate': 7.093006416402684e-06}[Rank 0] Trainer log: {'loss': 0.3101, 'grad_norm': 6.107998371124268, 'learning_rate': 7.093006416402684e-06} - - -[Rank 1] Trainer log: {'loss': 0.3101, 'grad_norm': 6.107998371124268, 'learning_rate': 7.093006416402684e-06} -{'loss': 0.3101, 'grad_norm': 6.107998371124268, 'learning_rate': 7.093006416402684e-06, 'epoch': 0.61} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.000171824567951262, 'train/lm_loss': 3.239845973439515e-05, 'train/info_loss': 2.1993630070937797e-05, 'train/ref_loss': 0.11696814000606537, 'train/uncertainty_loss': -6.702624377794564e-05, 'train/video_loss': 0.11829770356416702, 'train/total_loss': 0.11833009868860245} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(0.1495, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00036078577395528557, 'train/lm_loss': 7.094078464433552e-05, 'train/info_loss': 3.0516646802425385e-05, 'train/ref_loss': 0.31465011835098267, 'train/uncertainty_loss': 0.014952132105827333, 'train/video_loss': 0.3325190544128418, 'train/total_loss': 0.3325899839401245} -[Rank 3] Trainer log: {'loss': 0.341, 'grad_norm': 5.006908893585205, 'learning_rate': 7.082801209098775e-06}[Rank 2] Trainer log: {'loss': 0.341, 'grad_norm': 5.006908893585205, 'learning_rate': 7.082801209098775e-06}[Rank 0] Trainer log: {'loss': 0.341, 'grad_norm': 5.006908893585205, 'learning_rate': 7.082801209098775e-06} - -[Rank 1] Trainer log: {'loss': 0.341, 'grad_norm': 5.006908893585205, 'learning_rate': 7.082801209098775e-06} - -{'loss': 0.341, 'grad_norm': 5.006908893585205, 'learning_rate': 7.082801209098775e-06, 'epoch': 0.61} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00026292209513485433, 'train/lm_loss': 0.00011888784356415272, 'train/info_loss': 2.8251803087187e-05, 'train/ref_loss': 0.13308259844779968, 'train/uncertainty_loss': -6.897833663970232e-05, 'train/video_loss': 0.13514524698257446, 'train/total_loss': 0.13526412844657898} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2312880039215088, 'train/info_loss': 0.24638144671916962, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013672570930793883, 'train/video_loss': 0.24624471366405487, 'train/total_loss': 0.4775327146053314} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(0.1340, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3799, 'grad_norm': 5.872650146484375, 'learning_rate': 7.072599321466523e-06}[Rank 3] Trainer log: {'loss': 0.3799, 'grad_norm': 5.872650146484375, 'learning_rate': 7.072599321466523e-06}[Rank 2] Trainer log: {'loss': 0.3799, 'grad_norm': 5.872650146484375, 'learning_rate': 7.072599321466523e-06} - - -[Rank 0] Trainer log: {'loss': 0.3799, 'grad_norm': 5.872650146484375, 'learning_rate': 7.072599321466523e-06} -{'loss': 0.3799, 'grad_norm': 5.872650146484375, 'learning_rate': 7.072599321466523e-06, 'epoch': 0.61} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3173127889633179, 'train/info_loss': 0.19134551286697388, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012018078705295921, 'train/video_loss': 0.1912253350019455, 'train/total_loss': 0.5085381269454956} -tensor(0.0361, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1568, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003343466203659773, 'train/lm_loss': 4.183808341622353e-05, 'train/info_loss': 2.455650974297896e-05, 'train/ref_loss': 0.1500389724969864, 'train/uncertainty_loss': -7.046374958008528e-05, 'train/video_loss': 0.152667835354805, 'train/total_loss': 0.15270967781543732} -tensor(0.2488, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.366, 'grad_norm': 1.8795405626296997, 'learning_rate': 7.062400765115326e-06}[Rank 1] Trainer log: {'loss': 0.366, 'grad_norm': 1.8795405626296997, 'learning_rate': 7.062400765115326e-06} - -[Rank 2] Trainer log: {'loss': 0.366, 'grad_norm': 1.8795405626296997, 'learning_rate': 7.062400765115326e-06}[Rank 0] Trainer log: {'loss': 0.366, 'grad_norm': 1.8795405626296997, 'learning_rate': 7.062400765115326e-06} - -{'loss': 0.366, 'grad_norm': 1.8795405626296997, 'learning_rate': 7.062400765115326e-06, 'epoch': 0.62} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.19695255756378174, 'train/info_loss': 0.22698785364627838, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010654343059286476, 'train/video_loss': 0.22688131034374237, 'train/total_loss': 0.4238338768482208} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.10710723400115968, 'train/info_loss': 0.10411542654037476, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010621786350384355, 'train/video_loss': 0.10400921106338501, 'train/total_loss': 0.21111644804477692} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.4873, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4017, 'grad_norm': 6.347061634063721, 'learning_rate': 7.052205551650796e-06}[Rank 2] Trainer log: {'loss': 0.4017, 'grad_norm': 6.347061634063721, 'learning_rate': 7.052205551650796e-06}[Rank 1] Trainer log: {'loss': 0.4017, 'grad_norm': 6.347061634063721, 'learning_rate': 7.052205551650796e-06} - - -[Rank 0] Trainer log: {'loss': 0.4017, 'grad_norm': 6.347061634063721, 'learning_rate': 7.052205551650796e-06} -{'loss': 0.4017, 'grad_norm': 6.347061634063721, 'learning_rate': 7.052205551650796e-06, 'epoch': 0.62} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00025314660742878917, 'train/lm_loss': 2.7869190671481195e-05, 'train/info_loss': 1.8834713046089746e-05, 'train/ref_loss': 0.05157986655831337, 'train/uncertainty_loss': -6.967865047045053e-05, 'train/video_loss': 0.053554195910692215, 'train/total_loss': 0.05358206480741501} -tensor(0.0716, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002649300033226609, 'train/lm_loss': 3.2469973666593434e-05, 'train/info_loss': 2.1993630070937797e-05, 'train/ref_loss': 0.17051982879638672, 'train/uncertainty_loss': -7.033495930954815e-05, 'train/video_loss': 0.1725909262895584, 'train/total_loss': 0.1726233959197998} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3338, 'grad_norm': 2.1469223499298096, 'learning_rate': 7.0420136926747225e-06}[Rank 2] Trainer log: {'loss': 0.3338, 'grad_norm': 2.1469223499298096, 'learning_rate': 7.0420136926747225e-06}[Rank 3] Trainer log: {'loss': 0.3338, 'grad_norm': 2.1469223499298096, 'learning_rate': 7.0420136926747225e-06} - - -[Rank 0] Trainer log: {'loss': 0.3338, 'grad_norm': 2.1469223499298096, 'learning_rate': 7.0420136926747225e-06} -{'loss': 0.3338, 'grad_norm': 2.1469223499298096, 'learning_rate': 7.0420136926747225e-06, 'epoch': 0.62} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(0.0381, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00019890042021870614, 'train/lm_loss': 5.504327709786594e-05, 'train/info_loss': 2.8251803087187e-05, 'train/ref_loss': 0.14498263597488403, 'train/uncertainty_loss': -7.070401916280389e-05, 'train/video_loss': 0.1465313881635666, 'train/total_loss': 0.14658643305301666} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.31899094581604004, 'train/info_loss': 0.18360458314418793, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011264968197792769, 'train/video_loss': 0.18349193036556244, 'train/total_loss': 0.5024828910827637} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(0.1581, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3398, 'grad_norm': 4.42339563369751, 'learning_rate': 7.031825199785103e-06}[Rank 1] Trainer log: {'loss': 0.3398, 'grad_norm': 4.42339563369751, 'learning_rate': 7.031825199785103e-06}[Rank 2] Trainer log: {'loss': 0.3398, 'grad_norm': 4.42339563369751, 'learning_rate': 7.031825199785103e-06} - - -[Rank 0] Trainer log: {'loss': 0.3398, 'grad_norm': 4.42339563369751, 'learning_rate': 7.031825199785103e-06} -{'loss': 0.3398, 'grad_norm': 4.42339563369751, 'learning_rate': 7.031825199785103e-06, 'epoch': 0.62} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00029584590811282396, 'train/lm_loss': 4.7821048065088695e-05, 'train/info_loss': 2.6940573661704548e-05, 'train/ref_loss': 0.10436344891786575, 'train/uncertainty_loss': -6.816043751314283e-05, 'train/video_loss': 0.10668899863958359, 'train/total_loss': 0.10673681646585464} -tensor(0.0478, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0005265767686069012, 'train/lm_loss': 3.180250932928175e-05, 'train/info_loss': 2.1338008082238957e-05, 'train/ref_loss': 0.18219846487045288, 'train/uncertainty_loss': -7.687379838898778e-05, 'train/video_loss': 0.18635554611682892, 'train/total_loss': 0.1863873451948166} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.2948, 'grad_norm': 8.064760208129883, 'learning_rate': 7.021640084576078e-06} -[Rank 0] Trainer log: {'loss': 0.2948, 'grad_norm': 8.064760208129883, 'learning_rate': 7.021640084576078e-06}[Rank 1] Trainer log: {'loss': 0.2948, 'grad_norm': 8.064760208129883, 'learning_rate': 7.021640084576078e-06} -[Rank 3] Trainer log: {'loss': 0.2948, 'grad_norm': 8.064760208129883, 'learning_rate': 7.021640084576078e-06} - -{'loss': 0.2948, 'grad_norm': 8.064760208129883, 'learning_rate': 7.021640084576078e-06, 'epoch': 0.62} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.29478206634521487, 'train/info_loss': 0.18742595613002777, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001082689967006445, 'train/video_loss': 0.18731768429279327, 'train/total_loss': 0.4820997714996338} -tensor(0.2469, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.27183420658111573, 'train/info_loss': 0.2652425169944763, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013236821396276355, 'train/video_loss': 0.2651101350784302, 'train/total_loss': 0.5369443893432617} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3383, 'grad_norm': 4.6160502433776855, 'learning_rate': 7.011458358637968e-06} -[Rank 3] Trainer log: {'loss': 0.3383, 'grad_norm': 4.6160502433776855, 'learning_rate': 7.011458358637968e-06} -[Rank 0] Trainer log: {'loss': 0.3383, 'grad_norm': 4.6160502433776855, 'learning_rate': 7.011458358637968e-06}[Rank 1] Trainer log: {'loss': 0.3383, 'grad_norm': 4.6160502433776855, 'learning_rate': 7.011458358637968e-06} - -{'loss': 0.3383, 'grad_norm': 4.6160502433776855, 'learning_rate': 7.011458358637968e-06, 'epoch': 0.62} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.2096, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00019793708343058826, 'train/lm_loss': 4.77495399536565e-05, 'train/info_loss': 2.783459422062151e-05, 'train/ref_loss': 0.12697729468345642, 'train/uncertainty_loss': -6.667752750217914e-05, 'train/video_loss': 0.12852194905281067, 'train/total_loss': 0.12856969237327576} -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.25411291122436525, 'train/info_loss': 0.4201201796531677, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013892757706344128, 'train/video_loss': 0.4199812412261963, 'train/total_loss': 0.6740942001342773} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4701, 'grad_norm': 4.917999744415283, 'learning_rate': 7.001280033557221e-06}[Rank 3] Trainer log: {'loss': 0.4701, 'grad_norm': 4.917999744415283, 'learning_rate': 7.001280033557221e-06}[Rank 0] Trainer log: {'loss': 0.4701, 'grad_norm': 4.917999744415283, 'learning_rate': 7.001280033557221e-06} - -[Rank 2] Trainer log: {'loss': 0.4701, 'grad_norm': 4.917999744415283, 'learning_rate': 7.001280033557221e-06} - -{'loss': 0.4701, 'grad_norm': 4.917999744415283, 'learning_rate': 7.001280033557221e-06, 'epoch': 0.62} -tensor(-0.0015, device='cuda:2', grad_fn=) tensor(-0.0015, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.029169750213623048, 'train/info_loss': 0.13532738387584686, 'train/ref_loss': None, 'train/uncertainty_loss': -9.930904489010573e-05, 'train/video_loss': 0.13522806763648987, 'train/total_loss': 0.16439782083034515} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.0481, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.4488419055938721, 'train/info_loss': 0.19105997681617737, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010680613340809942, 'train/video_loss': 0.19095316529273987, 'train/total_loss': 0.6397950649261475} -tensor(0.1127, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0102, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3453, 'grad_norm': 8.264098167419434, 'learning_rate': 6.991105120916419e-06}[Rank 1] Trainer log: {'loss': 0.3453, 'grad_norm': 8.264098167419434, 'learning_rate': 6.991105120916419e-06}[Rank 2] Trainer log: {'loss': 0.3453, 'grad_norm': 8.264098167419434, 'learning_rate': 6.991105120916419e-06} - - -[Rank 0] Trainer log: {'loss': 0.3453, 'grad_norm': 8.264098167419434, 'learning_rate': 6.991105120916419e-06} -{'loss': 0.3453, 'grad_norm': 8.264098167419434, 'learning_rate': 6.991105120916419e-06, 'epoch': 0.62} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(0.1430, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00017404812388122082, 'train/lm_loss': 3.182634827680886e-05, 'train/info_loss': 2.235124156868551e-05, 'train/ref_loss': 0.30836406350135803, 'train/uncertainty_loss': 0.014302468299865723, 'train/video_loss': 0.32408127188682556, 'train/total_loss': 0.3241131007671356} -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003621518146246672, 'train/lm_loss': 2.820293011609465e-05, 'train/info_loss': 2.1338008082238957e-05, 'train/ref_loss': 0.18012340366840363, 'train/uncertainty_loss': -7.173661142587662e-05, 'train/video_loss': 0.18297022581100464, 'train/total_loss': 0.1829984337091446} -tensor(0.1408, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3504, 'grad_norm': 2.358250617980957, 'learning_rate': 6.980933632294269e-06}[Rank 3] Trainer log: {'loss': 0.3504, 'grad_norm': 2.358250617980957, 'learning_rate': 6.980933632294269e-06}[Rank 1] Trainer log: {'loss': 0.3504, 'grad_norm': 2.358250617980957, 'learning_rate': 6.980933632294269e-06} - - -[Rank 0] Trainer log: {'loss': 0.3504, 'grad_norm': 2.358250617980957, 'learning_rate': 6.980933632294269e-06} -{'loss': 0.3504, 'grad_norm': 2.358250617980957, 'learning_rate': 6.980933632294269e-06, 'epoch': 0.62} -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.4284193515777588, 'train/info_loss': 0.15002264082431793, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012835678644478323, 'train/video_loss': 0.14989428222179413, 'train/total_loss': 0.5783136487007141} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3098216533660889, 'train/info_loss': 0.20968572795391083, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012355189537629485, 'train/video_loss': 0.20956218242645264, 'train/total_loss': 0.5193838477134705} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.299, 'grad_norm': 2.584981918334961, 'learning_rate': 6.9707655792655635e-06}[Rank 2] Trainer log: {'loss': 0.299, 'grad_norm': 2.584981918334961, 'learning_rate': 6.9707655792655635e-06}[Rank 0] Trainer log: {'loss': 0.299, 'grad_norm': 2.584981918334961, 'learning_rate': 6.9707655792655635e-06} - -[Rank 3] Trainer log: {'loss': 0.299, 'grad_norm': 2.584981918334961, 'learning_rate': 6.9707655792655635e-06} - -{'loss': 0.299, 'grad_norm': 2.584981918334961, 'learning_rate': 6.9707655792655635e-06, 'epoch': 0.62} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00026590838097035886, 'train/lm_loss': 6.138336611911654e-05, 'train/info_loss': 2.5331331926281564e-05, 'train/ref_loss': 0.17041140794754028, 'train/uncertainty_loss': -7.295894320122898e-05, 'train/video_loss': 0.17249104380607605, 'train/total_loss': 0.1725524216890335} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0949, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00022726673632860185, 'train/lm_loss': 4.7439671470783656e-05, 'train/info_loss': 2.4914121240726672e-05, 'train/ref_loss': 0.2776205241680145, 'train/uncertainty_loss': 0.009487228095531465, 'train/video_loss': 0.2889508008956909, 'train/total_loss': 0.28899824619293213} -[Rank 3] Trainer log: {'loss': 0.2996, 'grad_norm': 2.5141408443450928, 'learning_rate': 6.960600973401207e-06}[Rank 1] Trainer log: {'loss': 0.2996, 'grad_norm': 2.5141408443450928, 'learning_rate': 6.960600973401207e-06}[Rank 2] Trainer log: {'loss': 0.2996, 'grad_norm': 2.5141408443450928, 'learning_rate': 6.960600973401207e-06} - - -[Rank 0] Trainer log: {'loss': 0.2996, 'grad_norm': 2.5141408443450928, 'learning_rate': 6.960600973401207e-06} -{'loss': 0.2996, 'grad_norm': 2.5141408443450928, 'learning_rate': 6.960600973401207e-06, 'epoch': 0.62} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.13866126537322998, 'train/info_loss': 0.15124395489692688, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010892548598349095, 'train/video_loss': 0.15113502740859985, 'train/total_loss': 0.28979629278182983} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003490289906039834, 'train/lm_loss': 4.7773375990800565e-05, 'train/info_loss': 2.3781687559676357e-05, 'train/ref_loss': 0.207356795668602, 'train/uncertainty_loss': -7.340416777879e-05, 'train/video_loss': 0.21009941399097443, 'train/total_loss': 0.2101471871137619} -tensor(0.0012, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3176, 'grad_norm': 4.495316982269287, 'learning_rate': 6.950439826268163e-06}[Rank 2] Trainer log: {'loss': 0.3176, 'grad_norm': 4.495316982269287, 'learning_rate': 6.950439826268163e-06}[Rank 1] Trainer log: {'loss': 0.3176, 'grad_norm': 4.495316982269287, 'learning_rate': 6.950439826268163e-06} - - -[Rank 0] Trainer log: {'loss': 0.3176, 'grad_norm': 4.495316982269287, 'learning_rate': 6.950439826268163e-06} -{'loss': 0.3176, 'grad_norm': 4.495316982269287, 'learning_rate': 6.950439826268163e-06, 'epoch': 0.62} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.2371, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002763666678220034, 'train/lm_loss': 3.6474716034717856e-05, 'train/info_loss': 2.270885306643322e-05, 'train/ref_loss': 0.368747740983963, 'train/uncertainty_loss': 0.02370729893445969, 'train/video_loss': 0.3946886956691742, 'train/total_loss': 0.39472517371177673} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(0.0077, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00018445461755618454, 'train/lm_loss': 3.68322798749432e-05, 'train/info_loss': 2.3424076061928645e-05, 'train/ref_loss': 0.18269000947475433, 'train/uncertainty_loss': -6.906326161697507e-05, 'train/video_loss': 0.18411999940872192, 'train/total_loss': 0.18415683507919312} -[Rank 2] Trainer log: {'loss': 0.4037, 'grad_norm': 4.127529621124268, 'learning_rate': 6.940282149429472e-06}[Rank 1] Trainer log: {'loss': 0.4037, 'grad_norm': 4.127529621124268, 'learning_rate': 6.940282149429472e-06} - -[Rank 3] Trainer log: {'loss': 0.4037, 'grad_norm': 4.127529621124268, 'learning_rate': 6.940282149429472e-06}[Rank 0] Trainer log: {'loss': 0.4037, 'grad_norm': 4.127529621124268, 'learning_rate': 6.940282149429472e-06} - -{'loss': 0.4037, 'grad_norm': 4.127529621124268, 'learning_rate': 6.940282149429472e-06, 'epoch': 0.62} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.11360511779785157, 'train/info_loss': 0.16941320896148682, 'train/ref_loss': None, 'train/uncertainty_loss': -9.124868083745242e-05, 'train/video_loss': 0.1693219542503357, 'train/total_loss': 0.2829270660877228} -tensor(0.2433, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(0.1733, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.4864, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0106, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.2501, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) {'train/tv_loss': 0.00020638941787183286, 'train/lm_loss': 5.428054137155414e-05, 'train/info_loss': 2.8251803087187e-05, 'train/ref_loss': 0.36885836720466614, 'train/uncertainty_loss': 0.02501468658447266, 'train/video_loss': 0.3955524265766144, 'train/total_loss': 0.3956066966056824} -tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3642, 'grad_norm': 18.902246475219727, 'learning_rate': 6.93012795444421e-06}[Rank 1] Trainer log: {'loss': 0.3642, 'grad_norm': 18.902246475219727, 'learning_rate': 6.93012795444421e-06} - -[Rank 3] Trainer log: {'loss': 0.3642, 'grad_norm': 18.902246475219727, 'learning_rate': 6.93012795444421e-06} -[Rank 0] Trainer log: {'loss': 0.3642, 'grad_norm': 18.902246475219727, 'learning_rate': 6.93012795444421e-06} -{'loss': 0.3642, 'grad_norm': 18.902246475219727, 'learning_rate': 6.93012795444421e-06, 'epoch': 0.62} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.105245840549469, 'train/info_loss': 0.3097156286239624, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001056134351529181, 'train/video_loss': 0.3096100091934204, 'train/total_loss': 0.41485583782196045} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.08594682812690735, 'train/info_loss': 0.17224568128585815, 'train/ref_loss': None, 'train/uncertainty_loss': -9.640723583288492e-05, 'train/video_loss': 0.17214927077293396, 'train/total_loss': 0.2580960988998413} -tensor(0.0048, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3754, 'grad_norm': 2.587892770767212, 'learning_rate': 6.919977252867513e-06}[Rank 2] Trainer log: {'loss': 0.3754, 'grad_norm': 2.587892770767212, 'learning_rate': 6.919977252867513e-06}[Rank 1] Trainer log: {'loss': 0.3754, 'grad_norm': 2.587892770767212, 'learning_rate': 6.919977252867513e-06} - - -[Rank 0] Trainer log: {'loss': 0.3754, 'grad_norm': 2.587892770767212, 'learning_rate': 6.919977252867513e-06} -{'loss': 0.3754, 'grad_norm': 2.587892770767212, 'learning_rate': 6.919977252867513e-06, 'epoch': 0.62} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.27659876346588136, 'train/info_loss': 0.15869122743606567, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012577928137034178, 'train/video_loss': 0.1585654467344284, 'train/total_loss': 0.435164213180542} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00024316087365150452, 'train/lm_loss': 0.00010123074753209949, 'train/info_loss': 2.962263170047663e-05, 'train/ref_loss': 0.20236599445343018, 'train/uncertainty_loss': -7.090355502441526e-05, 'train/video_loss': 0.20427000522613525, 'train/total_loss': 0.20437122881412506} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3272, 'grad_norm': 6.539691925048828, 'learning_rate': 6.909830056250527e-06}[Rank 3] Trainer log: {'loss': 0.3272, 'grad_norm': 6.539691925048828, 'learning_rate': 6.909830056250527e-06} -[Rank 1] Trainer log: {'loss': 0.3272, 'grad_norm': 6.539691925048828, 'learning_rate': 6.909830056250527e-06} - -[Rank 2] Trainer log: {'loss': 0.3272, 'grad_norm': 6.539691925048828, 'learning_rate': 6.909830056250527e-06} -{'loss': 0.3272, 'grad_norm': 6.539691925048828, 'learning_rate': 6.909830056250527e-06, 'epoch': 0.62} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.15629886388778688, 'train/info_loss': 0.18036261200904846, 'train/ref_loss': None, 'train/uncertainty_loss': -8.788546547293663e-05, 'train/video_loss': 0.18027472496032715, 'train/total_loss': 0.33657360076904297} -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.12144585847854615, 'train/info_loss': 0.24021096527576447, 'train/ref_loss': None, 'train/uncertainty_loss': -9.601531201042235e-05, 'train/video_loss': 0.2401149570941925, 'train/total_loss': 0.3615608215332031} -tensor(0.0453, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3549, 'grad_norm': 3.2858850955963135, 'learning_rate': 6.899686376140407e-06} -[Rank 0] Trainer log: {'loss': 0.3549, 'grad_norm': 3.2858850955963135, 'learning_rate': 6.899686376140407e-06}[Rank 2] Trainer log: {'loss': 0.3549, 'grad_norm': 3.2858850955963135, 'learning_rate': 6.899686376140407e-06} -[Rank 1] Trainer log: {'loss': 0.3549, 'grad_norm': 3.2858850955963135, 'learning_rate': 6.899686376140407e-06} - -{'loss': 0.3549, 'grad_norm': 3.2858850955963135, 'learning_rate': 6.899686376140407e-06, 'epoch': 0.62} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.4976, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00013512736186385154, 'train/lm_loss': 3.2827543327584864e-05, 'train/info_loss': 2.3066464564180933e-05, 'train/ref_loss': 0.5332517623901367, 'train/uncertainty_loss': 0.049756681919097906, 'train/video_loss': 0.5841125249862671, 'train/total_loss': 0.5841453671455383} -tensor(0.1938, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00020235597621649505, 'train/lm_loss': 3.645087999757379e-05, 'train/info_loss': 2.235124156868551e-05, 'train/ref_loss': 0.039368461817502975, 'train/uncertainty_loss': -7.008882821537555e-05, 'train/video_loss': 0.0409395731985569, 'train/total_loss': 0.04097602516412735} -tensor(-0.0015, device='cuda:1', grad_fn=) tensor(-0.0015, device='cuda:1', grad_fn=) -tensor(0.0434, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3358, 'grad_norm': 2.009228229522705, 'learning_rate': 6.889546224080317e-06}[Rank 2] Trainer log: {'loss': 0.3358, 'grad_norm': 2.009228229522705, 'learning_rate': 6.889546224080317e-06}[Rank 0] Trainer log: {'loss': 0.3358, 'grad_norm': 2.009228229522705, 'learning_rate': 6.889546224080317e-06} - - -[Rank 3] Trainer log: {'loss': 0.3358, 'grad_norm': 2.009228229522705, 'learning_rate': 6.889546224080317e-06} -{'loss': 0.3358, 'grad_norm': 2.009228229522705, 'learning_rate': 6.889546224080317e-06, 'epoch': 0.62} -tensor(-0.0015, device='cuda:0', grad_fn=) tensor(-0.0015, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.36879982948303225, 'train/info_loss': 0.2138832062482834, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00014512798516079785, 'train/video_loss': 0.2137380838394165, 'train/total_loss': 0.5825378894805908} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.07442213892936707, 'train/info_loss': 0.18131089210510254, 'train/ref_loss': None, 'train/uncertainty_loss': -9.879503631964326e-05, 'train/video_loss': 0.18121209740638733, 'train/total_loss': 0.25563424825668335} -tensor(0.0795, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3347, 'grad_norm': 5.800853252410889, 'learning_rate': 6.879409611609394e-06}[Rank 2] Trainer log: {'loss': 0.3347, 'grad_norm': 5.800853252410889, 'learning_rate': 6.879409611609394e-06}[Rank 3] Trainer log: {'loss': 0.3347, 'grad_norm': 5.800853252410889, 'learning_rate': 6.879409611609394e-06} - - -[Rank 0] Trainer log: {'loss': 0.3347, 'grad_norm': 5.800853252410889, 'learning_rate': 6.879409611609394e-06} -{'loss': 0.3347, 'grad_norm': 5.800853252410889, 'learning_rate': 6.879409611609394e-06, 'epoch': 0.62} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1428, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0797, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.33799951076507573, 'train/info_loss': 0.19124862551689148, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013314494863152506, 'train/video_loss': 0.19111548364162445, 'train/total_loss': 0.5291150212287903} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.28065168857574463, 'train/info_loss': 0.23576030135154724, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010419415775686502, 'train/video_loss': 0.23565611243247986, 'train/total_loss': 0.5163078308105469} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3322, 'grad_norm': 5.654279708862305, 'learning_rate': 6.869276550262759e-06}[Rank 3] Trainer log: {'loss': 0.3322, 'grad_norm': 5.654279708862305, 'learning_rate': 6.869276550262759e-06}[Rank 2] Trainer log: {'loss': 0.3322, 'grad_norm': 5.654279708862305, 'learning_rate': 6.869276550262759e-06} - - -[Rank 0] Trainer log: {'loss': 0.3322, 'grad_norm': 5.654279708862305, 'learning_rate': 6.869276550262759e-06} -{'loss': 0.3322, 'grad_norm': 5.654279708862305, 'learning_rate': 6.869276550262759e-06, 'epoch': 0.62} -tensor(-0.0016, device='cuda:0', grad_fn=) tensor(-0.0016, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.4676541328430176, 'train/info_loss': 0.16758723556995392, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00016314452514052392, 'train/video_loss': 0.16742409765720367, 'train/total_loss': 0.6350782513618469} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.22262122631073, 'train/info_loss': 0.1668197363615036, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012730587041005493, 'train/video_loss': 0.16669243574142456, 'train/total_loss': 0.389313668012619} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(0.2260, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3792, 'grad_norm': 3.2106826305389404, 'learning_rate': 6.859147051571483e-06}[Rank 3] Trainer log: {'loss': 0.3792, 'grad_norm': 3.2106826305389404, 'learning_rate': 6.859147051571483e-06} - -[Rank 0] Trainer log: {'loss': 0.3792, 'grad_norm': 3.2106826305389404, 'learning_rate': 6.859147051571483e-06}[Rank 2] Trainer log: {'loss': 0.3792, 'grad_norm': 3.2106826305389404, 'learning_rate': 6.859147051571483e-06} - -{'loss': 0.3792, 'grad_norm': 3.2106826305389404, 'learning_rate': 6.859147051571483e-06, 'epoch': 0.62} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003615721827372909, 'train/lm_loss': 5.4447393631562596e-05, 'train/info_loss': 2.6106152290594764e-05, 'train/ref_loss': 0.16375698149204254, 'train/uncertainty_loss': -7.258085533976555e-05, 'train/video_loss': 0.16660308837890625, 'train/total_loss': 0.16665753722190857} -tensor(0.1190, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.32675549983978275, 'train/info_loss': 0.17301881313323975, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011550711933523418, 'train/video_loss': 0.17290329933166504, 'train/total_loss': 0.4996587932109833} -tensor(0.4684, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.2099, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3533, 'grad_norm': 7.92647647857666, 'learning_rate': 6.849021127062585e-06} -[Rank 2] Trainer log: {'loss': 0.3533, 'grad_norm': 7.92647647857666, 'learning_rate': 6.849021127062585e-06} -[Rank 0] Trainer log: {'loss': 0.3533, 'grad_norm': 7.92647647857666, 'learning_rate': 6.849021127062585e-06} -[Rank 3] Trainer log: {'loss': 0.3533, 'grad_norm': 7.92647647857666, 'learning_rate': 6.849021127062585e-06} -{'loss': 0.3533, 'grad_norm': 7.92647647857666, 'learning_rate': 6.849021127062585e-06, 'epoch': 0.62} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00022650642786175014, 'train/lm_loss': 2.4793995544314386e-05, 'train/info_loss': 1.9132725356030278e-05, 'train/ref_loss': 0.13611871004104614, 'train/uncertainty_loss': -7.240913109853865e-05, 'train/video_loss': 0.13787749409675598, 'train/total_loss': 0.13790228962898254}tensor(0.3715, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(0.7555, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00023752686101943257, 'train/lm_loss': 2.7893029619008305e-05, 'train/info_loss': 1.9728748156921938e-05, 'train/ref_loss': 0.19341911375522614, 'train/uncertainty_loss': -7.224446162581445e-05, 'train/video_loss': 0.19526681303977966, 'train/total_loss': 0.19529470801353455} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4049, 'grad_norm': 15.267955780029297, 'learning_rate': 6.838898788259029e-06}[Rank 3] Trainer log: {'loss': 0.4049, 'grad_norm': 15.267955780029297, 'learning_rate': 6.838898788259029e-06} -[Rank 0] Trainer log: {'loss': 0.4049, 'grad_norm': 15.267955780029297, 'learning_rate': 6.838898788259029e-06} -[Rank 2] Trainer log: {'loss': 0.4049, 'grad_norm': 15.267955780029297, 'learning_rate': 6.838898788259029e-06} - -{'loss': 0.4049, 'grad_norm': 15.267955780029297, 'learning_rate': 6.838898788259029e-06, 'epoch': 0.62} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.5120602607727051, 'train/info_loss': 0.1643429845571518, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013286445755511522, 'train/video_loss': 0.16421012580394745, 'train/total_loss': 0.6762704253196716} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.0163, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3184762239456177, 'train/info_loss': 0.20519529283046722, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012439993442967535, 'train/video_loss': 0.20507089793682098, 'train/total_loss': 0.5235471129417419} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3206, 'grad_norm': 3.0136170387268066, 'learning_rate': 6.8287800466796715e-06}[Rank 1] Trainer log: {'loss': 0.3206, 'grad_norm': 3.0136170387268066, 'learning_rate': 6.8287800466796715e-06} -[Rank 2] Trainer log: {'loss': 0.3206, 'grad_norm': 3.0136170387268066, 'learning_rate': 6.8287800466796715e-06} -[Rank 3] Trainer log: {'loss': 0.3206, 'grad_norm': 3.0136170387268066, 'learning_rate': 6.8287800466796715e-06} - -{'loss': 0.3206, 'grad_norm': 3.0136170387268066, 'learning_rate': 6.8287800466796715e-06, 'epoch': 0.62} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0001552836853079498, 'train/lm_loss': 4.126599815208465e-05, 'train/info_loss': 2.3066464564180933e-05, 'train/ref_loss': 0.18847525119781494, 'train/uncertainty_loss': -6.913632969371975e-05, 'train/video_loss': 0.18967144191265106, 'train/total_loss': 0.18971270322799683} -tensor(1.2189, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(1.2653, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00018894661916419865, 'train/lm_loss': 3.714216582011432e-05, 'train/info_loss': 2.1993630070937797e-05, 'train/ref_loss': 1.092767596244812, 'train/uncertainty_loss': 0.12653160095214844, 'train/video_loss': 1.2208327054977417, 'train/total_loss': 1.2208698987960815} -[Rank 3] Trainer log: {'loss': 0.5569, 'grad_norm': 22.0894832611084, 'learning_rate': 6.818664913839306e-06} -[Rank 1] Trainer log: {'loss': 0.5569, 'grad_norm': 22.0894832611084, 'learning_rate': 6.818664913839306e-06} -[Rank 0] Trainer log: {'loss': 0.5569, 'grad_norm': 22.0894832611084, 'learning_rate': 6.818664913839306e-06} -[Rank 2] Trainer log: {'loss': 0.5569, 'grad_norm': 22.0894832611084, 'learning_rate': 6.818664913839306e-06} -{'loss': 0.5569, 'grad_norm': 22.0894832611084, 'learning_rate': 6.818664913839306e-06, 'epoch': 0.62} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.21772496700286867, 'train/info_loss': 0.17574043571949005, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011186550837010145, 'train/video_loss': 0.17562857270240784, 'train/total_loss': 0.39335352182388306} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.9181, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.2715, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00019865450449287894, 'train/lm_loss': 0.00010680684354156256, 'train/info_loss': 2.6106152290594764e-05, 'train/ref_loss': 0.19769251346588135, 'train/uncertainty_loss': -6.702697719447314e-05, 'train/video_loss': 0.19924083352088928, 'train/total_loss': 0.19934764504432678} -[Rank 1] Trainer log: {'loss': 0.4397, 'grad_norm': 12.014140129089355, 'learning_rate': 6.8085534012485925e-06} -[Rank 0] Trainer log: {'loss': 0.4397, 'grad_norm': 12.014140129089355, 'learning_rate': 6.8085534012485925e-06}[Rank 2] Trainer log: {'loss': 0.4397, 'grad_norm': 12.014140129089355, 'learning_rate': 6.8085534012485925e-06} -[Rank 3] Trainer log: {'loss': 0.4397, 'grad_norm': 12.014140129089355, 'learning_rate': 6.8085534012485925e-06} - -{'loss': 0.4397, 'grad_norm': 12.014140129089355, 'learning_rate': 6.8085534012485925e-06, 'epoch': 0.62} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -{'train/tv_loss': 0.00016442609485238792, 'train/lm_loss': 0.00020341062918305397, 'train/info_loss': 4.046991671202704e-05, 'train/ref_loss': 0.14818492531776428, 'train/uncertainty_loss': -7.111834711395205e-05, 'train/video_loss': 0.14946968853473663, 'train/total_loss': 0.14967310428619385} -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.13317297697067262, 'train/info_loss': 0.13992276787757874, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010056019527837634, 'train/video_loss': 0.13982221484184265, 'train/total_loss': 0.2729951739311218} -tensor(0.3346, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.2263, 'grad_norm': 2.0118014812469482, 'learning_rate': 6.798445520414094e-06}[Rank 0] Trainer log: {'loss': 0.2263, 'grad_norm': 2.0118014812469482, 'learning_rate': 6.798445520414094e-06}[Rank 3] Trainer log: {'loss': 0.2263, 'grad_norm': 2.0118014812469482, 'learning_rate': 6.798445520414094e-06} - - -[Rank 2] Trainer log: {'loss': 0.2263, 'grad_norm': 2.0118014812469482, 'learning_rate': 6.798445520414094e-06} -{'loss': 0.2263, 'grad_norm': 2.0118014812469482, 'learning_rate': 6.798445520414094e-06, 'epoch': 0.62} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.4639260292053223, 'train/info_loss': 0.13388203084468842, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012205661041662098, 'train/video_loss': 0.1337599754333496, 'train/total_loss': 0.5976860523223877} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.09502643942832947, 'train/info_loss': 0.19427229464054108, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010091951116919518, 'train/video_loss': 0.19417136907577515, 'train/total_loss': 0.28919780254364014} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3989, 'grad_norm': 2.229557514190674, 'learning_rate': 6.788341282838224e-06} -[Rank 3] Trainer log: {'loss': 0.3989, 'grad_norm': 2.229557514190674, 'learning_rate': 6.788341282838224e-06} -[Rank 2] Trainer log: {'loss': 0.3989, 'grad_norm': 2.229557514190674, 'learning_rate': 6.788341282838224e-06} -[Rank 0] Trainer log: {'loss': 0.3989, 'grad_norm': 2.229557514190674, 'learning_rate': 6.788341282838224e-06} -{'loss': 0.3989, 'grad_norm': 2.229557514190674, 'learning_rate': 6.788341282838224e-06, 'epoch': 0.62} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.0889325499534607, 'train/info_loss': 0.23334553837776184, 'train/ref_loss': None, 'train/uncertainty_loss': -9.715382475405931e-05, 'train/video_loss': 0.23324838280677795, 'train/total_loss': 0.32218092679977417} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.16637678146362306, 'train/info_loss': 0.13618429005146027, 'train/ref_loss': None, 'train/uncertainty_loss': -8.875904022715987e-05, 'train/video_loss': 0.13609552383422852, 'train/total_loss': 0.3024722933769226} -tensor(0.1459, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3523, 'grad_norm': 4.403529644012451, 'learning_rate': 6.7782407000192586e-06}[Rank 1] Trainer log: {'loss': 0.3523, 'grad_norm': 4.403529644012451, 'learning_rate': 6.7782407000192586e-06}[Rank 2] Trainer log: {'loss': 0.3523, 'grad_norm': 4.403529644012451, 'learning_rate': 6.7782407000192586e-06} - -[Rank 3] Trainer log: {'loss': 0.3523, 'grad_norm': 4.403529644012451, 'learning_rate': 6.7782407000192586e-06} - -{'loss': 0.3523, 'grad_norm': 4.403529644012451, 'learning_rate': 6.7782407000192586e-06, 'epoch': 0.62} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.1073, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00020154376979917288, 'train/lm_loss': 4.705829196609557e-05, 'train/info_loss': 2.4914121240726672e-05, 'train/ref_loss': 0.26406562328338623, 'train/uncertainty_loss': 0.010733038932085038, 'train/video_loss': 0.276435911655426, 'train/total_loss': 0.2764829695224762} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3394638061523438, 'train/info_loss': 0.20982135832309723, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012499646982178092, 'train/video_loss': 0.20969636738300323, 'train/total_loss': 0.5491601824760437} -tensor(0.2430, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.2985, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3677, 'grad_norm': 6.1958909034729, 'learning_rate': 6.768143783451315e-06} -[Rank 0] Trainer log: {'loss': 0.3677, 'grad_norm': 6.1958909034729, 'learning_rate': 6.768143783451315e-06} -[Rank 3] Trainer log: {'loss': 0.3677, 'grad_norm': 6.1958909034729, 'learning_rate': 6.768143783451315e-06}[Rank 2] Trainer log: {'loss': 0.3677, 'grad_norm': 6.1958909034729, 'learning_rate': 6.768143783451315e-06} - -{'loss': 0.3677, 'grad_norm': 6.1958909034729, 'learning_rate': 6.768143783451315e-06, 'epoch': 0.62} -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(0.1440, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00018555724527686835, 'train/lm_loss': 3.723751578945667e-05, 'train/info_loss': 2.3781687559676357e-05, 'train/ref_loss': 0.30433428287506104, 'train/uncertainty_loss': 0.014402572810649873, 'train/video_loss': 0.32024508714675903, 'train/total_loss': 0.32028231024742126} -tensor(0.2398, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2482764005661011, 'train/info_loss': 0.1998979151248932, 'train/ref_loss': None, 'train/uncertainty_loss': -8.698421297594906e-05, 'train/video_loss': 0.1998109370470047, 'train/total_loss': 0.44808733463287354} -tensor(0.1754, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.384, 'grad_norm': 5.503631114959717, 'learning_rate': 6.758050544624333e-06}[Rank 2] Trainer log: {'loss': 0.384, 'grad_norm': 5.503631114959717, 'learning_rate': 6.758050544624333e-06} -[Rank 3] Trainer log: {'loss': 0.384, 'grad_norm': 5.503631114959717, 'learning_rate': 6.758050544624333e-06} -[Rank 1] Trainer log: {'loss': 0.384, 'grad_norm': 5.503631114959717, 'learning_rate': 6.758050544624333e-06} - -{'loss': 0.384, 'grad_norm': 5.503631114959717, 'learning_rate': 6.758050544624333e-06, 'epoch': 0.62} -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2752016305923462, 'train/info_loss': 0.22438140213489532, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013848821399733424, 'train/video_loss': 0.2242429107427597, 'train/total_loss': 0.49944454431533813} -tensor(0.0481, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.2337, device='cuda:2', grad_fn=) tensor(-0.0006, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.4596, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.1148, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00044927359558641914, 'train/lm_loss': 6.898643914610147e-05, 'train/info_loss': 2.6523362976149656e-05, 'train/ref_loss': 0.27680492401123047, 'train/uncertainty_loss': 0.011483031511306764, 'train/video_loss': 0.2919086813926697, 'train/total_loss': 0.2919776737689972} -[Rank 3] Trainer log: {'loss': 0.4935, 'grad_norm': 8.186606407165527, 'learning_rate': 6.747960995024073e-06}[Rank 1] Trainer log: {'loss': 0.4935, 'grad_norm': 8.186606407165527, 'learning_rate': 6.747960995024073e-06}[Rank 0] Trainer log: {'loss': 0.4935, 'grad_norm': 8.186606407165527, 'learning_rate': 6.747960995024073e-06} - - -[Rank 2] Trainer log: {'loss': 0.4935, 'grad_norm': 8.186606407165527, 'learning_rate': 6.747960995024073e-06} -{'loss': 0.4935, 'grad_norm': 8.186606407165527, 'learning_rate': 6.747960995024073e-06, 'epoch': 0.63} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.14675476551055908, 'train/info_loss': 0.28074970841407776, 'train/ref_loss': None, 'train/uncertainty_loss': -9.323382982984185e-05, 'train/video_loss': 0.28065648674964905, 'train/total_loss': 0.4274112582206726} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2811506748199463, 'train/info_loss': 0.10740917176008224, 'train/ref_loss': None, 'train/uncertainty_loss': -9.747559088282288e-05, 'train/video_loss': 0.10731169581413269, 'train/total_loss': 0.3884623646736145} -tensor(-0.0015, device='cuda:1', grad_fn=) tensor(-0.0015, device='cuda:1', grad_fn=) -tensor(0.1281, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.2252, device='cuda:2', grad_fn=) tensor(-0.0006, device='cuda:2', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.4286, 'grad_norm': 10.754124641418457, 'learning_rate': 6.737875146132091e-06} -[Rank 1] Trainer log: {'loss': 0.4286, 'grad_norm': 10.754124641418457, 'learning_rate': 6.737875146132091e-06}[Rank 0] Trainer log: {'loss': 0.4286, 'grad_norm': 10.754124641418457, 'learning_rate': 6.737875146132091e-06}[Rank 3] Trainer log: {'loss': 0.4286, 'grad_norm': 10.754124641418457, 'learning_rate': 6.737875146132091e-06} - - -{'loss': 0.4286, 'grad_norm': 10.754124641418457, 'learning_rate': 6.737875146132091e-06, 'epoch': 0.63} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(0.1303, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003517218865454197, 'train/lm_loss': 2.5127740809693935e-05, 'train/info_loss': 2.002676046686247e-05, 'train/ref_loss': 0.2857975959777832, 'train/uncertainty_loss': 0.013029138743877412, 'train/video_loss': 0.30166053771972656, 'train/total_loss': 0.3016856610774994} -tensor(0.1005, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.07758356332778932, 'train/info_loss': 0.2312011420726776, 'train/ref_loss': None, 'train/uncertainty_loss': -8.804600220173598e-05, 'train/video_loss': 0.23111309111118317, 'train/total_loss': 0.3086966574192047} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.4027, 'grad_norm': 3.0098726749420166, 'learning_rate': 6.7277930094257405e-06}[Rank 1] Trainer log: {'loss': 0.4027, 'grad_norm': 3.0098726749420166, 'learning_rate': 6.7277930094257405e-06} -[Rank 0] Trainer log: {'loss': 0.4027, 'grad_norm': 3.0098726749420166, 'learning_rate': 6.7277930094257405e-06} -[Rank 3] Trainer log: {'loss': 0.4027, 'grad_norm': 3.0098726749420166, 'learning_rate': 6.7277930094257405e-06} - -{'loss': 0.4027, 'grad_norm': 3.0098726749420166, 'learning_rate': 6.7277930094257405e-06, 'epoch': 0.63} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-4.6598e-05, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1903, device='cuda:2', grad_fn=) tensor(-0.0006, device='cuda:2', grad_fn=) -tensor(0.0350, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.000307237496599555, 'train/lm_loss': 3.225543186999857e-05, 'train/info_loss': 2.235124156868551e-05, 'train/ref_loss': 0.20971694588661194, 'train/uncertainty_loss': 0.0034957528114318848, 'train/video_loss': 0.21569295227527618, 'train/total_loss': 0.21572521328926086} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.42418665885925294, 'train/info_loss': 0.1983293890953064, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012977528385818004, 'train/video_loss': 0.19819961488246918, 'train/total_loss': 0.6223862767219543} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.0082, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3442, 'grad_norm': 3.3045501708984375, 'learning_rate': 6.717714596378138e-06}[Rank 3] Trainer log: {'loss': 0.3442, 'grad_norm': 3.3045501708984375, 'learning_rate': 6.717714596378138e-06}[Rank 2] Trainer log: {'loss': 0.3442, 'grad_norm': 3.3045501708984375, 'learning_rate': 6.717714596378138e-06} - - -[Rank 1] Trainer log: {'loss': 0.3442, 'grad_norm': 3.3045501708984375, 'learning_rate': 6.717714596378138e-06} -{'loss': 0.3442, 'grad_norm': 3.3045501708984375, 'learning_rate': 6.717714596378138e-06, 'epoch': 0.63} -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.23708295822143555, 'train/info_loss': 0.24117344617843628, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010266288882121444, 'train/video_loss': 0.24107077717781067, 'train/total_loss': 0.4781537353992462} -tensor(0.1442, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1767, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1996, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.07077595591545105, 'train/info_loss': 0.16366487741470337, 'train/ref_loss': None, 'train/uncertainty_loss': -7.781412568874657e-05, 'train/video_loss': 0.1635870635509491, 'train/total_loss': 0.23436301946640015} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0872, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.2804, 'grad_norm': 8.50947093963623, 'learning_rate': 6.707639918458176e-06}[Rank 2] Trainer log: {'loss': 0.2804, 'grad_norm': 8.50947093963623, 'learning_rate': 6.707639918458176e-06} -[Rank 1] Trainer log: {'loss': 0.2804, 'grad_norm': 8.50947093963623, 'learning_rate': 6.707639918458176e-06} - -[Rank 0] Trainer log: {'loss': 0.2804, 'grad_norm': 8.50947093963623, 'learning_rate': 6.707639918458176e-06} -{'loss': 0.2804, 'grad_norm': 8.50947093963623, 'learning_rate': 6.707639918458176e-06, 'epoch': 0.63} -tensor(0.2576, device='cuda:1', grad_fn=) tensor(-0.0006, device='cuda:1', grad_fn=) -tensor(0.1675, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.3952, device='cuda:0', grad_fn=) tensor(-0.0006, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00020331130363047123, 'train/lm_loss': 3.640320501290262e-05, 'train/info_loss': 2.1636018573190086e-05, 'train/ref_loss': 0.5076949596405029, 'train/uncertainty_loss': 0.03952285051345825, 'train/video_loss': 0.5488659143447876, 'train/total_loss': 0.5489023327827454} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.4900837421417237, 'train/info_loss': 0.14553533494472504, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011561989085748792, 'train/video_loss': 0.14541971683502197, 'train/total_loss': 0.6355034708976746} -tensor(0.1011, device='cuda:2', grad_fn=) tensor(-0.0006, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3414, 'grad_norm': 3.6949758529663086, 'learning_rate': 6.6975689871304925e-06}[Rank 2] Trainer log: {'loss': 0.3414, 'grad_norm': 3.6949758529663086, 'learning_rate': 6.6975689871304925e-06}[Rank 3] Trainer log: {'loss': 0.3414, 'grad_norm': 3.6949758529663086, 'learning_rate': 6.6975689871304925e-06} - - -[Rank 0] Trainer log: {'loss': 0.3414, 'grad_norm': 3.6949758529663086, 'learning_rate': 6.6975689871304925e-06} -{'loss': 0.3414, 'grad_norm': 3.6949758529663086, 'learning_rate': 6.6975689871304925e-06, 'epoch': 0.63} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.308084511756897, 'train/info_loss': 0.15713857114315033, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011176167754456401, 'train/video_loss': 0.15702681243419647, 'train/total_loss': 0.46511131525039673} -tensor(0.0516, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0742, device='cuda:1', grad_fn=) tensor(-0.0006, device='cuda:1', grad_fn=) -tensor(-0.0018, device='cuda:0', grad_fn=) tensor(-0.0018, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2358006238937378, 'train/info_loss': 0.37558963894844055, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00017538393149152398, 'train/video_loss': 0.37541425228118896, 'train/total_loss': 0.6112148761749268} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.1319, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.2028, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3608, 'grad_norm': 5.606040000915527, 'learning_rate': 6.687501813855455e-06}[Rank 3] Trainer log: {'loss': 0.3608, 'grad_norm': 5.606040000915527, 'learning_rate': 6.687501813855455e-06} -[Rank 1] Trainer log: {'loss': 0.3608, 'grad_norm': 5.606040000915527, 'learning_rate': 6.687501813855455e-06} -[Rank 2] Trainer log: {'loss': 0.3608, 'grad_norm': 5.606040000915527, 'learning_rate': 6.687501813855455e-06} - -{'loss': 0.3608, 'grad_norm': 5.606040000915527, 'learning_rate': 6.687501813855455e-06, 'epoch': 0.63} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.43306818008422854, 'train/info_loss': 0.21520879864692688, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011452332837507129, 'train/video_loss': 0.21509426832199097, 'train/total_loss': 0.6481624841690063} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.0311, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002403115388005972, 'train/lm_loss': 4.112297610845417e-05, 'train/info_loss': 2.3424076061928645e-05, 'train/ref_loss': 0.15287740528583527, 'train/uncertainty_loss': -6.652298616245389e-05, 'train/video_loss': 0.1547567993402481, 'train/total_loss': 0.15479792654514313} -tensor(0.2608, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0597, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.366, 'grad_norm': 2.984349489212036, 'learning_rate': 6.677438410089163e-06}[Rank 0] Trainer log: {'loss': 0.366, 'grad_norm': 2.984349489212036, 'learning_rate': 6.677438410089163e-06}[Rank 3] Trainer log: {'loss': 0.366, 'grad_norm': 2.984349489212036, 'learning_rate': 6.677438410089163e-06} - - -[Rank 1] Trainer log: {'loss': 0.366, 'grad_norm': 2.984349489212036, 'learning_rate': 6.677438410089163e-06} -{'loss': 0.366, 'grad_norm': 2.984349489212036, 'learning_rate': 6.677438410089163e-06, 'epoch': 0.63} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.12351206541061402, 'train/info_loss': 0.22129005193710327, 'train/ref_loss': None, 'train/uncertainty_loss': -9.475399856455624e-05, 'train/video_loss': 0.2211952954530716, 'train/total_loss': 0.3447073698043823} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.07152055501937866, 'train/info_loss': 0.1371832937002182, 'train/ref_loss': None, 'train/uncertainty_loss': -8.928876486606896e-05, 'train/video_loss': 0.13709400594234467, 'train/total_loss': 0.2086145579814911} -tensor(0.0386, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.2731, 'grad_norm': 2.959535837173462, 'learning_rate': 6.667378787283421e-06} -[Rank 3] Trainer log: {'loss': 0.2731, 'grad_norm': 2.959535837173462, 'learning_rate': 6.667378787283421e-06} -[Rank 0] Trainer log: {'loss': 0.2731, 'grad_norm': 2.959535837173462, 'learning_rate': 6.667378787283421e-06}[Rank 2] Trainer log: {'loss': 0.2731, 'grad_norm': 2.959535837173462, 'learning_rate': 6.667378787283421e-06} - -{'loss': 0.2731, 'grad_norm': 2.959535837173462, 'learning_rate': 6.667378787283421e-06, 'epoch': 0.63} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.15674498081207278, 'train/info_loss': 0.2293657660484314, 'train/ref_loss': None, 'train/uncertainty_loss': -8.666801149956883e-05, 'train/video_loss': 0.22927910089492798, 'train/total_loss': 0.3860240876674652} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(0.1008, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.000281658279709518, 'train/lm_loss': 2.87273753201589e-05, 'train/info_loss': 2.1636018573190086e-05, 'train/ref_loss': 0.274599552154541, 'train/uncertainty_loss': 0.010076674818992616, 'train/video_loss': 0.28695112466812134, 'train/total_loss': 0.2869798541069031} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0492, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3006, 'grad_norm': 5.138636112213135, 'learning_rate': 6.657322956885734e-06}[Rank 1] Trainer log: {'loss': 0.3006, 'grad_norm': 5.138636112213135, 'learning_rate': 6.657322956885734e-06} -[Rank 3] Trainer log: {'loss': 0.3006, 'grad_norm': 5.138636112213135, 'learning_rate': 6.657322956885734e-06} - -[Rank 2] Trainer log: {'loss': 0.3006, 'grad_norm': 5.138636112213135, 'learning_rate': 6.657322956885734e-06} -{'loss': 0.3006, 'grad_norm': 5.138636112213135, 'learning_rate': 6.657322956885734e-06, 'epoch': 0.63} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.333745288848877, 'train/info_loss': 0.20276683568954468, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011839151848107577, 'train/video_loss': 0.20264844596385956, 'train/total_loss': 0.5363937616348267} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(0.0881, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.0591, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0579, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00021142440382391215, 'train/lm_loss': 3.2064729020930825e-05, 'train/info_loss': 2.002676046686247e-05, 'train/ref_loss': 0.18212451040744781, 'train/uncertainty_loss': -6.894645048305392e-05, 'train/video_loss': 0.18376699090003967, 'train/total_loss': 0.18379905819892883} -tensor(0.0117, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3337, 'grad_norm': 16.374658584594727, 'learning_rate': 6.647270930339289e-06} -[Rank 2] Trainer log: {'loss': 0.3337, 'grad_norm': 16.374658584594727, 'learning_rate': 6.647270930339289e-06}[Rank 0] Trainer log: {'loss': 0.3337, 'grad_norm': 16.374658584594727, 'learning_rate': 6.647270930339289e-06} -[Rank 3] Trainer log: {'loss': 0.3337, 'grad_norm': 16.374658584594727, 'learning_rate': 6.647270930339289e-06} - -{'loss': 0.3337, 'grad_norm': 16.374658584594727, 'learning_rate': 6.647270930339289e-06, 'epoch': 0.63} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(0.2259, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0738, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.3244, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003074802225455642, 'train/lm_loss': 5.408985889516771e-05, 'train/info_loss': 2.6106152290594764e-05, 'train/ref_loss': 0.43524056673049927, 'train/uncertainty_loss': 0.03244365155696869, 'train/video_loss': 0.47017017006874084, 'train/total_loss': 0.4702242612838745} -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.11624451875686646, 'train/info_loss': 0.20895139873027802, 'train/ref_loss': None, 'train/uncertainty_loss': -9.229739080183209e-05, 'train/video_loss': 0.20885910093784332, 'train/total_loss': 0.32510361075401306} -tensor(0.1481, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.4724, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3817, 'grad_norm': 4.571194171905518, 'learning_rate': 6.637222719082945e-06}[Rank 2] Trainer log: {'loss': 0.3817, 'grad_norm': 4.571194171905518, 'learning_rate': 6.637222719082945e-06}[Rank 3] Trainer log: {'loss': 0.3817, 'grad_norm': 4.571194171905518, 'learning_rate': 6.637222719082945e-06} - - -[Rank 0] Trainer log: {'loss': 0.3817, 'grad_norm': 4.571194171905518, 'learning_rate': 6.637222719082945e-06} -{'loss': 0.3817, 'grad_norm': 4.571194171905518, 'learning_rate': 6.637222719082945e-06, 'epoch': 0.63} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.424381685256958, 'train/info_loss': 0.22380010783672333, 'train/ref_loss': None, 'train/uncertainty_loss': -9.66091058216989e-05, 'train/video_loss': 0.2237035036087036, 'train/total_loss': 0.6480852365493774} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0162, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(0.0482, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00020360816270112992, 'train/lm_loss': 2.794070460367948e-05, 'train/info_loss': 1.9430735846981406e-05, 'train/ref_loss': 0.2513878345489502, 'train/uncertainty_loss': 0.0048196036368608475, 'train/video_loss': 0.25785574316978455, 'train/total_loss': 0.2578836977481842} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.2001, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3486, 'grad_norm': 9.520792961120605, 'learning_rate': 6.627178334551227e-06}[Rank 0] Trainer log: {'loss': 0.3486, 'grad_norm': 9.520792961120605, 'learning_rate': 6.627178334551227e-06} -[Rank 2] Trainer log: {'loss': 0.3486, 'grad_norm': 9.520792961120605, 'learning_rate': 6.627178334551227e-06} - -[Rank 3] Trainer log: {'loss': 0.3486, 'grad_norm': 9.520792961120605, 'learning_rate': 6.627178334551227e-06} -{'loss': 0.3486, 'grad_norm': 9.520792961120605, 'learning_rate': 6.627178334551227e-06, 'epoch': 0.63} -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0337, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2806731939315796, 'train/info_loss': 0.23063869774341583, 'train/ref_loss': None, 'train/uncertainty_loss': -9.899827418848873e-05, 'train/video_loss': 0.2305396944284439, 'train/total_loss': 0.5112128853797913} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(0.0215, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00026345893274992704, 'train/lm_loss': 3.223159583285451e-05, 'train/info_loss': 2.1338008082238957e-05, 'train/ref_loss': 0.23700493574142456, 'train/uncertainty_loss': 0.0021482333540916443, 'train/video_loss': 0.2412821650505066, 'train/total_loss': 0.24131439626216888} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.4382, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.34, 'grad_norm': 6.3445305824279785, 'learning_rate': 6.617137788174288e-06}[Rank 1] Trainer log: {'loss': 0.34, 'grad_norm': 6.3445305824279785, 'learning_rate': 6.617137788174288e-06}[Rank 3] Trainer log: {'loss': 0.34, 'grad_norm': 6.3445305824279785, 'learning_rate': 6.617137788174288e-06} - - -[Rank 0] Trainer log: {'loss': 0.34, 'grad_norm': 6.3445305824279785, 'learning_rate': 6.617137788174288e-06} -{'loss': 0.34, 'grad_norm': 6.3445305824279785, 'learning_rate': 6.617137788174288e-06, 'epoch': 0.63} -tensor(-0.0016, device='cuda:0', grad_fn=) tensor(-0.0016, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.31550905704498294, 'train/info_loss': 0.16783924400806427, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00016000146279111506, 'train/video_loss': 0.16767923533916473, 'train/total_loss': 0.483188271522522} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(0.0431, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.0117, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002121941652148962, 'train/lm_loss': 4.701061698142439e-05, 'train/info_loss': 2.3781687559676357e-05, 'train/ref_loss': 0.2368190884590149, 'train/uncertainty_loss': 0.001165374182164669, 'train/video_loss': 0.23970580101013184, 'train/total_loss': 0.23975281417369843} -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.2893, 'grad_norm': 5.428378582000732, 'learning_rate': 6.607101091377935e-06}[Rank 1] Trainer log: {'loss': 0.2893, 'grad_norm': 5.428378582000732, 'learning_rate': 6.607101091377935e-06} - -[Rank 0] Trainer log: {'loss': 0.2893, 'grad_norm': 5.428378582000732, 'learning_rate': 6.607101091377935e-06}[Rank 2] Trainer log: {'loss': 0.2893, 'grad_norm': 5.428378582000732, 'learning_rate': 6.607101091377935e-06} - -{'loss': 0.2893, 'grad_norm': 5.428378582000732, 'learning_rate': 6.607101091377935e-06, 'epoch': 0.63} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0117, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002668555593118072, 'train/lm_loss': 4.081309598404914e-05, 'train/info_loss': 2.3424076061928645e-05, 'train/ref_loss': 0.16082456707954407, 'train/uncertainty_loss': -7.026470848359168e-05, 'train/video_loss': 0.16291257739067078, 'train/total_loss': 0.16295339167118073} -tensor(0.2478, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003599084448069334, 'train/lm_loss': 2.8488991665653886e-05, 'train/info_loss': 1.9728748156921938e-05, 'train/ref_loss': 0.15264010429382324, 'train/uncertainty_loss': -7.875193259678782e-05, 'train/video_loss': 0.15546034276485443, 'train/total_loss': 0.15548883378505707} -[Rank 3] Trainer log: {'loss': 0.284, 'grad_norm': 6.461878776550293, 'learning_rate': 6.59706825558357e-06}[Rank 1] Trainer log: {'loss': 0.284, 'grad_norm': 6.461878776550293, 'learning_rate': 6.59706825558357e-06}[Rank 0] Trainer log: {'loss': 0.284, 'grad_norm': 6.461878776550293, 'learning_rate': 6.59706825558357e-06} - -[Rank 2] Trainer log: {'loss': 0.284, 'grad_norm': 6.461878776550293, 'learning_rate': 6.59706825558357e-06} - -{'loss': 0.284, 'grad_norm': 6.461878776550293, 'learning_rate': 6.59706825558357e-06, 'epoch': 0.63} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3859017848968506, 'train/info_loss': 0.14357684552669525, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012079784646630288, 'train/video_loss': 0.14345604181289673, 'train/total_loss': 0.5293577909469604} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(0.2597, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1563, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3162880420684815, 'train/info_loss': 0.17349499464035034, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001164511893875897, 'train/video_loss': 0.17337854206562042, 'train/total_loss': 0.48966658115386963} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.404, 'grad_norm': 5.305942058563232, 'learning_rate': 6.587039292208228e-06}[Rank 0] Trainer log: {'loss': 0.404, 'grad_norm': 5.305942058563232, 'learning_rate': 6.587039292208228e-06} -[Rank 3] Trainer log: {'loss': 0.404, 'grad_norm': 5.305942058563232, 'learning_rate': 6.587039292208228e-06} -[Rank 2] Trainer log: {'loss': 0.404, 'grad_norm': 5.305942058563232, 'learning_rate': 6.587039292208228e-06} - -{'loss': 0.404, 'grad_norm': 5.305942058563232, 'learning_rate': 6.587039292208228e-06, 'epoch': 0.63} -tensor(0.1640, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0001951065263710916, 'train/lm_loss': 5.406602285802365e-05, 'train/info_loss': 2.8907417799928226e-05, 'train/ref_loss': 0.3207503855228424, 'train/uncertainty_loss': 0.01639506816864014, 'train/video_loss': 0.3387352228164673, 'train/total_loss': 0.33878928422927856} -tensor(0.1012, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.0261, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.26272268295288087, 'train/info_loss': 0.2123430371284485, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013714995002374054, 'train/video_loss': 0.2122058868408203, 'train/total_loss': 0.4749285876750946} -[Rank 1] Trainer log: {'loss': 0.2942, 'grad_norm': 9.286528587341309, 'learning_rate': 6.577014212664509e-06} -[Rank 0] Trainer log: {'loss': 0.2942, 'grad_norm': 9.286528587341309, 'learning_rate': 6.577014212664509e-06}[Rank 2] Trainer log: {'loss': 0.2942, 'grad_norm': 9.286528587341309, 'learning_rate': 6.577014212664509e-06} - -[Rank 3] Trainer log: {'loss': 0.2942, 'grad_norm': 9.286528587341309, 'learning_rate': 6.577014212664509e-06} -{'loss': 0.2942, 'grad_norm': 9.286528587341309, 'learning_rate': 6.577014212664509e-06, 'epoch': 0.63} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0001512086600996554, 'train/lm_loss': 4.169506428297609e-05, 'train/info_loss': 2.455650974297896e-05, 'train/ref_loss': 0.15990613400936127, 'train/uncertainty_loss': -7.192563498392701e-05, 'train/video_loss': 0.16106843948364258, 'train/total_loss': 0.16111013293266296} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0001587690436281264, 'train/lm_loss': 2.8131413273513317e-05, 'train/info_loss': 1.9430735846981406e-05, 'train/ref_loss': 0.2049897015094757, 'train/uncertainty_loss': -7.324491161853075e-05, 'train/video_loss': 0.2062060385942459, 'train/total_loss': 0.2062341719865799} -tensor(0.1623, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.303, 'grad_norm': 3.8422646522521973, 'learning_rate': 6.566993028360619e-06}[Rank 1] Trainer log: {'loss': 0.303, 'grad_norm': 3.8422646522521973, 'learning_rate': 6.566993028360619e-06}[Rank 2] Trainer log: {'loss': 0.303, 'grad_norm': 3.8422646522521973, 'learning_rate': 6.566993028360619e-06} - - -[Rank 3] Trainer log: {'loss': 0.303, 'grad_norm': 3.8422646522521973, 'learning_rate': 6.566993028360619e-06} -{'loss': 0.303, 'grad_norm': 3.8422646522521973, 'learning_rate': 6.566993028360619e-06, 'epoch': 0.63} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00012653577141463757, 'train/lm_loss': 3.1611806480214004e-05, 'train/info_loss': 1.9728748156921938e-05, 'train/ref_loss': 0.11872692406177521, 'train/uncertainty_loss': -6.902707391418517e-05, 'train/video_loss': 0.11968991160392761, 'train/total_loss': 0.11972152441740036} -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3506661415100098, 'train/info_loss': 0.25913870334625244, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013321470469236375, 'train/video_loss': 0.25900548696517944, 'train/total_loss': 0.6096715927124023} -[Rank 0] Trainer log: {'loss': 0.3943, 'grad_norm': 4.964505672454834, 'learning_rate': 6.556975750700319e-06}[Rank 3] Trainer log: {'loss': 0.3943, 'grad_norm': 4.964505672454834, 'learning_rate': 6.556975750700319e-06} -[Rank 1] Trainer log: {'loss': 0.3943, 'grad_norm': 4.964505672454834, 'learning_rate': 6.556975750700319e-06} -[Rank 2] Trainer log: {'loss': 0.3943, 'grad_norm': 4.964505672454834, 'learning_rate': 6.556975750700319e-06} - -{'loss': 0.3943, 'grad_norm': 4.964505672454834, 'learning_rate': 6.556975750700319e-06, 'epoch': 0.63} -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(0.2073, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.000481387646868825, 'train/lm_loss': 3.244613762944937e-05, 'train/info_loss': 2.0682384274550714e-05, 'train/ref_loss': 0.3491383194923401, 'train/uncertainty_loss': 0.020726756751537324, 'train/video_loss': 0.3737368881702423, 'train/total_loss': 0.3737693428993225} -tensor(0.5132, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0017, device='cuda:2', grad_fn=) tensor(-0.0017, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.10741631984710694, 'train/info_loss': 0.25512468814849854, 'train/ref_loss': None, 'train/uncertainty_loss': -9.60618956014514e-05, 'train/video_loss': 0.255028635263443, 'train/total_loss': 0.3624449670314789} -tensor(0.0679, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4153, 'grad_norm': 2.8593058586120605, 'learning_rate': 6.546962391082919e-06}[Rank 0] Trainer log: {'loss': 0.4153, 'grad_norm': 2.8593058586120605, 'learning_rate': 6.546962391082919e-06} - -[Rank 2] Trainer log: {'loss': 0.4153, 'grad_norm': 2.8593058586120605, 'learning_rate': 6.546962391082919e-06} -[Rank 1] Trainer log: {'loss': 0.4153, 'grad_norm': 2.8593058586120605, 'learning_rate': 6.546962391082919e-06} -{'loss': 0.4153, 'grad_norm': 2.8593058586120605, 'learning_rate': 6.546962391082919e-06, 'epoch': 0.63} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.33302786350250246, 'train/info_loss': 0.22788484394550323, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013468303950503469, 'train/video_loss': 0.22775016725063324, 'train/total_loss': 0.560778021812439} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.12058727741241455, 'train/info_loss': 0.19958586990833282, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011925597209483385, 'train/video_loss': 0.19946661591529846, 'train/total_loss': 0.32005390524864197} -[Rank 0] Trainer log: {'loss': 0.289, 'grad_norm': 3.7362568378448486, 'learning_rate': 6.536952960903286e-06}[Rank 1] Trainer log: {'loss': 0.289, 'grad_norm': 3.7362568378448486, 'learning_rate': 6.536952960903286e-06} -[Rank 3] Trainer log: {'loss': 0.289, 'grad_norm': 3.7362568378448486, 'learning_rate': 6.536952960903286e-06} - -[Rank 2] Trainer log: {'loss': 0.289, 'grad_norm': 3.7362568378448486, 'learning_rate': 6.536952960903286e-06} -{'loss': 0.289, 'grad_norm': 3.7362568378448486, 'learning_rate': 6.536952960903286e-06, 'epoch': 0.63} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.0442, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00016721722204238176, 'train/lm_loss': 3.714216582011432e-05, 'train/info_loss': 2.3781687559676357e-05, 'train/ref_loss': 0.18321779370307922, 'train/uncertainty_loss': -7.436667801812291e-05, 'train/video_loss': 0.1845049411058426, 'train/total_loss': 0.18454208970069885} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.06665327548980714, 'train/info_loss': 0.15369437634944916, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010948529234156013, 'train/video_loss': 0.15358489751815796, 'train/total_loss': 0.22023817896842957} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.1942, 'grad_norm': 1.3000973463058472, 'learning_rate': 6.526947471551799e-06} -[Rank 3] Trainer log: {'loss': 0.1942, 'grad_norm': 1.3000973463058472, 'learning_rate': 6.526947471551799e-06} -[Rank 0] Trainer log: {'loss': 0.1942, 'grad_norm': 1.3000973463058472, 'learning_rate': 6.526947471551799e-06} -[Rank 2] Trainer log: {'loss': 0.1942, 'grad_norm': 1.3000973463058472, 'learning_rate': 6.526947471551799e-06} -{'loss': 0.1942, 'grad_norm': 1.3000973463058472, 'learning_rate': 6.526947471551799e-06, 'epoch': 0.63} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.8902, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00012460139114409686, 'train/lm_loss': 6.155020673759282e-05, 'train/info_loss': 2.783459422062151e-05, 'train/ref_loss': 0.08551560342311859, 'train/uncertainty_loss': -7.283426239155233e-05, 'train/video_loss': 0.08646741509437561, 'train/total_loss': 0.0865289643406868} -tensor(-0.0015, device='cuda:0', grad_fn=) tensor(-0.0015, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.27936291694641113, 'train/info_loss': 0.33039483428001404, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00014666449278593065, 'train/video_loss': 0.3302481770515442, 'train/total_loss': 0.6096110939979553} -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.4639, 'grad_norm': 4.875245094299316, 'learning_rate': 6.5169459344143645e-06}[Rank 3] Trainer log: {'loss': 0.4639, 'grad_norm': 4.875245094299316, 'learning_rate': 6.5169459344143645e-06} - -[Rank 1] Trainer log: {'loss': 0.4639, 'grad_norm': 4.875245094299316, 'learning_rate': 6.5169459344143645e-06} -[Rank 0] Trainer log: {'loss': 0.4639, 'grad_norm': 4.875245094299316, 'learning_rate': 6.5169459344143645e-06} -{'loss': 0.4639, 'grad_norm': 4.875245094299316, 'learning_rate': 6.5169459344143645e-06, 'epoch': 0.63} -tensor(-0.0015, device='cuda:3', grad_fn=) tensor(-0.0015, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.13339370489120483, 'train/info_loss': 0.16539119184017181, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011325639206916094, 'train/video_loss': 0.16527792811393738, 'train/total_loss': 0.2986716330051422} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.3119, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0015, device='cuda:3', grad_fn=) tensor(-0.0015, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1525268077850342, 'train/info_loss': 0.16885998845100403, 'train/ref_loss': None, 'train/uncertainty_loss': -8.872559992596508e-05, 'train/video_loss': 0.16877126693725586, 'train/total_loss': 0.3212980628013611} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3202, 'grad_norm': 6.588247776031494, 'learning_rate': 6.506948360872385e-06}[Rank 0] Trainer log: {'loss': 0.3202, 'grad_norm': 6.588247776031494, 'learning_rate': 6.506948360872385e-06}[Rank 3] Trainer log: {'loss': 0.3202, 'grad_norm': 6.588247776031494, 'learning_rate': 6.506948360872385e-06} - -[Rank 1] Trainer log: {'loss': 0.3202, 'grad_norm': 6.588247776031494, 'learning_rate': 6.506948360872385e-06} - -{'loss': 0.3202, 'grad_norm': 6.588247776031494, 'learning_rate': 6.506948360872385e-06, 'epoch': 0.63} -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.024344448745250703, 'train/info_loss': 0.18990488350391388, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010200284887105227, 'train/video_loss': 0.189802885055542, 'train/total_loss': 0.21414732933044434} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.3297, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2833009719848633, 'train/info_loss': 0.18821348249912262, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001010730746202171, 'train/video_loss': 0.18811240792274475, 'train/total_loss': 0.47141337394714355} -tensor(0.0191, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3501, 'grad_norm': 6.639851093292236, 'learning_rate': 6.496954762302751e-06}[Rank 3] Trainer log: {'loss': 0.3501, 'grad_norm': 6.639851093292236, 'learning_rate': 6.496954762302751e-06}[Rank 0] Trainer log: {'loss': 0.3501, 'grad_norm': 6.639851093292236, 'learning_rate': 6.496954762302751e-06} - -[Rank 2] Trainer log: {'loss': 0.3501, 'grad_norm': 6.639851093292236, 'learning_rate': 6.496954762302751e-06} - -{'loss': 0.3501, 'grad_norm': 6.639851093292236, 'learning_rate': 6.496954762302751e-06, 'epoch': 0.63} -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.319260311126709, 'train/info_loss': 0.09162968397140503, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010067666880786419, 'train/video_loss': 0.0915290042757988, 'train/total_loss': 0.4107893407344818} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.08863075375556946, 'train/info_loss': 0.2000458836555481, 'train/ref_loss': None, 'train/uncertainty_loss': -8.670504321344197e-05, 'train/video_loss': 0.1999591737985611, 'train/total_loss': 0.2885899245738983} -tensor(0.0389, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.2551, 'grad_norm': 5.45242166519165, 'learning_rate': 6.486965150077843e-06}[Rank 3] Trainer log: {'loss': 0.2551, 'grad_norm': 5.45242166519165, 'learning_rate': 6.486965150077843e-06}[Rank 2] Trainer log: {'loss': 0.2551, 'grad_norm': 5.45242166519165, 'learning_rate': 6.486965150077843e-06} - - -[Rank 0] Trainer log: {'loss': 0.2551, 'grad_norm': 5.45242166519165, 'learning_rate': 6.486965150077843e-06} -{'loss': 0.2551, 'grad_norm': 5.45242166519165, 'learning_rate': 6.486965150077843e-06, 'epoch': 0.63} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2830339908599854, 'train/info_loss': 0.23976604640483856, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011805875692516565, 'train/video_loss': 0.2396479845046997, 'train/total_loss': 0.5226819515228271} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.08788570761680603, 'train/info_loss': 0.16839271783828735, 'train/ref_loss': None, 'train/uncertainty_loss': -9.393137879669667e-05, 'train/video_loss': 0.16829878091812134, 'train/total_loss': 0.25618448853492737} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.4178, 'grad_norm': 2.5410315990448, 'learning_rate': 6.476979535565486e-06}[Rank 0] Trainer log: {'loss': 0.4178, 'grad_norm': 2.5410315990448, 'learning_rate': 6.476979535565486e-06} -[Rank 3] Trainer log: {'loss': 0.4178, 'grad_norm': 2.5410315990448, 'learning_rate': 6.476979535565486e-06} - -{'loss': 0.4178, 'grad_norm': 2.5410315990448, 'learning_rate': 6.476979535565486e-06, 'epoch': 0.63} -[Rank 1] Trainer log: {'loss': 0.4178, 'grad_norm': 2.5410315990448, 'learning_rate': 6.476979535565486e-06} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(0.8026, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00019703363068401814, 'train/lm_loss': 4.6295530046336354e-05, 'train/info_loss': 1.9728748156921938e-05, 'train/ref_loss': 0.6734815835952759, 'train/uncertainty_loss': 0.08025939464569093, 'train/video_loss': 0.7553369402885437, 'train/total_loss': 0.7553832530975342} -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00027363419067114595, 'train/lm_loss': 4.765419580508024e-05, 'train/info_loss': 2.6106152290594764e-05, 'train/ref_loss': 0.13177955150604248, 'train/uncertainty_loss': -7.680937997065485e-05, 'train/video_loss': 0.1339179128408432, 'train/total_loss': 0.13396556675434113} -[Rank 1] Trainer log: {'loss': 0.5002, 'grad_norm': 9.996387481689453, 'learning_rate': 6.466997930128972e-06}[Rank 0] Trainer log: {'loss': 0.5002, 'grad_norm': 9.996387481689453, 'learning_rate': 6.466997930128972e-06} -[Rank 3] Trainer log: {'loss': 0.5002, 'grad_norm': 9.996387481689453, 'learning_rate': 6.466997930128972e-06}[Rank 2] Trainer log: {'loss': 0.5002, 'grad_norm': 9.996387481689453, 'learning_rate': 6.466997930128972e-06} - - -{'loss': 0.5002, 'grad_norm': 9.996387481689453, 'learning_rate': 6.466997930128972e-06, 'epoch': 0.63} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3542498588562012, 'train/info_loss': 0.207747220993042, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013553863391280174, 'train/video_loss': 0.20761168003082275, 'train/total_loss': 0.561861515045166} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.49723491668701175, 'train/info_loss': 0.2280610352754593, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001228578039444983, 'train/video_loss': 0.22793817520141602, 'train/total_loss': 0.7251731157302856} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(0.1043, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.4736, 'grad_norm': 5.557400703430176, 'learning_rate': 6.457020345127018e-06}[Rank 1] Trainer log: {'loss': 0.4736, 'grad_norm': 5.557400703430176, 'learning_rate': 6.457020345127018e-06} -[Rank 0] Trainer log: {'loss': 0.4736, 'grad_norm': 5.557400703430176, 'learning_rate': 6.457020345127018e-06} -[Rank 3] Trainer log: {'loss': 0.4736, 'grad_norm': 5.557400703430176, 'learning_rate': 6.457020345127018e-06} - -{'loss': 0.4736, 'grad_norm': 5.557400703430176, 'learning_rate': 6.457020345127018e-06, 'epoch': 0.63} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3651778221130371, 'train/info_loss': 0.21958814561367035, 'train/ref_loss': None, 'train/uncertainty_loss': -0.000114200240932405, 'train/video_loss': 0.2194739431142807, 'train/total_loss': 0.58465176820755} -tensor(0.3663, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.4141, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.25783512592315677, 'train/info_loss': 0.18598683178424835, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00014223685720935464, 'train/video_loss': 0.18584460020065308, 'train/total_loss': 0.44367972016334534} -tensor(0.1532, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.2116, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.4201, 'grad_norm': 4.435382843017578, 'learning_rate': 6.447046791913776e-06}[Rank 3] Trainer log: {'loss': 0.4201, 'grad_norm': 4.435382843017578, 'learning_rate': 6.447046791913776e-06} -[Rank 0] Trainer log: {'loss': 0.4201, 'grad_norm': 4.435382843017578, 'learning_rate': 6.447046791913776e-06} - -[Rank 1] Trainer log: {'loss': 0.4201, 'grad_norm': 4.435382843017578, 'learning_rate': 6.447046791913776e-06} -{'loss': 0.4201, 'grad_norm': 4.435382843017578, 'learning_rate': 6.447046791913776e-06, 'epoch': 0.63} -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.24868378639221192, 'train/info_loss': 0.1556326299905777, 'train/ref_loss': None, 'train/uncertainty_loss': -0.000128312804736197, 'train/video_loss': 0.15550431609153748, 'train/total_loss': 0.4041880965232849} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(0.6787, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0250, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0001688216463662684, 'train/lm_loss': 3.2517651561647654e-05, 'train/info_loss': 2.0980394765501842e-05, 'train/ref_loss': 0.24514059722423553, 'train/uncertainty_loss': 0.0024956852197647096, 'train/video_loss': 0.24900783598423004, 'train/total_loss': 0.24904035031795502} -[Rank 2] Trainer log: {'loss': 0.4857, 'grad_norm': 8.076006889343262, 'learning_rate': 6.437077281838801e-06}[Rank 1] Trainer log: {'loss': 0.4857, 'grad_norm': 8.076006889343262, 'learning_rate': 6.437077281838801e-06}[Rank 3] Trainer log: {'loss': 0.4857, 'grad_norm': 8.076006889343262, 'learning_rate': 6.437077281838801e-06} - - -[Rank 0] Trainer log: {'loss': 0.4857, 'grad_norm': 8.076006889343262, 'learning_rate': 6.437077281838801e-06} -{'loss': 0.4857, 'grad_norm': 8.076006889343262, 'learning_rate': 6.437077281838801e-06, 'epoch': 0.64} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.16820906400680544, 'train/info_loss': 0.1687648743391037, 'train/ref_loss': None, 'train/uncertainty_loss': -8.293355349451304e-05, 'train/video_loss': 0.16868193447589874, 'train/total_loss': 0.3368909955024719} -tensor(0.1866, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.6921, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003720959881320596, 'train/lm_loss': 3.2017051125876606e-05, 'train/info_loss': 1.9728748156921938e-05, 'train/ref_loss': 0.6271119117736816, 'train/uncertainty_loss': 0.06921260356903076, 'train/video_loss': 0.6993210315704346, 'train/total_loss': 0.699353039264679} -[Rank 1] Trainer log: {'loss': 0.3449, 'grad_norm': 11.88251781463623, 'learning_rate': 6.427111826247057e-06}[Rank 0] Trainer log: {'loss': 0.3449, 'grad_norm': 11.88251781463623, 'learning_rate': 6.427111826247057e-06} - -[Rank 2] Trainer log: {'loss': 0.3449, 'grad_norm': 11.88251781463623, 'learning_rate': 6.427111826247057e-06} -[Rank 3] Trainer log: {'loss': 0.3449, 'grad_norm': 11.88251781463623, 'learning_rate': 6.427111826247057e-06} -{'loss': 0.3449, 'grad_norm': 11.88251781463623, 'learning_rate': 6.427111826247057e-06, 'epoch': 0.64} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.10532732009887696, 'train/info_loss': 0.14774031937122345, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010902108624577522, 'train/video_loss': 0.14763130247592926, 'train/total_loss': 0.25295862555503845} -tensor(0.3036, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2430433750152588, 'train/info_loss': 0.16200755536556244, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012387816095724702, 'train/video_loss': 0.16188368201255798, 'train/total_loss': 0.4049270749092102} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(0.0830, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3578, 'grad_norm': 4.1055989265441895, 'learning_rate': 6.417150436478887e-06}[Rank 2] Trainer log: {'loss': 0.3578, 'grad_norm': 4.1055989265441895, 'learning_rate': 6.417150436478887e-06} - -[Rank 3] Trainer log: {'loss': 0.3578, 'grad_norm': 4.1055989265441895, 'learning_rate': 6.417150436478887e-06} -[Rank 0] Trainer log: {'loss': 0.3578, 'grad_norm': 4.1055989265441895, 'learning_rate': 6.417150436478887e-06} -{'loss': 0.3578, 'grad_norm': 4.1055989265441895, 'learning_rate': 6.417150436478887e-06, 'epoch': 0.64} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3513289213180542, 'train/info_loss': 0.19261059165000916, 'train/ref_loss': None, 'train/uncertainty_loss': -9.939344599843026e-05, 'train/video_loss': 0.1925112009048462, 'train/total_loss': 0.5438401699066162} -tensor(0.0902, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1153, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.33470332622528076, 'train/info_loss': 0.18229101598262787, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011197630083188416, 'train/video_loss': 0.1821790337562561, 'train/total_loss': 0.5168823599815369} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.2183, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4233, 'grad_norm': 7.252058029174805, 'learning_rate': 6.407193123870006e-06} -[Rank 0] Trainer log: {'loss': 0.4233, 'grad_norm': 7.252058029174805, 'learning_rate': 6.407193123870006e-06}[Rank 3] Trainer log: {'loss': 0.4233, 'grad_norm': 7.252058029174805, 'learning_rate': 6.407193123870006e-06} - -[Rank 2] Trainer log: {'loss': 0.4233, 'grad_norm': 7.252058029174805, 'learning_rate': 6.407193123870006e-06} -{'loss': 0.4233, 'grad_norm': 7.252058029174805, 'learning_rate': 6.407193123870006e-06, 'epoch': 0.64} -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2379765272140503, 'train/info_loss': 0.22677458822727203, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001328581594862044, 'train/video_loss': 0.2266417294740677, 'train/total_loss': 0.4646182656288147} -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0555, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.1915, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.29035291671752933, 'train/info_loss': 0.1763470470905304, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010752762900665403, 'train/video_loss': 0.1762395203113556, 'train/total_loss': 0.4665924310684204} -[Rank 3] Trainer log: {'loss': 0.3917, 'grad_norm': 3.26718807220459, 'learning_rate': 6.397239899751501e-06}[Rank 1] Trainer log: {'loss': 0.3917, 'grad_norm': 3.26718807220459, 'learning_rate': 6.397239899751501e-06} - -[Rank 2] Trainer log: {'loss': 0.3917, 'grad_norm': 3.26718807220459, 'learning_rate': 6.397239899751501e-06} -[Rank 0] Trainer log: {'loss': 0.3917, 'grad_norm': 3.26718807220459, 'learning_rate': 6.397239899751501e-06} -{'loss': 0.3917, 'grad_norm': 3.26718807220459, 'learning_rate': 6.397239899751501e-06, 'epoch': 0.64} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.0665, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0531, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00031842123717069627, 'train/lm_loss': 3.235078474972397e-05, 'train/info_loss': 2.1338008082238957e-05, 'train/ref_loss': 0.2420630306005478, 'train/uncertainty_loss': 0.005313643068075181, 'train/video_loss': 0.24994538724422455, 'train/total_loss': 0.2499777376651764} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.0435, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00023039560765028002, 'train/lm_loss': 2.496086817700416e-05, 'train/info_loss': 1.8238688426208682e-05, 'train/ref_loss': 0.12019117176532745, 'train/uncertainty_loss': -6.647808477282524e-05, 'train/video_loss': 0.12198609113693237, 'train/total_loss': 0.12201105058193207} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.2883, 'grad_norm': 4.737313747406006, 'learning_rate': 6.387290775449786e-06}[Rank 2] Trainer log: {'loss': 0.2883, 'grad_norm': 4.737313747406006, 'learning_rate': 6.387290775449786e-06}[Rank 3] Trainer log: {'loss': 0.2883, 'grad_norm': 4.737313747406006, 'learning_rate': 6.387290775449786e-06} - - -[Rank 0] Trainer log: {'loss': 0.2883, 'grad_norm': 4.737313747406006, 'learning_rate': 6.387290775449786e-06} -{'loss': 0.2883, 'grad_norm': 4.737313747406006, 'learning_rate': 6.387290775449786e-06, 'epoch': 0.64} -tensor(-0.0015, device='cuda:0', grad_fn=) tensor(-0.0015, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.5843634605407715, 'train/info_loss': 0.24821121990680695, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00015089901862666013, 'train/video_loss': 0.24806031584739685, 'train/total_loss': 0.8324238061904907} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.5708, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3139958381652832, 'train/info_loss': 0.1876799464225769, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012813085922971367, 'train/video_loss': 0.187551811337471, 'train/total_loss': 0.501547634601593} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4589, 'grad_norm': 8.782222747802734, 'learning_rate': 6.377345762286633e-06}[Rank 1] Trainer log: {'loss': 0.4589, 'grad_norm': 8.782222747802734, 'learning_rate': 6.377345762286633e-06}[Rank 0] Trainer log: {'loss': 0.4589, 'grad_norm': 8.782222747802734, 'learning_rate': 6.377345762286633e-06} - -[Rank 2] Trainer log: {'loss': 0.4589, 'grad_norm': 8.782222747802734, 'learning_rate': 6.377345762286633e-06} - -{'loss': 0.4589, 'grad_norm': 8.782222747802734, 'learning_rate': 6.377345762286633e-06, 'epoch': 0.64} -tensor(0.2761, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1664, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.4432, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.1164, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002568821888417006, 'train/lm_loss': 3.673692990560085e-05, 'train/info_loss': 1.9132725356030278e-05, 'train/ref_loss': 0.2821768820285797, 'train/uncertainty_loss': 0.011635564267635345, 'train/video_loss': 0.2958866357803345, 'train/total_loss': 0.2959233820438385} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.37059075832366944, 'train/info_loss': 0.2774847745895386, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012379352701827884, 'train/video_loss': 0.2773609757423401, 'train/total_loss': 0.6479517221450806} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(0.1455, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.3778, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4628, 'grad_norm': 10.091394424438477, 'learning_rate': 6.367404871579114e-06}[Rank 2] Trainer log: {'loss': 0.4628, 'grad_norm': 10.091394424438477, 'learning_rate': 6.367404871579114e-06} -[Rank 3] Trainer log: {'loss': 0.4628, 'grad_norm': 10.091394424438477, 'learning_rate': 6.367404871579114e-06} - -[Rank 0] Trainer log: {'loss': 0.4628, 'grad_norm': 10.091394424438477, 'learning_rate': 6.367404871579114e-06} -{'loss': 0.4628, 'grad_norm': 10.091394424438477, 'learning_rate': 6.367404871579114e-06, 'epoch': 0.64} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.2252, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.0016, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00020579828415066004, 'train/lm_loss': 2.8059899341315032e-05, 'train/info_loss': 1.8834713046089746e-05, 'train/ref_loss': 0.20408231019973755, 'train/uncertainty_loss': 0.0001641893293708563, 'train/video_loss': 0.20591172575950623, 'train/total_loss': 0.20593978464603424} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1394798517227173, 'train/info_loss': 0.14528746902942657, 'train/ref_loss': None, 'train/uncertainty_loss': -9.472893434576691e-05, 'train/video_loss': 0.14519274234771729, 'train/total_loss': 0.2846726179122925} -tensor(0.1626, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3752, 'grad_norm': 15.192512512207031, 'learning_rate': 6.357468114639627e-06} -[Rank 0] Trainer log: {'loss': 0.3752, 'grad_norm': 15.192512512207031, 'learning_rate': 6.357468114639627e-06}[Rank 2] Trainer log: {'loss': 0.3752, 'grad_norm': 15.192512512207031, 'learning_rate': 6.357468114639627e-06} -[Rank 3] Trainer log: {'loss': 0.3752, 'grad_norm': 15.192512512207031, 'learning_rate': 6.357468114639627e-06} - -{'loss': 0.3752, 'grad_norm': 15.192512512207031, 'learning_rate': 6.357468114639627e-06, 'epoch': 0.64} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.18478609323501588, 'train/info_loss': 0.16282954812049866, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011444380506873131, 'train/video_loss': 0.1627151072025299, 'train/total_loss': 0.3475012183189392} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.435119104385376, 'train/info_loss': 0.1848669797182083, 'train/ref_loss': None, 'train/uncertainty_loss': -9.895178955048323e-05, 'train/video_loss': 0.18476802110671997, 'train/total_loss': 0.619887113571167} -tensor(0.1456, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0402, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4269, 'grad_norm': 2.4569830894470215, 'learning_rate': 6.347535502775858e-06}[Rank 0] Trainer log: {'loss': 0.4269, 'grad_norm': 2.4569830894470215, 'learning_rate': 6.347535502775858e-06} -[Rank 2] Trainer log: {'loss': 0.4269, 'grad_norm': 2.4569830894470215, 'learning_rate': 6.347535502775858e-06} - -[Rank 3] Trainer log: {'loss': 0.4269, 'grad_norm': 2.4569830894470215, 'learning_rate': 6.347535502775858e-06} -{'loss': 0.4269, 'grad_norm': 2.4569830894470215, 'learning_rate': 6.347535502775858e-06, 'epoch': 0.64} -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.23419005870819093, 'train/info_loss': 0.3472766578197479, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001353435800410807, 'train/video_loss': 0.3471413254737854, 'train/total_loss': 0.5813313722610474} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00018881360301747918, 'train/lm_loss': 2.7797673828899862e-05, 'train/info_loss': 1.8536700736149214e-05, 'train/ref_loss': 0.1731337010860443, 'train/uncertainty_loss': -7.192025077529252e-05, 'train/video_loss': 0.1745908409357071, 'train/total_loss': 0.17461863160133362} -[Rank 0] Trainer log: {'loss': 0.3817, 'grad_norm': 3.4815046787261963, 'learning_rate': 6.337607047290774e-06}[Rank 2] Trainer log: {'loss': 0.3817, 'grad_norm': 3.4815046787261963, 'learning_rate': 6.337607047290774e-06} -[Rank 1] Trainer log: {'loss': 0.3817, 'grad_norm': 3.4815046787261963, 'learning_rate': 6.337607047290774e-06} - -[Rank 3] Trainer log: {'loss': 0.3817, 'grad_norm': 3.4815046787261963, 'learning_rate': 6.337607047290774e-06} -{'loss': 0.3817, 'grad_norm': 3.4815046787261963, 'learning_rate': 6.337607047290774e-06, 'epoch': 0.64} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(0.2465, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0001688797725364566, 'train/lm_loss': 4.696294490713626e-05, 'train/info_loss': 2.3424076061928645e-05, 'train/ref_loss': 0.3801755905151367, 'train/uncertainty_loss': 0.02465268820524216, 'train/video_loss': 0.4062027335166931, 'train/total_loss': 0.4062497019767761} -tensor(-0.0015, device='cuda:2', grad_fn=) tensor(-0.0015, device='cuda:2', grad_fn=) -tensor(0.1611, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.11727348566055298, 'train/info_loss': 0.17382323741912842, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010269143385812641, 'train/video_loss': 0.17372053861618042, 'train/total_loss': 0.2909940183162689} -tensor(0.0435, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0413, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3642, 'grad_norm': 2.110673666000366, 'learning_rate': 6.3276827594826184e-06}[Rank 2] Trainer log: {'loss': 0.3642, 'grad_norm': 2.110673666000366, 'learning_rate': 6.3276827594826184e-06} -[Rank 0] Trainer log: {'loss': 0.3642, 'grad_norm': 2.110673666000366, 'learning_rate': 6.3276827594826184e-06} - -[Rank 1] Trainer log: {'loss': 0.3642, 'grad_norm': 2.110673666000366, 'learning_rate': 6.3276827594826184e-06} -{'loss': 0.3642, 'grad_norm': 2.110673666000366, 'learning_rate': 6.3276827594826184e-06, 'epoch': 0.64} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(0.1110, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.04202154576778412, 'train/info_loss': 0.23636578023433685, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011641806922852994, 'train/video_loss': 0.23624935746192932, 'train/total_loss': 0.2782709002494812} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00020210151560604573, 'train/lm_loss': 2.8632022440433504e-05, 'train/info_loss': 2.002676046686247e-05, 'train/ref_loss': 0.1843775063753128, 'train/uncertainty_loss': -6.8293372169137e-05, 'train/video_loss': 0.1859460473060608, 'train/total_loss': 0.18597467243671417} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3904, 'grad_norm': 2.5221827030181885, 'learning_rate': 6.317762650644884e-06}[Rank 3] Trainer log: {'loss': 0.3904, 'grad_norm': 2.5221827030181885, 'learning_rate': 6.317762650644884e-06}[Rank 1] Trainer log: {'loss': 0.3904, 'grad_norm': 2.5221827030181885, 'learning_rate': 6.317762650644884e-06} - -[Rank 2] Trainer log: {'loss': 0.3904, 'grad_norm': 2.5221827030181885, 'learning_rate': 6.317762650644884e-06} - -{'loss': 0.3904, 'grad_norm': 2.5221827030181885, 'learning_rate': 6.317762650644884e-06, 'epoch': 0.64} -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.32770459651947026, 'train/info_loss': 0.23873406648635864, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013407416408881546, 'train/video_loss': 0.2385999858379364, 'train/total_loss': 0.5663045644760132} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(0.0670, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3156224966049195, 'train/info_loss': 0.21184027194976807, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012085732305422425, 'train/video_loss': 0.21171940863132477, 'train/total_loss': 0.527341902256012} -tensor(0.0106, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.333, 'grad_norm': 4.674950122833252, 'learning_rate': 6.307846732066318e-06} -[Rank 3] Trainer log: {'loss': 0.333, 'grad_norm': 4.674950122833252, 'learning_rate': 6.307846732066318e-06} -[Rank 0] Trainer log: {'loss': 0.333, 'grad_norm': 4.674950122833252, 'learning_rate': 6.307846732066318e-06} -[Rank 2] Trainer log: {'loss': 0.333, 'grad_norm': 4.674950122833252, 'learning_rate': 6.307846732066318e-06} -{'loss': 0.333, 'grad_norm': 4.674950122833252, 'learning_rate': 6.307846732066318e-06, 'epoch': 0.64} -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.19275628328323366, 'train/info_loss': 0.21120697259902954, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001346058095805347, 'train/video_loss': 0.21107237040996552, 'train/total_loss': 0.4038286507129669} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.2247, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.11187375783920289, 'train/info_loss': 0.18693529069423676, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010637781815603375, 'train/video_loss': 0.18682891130447388, 'train/total_loss': 0.2987026572227478} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4025, 'grad_norm': 8.251299858093262, 'learning_rate': 6.29793501503089e-06}[Rank 0] Trainer log: {'loss': 0.4025, 'grad_norm': 8.251299858093262, 'learning_rate': 6.29793501503089e-06} -[Rank 3] Trainer log: {'loss': 0.4025, 'grad_norm': 8.251299858093262, 'learning_rate': 6.29793501503089e-06} -[Rank 2] Trainer log: {'loss': 0.4025, 'grad_norm': 8.251299858093262, 'learning_rate': 6.29793501503089e-06} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) - -{'loss': 0.4025, 'grad_norm': 8.251299858093262, 'learning_rate': 6.29793501503089e-06, 'epoch': 0.64} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.38175806999206546, 'train/info_loss': 0.20703017711639404, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012038792483508587, 'train/video_loss': 0.20690979063510895, 'train/total_loss': 0.5886678695678711} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(0.0700, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1122, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003048209007829428, 'train/lm_loss': 2.200482413172722e-05, 'train/info_loss': 1.7702266632113606e-05, 'train/ref_loss': 0.27241185307502747, 'train/uncertainty_loss': 0.011216412484645845, 'train/video_loss': 0.28608453273773193, 'train/total_loss': 0.28610652685165405} -[Rank 1] Trainer log: {'loss': 0.424, 'grad_norm': 3.267937660217285, 'learning_rate': 6.2880275108177915e-06}[Rank 0] Trainer log: {'loss': 0.424, 'grad_norm': 3.267937660217285, 'learning_rate': 6.2880275108177915e-06}[Rank 2] Trainer log: {'loss': 0.424, 'grad_norm': 3.267937660217285, 'learning_rate': 6.2880275108177915e-06} - - -[Rank 3] Trainer log: {'loss': 0.424, 'grad_norm': 3.267937660217285, 'learning_rate': 6.2880275108177915e-06} -{'loss': 0.424, 'grad_norm': 3.267937660217285, 'learning_rate': 6.2880275108177915e-06, 'epoch': 0.64} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0017, device='cuda:3', grad_fn=) tensor(-0.0017, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.47585754394531254, 'train/info_loss': 0.21883225440979004, 'train/ref_loss': None, 'train/uncertainty_loss': -8.16467043478042e-05, 'train/video_loss': 0.21875061094760895, 'train/total_loss': 0.6946081519126892} -tensor(1.3739, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00016568151768296959, 'train/lm_loss': 4.1671225335448984e-05, 'train/info_loss': 2.1636018573190086e-05, 'train/ref_loss': 1.2305277585983276, 'train/uncertainty_loss': 0.13739416599273682, 'train/video_loss': 1.3692690134048462, 'train/total_loss': 1.369310736656189} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0015, device='cuda:1', grad_fn=) tensor(-0.0015, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.5476, 'grad_norm': 8.83580207824707, 'learning_rate': 6.278124230701428e-06}[Rank 3] Trainer log: {'loss': 0.5476, 'grad_norm': 8.83580207824707, 'learning_rate': 6.278124230701428e-06}[Rank 2] Trainer log: {'loss': 0.5476, 'grad_norm': 8.83580207824707, 'learning_rate': 6.278124230701428e-06} - - -[Rank 0] Trainer log: {'loss': 0.5476, 'grad_norm': 8.83580207824707, 'learning_rate': 6.278124230701428e-06} -{'loss': 0.5476, 'grad_norm': 8.83580207824707, 'learning_rate': 6.278124230701428e-06, 'epoch': 0.64} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.19114190340042114, 'train/info_loss': 0.233725443482399, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011888383887708187, 'train/video_loss': 0.23360656201839447, 'train/total_loss': 0.4247484803199768} -tensor(0.0269, device='cuda:1', grad_fn=) tensor(-0.0006, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.40742115974426274, 'train/info_loss': 0.1600034534931183, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010219960240647197, 'train/video_loss': 0.15990126132965088, 'train/total_loss': 0.5673224329948425} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.1780, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0419, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.2992, 'grad_norm': 3.0492100715637207, 'learning_rate': 6.268225185951382e-06} -[Rank 2] Trainer log: {'loss': 0.2992, 'grad_norm': 3.0492100715637207, 'learning_rate': 6.268225185951382e-06} -[Rank 3] Trainer log: {'loss': 0.2992, 'grad_norm': 3.0492100715637207, 'learning_rate': 6.268225185951382e-06} -[Rank 0] Trainer log: {'loss': 0.2992, 'grad_norm': 3.0492100715637207, 'learning_rate': 6.268225185951382e-06} -{'loss': 0.2992, 'grad_norm': 3.0492100715637207, 'learning_rate': 6.268225185951382e-06, 'epoch': 0.64} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.1675, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00022998882923275232, 'train/lm_loss': 4.169506428297609e-05, 'train/info_loss': 2.1338008082238957e-05, 'train/ref_loss': 0.11930997669696808, 'train/uncertainty_loss': -6.700734957121313e-05, 'train/video_loss': 0.12110421806573868, 'train/total_loss': 0.12114591151475906} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.30424854755401615, 'train/info_loss': 0.19643722474575043, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012262032832950355, 'train/video_loss': 0.19631460309028625, 'train/total_loss': 0.5005631446838379} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3233, 'grad_norm': 2.0171010494232178, 'learning_rate': 6.258330387832433e-06}[Rank 3] Trainer log: {'loss': 0.3233, 'grad_norm': 2.0171010494232178, 'learning_rate': 6.258330387832433e-06} -[Rank 2] Trainer log: {'loss': 0.3233, 'grad_norm': 2.0171010494232178, 'learning_rate': 6.258330387832433e-06} - -[Rank 0] Trainer log: {'loss': 0.3233, 'grad_norm': 2.0171010494232178, 'learning_rate': 6.258330387832433e-06} -{'loss': 0.3233, 'grad_norm': 2.0171010494232178, 'learning_rate': 6.258330387832433e-06, 'epoch': 0.64} -tensor(-0.0015, device='cuda:2', grad_fn=) tensor(-0.0015, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2773042678833008, 'train/info_loss': 0.29682791233062744, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001156082027591765, 'train/video_loss': 0.29671230912208557, 'train/total_loss': 0.5740165710449219} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.2735, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0001962633337825537, 'train/lm_loss': 4.217179957777262e-05, 'train/info_loss': 2.3781687559676357e-05, 'train/ref_loss': 0.1727827489376068, 'train/uncertainty_loss': -6.894501857459546e-05, 'train/video_loss': 0.1743076890707016, 'train/total_loss': 0.1743498593568802} -[Rank 1] Trainer log: {'loss': 0.3384, 'grad_norm': 4.034315586090088, 'learning_rate': 6.248439847604513e-06} -[Rank 2] Trainer log: {'loss': 0.3384, 'grad_norm': 4.034315586090088, 'learning_rate': 6.248439847604513e-06} -[Rank 3] Trainer log: {'loss': 0.3384, 'grad_norm': 4.034315586090088, 'learning_rate': 6.248439847604513e-06} -[Rank 0] Trainer log: {'loss': 0.3384, 'grad_norm': 4.034315586090088, 'learning_rate': 6.248439847604513e-06} -{'loss': 0.3384, 'grad_norm': 4.034315586090088, 'learning_rate': 6.248439847604513e-06, 'epoch': 0.64} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3418708086013794, 'train/info_loss': 0.18866977095603943, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012293491745367648, 'train/video_loss': 0.18854683637619019, 'train/total_loss': 0.5304176807403564} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0146, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2975023746490479, 'train/info_loss': 0.2976798415184021, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013148145517334343, 'train/video_loss': 0.2975483536720276, 'train/total_loss': 0.5950506925582886} -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.0423, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3756, 'grad_norm': 3.436814785003662, 'learning_rate': 6.238553576522721e-06} -[Rank 3] Trainer log: {'loss': 0.3756, 'grad_norm': 3.436814785003662, 'learning_rate': 6.238553576522721e-06} -[Rank 0] Trainer log: {'loss': 0.3756, 'grad_norm': 3.436814785003662, 'learning_rate': 6.238553576522721e-06} -[Rank 2] Trainer log: {'loss': 0.3756, 'grad_norm': 3.436814785003662, 'learning_rate': 6.238553576522721e-06} -{'loss': 0.3756, 'grad_norm': 3.436814785003662, 'learning_rate': 6.238553576522721e-06, 'epoch': 0.64} -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.37059640884399414, 'train/info_loss': 0.16949284076690674, 'train/ref_loss': None, 'train/uncertainty_loss': -9.805648587644101e-05, 'train/video_loss': 0.16939479112625122, 'train/total_loss': 0.5399911999702454} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(0.5325, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0440, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.19157867431640627, 'train/info_loss': 0.08607473969459534, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011144621530547739, 'train/video_loss': 0.08596329391002655, 'train/total_loss': 0.27754196524620056} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3935, 'grad_norm': 4.37644100189209, 'learning_rate': 6.2286715858372894e-06}[Rank 3] Trainer log: {'loss': 0.3935, 'grad_norm': 4.37644100189209, 'learning_rate': 6.2286715858372894e-06}[Rank 0] Trainer log: {'loss': 0.3935, 'grad_norm': 4.37644100189209, 'learning_rate': 6.2286715858372894e-06} - - -[Rank 2] Trainer log: {'loss': 0.3935, 'grad_norm': 4.37644100189209, 'learning_rate': 6.2286715858372894e-06} -{'loss': 0.3935, 'grad_norm': 4.37644100189209, 'learning_rate': 6.2286715858372894e-06, 'epoch': 0.64} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.4139266967773438, 'train/info_loss': 0.18684661388397217, 'train/ref_loss': None, 'train/uncertainty_loss': -9.144659270532429e-05, 'train/video_loss': 0.18675516545772552, 'train/total_loss': 0.6006818413734436} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.1731, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(0.0521, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002170149004086852, 'train/lm_loss': 5.387534038163722e-05, 'train/info_loss': 2.3424076061928645e-05, 'train/ref_loss': 0.2329559624195099, 'train/uncertainty_loss': 0.005214526876807213, 'train/video_loss': 0.23993003368377686, 'train/total_loss': 0.2399839162826538} -tensor(0.2171, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3771, 'grad_norm': 2.9737040996551514, 'learning_rate': 6.218793886793584e-06}[Rank 2] Trainer log: {'loss': 0.3771, 'grad_norm': 2.9737040996551514, 'learning_rate': 6.218793886793584e-06}[Rank 1] Trainer log: {'loss': 0.3771, 'grad_norm': 2.9737040996551514, 'learning_rate': 6.218793886793584e-06} - -[Rank 3] Trainer log: {'loss': 0.3771, 'grad_norm': 2.9737040996551514, 'learning_rate': 6.218793886793584e-06} - -{'loss': 0.3771, 'grad_norm': 2.9737040996551514, 'learning_rate': 6.218793886793584e-06, 'epoch': 0.64} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.25354933738708496, 'train/info_loss': 0.17772160470485687, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012239664793014526, 'train/video_loss': 0.1775992065668106, 'train/total_loss': 0.4311485290527344} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) -{'train/tv_loss': 0.000168933323584497, 'train/lm_loss': 4.169506428297609e-05, 'train/info_loss': 2.3781687559676357e-05, 'train/ref_loss': 0.1592310070991516, 'train/uncertainty_loss': -6.975583964958787e-05, 'train/video_loss': 0.1605364978313446, 'train/total_loss': 0.160578191280365} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3207, 'grad_norm': 1.793711543083191, 'learning_rate': 6.2089204906320886e-06} -[Rank 2] Trainer log: {'loss': 0.3207, 'grad_norm': 1.793711543083191, 'learning_rate': 6.2089204906320886e-06}[Rank 1] Trainer log: {'loss': 0.3207, 'grad_norm': 1.793711543083191, 'learning_rate': 6.2089204906320886e-06} - -[Rank 0] Trainer log: {'loss': 0.3207, 'grad_norm': 1.793711543083191, 'learning_rate': 6.2089204906320886e-06} -{'loss': 0.3207, 'grad_norm': 1.793711543083191, 'learning_rate': 6.2089204906320886e-06, 'epoch': 0.64} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3613285779953003, 'train/info_loss': 0.24335099756717682, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011424930999055506, 'train/video_loss': 0.2432367503643036, 'train/total_loss': 0.6045653223991394} -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1495409607887268, 'train/info_loss': 0.20983938872814178, 'train/ref_loss': None, 'train/uncertainty_loss': -9.613796137273312e-05, 'train/video_loss': 0.20974324643611908, 'train/total_loss': 0.3592842221260071} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3375, 'grad_norm': 2.3547706604003906, 'learning_rate': 6.199051408588379e-06} -[Rank 3] Trainer log: {'loss': 0.3375, 'grad_norm': 2.3547706604003906, 'learning_rate': 6.199051408588379e-06} -[Rank 2] Trainer log: {'loss': 0.3375, 'grad_norm': 2.3547706604003906, 'learning_rate': 6.199051408588379e-06}[Rank 0] Trainer log: {'loss': 0.3375, 'grad_norm': 2.3547706604003906, 'learning_rate': 6.199051408588379e-06} - -{'loss': 0.3375, 'grad_norm': 2.3547706604003906, 'learning_rate': 6.199051408588379e-06, 'epoch': 0.64} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.30516688823699956, 'train/info_loss': 0.22634294629096985, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011759981280192734, 'train/video_loss': 0.226225346326828, 'train/total_loss': 0.5313922166824341} -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(0.0242, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00013268664479255677, 'train/lm_loss': 3.657006600406021e-05, 'train/info_loss': 2.0980394765501842e-05, 'train/ref_loss': 0.07129465043544769, 'train/uncertainty_loss': -6.916296551935374e-05, 'train/video_loss': 0.07230795919895172, 'train/total_loss': 0.07234452664852142} -tensor(-0.0006, device='cuda:3', grad_fn=) tensor(-0.0006, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3089, 'grad_norm': 1.3960052728652954, 'learning_rate': 6.1891866518931445e-06}[Rank 0] Trainer log: {'loss': 0.3089, 'grad_norm': 1.3960052728652954, 'learning_rate': 6.1891866518931445e-06}[Rank 2] Trainer log: {'loss': 0.3089, 'grad_norm': 1.3960052728652954, 'learning_rate': 6.1891866518931445e-06} - - -[Rank 3] Trainer log: {'loss': 0.3089, 'grad_norm': 1.3960052728652954, 'learning_rate': 6.1891866518931445e-06} -{'loss': 0.3089, 'grad_norm': 1.3960052728652954, 'learning_rate': 6.1891866518931445e-06, 'epoch': 0.64} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.5336001873016357, 'train/info_loss': 0.24985633790493011, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011682645417749883, 'train/video_loss': 0.24973951280117035, 'train/total_loss': 0.7833397388458252} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.23217120170593264, 'train/info_loss': 0.21457386016845703, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011569010093808174, 'train/video_loss': 0.214458167552948, 'train/total_loss': 0.4466293752193451} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4244, 'grad_norm': 3.512707233428955, 'learning_rate': 6.179326231772123e-06}[Rank 2] Trainer log: {'loss': 0.4244, 'grad_norm': 3.512707233428955, 'learning_rate': 6.179326231772123e-06} - -[Rank 0] Trainer log: {'loss': 0.4244, 'grad_norm': 3.512707233428955, 'learning_rate': 6.179326231772123e-06}[Rank 3] Trainer log: {'loss': 0.4244, 'grad_norm': 3.512707233428955, 'learning_rate': 6.179326231772123e-06} - -{'loss': 0.4244, 'grad_norm': 3.512707233428955, 'learning_rate': 6.179326231772123e-06, 'epoch': 0.64} -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1259252667427063, 'train/info_loss': 0.172590434551239, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010847264202311635, 'train/video_loss': 0.172481968998909, 'train/total_loss': 0.29840725660324097} -tensor(-0.0015, device='cuda:3', grad_fn=) tensor(-0.0015, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1186, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003857918549329043, 'train/lm_loss': 3.5974127240478995e-05, 'train/info_loss': 1.9728748156921938e-05, 'train/ref_loss': 0.2842443585395813, 'train/uncertainty_loss': 0.011857124418020249, 'train/video_loss': 0.29920753836631775, 'train/total_loss': 0.2992435097694397} -[Rank 1] Trainer log: {'loss': 0.3099, 'grad_norm': 2.8603577613830566, 'learning_rate': 6.169470159446147e-06}[Rank 3] Trainer log: {'loss': 0.3099, 'grad_norm': 2.8603577613830566, 'learning_rate': 6.169470159446147e-06}[Rank 0] Trainer log: {'loss': 0.3099, 'grad_norm': 2.8603577613830566, 'learning_rate': 6.169470159446147e-06} - -[Rank 2] Trainer log: {'loss': 0.3099, 'grad_norm': 2.8603577613830566, 'learning_rate': 6.169470159446147e-06} - -{'loss': 0.3099, 'grad_norm': 2.8603577613830566, 'learning_rate': 6.169470159446147e-06, 'epoch': 0.64} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.1587, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1229, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003234912408515811, 'train/lm_loss': 3.635553002823144e-05, 'train/info_loss': 1.9728748156921938e-05, 'train/ref_loss': 0.2932474613189697, 'train/uncertainty_loss': 0.012294338643550874, 'train/video_loss': 0.3081494867801666, 'train/total_loss': 0.3081858456134796} -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.20550634860992434, 'train/info_loss': 0.12927304208278656, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010393380653113128, 'train/video_loss': 0.12916910648345947, 'train/total_loss': 0.33467546105384827} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3076, 'grad_norm': 6.84275484085083, 'learning_rate': 6.159618446131074e-06} -[Rank 0] Trainer log: {'loss': 0.3076, 'grad_norm': 6.84275484085083, 'learning_rate': 6.159618446131074e-06}[Rank 2] Trainer log: {'loss': 0.3076, 'grad_norm': 6.84275484085083, 'learning_rate': 6.159618446131074e-06} -[Rank 3] Trainer log: {'loss': 0.3076, 'grad_norm': 6.84275484085083, 'learning_rate': 6.159618446131074e-06} - -{'loss': 0.3076, 'grad_norm': 6.84275484085083, 'learning_rate': 6.159618446131074e-06, 'epoch': 0.64} -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.18680503368377688, 'train/info_loss': 0.1589684933423996, 'train/ref_loss': None, 'train/uncertainty_loss': -9.554707212373615e-05, 'train/video_loss': 0.15887294709682465, 'train/total_loss': 0.3456779718399048} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.33805763721466064, 'train/info_loss': 0.1566331833600998, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011625355109572411, 'train/video_loss': 0.1565169245004654, 'train/total_loss': 0.49457454681396484} -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3778, 'grad_norm': 3.7687482833862305, 'learning_rate': 6.149771103037821e-06}[Rank 1] Trainer log: {'loss': 0.3778, 'grad_norm': 3.7687482833862305, 'learning_rate': 6.149771103037821e-06} - -[Rank 3] Trainer log: {'loss': 0.3778, 'grad_norm': 3.7687482833862305, 'learning_rate': 6.149771103037821e-06} -[Rank 0] Trainer log: {'loss': 0.3778, 'grad_norm': 3.7687482833862305, 'learning_rate': 6.149771103037821e-06} -{'loss': 0.3778, 'grad_norm': 3.7687482833862305, 'learning_rate': 6.149771103037821e-06, 'epoch': 0.64} -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.06682546734809876, 'train/info_loss': 0.30030423402786255, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010744037572294474, 'train/video_loss': 0.3001967966556549, 'train/total_loss': 0.3670222759246826} -tensor(0.0376, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00031851020175963643, 'train/lm_loss': 3.659390495158732e-05, 'train/info_loss': 1.9132725356030278e-05, 'train/ref_loss': 0.17624610662460327, 'train/uncertainty_loss': -7.306275656446815e-05, 'train/video_loss': 0.1787402629852295, 'train/total_loss': 0.17877686023712158} -[Rank 0] Trainer log: {'loss': 0.3053, 'grad_norm': 9.282052993774414, 'learning_rate': 6.139928141372327e-06}[Rank 2] Trainer log: {'loss': 0.3053, 'grad_norm': 9.282052993774414, 'learning_rate': 6.139928141372327e-06}[Rank 1] Trainer log: {'loss': 0.3053, 'grad_norm': 9.282052993774414, 'learning_rate': 6.139928141372327e-06} - -[Rank 3] Trainer log: {'loss': 0.3053, 'grad_norm': 9.282052993774414, 'learning_rate': 6.139928141372327e-06} - -{'loss': 0.3053, 'grad_norm': 9.282052993774414, 'learning_rate': 6.139928141372327e-06, 'epoch': 0.64} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(0.2295, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00016633343184366823, 'train/lm_loss': 6.126419175416231e-05, 'train/info_loss': 2.3781687559676357e-05, 'train/ref_loss': 0.1816377490758896, 'train/uncertainty_loss': -7.052539149299264e-05, 'train/video_loss': 0.18292167782783508, 'train/total_loss': 0.182982936501503} -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.39434156417846683, 'train/info_loss': 0.2415270060300827, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013676300877705217, 'train/video_loss': 0.24139024317264557, 'train/total_loss': 0.6357318162918091} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(0.3714, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3989, 'grad_norm': 2.7048771381378174, 'learning_rate': 6.130089572335536e-06}[Rank 1] Trainer log: {'loss': 0.3989, 'grad_norm': 2.7048771381378174, 'learning_rate': 6.130089572335536e-06} -[Rank 3] Trainer log: {'loss': 0.3989, 'grad_norm': 2.7048771381378174, 'learning_rate': 6.130089572335536e-06} - -[Rank 2] Trainer log: {'loss': 0.3989, 'grad_norm': 2.7048771381378174, 'learning_rate': 6.130089572335536e-06} -{'loss': 0.3989, 'grad_norm': 2.7048771381378174, 'learning_rate': 6.130089572335536e-06, 'epoch': 0.65} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3091481685638428, 'train/info_loss': 0.2121775597333908, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011250425595790149, 'train/video_loss': 0.21206505596637726, 'train/total_loss': 0.5212132334709167} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0406, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00021647259127348663, 'train/lm_loss': 2.453176712151617e-05, 'train/info_loss': 1.710624019324314e-05, 'train/ref_loss': 0.2438611090183258, 'train/uncertainty_loss': 0.004057924449443817, 'train/video_loss': 0.2496679276227951, 'train/total_loss': 0.24969245493412018} -[Rank 3] Trainer log: {'loss': 0.4038, 'grad_norm': 6.3089213371276855, 'learning_rate': 6.120255407123408e-06}[Rank 0] Trainer log: {'loss': 0.4038, 'grad_norm': 6.3089213371276855, 'learning_rate': 6.120255407123408e-06}[Rank 1] Trainer log: {'loss': 0.4038, 'grad_norm': 6.3089213371276855, 'learning_rate': 6.120255407123408e-06} - -[Rank 2] Trainer log: {'loss': 0.4038, 'grad_norm': 6.3089213371276855, 'learning_rate': 6.120255407123408e-06} - -{'loss': 0.4038, 'grad_norm': 6.3089213371276855, 'learning_rate': 6.120255407123408e-06, 'epoch': 0.65} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.15396040678024292, 'train/info_loss': 0.2187943309545517, 'train/ref_loss': None, 'train/uncertainty_loss': -8.150316425599159e-05, 'train/video_loss': 0.21871282160282135, 'train/total_loss': 0.3726732134819031} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.07127287387847901, 'train/info_loss': 0.16547083854675293, 'train/ref_loss': None, 'train/uncertainty_loss': -9.010076755657793e-05, 'train/video_loss': 0.16538073122501373, 'train/total_loss': 0.23665359616279602} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1314, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.2752, 'grad_norm': 4.30164098739624, 'learning_rate': 6.1104256569268775e-06}[Rank 1] Trainer log: {'loss': 0.2752, 'grad_norm': 4.30164098739624, 'learning_rate': 6.1104256569268775e-06}[Rank 0] Trainer log: {'loss': 0.2752, 'grad_norm': 4.30164098739624, 'learning_rate': 6.1104256569268775e-06} - - -[Rank 3] Trainer log: {'loss': 0.2752, 'grad_norm': 4.30164098739624, 'learning_rate': 6.1104256569268775e-06} -{'loss': 0.2752, 'grad_norm': 4.30164098739624, 'learning_rate': 6.1104256569268775e-06, 'epoch': 0.65} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.30509281158447266, 'train/info_loss': 0.1872698962688446, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010543474927544595, 'train/video_loss': 0.18716445565223694, 'train/total_loss': 0.4922572672367096} -tensor(0.2305, device='cuda:2', grad_fn=) tensor(0.2928, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.05395435094833374, 'train/info_loss': 0.19311457872390747, 'train/ref_loss': None, 'train/uncertainty_loss': -8.475512149743737e-05, 'train/video_loss': 0.19302982091903687, 'train/total_loss': 0.24698416888713837} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3594, 'grad_norm': 10.624917030334473, 'learning_rate': 6.100600332931867e-06} -[Rank 1] Trainer log: {'loss': 0.3594, 'grad_norm': 10.624917030334473, 'learning_rate': 6.100600332931867e-06} -[Rank 0] Trainer log: {'loss': 0.3594, 'grad_norm': 10.624917030334473, 'learning_rate': 6.100600332931867e-06} -[Rank 2] Trainer log: {'loss': 0.3594, 'grad_norm': 10.624917030334473, 'learning_rate': 6.100600332931867e-06} -{'loss': 0.3594, 'grad_norm': 10.624917030334473, 'learning_rate': 6.100600332931867e-06, 'epoch': 0.65} -tensor(0.5322, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0385, device='cuda:3', grad_fn=) tensor(0.0045, device='cuda:0', grad_fn=)tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00019129341235384347, 'train/lm_loss': 2.493702922947705e-05, 'train/info_loss': 1.8536700736149214e-05, 'train/ref_loss': 0.20168760418891907, 'train/uncertainty_loss': 0.000446359533816576, 'train/video_loss': 0.20368285477161407, 'train/total_loss': 0.20370778441429138} -tensor(0.0471, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0560, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.2980, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.0136, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00040655173361301425, 'train/lm_loss': 4.1170651093125345e-05, 'train/info_loss': 2.0980394765501842e-05, 'train/ref_loss': 0.22776004672050476, 'train/uncertainty_loss': 0.0013647401705384255, 'train/video_loss': 0.23239818215370178, 'train/total_loss': 0.23243935406208038} -[Rank 3] Trainer log: {'loss': 0.309, 'grad_norm': 11.365703582763672, 'learning_rate': 6.090779446319251e-06} -[Rank 1] Trainer log: {'loss': 0.309, 'grad_norm': 11.365703582763672, 'learning_rate': 6.090779446319251e-06}[Rank 2] Trainer log: {'loss': 0.309, 'grad_norm': 11.365703582763672, 'learning_rate': 6.090779446319251e-06} - -[Rank 0] Trainer log: {'loss': 0.309, 'grad_norm': 11.365703582763672, 'learning_rate': 6.090779446319251e-06} -{'loss': 0.309, 'grad_norm': 11.365703582763672, 'learning_rate': 6.090779446319251e-06, 'epoch': 0.65} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0001201898092404008, 'train/lm_loss': 3.6331691080704335e-05, 'train/info_loss': 2.002676046686247e-05, 'train/ref_loss': 0.14254766702651978, 'train/uncertainty_loss': -6.973120616748929e-05, 'train/video_loss': 0.1434594690799713, 'train/total_loss': 0.14349579811096191} -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.12148813009262086, 'train/info_loss': 0.15951067209243774, 'train/ref_loss': None, 'train/uncertainty_loss': -9.37403179705143e-05, 'train/video_loss': 0.15941692888736725, 'train/total_loss': 0.2809050679206848} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3727, 'grad_norm': 3.8285598754882812, 'learning_rate': 6.080963008264862e-06}[Rank 1] Trainer log: {'loss': 0.3727, 'grad_norm': 3.8285598754882812, 'learning_rate': 6.080963008264862e-06}[Rank 2] Trainer log: {'loss': 0.3727, 'grad_norm': 3.8285598754882812, 'learning_rate': 6.080963008264862e-06} - - -[Rank 3] Trainer log: {'loss': 0.3727, 'grad_norm': 3.8285598754882812, 'learning_rate': 6.080963008264862e-06} -{'loss': 0.3727, 'grad_norm': 3.8285598754882812, 'learning_rate': 6.080963008264862e-06, 'epoch': 0.65} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.2109, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.2038, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.3402, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00015569863608106972, 'train/lm_loss': 4.1671225335448984e-05, 'train/info_loss': 2.1636018573190086e-05, 'train/ref_loss': 0.41875872015953064, 'train/uncertainty_loss': 0.034024116396903996, 'train/video_loss': 0.45405006408691406, 'train/total_loss': 0.45409172773361206} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(0.0737, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00012626335956156254, 'train/lm_loss': 3.1206556013785305e-05, 'train/info_loss': 1.9430735846981406e-05, 'train/ref_loss': 0.26868340373039246, 'train/uncertainty_loss': 0.007372397929430008, 'train/video_loss': 0.2770853638648987, 'train/total_loss': 0.2771165668964386} -[Rank 0] Trainer log: {'loss': 0.4181, 'grad_norm': 9.314566612243652, 'learning_rate': 6.071151029939474e-06}[Rank 2] Trainer log: {'loss': 0.4181, 'grad_norm': 9.314566612243652, 'learning_rate': 6.071151029939474e-06} -[Rank 1] Trainer log: {'loss': 0.4181, 'grad_norm': 9.314566612243652, 'learning_rate': 6.071151029939474e-06}[Rank 3] Trainer log: {'loss': 0.4181, 'grad_norm': 9.314566612243652, 'learning_rate': 6.071151029939474e-06} - - -{'loss': 0.4181, 'grad_norm': 9.314566612243652, 'learning_rate': 6.071151029939474e-06, 'epoch': 0.65} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.28261466026306153, 'train/info_loss': 0.16566526889801025, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012202216312289238, 'train/video_loss': 0.16554324328899384, 'train/total_loss': 0.4481579065322876} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.36076054573059085, 'train/info_loss': 0.16721072793006897, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010112309828400612, 'train/video_loss': 0.16710960865020752, 'train/total_loss': 0.5278701782226562} -tensor(0.0880, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3979, 'grad_norm': 5.674367904663086, 'learning_rate': 6.0613435225087695e-06} -[Rank 0] Trainer log: {'loss': 0.3979, 'grad_norm': 5.674367904663086, 'learning_rate': 6.0613435225087695e-06}[Rank 2] Trainer log: {'loss': 0.3979, 'grad_norm': 5.674367904663086, 'learning_rate': 6.0613435225087695e-06} -[Rank 3] Trainer log: {'loss': 0.3979, 'grad_norm': 5.674367904663086, 'learning_rate': 6.0613435225087695e-06} - -{'loss': 0.3979, 'grad_norm': 5.674367904663086, 'learning_rate': 6.0613435225087695e-06, 'epoch': 0.65} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3875551223754883, 'train/info_loss': 0.07445565611124039, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011913861380890012, 'train/video_loss': 0.07433651387691498, 'train/total_loss': 0.46189165115356445} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.24314424991607667, 'train/info_loss': 0.14914843440055847, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011830562725663186, 'train/video_loss': 0.14903013408184052, 'train/total_loss': 0.3921743929386139} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.0538, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3619, 'grad_norm': 4.594131946563721, 'learning_rate': 6.051540497133361e-06} -[Rank 2] Trainer log: {'loss': 0.3619, 'grad_norm': 4.594131946563721, 'learning_rate': 6.051540497133361e-06} -[Rank 3] Trainer log: {'loss': 0.3619, 'grad_norm': 4.594131946563721, 'learning_rate': 6.051540497133361e-06} -[Rank 0] Trainer log: {'loss': 0.3619, 'grad_norm': 4.594131946563721, 'learning_rate': 6.051540497133361e-06} -{'loss': 0.3619, 'grad_norm': 4.594131946563721, 'learning_rate': 6.051540497133361e-06, 'epoch': 0.65} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.33425045013427734, 'train/info_loss': 0.17607320845127106, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011425480479374528, 'train/video_loss': 0.17595894634723663, 'train/total_loss': 0.5102093815803528} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0015, device='cuda:1', grad_fn=) tensor(-0.0015, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3049135208129883, 'train/info_loss': 0.15897847712039948, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011565380264073611, 'train/video_loss': 0.15886282920837402, 'train/total_loss': 0.4637763500213623} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3896, 'grad_norm': 3.2512476444244385, 'learning_rate': 6.041741964968753e-06}[Rank 0] Trainer log: {'loss': 0.3896, 'grad_norm': 3.2512476444244385, 'learning_rate': 6.041741964968753e-06}[Rank 1] Trainer log: {'loss': 0.3896, 'grad_norm': 3.2512476444244385, 'learning_rate': 6.041741964968753e-06} - - -[Rank 2] Trainer log: {'loss': 0.3896, 'grad_norm': 3.2512476444244385, 'learning_rate': 6.041741964968753e-06} -{'loss': 0.3896, 'grad_norm': 3.2512476444244385, 'learning_rate': 6.041741964968753e-06, 'epoch': 0.65} -tensor(0.1013, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.1956, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00015035064425319435, 'train/lm_loss': 4.6653073513880376e-05, 'train/info_loss': 2.1993630070937797e-05, 'train/ref_loss': 0.34346187114715576, 'train/uncertainty_loss': 0.01956199258565903, 'train/video_loss': 0.364248663187027, 'train/total_loss': 0.3642953038215637} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.2224, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003819222562015057, 'train/lm_loss': 2.8012221446260813e-05, 'train/info_loss': 1.8834713046089746e-05, 'train/ref_loss': 0.3621264100074768, 'train/uncertainty_loss': 0.022242796421051026, 'train/video_loss': 0.3874433934688568, 'train/total_loss': 0.38747140765190125} -tensor(0.1826, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.2662, 'grad_norm': 8.812832832336426, 'learning_rate': 6.031947937165335e-06}[Rank 1] Trainer log: {'loss': 0.2662, 'grad_norm': 8.812832832336426, 'learning_rate': 6.031947937165335e-06}[Rank 2] Trainer log: {'loss': 0.2662, 'grad_norm': 8.812832832336426, 'learning_rate': 6.031947937165335e-06} - -[Rank 3] Trainer log: {'loss': 0.2662, 'grad_norm': 8.812832832336426, 'learning_rate': 6.031947937165335e-06} - -{'loss': 0.2662, 'grad_norm': 8.812832832336426, 'learning_rate': 6.031947937165335e-06, 'epoch': 0.65} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00010197197552770377, 'train/lm_loss': 3.2279270817525686e-05, 'train/info_loss': 1.740425250318367e-05, 'train/ref_loss': 0.22136563062667847, 'train/uncertainty_loss': -7.094729808159173e-05, 'train/video_loss': 0.22212786972522736, 'train/total_loss': 0.22216014564037323} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.2055, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00017701269825920463, 'train/lm_loss': 3.649855498224497e-05, 'train/info_loss': 1.9132725356030278e-05, 'train/ref_loss': 0.35028719902038574, 'train/uncertainty_loss': 0.020550471544265748, 'train/video_loss': 0.37227287888526917, 'train/total_loss': 0.3723093867301941} -[Rank 0] Trainer log: {'loss': 0.3965, 'grad_norm': 5.5397868156433105, 'learning_rate': 6.022158424868374e-06}[Rank 3] Trainer log: {'loss': 0.3965, 'grad_norm': 5.5397868156433105, 'learning_rate': 6.022158424868374e-06} -[Rank 1] Trainer log: {'loss': 0.3965, 'grad_norm': 5.5397868156433105, 'learning_rate': 6.022158424868374e-06}[Rank 2] Trainer log: {'loss': 0.3965, 'grad_norm': 5.5397868156433105, 'learning_rate': 6.022158424868374e-06} - - -{'loss': 0.3965, 'grad_norm': 5.5397868156433105, 'learning_rate': 6.022158424868374e-06, 'epoch': 0.65} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.31332595348358155, 'train/info_loss': 0.24958620965480804, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011581503786146642, 'train/video_loss': 0.24947039783000946, 'train/total_loss': 0.5627963542938232} -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.9475, device='cuda:0', grad_fn=) tensor(0.1543, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=)tensor(-0.0007, device='cuda:1', grad_fn=) - -{'train/tv_loss': 0.00028695354703813793, 'train/lm_loss': 0.00012074639089405538, 'train/info_loss': 2.6761768822325394e-05, 'train/ref_loss': 0.7806696891784668, 'train/uncertainty_loss': 0.09474682211875916, 'train/video_loss': 0.877738893032074, 'train/total_loss': 0.8778596520423889} -[Rank 3] Trainer log: {'loss': 0.5574, 'grad_norm': 12.036458969116211, 'learning_rate': 6.012373439217997e-06} -[Rank 1] Trainer log: {'loss': 0.5574, 'grad_norm': 12.036458969116211, 'learning_rate': 6.012373439217997e-06} -[Rank 0] Trainer log: {'loss': 0.5574, 'grad_norm': 12.036458969116211, 'learning_rate': 6.012373439217997e-06}[Rank 2] Trainer log: {'loss': 0.5574, 'grad_norm': 12.036458969116211, 'learning_rate': 6.012373439217997e-06} - -{'loss': 0.5574, 'grad_norm': 12.036458969116211, 'learning_rate': 6.012373439217997e-06, 'epoch': 0.65} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00016487744869664311, 'train/lm_loss': 2.4770156596787277e-05, 'train/info_loss': 1.7702266632113606e-05, 'train/ref_loss': 0.11467013508081436, 'train/uncertainty_loss': -6.890686345286668e-05, 'train/video_loss': 0.11593794822692871, 'train/total_loss': 0.11596272140741348} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.26505284309387206, 'train/info_loss': 0.134796142578125, 'train/ref_loss': None, 'train/uncertainty_loss': -9.834977099671961e-05, 'train/video_loss': 0.1346977949142456, 'train/total_loss': 0.39975064992904663} -tensor(0.6634, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.3826, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4388, 'grad_norm': 11.89124584197998, 'learning_rate': 6.002592991349183e-06} -[Rank 2] Trainer log: {'loss': 0.4388, 'grad_norm': 11.89124584197998, 'learning_rate': 6.002592991349183e-06} -[Rank 3] Trainer log: {'loss': 0.4388, 'grad_norm': 11.89124584197998, 'learning_rate': 6.002592991349183e-06} -[Rank 0] Trainer log: {'loss': 0.4388, 'grad_norm': 11.89124584197998, 'learning_rate': 6.002592991349183e-06} -{'loss': 0.4388, 'grad_norm': 11.89124584197998, 'learning_rate': 6.002592991349183e-06, 'epoch': 0.65} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00023707689251750708, 'train/lm_loss': 3.6427041050046685e-05, 'train/info_loss': 1.9728748156921938e-05, 'train/ref_loss': 0.1368878036737442, 'train/uncertainty_loss': -6.98448158800602e-05, 'train/video_loss': 0.13873431086540222, 'train/total_loss': 0.13877074420452118} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.1162, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002635343000292778, 'train/lm_loss': 2.784535172395408e-05, 'train/info_loss': 1.7702266632113606e-05, 'train/ref_loss': 0.29432040452957153, 'train/uncertainty_loss': 0.011622419208288194, 'train/video_loss': 0.30806881189346313, 'train/total_loss': 0.30809664726257324} -tensor(0.6200, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.4336, 'grad_norm': 6.902194023132324, 'learning_rate': 5.992817092391739e-06}[Rank 3] Trainer log: {'loss': 0.4336, 'grad_norm': 6.902194023132324, 'learning_rate': 5.992817092391739e-06} -[Rank 0] Trainer log: {'loss': 0.4336, 'grad_norm': 6.902194023132324, 'learning_rate': 5.992817092391739e-06} -[Rank 1] Trainer log: {'loss': 0.4336, 'grad_norm': 6.902194023132324, 'learning_rate': 5.992817092391739e-06} - -{'loss': 0.4336, 'grad_norm': 6.902194023132324, 'learning_rate': 5.992817092391739e-06, 'epoch': 0.65} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(0.0925, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00018267695559188725, 'train/lm_loss': 4.1361345211043954e-05, 'train/info_loss': 2.270885306643322e-05, 'train/ref_loss': 0.1456296741962433, 'train/uncertainty_loss': -7.200191612355411e-05, 'train/video_loss': 0.14704179763793945, 'train/total_loss': 0.14708316326141357} -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.12269148826599122, 'train/info_loss': 0.15473423898220062, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001012746011838317, 'train/video_loss': 0.15463297069072723, 'train/total_loss': 0.27732446789741516} -tensor(0.0930, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1259, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.297, 'grad_norm': 3.8812334537506104, 'learning_rate': 5.983045753470308e-06}[Rank 1] Trainer log: {'loss': 0.297, 'grad_norm': 3.8812334537506104, 'learning_rate': 5.983045753470308e-06} - -[Rank 0] Trainer log: {'loss': 0.297, 'grad_norm': 3.8812334537506104, 'learning_rate': 5.983045753470308e-06}[Rank 3] Trainer log: {'loss': 0.297, 'grad_norm': 3.8812334537506104, 'learning_rate': 5.983045753470308e-06} - -{'loss': 0.297, 'grad_norm': 3.8812334537506104, 'learning_rate': 5.983045753470308e-06, 'epoch': 0.65} -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1739600419998169, 'train/info_loss': 0.08582603186368942, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012579829199239613, 'train/video_loss': 0.08570023626089096, 'train/total_loss': 0.2596602737903595} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.380991530418396, 'train/info_loss': 0.19478654861450195, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001318382564932108, 'train/video_loss': 0.1946547031402588, 'train/total_loss': 0.5756462812423706} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0015, device='cuda:2', grad_fn=) tensor(-0.0015, device='cuda:2', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3832, 'grad_norm': 2.8839969635009766, 'learning_rate': 5.9732789857043294e-06}[Rank 0] Trainer log: {'loss': 0.3832, 'grad_norm': 2.8839969635009766, 'learning_rate': 5.9732789857043294e-06}[Rank 1] Trainer log: {'loss': 0.3832, 'grad_norm': 2.8839969635009766, 'learning_rate': 5.9732789857043294e-06} - - -[Rank 3] Trainer log: {'loss': 0.3832, 'grad_norm': 2.8839969635009766, 'learning_rate': 5.9732789857043294e-06} -{'loss': 0.3832, 'grad_norm': 2.8839969635009766, 'learning_rate': 5.9732789857043294e-06, 'epoch': 0.65} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.1309, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00017965297447517515, 'train/lm_loss': 3.2064729020930825e-05, 'train/info_loss': 1.8834713046089746e-05, 'train/ref_loss': 0.3007547855377197, 'train/uncertainty_loss': 0.013092812895774842, 'train/video_loss': 0.31530365347862244, 'train/total_loss': 0.3153357207775116} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00019407494692131878, 'train/lm_loss': 3.23031097650528e-05, 'train/info_loss': 1.9430735846981406e-05, 'train/ref_loss': 0.19931256771087646, 'train/uncertainty_loss': -6.924906629137696e-05, 'train/video_loss': 0.200815349817276, 'train/total_loss': 0.20084765553474426} -tensor(0.1337, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.2006, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.2811, 'grad_norm': 14.691533088684082, 'learning_rate': 5.963516800208056e-06}[Rank 2] Trainer log: {'loss': 0.2811, 'grad_norm': 14.691533088684082, 'learning_rate': 5.963516800208056e-06} - -[Rank 0] Trainer log: {'loss': 0.2811, 'grad_norm': 14.691533088684082, 'learning_rate': 5.963516800208056e-06}[Rank 3] Trainer log: {'loss': 0.2811, 'grad_norm': 14.691533088684082, 'learning_rate': 5.963516800208056e-06} - -{'loss': 0.2811, 'grad_norm': 14.691533088684082, 'learning_rate': 5.963516800208056e-06, 'epoch': 0.65} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(0.0302, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00025557156186550856, 'train/lm_loss': 2.4770156596787277e-05, 'train/info_loss': 1.710624019324314e-05, 'train/ref_loss': 0.1393357366323471, 'train/uncertainty_loss': -6.781655829399825e-05, 'train/video_loss': 0.1413296014070511, 'train/total_loss': 0.14135436713695526} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.09096441268920899, 'train/info_loss': 0.19193445146083832, 'train/ref_loss': None, 'train/uncertainty_loss': -9.373943321406842e-05, 'train/video_loss': 0.19184070825576782, 'train/total_loss': 0.28280511498451233} -[Rank 1] Trainer log: {'loss': 0.3363, 'grad_norm': 5.027533054351807, 'learning_rate': 5.9537592080905105e-06}[Rank 3] Trainer log: {'loss': 0.3363, 'grad_norm': 5.027533054351807, 'learning_rate': 5.9537592080905105e-06} - -[Rank 0] Trainer log: {'loss': 0.3363, 'grad_norm': 5.027533054351807, 'learning_rate': 5.9537592080905105e-06}[Rank 2] Trainer log: {'loss': 0.3363, 'grad_norm': 5.027533054351807, 'learning_rate': 5.9537592080905105e-06} - -{'loss': 0.3363, 'grad_norm': 5.027533054351807, 'learning_rate': 5.9537592080905105e-06, 'epoch': 0.65} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00017119521507993343, 'train/lm_loss': 4.705829196609557e-05, 'train/info_loss': 2.1993630070937797e-05, 'train/ref_loss': 0.11932078003883362, 'train/uncertainty_loss': -6.919216248206795e-05, 'train/video_loss': 0.12064313888549805, 'train/total_loss': 0.12069019675254822} -tensor(0.3195, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2552236557006836, 'train/info_loss': 0.23188261687755585, 'train/ref_loss': None, 'train/uncertainty_loss': -8.718921453692019e-05, 'train/video_loss': 0.23179543018341064, 'train/total_loss': 0.4870190918445587} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3428, 'grad_norm': 6.271315574645996, 'learning_rate': 5.944006220455502e-06}[Rank 2] Trainer log: {'loss': 0.3428, 'grad_norm': 6.271315574645996, 'learning_rate': 5.944006220455502e-06} -[Rank 3] Trainer log: {'loss': 0.3428, 'grad_norm': 6.271315574645996, 'learning_rate': 5.944006220455502e-06} - -[Rank 1] Trainer log: {'loss': 0.3428, 'grad_norm': 6.271315574645996, 'learning_rate': 5.944006220455502e-06} -{'loss': 0.3428, 'grad_norm': 6.271315574645996, 'learning_rate': 5.944006220455502e-06, 'epoch': 0.65} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.31928031444549565, 'train/info_loss': 0.25193366408348083, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010914799058809878, 'train/video_loss': 0.2518245279788971, 'train/total_loss': 0.571104884147644} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(0.1001, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00017066225409507753, 'train/lm_loss': 2.4841673439368606e-05, 'train/info_loss': 1.8834713046089746e-05, 'train/ref_loss': 0.2801535129547119, 'train/uncertainty_loss': 0.010007649660110474, 'train/video_loss': 0.2915453016757965, 'train/total_loss': 0.29157015681266785} -[Rank 0] Trainer log: {'loss': 0.389, 'grad_norm': 2.9969305992126465, 'learning_rate': 5.934257848401594e-06}[Rank 2] Trainer log: {'loss': 0.389, 'grad_norm': 2.9969305992126465, 'learning_rate': 5.934257848401594e-06} - -[Rank 3] Trainer log: {'loss': 0.389, 'grad_norm': 2.9969305992126465, 'learning_rate': 5.934257848401594e-06} -[Rank 1] Trainer log: {'loss': 0.389, 'grad_norm': 2.9969305992126465, 'learning_rate': 5.934257848401594e-06} -{'loss': 0.389, 'grad_norm': 2.9969305992126465, 'learning_rate': 5.934257848401594e-06, 'epoch': 0.65} -tensor(-0.0015, device='cuda:2', grad_fn=) tensor(-0.0015, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.10027623176574707, 'train/info_loss': 0.19487985968589783, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012369591277092696, 'train/video_loss': 0.1947561651468277, 'train/total_loss': 0.2950323820114136} -tensor(0.0308, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.04951271414756775, 'train/info_loss': 0.10716497153043747, 'train/ref_loss': None, 'train/uncertainty_loss': -9.935139678418636e-05, 'train/video_loss': 0.10706561803817749, 'train/total_loss': 0.15657833218574524} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(0.4356, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3021, 'grad_norm': 3.6791577339172363, 'learning_rate': 5.924514103022095e-06}[Rank 3] Trainer log: {'loss': 0.3021, 'grad_norm': 3.6791577339172363, 'learning_rate': 5.924514103022095e-06} -[Rank 0] Trainer log: {'loss': 0.3021, 'grad_norm': 3.6791577339172363, 'learning_rate': 5.924514103022095e-06} - -[Rank 2] Trainer log: {'loss': 0.3021, 'grad_norm': 3.6791577339172363, 'learning_rate': 5.924514103022095e-06} -{'loss': 0.3021, 'grad_norm': 3.6791577339172363, 'learning_rate': 5.924514103022095e-06, 'epoch': 0.65} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(0.2726, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00021985361818224194, 'train/lm_loss': 2.4698639754205944e-05, 'train/info_loss': 2.002676046686247e-05, 'train/ref_loss': 0.13007214665412903, 'train/uncertainty_loss': -7.104239775799215e-05, 'train/video_loss': 0.13177995383739471, 'train/total_loss': 0.13180464506149292} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.2455, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.28400440216064454, 'train/info_loss': 0.17893031239509583, 'train/ref_loss': None, 'train/uncertainty_loss': -8.189266081899406e-05, 'train/video_loss': 0.17884841561317444, 'train/total_loss': 0.4628528356552124} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.298, 'grad_norm': 8.45612621307373, 'learning_rate': 5.914774995405053e-06}[Rank 0] Trainer log: {'loss': 0.298, 'grad_norm': 8.45612621307373, 'learning_rate': 5.914774995405053e-06} -[Rank 3] Trainer log: {'loss': 0.298, 'grad_norm': 8.45612621307373, 'learning_rate': 5.914774995405053e-06} - -[Rank 2] Trainer log: {'loss': 0.298, 'grad_norm': 8.45612621307373, 'learning_rate': 5.914774995405053e-06}{'loss': 0.298, 'grad_norm': 8.45612621307373, 'learning_rate': 5.914774995405053e-06, 'epoch': 0.65} - -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00015258146449923516, 'train/lm_loss': 2.5103901862166822e-05, 'train/info_loss': 1.8536700736149214e-05, 'train/ref_loss': 0.013917881064116955, 'train/uncertainty_loss': -6.555556901730598e-05, 'train/video_loss': 0.015091514214873314, 'train/total_loss': 0.015116618014872074} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(0.2425, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0421, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00018067506607621908, 'train/lm_loss': 3.204089007340372e-05, 'train/info_loss': 2.0980394765501842e-05, 'train/ref_loss': 0.220712810754776, 'train/uncertainty_loss': 0.0042125403881073, 'train/video_loss': 0.2263917326927185, 'train/total_loss': 0.22642377018928528} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.2486, 'grad_norm': 2.432915449142456, 'learning_rate': 5.905040536633237e-06}[Rank 2] Trainer log: {'loss': 0.2486, 'grad_norm': 2.432915449142456, 'learning_rate': 5.905040536633237e-06}[Rank 0] Trainer log: {'loss': 0.2486, 'grad_norm': 2.432915449142456, 'learning_rate': 5.905040536633237e-06} - -[Rank 3] Trainer log: {'loss': 0.2486, 'grad_norm': 2.432915449142456, 'learning_rate': 5.905040536633237e-06} - -{'loss': 0.2486, 'grad_norm': 2.432915449142456, 'learning_rate': 5.905040536633237e-06, 'epoch': 0.65} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.1121, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002059600315988064, 'train/lm_loss': 7.689904887229205e-05, 'train/info_loss': 2.3066464564180933e-05, 'train/ref_loss': 0.2878027558326721, 'train/uncertainty_loss': 0.011212489753961564, 'train/video_loss': 0.3006860017776489, 'train/total_loss': 0.3007628917694092} -tensor(-0.0015, device='cuda:3', grad_fn=) tensor(-0.0015, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.041647848486900334, 'train/info_loss': 0.09906051307916641, 'train/ref_loss': None, 'train/uncertainty_loss': -9.864772437140346e-05, 'train/video_loss': 0.09896186739206314, 'train/total_loss': 0.1406097114086151} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.2949, 'grad_norm': 6.660579204559326, 'learning_rate': 5.895310737784126e-06}[Rank 2] Trainer log: {'loss': 0.2949, 'grad_norm': 6.660579204559326, 'learning_rate': 5.895310737784126e-06}[Rank 0] Trainer log: {'loss': 0.2949, 'grad_norm': 6.660579204559326, 'learning_rate': 5.895310737784126e-06} - - -[Rank 3] Trainer log: {'loss': 0.2949, 'grad_norm': 6.660579204559326, 'learning_rate': 5.895310737784126e-06} -{'loss': 0.2949, 'grad_norm': 6.660579204559326, 'learning_rate': 5.895310737784126e-06, 'epoch': 0.65} -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.263090991973877, 'train/info_loss': 0.1805194616317749, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012855682289227843, 'train/video_loss': 0.18039090931415558, 'train/total_loss': 0.4434819221496582} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.1300, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(0.0082, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00021481013391166925, 'train/lm_loss': 2.5008546072058382e-05, 'train/info_loss': 1.8238688426208682e-05, 'train/ref_loss': 0.17966030538082123, 'train/uncertainty_loss': 0.0008229100145399571, 'train/video_loss': 0.18221993744373322, 'train/total_loss': 0.1822449415922165} -tensor(0.0788, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3461, 'grad_norm': 4.936594009399414, 'learning_rate': 5.885585609929891e-06}[Rank 0] Trainer log: {'loss': 0.3461, 'grad_norm': 4.936594009399414, 'learning_rate': 5.885585609929891e-06} -[Rank 3] Trainer log: {'loss': 0.3461, 'grad_norm': 4.936594009399414, 'learning_rate': 5.885585609929891e-06} -[Rank 2] Trainer log: {'loss': 0.3461, 'grad_norm': 4.936594009399414, 'learning_rate': 5.885585609929891e-06} - -{'loss': 0.3461, 'grad_norm': 4.936594009399414, 'learning_rate': 5.885585609929891e-06, 'epoch': 0.65} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1712, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0587, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.125469172000885, 'train/info_loss': 0.24883492290973663, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010197189403697849, 'train/video_loss': 0.24873295426368713, 'train/total_loss': 0.3742021322250366} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.34588074684143066, 'train/info_loss': 0.16652926802635193, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011811172589659691, 'train/video_loss': 0.1664111614227295, 'train/total_loss': 0.5122919082641602} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(0.1216, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.363, 'grad_norm': 3.61773943901062, 'learning_rate': 5.875865164137396e-06}[Rank 1] Trainer log: {'loss': 0.363, 'grad_norm': 3.61773943901062, 'learning_rate': 5.875865164137396e-06} -[Rank 3] Trainer log: {'loss': 0.363, 'grad_norm': 3.61773943901062, 'learning_rate': 5.875865164137396e-06} -[Rank 2] Trainer log: {'loss': 0.363, 'grad_norm': 3.61773943901062, 'learning_rate': 5.875865164137396e-06} - -{'loss': 0.363, 'grad_norm': 3.61773943901062, 'learning_rate': 5.875865164137396e-06, 'epoch': 0.65} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.3075, device='cuda:1', grad_fn=) tensor(-0.0006, device='cuda:1', grad_fn=) -tensor(0.2513, device='cuda:3', grad_fn=) tensor(0.1616, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00029094910714775327, 'train/lm_loss': 5.358931375667453e-05, 'train/info_loss': 2.3781687559676357e-05, 'train/ref_loss': 0.31780415773391724, 'train/uncertainty_loss': 0.016159597039222717, 'train/video_loss': 0.3363151252269745, 'train/total_loss': 0.33636870980262756} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.32293994426727296, 'train/info_loss': 0.12928418815135956, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011226574424654246, 'train/video_loss': 0.1291719228029251, 'train/total_loss': 0.4521118998527527} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(0.2072, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.4444, 'grad_norm': 12.160444259643555, 'learning_rate': 5.866149411468177e-06}[Rank 3] Trainer log: {'loss': 0.4444, 'grad_norm': 12.160444259643555, 'learning_rate': 5.866149411468177e-06}[Rank 1] Trainer log: {'loss': 0.4444, 'grad_norm': 12.160444259643555, 'learning_rate': 5.866149411468177e-06} - - -[Rank 2] Trainer log: {'loss': 0.4444, 'grad_norm': 12.160444259643555, 'learning_rate': 5.866149411468177e-06} -{'loss': 0.4444, 'grad_norm': 12.160444259643555, 'learning_rate': 5.866149411468177e-06, 'epoch': 0.65} -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00020880883093923332, 'train/lm_loss': 2.4817834491841496e-05, 'train/info_loss': 1.8834713046089746e-05, 'train/ref_loss': 0.18321365118026733, 'train/uncertainty_loss': -6.815286469645798e-05, 'train/video_loss': 0.1848347932100296, 'train/total_loss': 0.18485960364341736} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2403105974197388, 'train/info_loss': 0.2913327217102051, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010404838249087334, 'train/video_loss': 0.29122868180274963, 'train/total_loss': 0.531539261341095} -tensor(0.0252, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0596, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3638, 'grad_norm': 1.8527789115905762, 'learning_rate': 5.8564383629784175e-06}[Rank 1] Trainer log: {'loss': 0.3638, 'grad_norm': 1.8527789115905762, 'learning_rate': 5.8564383629784175e-06} - -[Rank 0] Trainer log: {'loss': 0.3638, 'grad_norm': 1.8527789115905762, 'learning_rate': 5.8564383629784175e-06} -[Rank 2] Trainer log: {'loss': 0.3638, 'grad_norm': 1.8527789115905762, 'learning_rate': 5.8564383629784175e-06} -{'loss': 0.3638, 'grad_norm': 1.8527789115905762, 'learning_rate': 5.8564383629784175e-06, 'epoch': 0.65} -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.38158850669860844, 'train/info_loss': 0.06055440008640289, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001311479019932449, 'train/video_loss': 0.06042325124144554, 'train/total_loss': 0.4420117735862732} -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2511230230331421, 'train/info_loss': 0.2582288980484009, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012226408580318093, 'train/video_loss': 0.25810661911964417, 'train/total_loss': 0.5092296600341797} -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3692, 'grad_norm': 2.5602004528045654, 'learning_rate': 5.846732029718962e-06}[Rank 0] Trainer log: {'loss': 0.3692, 'grad_norm': 2.5602004528045654, 'learning_rate': 5.846732029718962e-06} -[Rank 1] Trainer log: {'loss': 0.3692, 'grad_norm': 2.5602004528045654, 'learning_rate': 5.846732029718962e-06} - -[Rank 2] Trainer log: {'loss': 0.3692, 'grad_norm': 2.5602004528045654, 'learning_rate': 5.846732029718962e-06} -{'loss': 0.3692, 'grad_norm': 2.5602004528045654, 'learning_rate': 5.846732029718962e-06, 'epoch': 0.65} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3681243896484375, 'train/info_loss': 0.07267985492944717, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012856742832809687, 'train/video_loss': 0.07255128771066666, 'train/total_loss': 0.44067567586898804} -tensor(0.2176, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.0752, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.29500226974487304, 'train/info_loss': 0.15204651653766632, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010267778998240829, 'train/video_loss': 0.15194383263587952, 'train/total_loss': 0.4469461143016815} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3501, 'grad_norm': 5.391907691955566, 'learning_rate': 5.837030422735281e-06}[Rank 2] Trainer log: {'loss': 0.3501, 'grad_norm': 5.391907691955566, 'learning_rate': 5.837030422735281e-06}[Rank 1] Trainer log: {'loss': 0.3501, 'grad_norm': 5.391907691955566, 'learning_rate': 5.837030422735281e-06} - - -[Rank 3] Trainer log: {'loss': 0.3501, 'grad_norm': 5.391907691955566, 'learning_rate': 5.837030422735281e-06} -{'loss': 0.3501, 'grad_norm': 5.391907691955566, 'learning_rate': 5.837030422735281e-06, 'epoch': 0.65} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.27378618717193604, 'train/info_loss': 0.18530026078224182, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001140484819188714, 'train/video_loss': 0.1851862072944641, 'train/total_loss': 0.45897239446640015} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(0.2328, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1607, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00021110486704856157, 'train/lm_loss': 0.00011288317618891598, 'train/info_loss': 2.6761768822325394e-05, 'train/ref_loss': 0.3208291828632355, 'train/uncertainty_loss': 0.016073283553123475, 'train/video_loss': 0.33861806988716125, 'train/total_loss': 0.33873096108436584} -tensor(0.0314, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3379, 'grad_norm': 7.8567070960998535, 'learning_rate': 5.827333553067473e-06}[Rank 1] Trainer log: {'loss': 0.3379, 'grad_norm': 7.8567070960998535, 'learning_rate': 5.827333553067473e-06} - -[Rank 2] Trainer log: {'loss': 0.3379, 'grad_norm': 7.8567070960998535, 'learning_rate': 5.827333553067473e-06} -[Rank 3] Trainer log: {'loss': 0.3379, 'grad_norm': 7.8567070960998535, 'learning_rate': 5.827333553067473e-06} -{'loss': 0.3379, 'grad_norm': 7.8567070960998535, 'learning_rate': 5.827333553067473e-06, 'epoch': 0.66} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.0031, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00014839788200333716, 'train/lm_loss': 2.1766431746073068e-05, 'train/info_loss': 1.794067611626815e-05, 'train/ref_loss': 0.21624045073986053, 'train/uncertainty_loss': 0.0003052026499062777, 'train/video_loss': 0.21775078773498535, 'train/total_loss': 0.21777255833148956} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0286, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00024025768507272006, 'train/lm_loss': 3.1874023261480036e-05, 'train/info_loss': 1.9728748156921938e-05, 'train/ref_loss': 0.1827160120010376, 'train/uncertainty_loss': -6.815321394242347e-05, 'train/video_loss': 0.18458963930606842, 'train/total_loss': 0.18462151288986206} -[Rank 1] Trainer log: {'loss': 0.2946, 'grad_norm': 7.998960971832275, 'learning_rate': 5.817641431750234e-06} -[Rank 3] Trainer log: {'loss': 0.2946, 'grad_norm': 7.998960971832275, 'learning_rate': 5.817641431750234e-06} -[Rank 0] Trainer log: {'loss': 0.2946, 'grad_norm': 7.998960971832275, 'learning_rate': 5.817641431750234e-06}[Rank 2] Trainer log: {'loss': 0.2946, 'grad_norm': 7.998960971832275, 'learning_rate': 5.817641431750234e-06} - -{'loss': 0.2946, 'grad_norm': 7.998960971832275, 'learning_rate': 5.817641431750234e-06, 'epoch': 0.66} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0528, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00027445454616099597, 'train/lm_loss': 2.1790270693600178e-05, 'train/info_loss': 1.6867830709088594e-05, 'train/ref_loss': 0.250593900680542, 'train/uncertainty_loss': 0.0052795026451349265, 'train/video_loss': 0.2580859065055847, 'train/total_loss': 0.2581076920032501} -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.21145300865173342, 'train/info_loss': 0.3242379426956177, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012023800518363715, 'train/video_loss': 0.3241176903247833, 'train/total_loss': 0.5355706810951233} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.0132, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3252, 'grad_norm': 10.534229278564453, 'learning_rate': 5.807954069812862e-06}[Rank 1] Trainer log: {'loss': 0.3252, 'grad_norm': 10.534229278564453, 'learning_rate': 5.807954069812862e-06} -[Rank 3] Trainer log: {'loss': 0.3252, 'grad_norm': 10.534229278564453, 'learning_rate': 5.807954069812862e-06} - -[Rank 0] Trainer log: {'loss': 0.3252, 'grad_norm': 10.534229278564453, 'learning_rate': 5.807954069812862e-06} -{'loss': 0.3252, 'grad_norm': 10.534229278564453, 'learning_rate': 5.807954069812862e-06, 'epoch': 0.66} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0294, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00017149079358205201, 'train/lm_loss': 4.7129800077527764e-05, 'train/info_loss': 2.455650974297896e-05, 'train/ref_loss': 0.2356904149055481, 'train/uncertainty_loss': 0.0029447738081216815, 'train/video_loss': 0.2400316596031189, 'train/total_loss': 0.24007879197597504} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.10311661958694458, 'train/info_loss': 0.17362730205059052, 'train/ref_loss': None, 'train/uncertainty_loss': -9.315699571743608e-05, 'train/video_loss': 0.17353413999080658, 'train/total_loss': 0.2766507565975189} -tensor(0.2431, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3809, 'grad_norm': 4.766957759857178, 'learning_rate': 5.798271478279253e-06}[Rank 1] Trainer log: {'loss': 0.3809, 'grad_norm': 4.766957759857178, 'learning_rate': 5.798271478279253e-06}[Rank 3] Trainer log: {'loss': 0.3809, 'grad_norm': 4.766957759857178, 'learning_rate': 5.798271478279253e-06} - -[Rank 2] Trainer log: {'loss': 0.3809, 'grad_norm': 4.766957759857178, 'learning_rate': 5.798271478279253e-06} - -{'loss': 0.3809, 'grad_norm': 4.766957759857178, 'learning_rate': 5.798271478279253e-06, 'epoch': 0.66} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(0.1595, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00022061308845877647, 'train/lm_loss': 3.185018722433597e-05, 'train/info_loss': 1.9430735846981406e-05, 'train/ref_loss': 0.18868407607078552, 'train/uncertainty_loss': -6.92706322297454e-05, 'train/video_loss': 0.1903991401195526, 'train/total_loss': 0.19043098390102386} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.29790046215057375, 'train/info_loss': 0.1837206333875656, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011250831885263325, 'train/video_loss': 0.18360812962055206, 'train/total_loss': 0.48150861263275146} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3361, 'grad_norm': 4.297579288482666, 'learning_rate': 5.788593668167854e-06}[Rank 0] Trainer log: {'loss': 0.3361, 'grad_norm': 4.297579288482666, 'learning_rate': 5.788593668167854e-06} -[Rank 3] Trainer log: {'loss': 0.3361, 'grad_norm': 4.297579288482666, 'learning_rate': 5.788593668167854e-06} - -[Rank 2] Trainer log: {'loss': 0.3361, 'grad_norm': 4.297579288482666, 'learning_rate': 5.788593668167854e-06} -{'loss': 0.3361, 'grad_norm': 4.297579288482666, 'learning_rate': 5.788593668167854e-06, 'epoch': 0.66} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(0.1702, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00018524311017245056, 'train/lm_loss': 3.623634111136198e-05, 'train/info_loss': 2.0980394765501842e-05, 'train/ref_loss': 0.32768750190734863, 'train/uncertainty_loss': 0.017021270096302034, 'train/video_loss': 0.346211701631546, 'train/total_loss': 0.34624794125556946} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.1074, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(0.0083, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00019137355266138912, 'train/lm_loss': 3.5974127240478995e-05, 'train/info_loss': 2.235124156868551e-05, 'train/ref_loss': 0.22056730091571808, 'train/uncertainty_loss': 0.0008338701911270619, 'train/video_loss': 0.22295451164245605, 'train/total_loss': 0.222990483045578} -[Rank 0] Trainer log: {'loss': 0.3789, 'grad_norm': 8.171895980834961, 'learning_rate': 5.7789206504916815e-06}[Rank 2] Trainer log: {'loss': 0.3789, 'grad_norm': 8.171895980834961, 'learning_rate': 5.7789206504916815e-06}[Rank 3] Trainer log: {'loss': 0.3789, 'grad_norm': 8.171895980834961, 'learning_rate': 5.7789206504916815e-06} - - -{'loss': 0.3789, 'grad_norm': 8.171895980834961, 'learning_rate': 5.7789206504916815e-06, 'epoch': 0.66}[Rank 1] Trainer log: {'loss': 0.3789, 'grad_norm': 8.171895980834961, 'learning_rate': 5.7789206504916815e-06} - -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00026274872943758963, 'train/lm_loss': 3.1874023261480036e-05, 'train/info_loss': 2.0682384274550714e-05, 'train/ref_loss': 0.1149064302444458, 'train/uncertainty_loss': -7.323419558815659e-05, 'train/video_loss': 0.11695586889982224, 'train/total_loss': 0.11698774248361588} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.0592, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00016578567447140814, 'train/lm_loss': 6.0000957455486065e-05, 'train/info_loss': 2.574854443082586e-05, 'train/ref_loss': 0.25501570105552673, 'train/uncertainty_loss': 0.0059223812073469165, 'train/video_loss': 0.2622901201248169, 'train/total_loss': 0.2623501121997833} -[Rank 3] Trainer log: {'loss': 0.3454, 'grad_norm': 3.6991794109344482, 'learning_rate': 5.769252436258295e-06} -[Rank 1] Trainer log: {'loss': 0.3454, 'grad_norm': 3.6991794109344482, 'learning_rate': 5.769252436258295e-06}[Rank 0] Trainer log: {'loss': 0.3454, 'grad_norm': 3.6991794109344482, 'learning_rate': 5.769252436258295e-06} - -[Rank 2] Trainer log: {'loss': 0.3454, 'grad_norm': 3.6991794109344482, 'learning_rate': 5.769252436258295e-06} -{'loss': 0.3454, 'grad_norm': 3.6991794109344482, 'learning_rate': 5.769252436258295e-06, 'epoch': 0.66} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0015, device='cuda:2', grad_fn=) tensor(-0.0015, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003217862220481038, 'train/lm_loss': 4.653389332816005e-05, 'train/info_loss': 2.1338008082238957e-05, 'train/ref_loss': 0.17965355515480042, 'train/uncertainty_loss': -7.330098887905479e-05, 'train/video_loss': 0.1821758896112442, 'train/total_loss': 0.1822224259376526} -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.21704933643341065, 'train/info_loss': 0.08072395622730255, 'train/ref_loss': None, 'train/uncertainty_loss': -9.479672298766673e-05, 'train/video_loss': 0.08062916249036789, 'train/total_loss': 0.29767850041389465} -[Rank 1] Trainer log: {'loss': 0.3814, 'grad_norm': 3.887155294418335, 'learning_rate': 5.759589036469793e-06}[Rank 0] Trainer log: {'loss': 0.3814, 'grad_norm': 3.887155294418335, 'learning_rate': 5.759589036469793e-06}[Rank 3] Trainer log: {'loss': 0.3814, 'grad_norm': 3.887155294418335, 'learning_rate': 5.759589036469793e-06} - - -[Rank 2] Trainer log: {'loss': 0.3814, 'grad_norm': 3.887155294418335, 'learning_rate': 5.759589036469793e-06} -{'loss': 0.3814, 'grad_norm': 3.887155294418335, 'learning_rate': 5.759589036469793e-06, 'epoch': 0.66} -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.15884593725204468, 'train/info_loss': 0.09679526090621948, 'train/ref_loss': None, 'train/uncertainty_loss': -9.324780548922718e-05, 'train/video_loss': 0.09670200943946838, 'train/total_loss': 0.2555479407310486} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.13753668069839478, 'train/info_loss': 0.23872123658657074, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001031742780469358, 'train/video_loss': 0.23861806094646454, 'train/total_loss': 0.37615475058555603} -tensor(0.1312, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3451, 'grad_norm': 9.478434562683105, 'learning_rate': 5.749930462122784e-06}[Rank 3] Trainer log: {'loss': 0.3451, 'grad_norm': 9.478434562683105, 'learning_rate': 5.749930462122784e-06}[Rank 2] Trainer log: {'loss': 0.3451, 'grad_norm': 9.478434562683105, 'learning_rate': 5.749930462122784e-06} - -[Rank 1] Trainer log: {'loss': 0.3451, 'grad_norm': 9.478434562683105, 'learning_rate': 5.749930462122784e-06} - -{'loss': 0.3451, 'grad_norm': 9.478434562683105, 'learning_rate': 5.749930462122784e-06, 'epoch': 0.66} -tensor(-0.0015, device='cuda:2', grad_fn=) tensor(-0.0015, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3149502038955689, 'train/info_loss': 0.1556193083524704, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012698310893028976, 'train/video_loss': 0.15549232065677643, 'train/total_loss': 0.470442533493042} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.0765468716621399, 'train/info_loss': 0.2542925477027893, 'train/ref_loss': None, 'train/uncertainty_loss': -8.723060018382967e-05, 'train/video_loss': 0.2542053163051605, 'train/total_loss': 0.3307521939277649} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0120, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.2922, 'grad_norm': 1.9175676107406616, 'learning_rate': 5.740276724208397e-06}[Rank 1] Trainer log: {'loss': 0.2922, 'grad_norm': 1.9175676107406616, 'learning_rate': 5.740276724208397e-06} -[Rank 2] Trainer log: {'loss': 0.2922, 'grad_norm': 1.9175676107406616, 'learning_rate': 5.740276724208397e-06} - -[Rank 3] Trainer log: {'loss': 0.2922, 'grad_norm': 1.9175676107406616, 'learning_rate': 5.740276724208397e-06} -{'loss': 0.2922, 'grad_norm': 1.9175676107406616, 'learning_rate': 5.740276724208397e-06, 'epoch': 0.66} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(0.1122, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1959, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00016258273972198368, 'train/lm_loss': 6.1025843024253845e-05, 'train/info_loss': 2.413929905742407e-05, 'train/ref_loss': 0.16883832216262817, 'train/uncertainty_loss': -7.159045781008899e-05, 'train/video_loss': 0.17009153962135315, 'train/total_loss': 0.17015255987644196} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(0.3660, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002153045265004039, 'train/lm_loss': 2.496086817700416e-05, 'train/info_loss': 1.8238688426208682e-05, 'train/ref_loss': 0.4590832591056824, 'train/uncertainty_loss': 0.03660423159599304, 'train/video_loss': 0.49742814898490906, 'train/total_loss': 0.49745312333106995} -tensor(0.2555, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3685, 'grad_norm': 5.5717644691467285, 'learning_rate': 5.7306278337122525e-06}[Rank 2] Trainer log: {'loss': 0.3685, 'grad_norm': 5.5717644691467285, 'learning_rate': 5.7306278337122525e-06} -[Rank 1] Trainer log: {'loss': 0.3685, 'grad_norm': 5.5717644691467285, 'learning_rate': 5.7306278337122525e-06} - -[Rank 0] Trainer log: {'loss': 0.3685, 'grad_norm': 5.5717644691467285, 'learning_rate': 5.7306278337122525e-06} -{'loss': 0.3685, 'grad_norm': 5.5717644691467285, 'learning_rate': 5.7306278337122525e-06, 'epoch': 0.66} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(1.1109, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002126151230186224, 'train/lm_loss': 2.4722478701733053e-05, 'train/info_loss': 1.740425250318367e-05, 'train/ref_loss': 0.9555550813674927, 'train/uncertainty_loss': 0.11109327077865601, 'train/video_loss': 1.0683666467666626, 'train/total_loss': 1.0683913230895996} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(0.0930, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00013671087799593806, 'train/lm_loss': 3.657006600406021e-05, 'train/info_loss': 1.9132725356030278e-05, 'train/ref_loss': 0.2791999578475952, 'train/uncertainty_loss': 0.009300475567579269, 'train/video_loss': 0.2896132469177246, 'train/total_loss': 0.2896498143672943} -[Rank 1] Trainer log: {'loss': 0.4932, 'grad_norm': 7.025022029876709, 'learning_rate': 5.720983801614455e-06}[Rank 0] Trainer log: {'loss': 0.4932, 'grad_norm': 7.025022029876709, 'learning_rate': 5.720983801614455e-06} -[Rank 3] Trainer log: {'loss': 0.4932, 'grad_norm': 7.025022029876709, 'learning_rate': 5.720983801614455e-06} - -[Rank 2] Trainer log: {'loss': 0.4932, 'grad_norm': 7.025022029876709, 'learning_rate': 5.720983801614455e-06} -{'loss': 0.4932, 'grad_norm': 7.025022029876709, 'learning_rate': 5.720983801614455e-06, 'epoch': 0.66} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2787301301956177, 'train/info_loss': 0.2266109138727188, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011889106826856732, 'train/video_loss': 0.2264920175075531, 'train/total_loss': 0.5052221417427063} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.13043994903564454, 'train/info_loss': 0.17577716708183289, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001127044321037829, 'train/video_loss': 0.17566446959972382, 'train/total_loss': 0.3061044216156006} -[Rank 1] Trainer log: {'loss': 0.3514, 'grad_norm': 5.906411170959473, 'learning_rate': 5.7113446388895855e-06}[Rank 3] Trainer log: {'loss': 0.3514, 'grad_norm': 5.906411170959473, 'learning_rate': 5.7113446388895855e-06}[Rank 0] Trainer log: {'loss': 0.3514, 'grad_norm': 5.906411170959473, 'learning_rate': 5.7113446388895855e-06} - - -[Rank 2] Trainer log: {'loss': 0.3514, 'grad_norm': 5.906411170959473, 'learning_rate': 5.7113446388895855e-06} -{'loss': 0.3514, 'grad_norm': 5.906411170959473, 'learning_rate': 5.7113446388895855e-06, 'epoch': 0.66} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00023483284749090672, 'train/lm_loss': 3.645087999757379e-05, 'train/info_loss': 2.235124156868551e-05, 'train/ref_loss': 0.06819528341293335, 'train/uncertainty_loss': -6.94944174028933e-05, 'train/video_loss': 0.07002680748701096, 'train/total_loss': 0.07006325572729111} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.5629, device='cuda:0', grad_fn=) tensor(-0.0008, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.000887374859303236, 'train/lm_loss': 4.150436725467444e-05, 'train/info_loss': 2.270885306643322e-05, 'train/ref_loss': 0.5856791138648987, 'train/uncertainty_loss': 0.05629110932350159, 'train/video_loss': 0.649091899394989, 'train/total_loss': 0.6491333842277527} -[Rank 1] Trainer log: {'loss': 0.4015, 'grad_norm': 11.06789493560791, 'learning_rate': 5.701710356506665e-06} -[Rank 3] Trainer log: {'loss': 0.4015, 'grad_norm': 11.06789493560791, 'learning_rate': 5.701710356506665e-06} -[Rank 0] Trainer log: {'loss': 0.4015, 'grad_norm': 11.06789493560791, 'learning_rate': 5.701710356506665e-06}[Rank 2] Trainer log: {'loss': 0.4015, 'grad_norm': 11.06789493560791, 'learning_rate': 5.701710356506665e-06} - -{'loss': 0.4015, 'grad_norm': 11.06789493560791, 'learning_rate': 5.701710356506665e-06, 'epoch': 0.66} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0157, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00012852794025093318, 'train/lm_loss': 5.418520304374397e-05, 'train/info_loss': 2.3424076061928645e-05, 'train/ref_loss': 0.22761771082878113, 'train/uncertainty_loss': 0.001567848213016987, 'train/video_loss': 0.23023721575737, 'train/total_loss': 0.23029139637947083} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0001663352712057531, 'train/lm_loss': 2.46032839640975e-05, 'train/info_loss': 1.710624019324314e-05, 'train/ref_loss': 0.1792670488357544, 'train/uncertainty_loss': -7.328792125917972e-05, 'train/video_loss': 0.18054156005382538, 'train/total_loss': 0.18056616187095642} -[Rank 1] Trainer log: {'loss': 0.2317, 'grad_norm': 2.1956772804260254, 'learning_rate': 5.6920809654291945e-06}[Rank 3] Trainer log: {'loss': 0.2317, 'grad_norm': 2.1956772804260254, 'learning_rate': 5.6920809654291945e-06}[Rank 0] Trainer log: {'loss': 0.2317, 'grad_norm': 2.1956772804260254, 'learning_rate': 5.6920809654291945e-06} - -[Rank 2] Trainer log: {'loss': 0.2317, 'grad_norm': 2.1956772804260254, 'learning_rate': 5.6920809654291945e-06} - -{'loss': 0.2317, 'grad_norm': 2.1956772804260254, 'learning_rate': 5.6920809654291945e-06, 'epoch': 0.66} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00016776639968156816, 'train/lm_loss': 2.7893029619008305e-05, 'train/info_loss': 2.0682384274550714e-05, 'train/ref_loss': 0.2100030779838562, 'train/uncertainty_loss': -7.060928619466722e-05, 'train/video_loss': 0.21129527688026428, 'train/total_loss': 0.21132317185401917} -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00012384995352476835, 'train/lm_loss': 4.760652373079211e-05, 'train/info_loss': 2.3781687559676357e-05, 'train/ref_loss': 0.09827212989330292, 'train/uncertainty_loss': -7.054724264889956e-05, 'train/video_loss': 0.09921616315841675, 'train/total_loss': 0.09926377236843109} -tensor(0.1509, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.2525, 'grad_norm': 3.496741771697998, 'learning_rate': 5.6824564766150724e-06}[Rank 3] Trainer log: {'loss': 0.2525, 'grad_norm': 3.496741771697998, 'learning_rate': 5.6824564766150724e-06}[Rank 0] Trainer log: {'loss': 0.2525, 'grad_norm': 3.496741771697998, 'learning_rate': 5.6824564766150724e-06} - -[Rank 1] Trainer log: {'loss': 0.2525, 'grad_norm': 3.496741771697998, 'learning_rate': 5.6824564766150724e-06} - -{'loss': 0.2525, 'grad_norm': 3.496741771697998, 'learning_rate': 5.6824564766150724e-06, 'epoch': 0.66} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.20042383670806885, 'train/info_loss': 0.1449424922466278, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011801900109276175, 'train/video_loss': 0.14482447504997253, 'train/total_loss': 0.3452483117580414} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.45617589950561527, 'train/info_loss': 0.24490997195243835, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012810240732505918, 'train/video_loss': 0.24478186666965485, 'train/total_loss': 0.7009577751159668} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3158, 'grad_norm': 4.859067440032959, 'learning_rate': 5.6728369010166426e-06}[Rank 1] Trainer log: {'loss': 0.3158, 'grad_norm': 4.859067440032959, 'learning_rate': 5.6728369010166426e-06}[Rank 0] Trainer log: {'loss': 0.3158, 'grad_norm': 4.859067440032959, 'learning_rate': 5.6728369010166426e-06} - -[Rank 3] Trainer log: {'loss': 0.3158, 'grad_norm': 4.859067440032959, 'learning_rate': 5.6728369010166426e-06} - -{'loss': 0.3158, 'grad_norm': 4.859067440032959, 'learning_rate': 5.6728369010166426e-06, 'epoch': 0.66} -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.11436709165573121, 'train/info_loss': 0.26202645897865295, 'train/ref_loss': None, 'train/uncertainty_loss': -9.564846986904741e-05, 'train/video_loss': 0.26193082332611084, 'train/total_loss': 0.3762979209423065} -tensor(-0.0015, device='cuda:2', grad_fn=) tensor(-0.0015, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.15787471532821656, 'train/info_loss': 0.17485037446022034, 'train/ref_loss': None, 'train/uncertainty_loss': -9.450375218875707e-05, 'train/video_loss': 0.17475587129592896, 'train/total_loss': 0.33263057470321655} -[Rank 1] Trainer log: {'loss': 0.4708, 'grad_norm': 2.322233200073242, 'learning_rate': 5.663222249580649e-06}[Rank 3] Trainer log: {'loss': 0.4708, 'grad_norm': 2.322233200073242, 'learning_rate': 5.663222249580649e-06}[Rank 0] Trainer log: {'loss': 0.4708, 'grad_norm': 2.322233200073242, 'learning_rate': 5.663222249580649e-06} - -[Rank 2] Trainer log: {'loss': 0.4708, 'grad_norm': 2.322233200073242, 'learning_rate': 5.663222249580649e-06} - -{'loss': 0.4708, 'grad_norm': 2.322233200073242, 'learning_rate': 5.663222249580649e-06, 'epoch': 0.66} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(0.2767, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0772, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00016861287876963617, 'train/lm_loss': 4.701061698142439e-05, 'train/info_loss': 2.235124156868551e-05, 'train/ref_loss': 0.2696513533592224, 'train/uncertainty_loss': 0.007722488045692444, 'train/video_loss': 0.2787451148033142, 'train/total_loss': 0.2787921130657196} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.20862238407135011, 'train/info_loss': 0.1744736284017563, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001176536548882723, 'train/video_loss': 0.17435596883296967, 'train/total_loss': 0.3829783499240875} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.2500, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3702, 'grad_norm': 10.16737174987793, 'learning_rate': 5.653612533248233e-06}[Rank 1] Trainer log: {'loss': 0.3702, 'grad_norm': 10.16737174987793, 'learning_rate': 5.653612533248233e-06} -[Rank 0] Trainer log: {'loss': 0.3702, 'grad_norm': 10.16737174987793, 'learning_rate': 5.653612533248233e-06} - -[Rank 2] Trainer log: {'loss': 0.3702, 'grad_norm': 10.16737174987793, 'learning_rate': 5.653612533248233e-06} -{'loss': 0.3702, 'grad_norm': 10.16737174987793, 'learning_rate': 5.653612533248233e-06, 'epoch': 0.66} -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0352, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00026960894465446474, 'train/lm_loss': 2.4698639754205944e-05, 'train/info_loss': 1.7702266632113606e-05, 'train/ref_loss': 0.24252426624298096, 'train/uncertainty_loss': 0.0035187847912311557, 'train/video_loss': 0.2482176274061203, 'train/total_loss': 0.2482423186302185} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00018143276683986188, 'train/lm_loss': 3.5974127240478995e-05, 'train/info_loss': 1.9728748156921938e-05, 'train/ref_loss': 0.08057326078414917, 'train/uncertainty_loss': -7.370269158855081e-05, 'train/video_loss': 0.08197075128555298, 'train/total_loss': 0.08200672268867493} -tensor(0.3041, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3183, 'grad_norm': 8.234452247619629, 'learning_rate': 5.644007762954926e-06}[Rank 1] Trainer log: {'loss': 0.3183, 'grad_norm': 8.234452247619629, 'learning_rate': 5.644007762954926e-06} - -[Rank 3] Trainer log: {'loss': 0.3183, 'grad_norm': 8.234452247619629, 'learning_rate': 5.644007762954926e-06}[Rank 0] Trainer log: {'loss': 0.3183, 'grad_norm': 8.234452247619629, 'learning_rate': 5.644007762954926e-06} - -{'loss': 0.3183, 'grad_norm': 8.234452247619629, 'learning_rate': 5.644007762954926e-06, 'epoch': 0.66} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.06672886610031128, 'train/info_loss': 0.18479126691818237, 'train/ref_loss': None, 'train/uncertainty_loss': -9.260554797947408e-05, 'train/video_loss': 0.1846986562013626, 'train/total_loss': 0.2514275312423706} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00025838704314082864, 'train/lm_loss': 2.7893029619008305e-05, 'train/info_loss': 1.7702266632113606e-05, 'train/ref_loss': 0.09182281792163849, 'train/uncertainty_loss': -6.915096309967339e-05, 'train/video_loss': 0.09383846819400787, 'train/total_loss': 0.09386636316776276} -[Rank 1] Trainer log: {'loss': 0.2844, 'grad_norm': 3.304973602294922, 'learning_rate': 5.634407949630617e-06}[Rank 0] Trainer log: {'loss': 0.2844, 'grad_norm': 3.304973602294922, 'learning_rate': 5.634407949630617e-06}[Rank 3] Trainer log: {'loss': 0.2844, 'grad_norm': 3.304973602294922, 'learning_rate': 5.634407949630617e-06} - -[Rank 2] Trainer log: {'loss': 0.2844, 'grad_norm': 3.304973602294922, 'learning_rate': 5.634407949630617e-06} - -{'loss': 0.2844, 'grad_norm': 3.304973602294922, 'learning_rate': 5.634407949630617e-06, 'epoch': 0.66} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(1.0766, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.7360, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00015928944339975716, 'train/lm_loss': 4.684376472141594e-05, 'train/info_loss': 2.1993630070937797e-05, 'train/ref_loss': 0.7353293299674988, 'train/uncertainty_loss': 0.07359982132911683, 'train/video_loss': 0.8102254271507263, 'train/total_loss': 0.8102722764015198} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2558208465576172, 'train/info_loss': 0.18679678440093994, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010758317075669766, 'train/video_loss': 0.18668919801712036, 'train/total_loss': 0.44251003861427307} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4651, 'grad_norm': 17.812118530273438, 'learning_rate': 5.624813104199567e-06} -[Rank 2] Trainer log: {'loss': 0.4651, 'grad_norm': 17.812118530273438, 'learning_rate': 5.624813104199567e-06} -[Rank 0] Trainer log: {'loss': 0.4651, 'grad_norm': 17.812118530273438, 'learning_rate': 5.624813104199567e-06}[Rank 1] Trainer log: {'loss': 0.4651, 'grad_norm': 17.812118530273438, 'learning_rate': 5.624813104199567e-06} - -{'loss': 0.4651, 'grad_norm': 17.812118530273438, 'learning_rate': 5.624813104199567e-06, 'epoch': 0.66} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.4664, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00020973044447600842, 'train/lm_loss': 4.715363611467183e-05, 'train/info_loss': 2.235124156868551e-05, 'train/ref_loss': 0.5165361166000366, 'train/uncertainty_loss': 0.0466405063867569, 'train/video_loss': 0.5648768544197083, 'train/total_loss': 0.5649240016937256} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.36234724521636963, 'train/info_loss': 0.14337365329265594, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011461186222732067, 'train/video_loss': 0.14325904846191406, 'train/total_loss': 0.5056062936782837} -tensor(0.0681, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.3913, 'grad_norm': 3.6058335304260254, 'learning_rate': 5.615223237580377e-06}[Rank 0] Trainer log: {'loss': 0.3913, 'grad_norm': 3.6058335304260254, 'learning_rate': 5.615223237580377e-06}[Rank 2] Trainer log: {'loss': 0.3913, 'grad_norm': 3.6058335304260254, 'learning_rate': 5.615223237580377e-06} - - -[Rank 1] Trainer log: {'loss': 0.3913, 'grad_norm': 3.6058335304260254, 'learning_rate': 5.615223237580377e-06} -{'loss': 0.3913, 'grad_norm': 3.6058335304260254, 'learning_rate': 5.615223237580377e-06, 'epoch': 0.66} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.06900591254234315, 'train/info_loss': 0.15682968497276306, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010683162836357952, 'train/video_loss': 0.15672285854816437, 'train/total_loss': 0.22572878003120422} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.40285620689392093, 'train/info_loss': 0.18601451814174652, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011755496961995959, 'train/video_loss': 0.18589696288108826, 'train/total_loss': 0.5887531638145447} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(0.0370, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3731, 'grad_norm': 2.03836727142334, 'learning_rate': 5.605638360685988e-06} -[Rank 0] Trainer log: {'loss': 0.3731, 'grad_norm': 2.03836727142334, 'learning_rate': 5.605638360685988e-06}[Rank 3] Trainer log: {'loss': 0.3731, 'grad_norm': 2.03836727142334, 'learning_rate': 5.605638360685988e-06} - -[Rank 2] Trainer log: {'loss': 0.3731, 'grad_norm': 2.03836727142334, 'learning_rate': 5.605638360685988e-06} -{'loss': 0.3731, 'grad_norm': 2.03836727142334, 'learning_rate': 5.605638360685988e-06, 'epoch': 0.66} -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.2234, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00023927739821374419, 'train/lm_loss': 2.8107574325986207e-05, 'train/info_loss': 1.8238688426208682e-05, 'train/ref_loss': 0.3630368113517761, 'train/uncertainty_loss': 0.022341114282608033, 'train/video_loss': 0.3873103857040405, 'train/total_loss': 0.3873384892940521} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:1', grad_fn=) tensor(-0.0013, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.14715486764907837, 'train/info_loss': 0.13828474283218384, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001031541614793241, 'train/video_loss': 0.13818158209323883, 'train/total_loss': 0.285336434841156} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3744, 'grad_norm': 5.042629241943359, 'learning_rate': 5.5960584844236565e-06}[Rank 1] Trainer log: {'loss': 0.3744, 'grad_norm': 5.042629241943359, 'learning_rate': 5.5960584844236565e-06} -[Rank 2] Trainer log: {'loss': 0.3744, 'grad_norm': 5.042629241943359, 'learning_rate': 5.5960584844236565e-06} - -[Rank 3] Trainer log: {'loss': 0.3744, 'grad_norm': 5.042629241943359, 'learning_rate': 5.5960584844236565e-06} -{'loss': 0.3744, 'grad_norm': 5.042629241943359, 'learning_rate': 5.5960584844236565e-06, 'epoch': 0.66} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.4195257663726807, 'train/info_loss': 0.10877030342817307, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011566100874915719, 'train/video_loss': 0.10865464061498642, 'train/total_loss': 0.5281804203987122} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1955, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0016, device='cuda:0', grad_fn=) tensor(-0.0016, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 2.8298282995820048e-05, 'train/info_loss': 2.342404332011938e-05, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00015964442864060404, 'train/video_loss': -0.00013622039114125073, 'train/total_loss': -0.00010792211105581373} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.0522, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.2652, 'grad_norm': 10.820907592773438, 'learning_rate': 5.586483619694953e-06} -[Rank 0] Trainer log: {'loss': 0.2652, 'grad_norm': 10.820907592773438, 'learning_rate': 5.586483619694953e-06}[Rank 1] Trainer log: {'loss': 0.2652, 'grad_norm': 10.820907592773438, 'learning_rate': 5.586483619694953e-06} - -[Rank 2] Trainer log: {'loss': 0.2652, 'grad_norm': 10.820907592773438, 'learning_rate': 5.586483619694953e-06} -{'loss': 0.2652, 'grad_norm': 10.820907592773438, 'learning_rate': 5.586483619694953e-06, 'epoch': 0.66} -tensor(-0.0016, device='cuda:3', grad_fn=) tensor(-0.0016, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00031795650720596315, 'train/lm_loss': 7.840050384402276e-05, 'train/info_loss': 2.413929905742407e-05, 'train/ref_loss': 0.1724846065044403, 'train/uncertainty_loss': -6.841651047579944e-05, 'train/video_loss': 0.17498399317264557, 'train/total_loss': 0.1750623881816864} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.19196442365646363, 'train/info_loss': 0.2579406201839447, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011937021045014263, 'train/video_loss': 0.257821261882782, 'train/total_loss': 0.4497857093811035} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3145, 'grad_norm': 6.420591831207275, 'learning_rate': 5.576913777395749e-06} -[Rank 3] Trainer log: {'loss': 0.3145, 'grad_norm': 6.420591831207275, 'learning_rate': 5.576913777395749e-06} -[Rank 0] Trainer log: {'loss': 0.3145, 'grad_norm': 6.420591831207275, 'learning_rate': 5.576913777395749e-06}[Rank 1] Trainer log: {'loss': 0.3145, 'grad_norm': 6.420591831207275, 'learning_rate': 5.576913777395749e-06} - -{'loss': 0.3145, 'grad_norm': 6.420591831207275, 'learning_rate': 5.576913777395749e-06, 'epoch': 0.66} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3795618772506714, 'train/info_loss': 0.29438889026641846, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011655737180262805, 'train/video_loss': 0.2942723333835602, 'train/total_loss': 0.6738342046737671} -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.32472651004791264, 'train/info_loss': 0.22495689988136292, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001121837878599763, 'train/video_loss': 0.22484470903873444, 'train/total_loss': 0.5495712161064148} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.0239, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.385, 'grad_norm': 4.281194686889648, 'learning_rate': 5.567348968416184e-06}[Rank 3] Trainer log: {'loss': 0.385, 'grad_norm': 4.281194686889648, 'learning_rate': 5.567348968416184e-06} - -[Rank 0] Trainer log: {'loss': 0.385, 'grad_norm': 4.281194686889648, 'learning_rate': 5.567348968416184e-06} -[Rank 1] Trainer log: {'loss': 0.385, 'grad_norm': 4.281194686889648, 'learning_rate': 5.567348968416184e-06} -{'loss': 0.385, 'grad_norm': 4.281194686889648, 'learning_rate': 5.567348968416184e-06, 'epoch': 0.66} -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.4046, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00038167270831763745, 'train/lm_loss': 2.8584344545379284e-05, 'train/info_loss': 1.9132725356030278e-05, 'train/ref_loss': 0.49088212847709656, 'train/uncertainty_loss': 0.04045624732971192, 'train/video_loss': 0.5344108939170837, 'train/total_loss': 0.5344395041465759} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0015, device='cuda:3', grad_fn=) tensor(-0.0015, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.24148297309875488, 'train/info_loss': 0.2007106989622116, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011322950012981892, 'train/video_loss': 0.20059746503829956, 'train/total_loss': 0.44208043813705444} -tensor(0.0585, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.4198, 'grad_norm': 8.499199867248535, 'learning_rate': 5.557789203640687e-06}[Rank 2] Trainer log: {'loss': 0.4198, 'grad_norm': 8.499199867248535, 'learning_rate': 5.557789203640687e-06}[Rank 1] Trainer log: {'loss': 0.4198, 'grad_norm': 8.499199867248535, 'learning_rate': 5.557789203640687e-06} - - -[Rank 3] Trainer log: {'loss': 0.4198, 'grad_norm': 8.499199867248535, 'learning_rate': 5.557789203640687e-06} -{'loss': 0.4198, 'grad_norm': 8.499199867248535, 'learning_rate': 5.557789203640687e-06, 'epoch': 0.66} -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.2698, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.4741, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00020899735391139985, 'train/lm_loss': 2.796454355120659e-05, 'train/info_loss': 1.794067611626815e-05, 'train/ref_loss': 0.5671656727790833, 'train/uncertainty_loss': 0.04740565419197083, 'train/video_loss': 0.6162612438201904, 'train/total_loss': 0.6162891983985901} -tensor(0.2692, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(0.2025, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00020984613802284003, 'train/lm_loss': 4.1623553261160855e-05, 'train/info_loss': 2.270885306643322e-05, 'train/ref_loss': 0.34875354170799255, 'train/uncertainty_loss': 0.020245166122913362, 'train/video_loss': 0.3707001805305481, 'train/total_loss': 0.3707418143749237} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.1522, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 0] Trainer log: {'loss': 0.3907, 'grad_norm': 24.31747055053711, 'learning_rate': 5.548234493947939e-06}[Rank 1] Trainer log: {'loss': 0.3907, 'grad_norm': 24.31747055053711, 'learning_rate': 5.548234493947939e-06} -[Rank 3] Trainer log: {'loss': 0.3907, 'grad_norm': 24.31747055053711, 'learning_rate': 5.548234493947939e-06} - -[Rank 2] Trainer log: {'loss': 0.3907, 'grad_norm': 24.31747055053711, 'learning_rate': 5.548234493947939e-06} -{'loss': 0.3907, 'grad_norm': 24.31747055053711, 'learning_rate': 5.548234493947939e-06, 'epoch': 0.66} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3018145322799683, 'train/info_loss': 0.25296148657798767, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00014256734866648912, 'train/video_loss': 0.25281891226768494, 'train/total_loss': 0.5546334385871887} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(0.7656, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.39072275161743164, 'train/info_loss': 0.18714040517807007, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010978373466059567, 'train/video_loss': 0.187030628323555, 'train/total_loss': 0.5777533650398254} -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(0.0862, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4751, 'grad_norm': 13.349773406982422, 'learning_rate': 5.538684850210872e-06}[Rank 0] Trainer log: {'loss': 0.4751, 'grad_norm': 13.349773406982422, 'learning_rate': 5.538684850210872e-06}[Rank 3] Trainer log: {'loss': 0.4751, 'grad_norm': 13.349773406982422, 'learning_rate': 5.538684850210872e-06} - -[Rank 2] Trainer log: {'loss': 0.4751, 'grad_norm': 13.349773406982422, 'learning_rate': 5.538684850210872e-06} - -{'loss': 0.4751, 'grad_norm': 13.349773406982422, 'learning_rate': 5.538684850210872e-06, 'epoch': 0.66} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.3718518972396851, 'train/info_loss': 0.2331818789243698, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013318745186552405, 'train/video_loss': 0.2330486923456192, 'train/total_loss': 0.604900598526001} -tensor(0.2073, device='cuda:2', grad_fn=) tensor(-0.0006, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:3', grad_fn=) tensor(-0.0010, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.1576, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00022317415568977596, 'train/lm_loss': 2.8441313770599666e-05, 'train/info_loss': 1.8834713046089746e-05, 'train/ref_loss': 0.14411097764968872, 'train/uncertainty_loss': -6.683562532998622e-05, 'train/video_loss': 0.1458483785390854, 'train/total_loss': 0.14587682485580444} -[Rank 3] Trainer log: {'loss': 0.3281, 'grad_norm': 2.9163901805877686, 'learning_rate': 5.529140283296655e-06}[Rank 0] Trainer log: {'loss': 0.3281, 'grad_norm': 2.9163901805877686, 'learning_rate': 5.529140283296655e-06} -[Rank 2] Trainer log: {'loss': 0.3281, 'grad_norm': 2.9163901805877686, 'learning_rate': 5.529140283296655e-06} -[Rank 1] Trainer log: {'loss': 0.3281, 'grad_norm': 2.9163901805877686, 'learning_rate': 5.529140283296655e-06} - -{'loss': 0.3281, 'grad_norm': 2.9163901805877686, 'learning_rate': 5.529140283296655e-06, 'epoch': 0.67} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:3', grad_fn=) tensor(-0.0013, device='cuda:3', grad_fn=) -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0010, device='cuda:0', grad_fn=) tensor(-0.0010, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.062081152200698854, 'train/info_loss': 0.16616980731487274, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001029652892611921, 'train/video_loss': 0.16606684029102325, 'train/total_loss': 0.22814799845218658} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00018424011068418622, 'train/lm_loss': 3.657006600406021e-05, 'train/info_loss': 2.0682384274550714e-05, 'train/ref_loss': 0.03961974009871483, 'train/uncertainty_loss': -6.689630681648851e-05, 'train/video_loss': 0.04104745015501976, 'train/total_loss': 0.04108402132987976} -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3907, 'grad_norm': 4.135325908660889, 'learning_rate': 5.5196008040666645e-06}[Rank 1] Trainer log: {'loss': 0.3907, 'grad_norm': 4.135325908660889, 'learning_rate': 5.5196008040666645e-06} -[Rank 3] Trainer log: {'loss': 0.3907, 'grad_norm': 4.135325908660889, 'learning_rate': 5.5196008040666645e-06} - -[Rank 0] Trainer log: {'loss': 0.3907, 'grad_norm': 4.135325908660889, 'learning_rate': 5.5196008040666645e-06} -{'loss': 0.3907, 'grad_norm': 4.135325908660889, 'learning_rate': 5.5196008040666645e-06, 'epoch': 0.67} -tensor(-0.0014, device='cuda:3', grad_fn=) tensor(-0.0014, device='cuda:3', grad_fn=) -tensor(-0.0013, device='cuda:2', grad_fn=) tensor(-0.0013, device='cuda:2', grad_fn=) -tensor(0.0723, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) +tensor(0.0401, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002113046357408166, 'train/lm_loss': 4.093228199053556e-05, 'train/info_loss': 2.0324770957813598e-05, 'train/ref_loss': 0.0825377106666565, 'train/uncertainty_loss': -6.546297227032483e-05, 'train/video_loss': 0.08418301492929459, 'train/total_loss': 0.08422394841909409} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.1926, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1136, device='cuda:3', grad_fn=) tensor(0.0902, device='cuda:0', grad_fn=)tensor(-0.0007, device='cuda:3', grad_fn=) - tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002980440389364958, 'train/lm_loss': 3.676076594274491e-05, 'train/info_loss': 2.0980394765501842e-05, 'train/ref_loss': 0.27036359906196594, 'train/uncertainty_loss': 0.00902046412229538, 'train/video_loss': 0.2817894220352173, 'train/total_loss': 0.2818261682987213} -[Rank 1] Trainer log: {'loss': 0.3286, 'grad_norm': 2.2409825325012207, 'learning_rate': 5.510066423376514e-06} -[Rank 3] Trainer log: {'loss': 0.3286, 'grad_norm': 2.2409825325012207, 'learning_rate': 5.510066423376514e-06} -[Rank 2] Trainer log: {'loss': 0.3286, 'grad_norm': 2.2409825325012207, 'learning_rate': 5.510066423376514e-06} -[Rank 0] Trainer log: {'loss': 0.3286, 'grad_norm': 2.2409825325012207, 'learning_rate': 5.510066423376514e-06} -{'loss': 0.3286, 'grad_norm': 2.2409825325012207, 'learning_rate': 5.510066423376514e-06, 'epoch': 0.67} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.22054891586303713, 'train/info_loss': 0.16172195971012115, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012107143411412836, 'train/video_loss': 0.16160088777542114, 'train/total_loss': 0.3821498155593872} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(0.0148, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:3', grad_fn=) tensor(-0.0009, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2663368940353394, 'train/info_loss': 0.10841624438762665, 'train/ref_loss': None, 'train/uncertainty_loss': -8.942196145653725e-05, 'train/video_loss': 0.10832682251930237, 'train/total_loss': 0.37466371059417725} -tensor(0.1076, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3337, 'grad_norm': 2.651632070541382, 'learning_rate': 5.500537152075987e-06}[Rank 2] Trainer log: {'loss': 0.3337, 'grad_norm': 2.651632070541382, 'learning_rate': 5.500537152075987e-06}[Rank 3] Trainer log: {'loss': 0.3337, 'grad_norm': 2.651632070541382, 'learning_rate': 5.500537152075987e-06} - - -[Rank 0] Trainer log: {'loss': 0.3337, 'grad_norm': 2.651632070541382, 'learning_rate': 5.500537152075987e-06} -{'loss': 0.3337, 'grad_norm': 2.651632070541382, 'learning_rate': 5.500537152075987e-06, 'epoch': 0.67} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(0.2763, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.06459110975265503, 'train/info_loss': 0.21699465811252594, 'train/ref_loss': None, 'train/uncertainty_loss': -9.331009350717069e-05, 'train/video_loss': 0.21690134704113007, 'train/total_loss': 0.2814924716949463} -tensor(-0.0014, device='cuda:2', grad_fn=) tensor(-0.0014, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:1', grad_fn=) tensor(-0.0011, device='cuda:1', grad_fn=) -tensor(-0.0015, device='cuda:3', grad_fn=) tensor(-0.0015, device='cuda:3', grad_fn=) -tensor(0.0351, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00018034547101706267, 'train/lm_loss': 3.2064729020930825e-05, 'train/info_loss': 1.9728748156921938e-05, 'train/ref_loss': 0.22903497517108917, 'train/uncertainty_loss': 0.003506988659501076, 'train/video_loss': 0.2340044528245926, 'train/total_loss': 0.23403652012348175} -[Rank 1] Trainer log: {'loss': 0.3109, 'grad_norm': 3.874473810195923, 'learning_rate': 5.491013001009076e-06} -[Rank 0] Trainer log: {'loss': 0.3109, 'grad_norm': 3.874473810195923, 'learning_rate': 5.491013001009076e-06}[Rank 2] Trainer log: {'loss': 0.3109, 'grad_norm': 3.874473810195923, 'learning_rate': 5.491013001009076e-06} - -[Rank 3] Trainer log: {'loss': 0.3109, 'grad_norm': 3.874473810195923, 'learning_rate': 5.491013001009076e-06} -{'loss': 0.3109, 'grad_norm': 3.874473810195923, 'learning_rate': 5.491013001009076e-06, 'epoch': 0.67} -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.07050921320915222, 'train/info_loss': 0.19436703622341156, 'train/ref_loss': None, 'train/uncertainty_loss': -8.927494054660201e-05, 'train/video_loss': 0.19427776336669922, 'train/total_loss': 0.2647869884967804} -tensor(0.1349, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.0307, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.15221678018569948, 'train/info_loss': 0.10548965632915497, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011310963891446591, 'train/video_loss': 0.10537654906511307, 'train/total_loss': 0.2575933337211609} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.2411, 'grad_norm': 2.033538818359375, 'learning_rate': 5.4814939810139236e-06}[Rank 0] Trainer log: {'loss': 0.2411, 'grad_norm': 2.033538818359375, 'learning_rate': 5.4814939810139236e-06}[Rank 3] Trainer log: {'loss': 0.2411, 'grad_norm': 2.033538818359375, 'learning_rate': 5.4814939810139236e-06} - -[Rank 2] Trainer log: {'loss': 0.2411, 'grad_norm': 2.033538818359375, 'learning_rate': 5.4814939810139236e-06} - -{'loss': 0.2411, 'grad_norm': 2.033538818359375, 'learning_rate': 5.4814939810139236e-06, 'epoch': 0.67} -tensor(-0.0014, device='cuda:1', grad_fn=) tensor(-0.0014, device='cuda:1', grad_fn=) -tensor(-0.0008, device='cuda:3', grad_fn=) tensor(-0.0008, device='cuda:3', grad_fn=) -tensor(0.1315, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.17559853792190552, 'train/info_loss': 0.09834705293178558, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001089532976038754, 'train/video_loss': 0.09823810309171677, 'train/total_loss': 0.2738366425037384} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(-0.0008, device='cuda:1', grad_fn=) tensor(-0.0008, device='cuda:1', grad_fn=) -tensor(0.3571, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00020730055402964356, 'train/lm_loss': 3.654622996691615e-05, 'train/info_loss': 2.235124156868551e-05, 'train/ref_loss': 0.46529483795166016, 'train/uncertainty_loss': 0.035708272457122804, 'train/video_loss': 0.5026838779449463, 'train/total_loss': 0.5027204155921936} -[Rank 2] Trainer log: {'loss': 0.3566, 'grad_norm': 9.683050155639648, 'learning_rate': 5.471980102922859e-06}[Rank 0] Trainer log: {'loss': 0.3566, 'grad_norm': 9.683050155639648, 'learning_rate': 5.471980102922859e-06} -[Rank 1] Trainer log: {'loss': 0.3566, 'grad_norm': 9.683050155639648, 'learning_rate': 5.471980102922859e-06} - -[Rank 3] Trainer log: {'loss': 0.3566, 'grad_norm': 9.683050155639648, 'learning_rate': 5.471980102922859e-06} -{'loss': 0.3566, 'grad_norm': 9.683050155639648, 'learning_rate': 5.471980102922859e-06, 'epoch': 0.67} -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.1469507694244385, 'train/info_loss': 0.2718953788280487, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012867078185081483, 'train/video_loss': 0.271766722202301, 'train/total_loss': 0.41871750354766846} -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.5532, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.4076, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.08282127976417542, 'train/info_loss': 0.19566015899181366, 'train/ref_loss': None, 'train/uncertainty_loss': -8.814950124360622e-05, 'train/video_loss': 0.19557200372219086, 'train/total_loss': 0.2783932685852051} -tensor(0.6230, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 2] Trainer log: {'loss': 0.3995, 'grad_norm': 9.78901481628418, 'learning_rate': 5.4624713775623465e-06} -[Rank 1] Trainer log: {'loss': 0.3995, 'grad_norm': 9.78901481628418, 'learning_rate': 5.4624713775623465e-06} -[Rank 3] Trainer log: {'loss': 0.3995, 'grad_norm': 9.78901481628418, 'learning_rate': 5.4624713775623465e-06} -[Rank 0] Trainer log: {'loss': 0.3995, 'grad_norm': 9.78901481628418, 'learning_rate': 5.4624713775623465e-06} -{'loss': 0.3995, 'grad_norm': 9.78901481628418, 'learning_rate': 5.4624713775623465e-06, 'epoch': 0.67} -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -tensor(0.4353, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00014477868098765613, 'train/lm_loss': 2.8036060393787923e-05, 'train/info_loss': 1.9132725356030278e-05, 'train/ref_loss': 0.5355187058448792, 'train/uncertainty_loss': 0.043532878160476685, 'train/video_loss': 0.5802289247512817, 'train/total_loss': 0.5802569389343262} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0013, device='cuda:0', grad_fn=) tensor(-0.0013, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.2846735239028931, 'train/info_loss': 0.25035104155540466, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012556229485198857, 'train/video_loss': 0.2502254843711853, 'train/total_loss': 0.5348989963531494} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0010, device='cuda:1', grad_fn=) tensor(-0.0010, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.2973, 'grad_norm': 6.888955593109131, 'learning_rate': 5.452967815752981e-06} -[Rank 0] Trainer log: {'loss': 0.2973, 'grad_norm': 6.888955593109131, 'learning_rate': 5.452967815752981e-06}[Rank 2] Trainer log: {'loss': 0.2973, 'grad_norm': 6.888955593109131, 'learning_rate': 5.452967815752981e-06} -[Rank 3] Trainer log: {'loss': 0.2973, 'grad_norm': 6.888955593109131, 'learning_rate': 5.452967815752981e-06} - -{'loss': 0.2973, 'grad_norm': 6.888955593109131, 'learning_rate': 5.452967815752981e-06, 'epoch': 0.67} -tensor(-0.0012, device='cuda:0', grad_fn=) tensor(-0.0012, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.35250082015991213, 'train/info_loss': 0.12986838817596436, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011850049486383796, 'train/video_loss': 0.12974989414215088, 'train/total_loss': 0.48225072026252747} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(-0.0009, device='cuda:0', grad_fn=) tensor(-0.0009, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.06482524275779725, 'train/info_loss': 0.19261880218982697, 'train/ref_loss': None, 'train/uncertainty_loss': -9.101710165850819e-05, 'train/video_loss': 0.19252778589725494, 'train/total_loss': 0.2573530375957489} -tensor(-0.0009, device='cuda:1', grad_fn=) tensor(-0.0009, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.2944, 'grad_norm': 2.9446370601654053, 'learning_rate': 5.4434694283094916e-06}[Rank 0] Trainer log: {'loss': 0.2944, 'grad_norm': 2.9446370601654053, 'learning_rate': 5.4434694283094916e-06} -[Rank 1] Trainer log: {'loss': 0.2944, 'grad_norm': 2.9446370601654053, 'learning_rate': 5.4434694283094916e-06} -[Rank 2] Trainer log: {'loss': 0.2944, 'grad_norm': 2.9446370601654053, 'learning_rate': 5.4434694283094916e-06} - -{'loss': 0.2944, 'grad_norm': 2.9446370601654053, 'learning_rate': 5.4434694283094916e-06, 'epoch': 0.67} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(-0.0009, device='cuda:2', grad_fn=) tensor(-0.0009, device='cuda:2', grad_fn=) -tensor(0.0826, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00014374495949596167, 'train/lm_loss': 2.815525222104043e-05, 'train/info_loss': 1.9132725356030278e-05, 'train/ref_loss': 0.2506270706653595, 'train/uncertainty_loss': 0.008263303339481354, 'train/video_loss': 0.2600594460964203, 'train/total_loss': 0.26008760929107666} -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.32692942619323734, 'train/info_loss': 0.17380265891551971, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010667567839846016, 'train/video_loss': 0.17369598150253296, 'train/total_loss': 0.5006253719329834} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -tensor(0.4338, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(-0.0007, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4206, 'grad_norm': 6.2780914306640625, 'learning_rate': 5.433976226040713e-06} -[Rank 2] Trainer log: {'loss': 0.4206, 'grad_norm': 6.2780914306640625, 'learning_rate': 5.433976226040713e-06}[Rank 1] Trainer log: {'loss': 0.4206, 'grad_norm': 6.2780914306640625, 'learning_rate': 5.433976226040713e-06} -[Rank 0] Trainer log: {'loss': 0.4206, 'grad_norm': 6.2780914306640625, 'learning_rate': 5.433976226040713e-06} - -{'loss': 0.4206, 'grad_norm': 6.2780914306640625, 'learning_rate': 5.433976226040713e-06, 'epoch': 0.67} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0012, device='cuda:3', grad_fn=) tensor(-0.0012, device='cuda:3', grad_fn=) -tensor(0.0401, device='cuda:0', grad_fn=) tensor(0.1457, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -tensor(-0.0007, device='cuda:2', grad_fn=) -{'train/tv_loss': 0.0001431380049325526, 'train/lm_loss': 2.1766431746073068e-05, 'train/info_loss': 1.561617318657227e-05, 'train/ref_loss': 0.21703669428825378, 'train/uncertainty_loss': 0.0040077798068523405, 'train/video_loss': 0.22220520675182343, 'train/total_loss': 0.22222697734832764} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.19441068172454834, 'train/info_loss': 0.2764996588230133, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010609085438773036, 'train/video_loss': 0.2763935625553131, 'train/total_loss': 0.47080424427986145} -tensor(0.0335, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.3794, 'grad_norm': 5.537331581115723, 'learning_rate': 5.424488219749593e-06}[Rank 3] Trainer log: {'loss': 0.3794, 'grad_norm': 5.537331581115723, 'learning_rate': 5.424488219749593e-06}[Rank 0] Trainer log: {'loss': 0.3794, 'grad_norm': 5.537331581115723, 'learning_rate': 5.424488219749593e-06} - -[Rank 2] Trainer log: {'loss': 0.3794, 'grad_norm': 5.537331581115723, 'learning_rate': 5.424488219749593e-06} - -{'loss': 0.3794, 'grad_norm': 5.537331581115723, 'learning_rate': 5.424488219749593e-06, 'epoch': 0.67} -tensor(-0.0008, device='cuda:2', grad_fn=) tensor(-0.0008, device='cuda:2', grad_fn=) -tensor(0.1813, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.1105, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.10859153270721436, 'train/info_loss': 0.21142883598804474, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010824981145560742, 'train/video_loss': 0.21132057905197144, 'train/total_loss': 0.3199121057987213} -tensor(-0.0012, device='cuda:1', grad_fn=) tensor(-0.0012, device='cuda:1', grad_fn=) -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(0.2025, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.00026534441858530047, 'train/lm_loss': 3.2517651561647654e-05, 'train/info_loss': 2.0682384274550714e-05, 'train/ref_loss': 0.34838536381721497, 'train/uncertainty_loss': 0.020245753228664398, 'train/video_loss': 0.3707745671272278, 'train/total_loss': 0.37080708146095276} -tensor(-0.0012, device='cuda:2', grad_fn=) tensor(-0.0012, device='cuda:2', grad_fn=) -[Rank 3] Trainer log: {'loss': 0.4171, 'grad_norm': 7.524171829223633, 'learning_rate': 5.415005420233141e-06}[Rank 1] Trainer log: {'loss': 0.4171, 'grad_norm': 7.524171829223633, 'learning_rate': 5.415005420233141e-06}[Rank 2] Trainer log: {'loss': 0.4171, 'grad_norm': 7.524171829223633, 'learning_rate': 5.415005420233141e-06} - - -[Rank 0] Trainer log: {'loss': 0.4171, 'grad_norm': 7.524171829223633, 'learning_rate': 5.415005420233141e-06} -{'loss': 0.4171, 'grad_norm': 7.524171829223633, 'learning_rate': 5.415005420233141e-06, 'epoch': 0.67} -tensor(-0.0015, device='cuda:3', grad_fn=) tensor(-0.0015, device='cuda:3', grad_fn=) -tensor(-0.0011, device='cuda:0', grad_fn=) tensor(-0.0011, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.4450415134429932, 'train/info_loss': 0.18519927561283112, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010915255406871439, 'train/video_loss': 0.18509012460708618, 'train/total_loss': 0.6301316022872925} -tensor(0.2665, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.2499, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0014, device='cuda:0', grad_fn=) tensor(-0.0014, device='cuda:0', grad_fn=) -{'train/tv_loss': None, 'train/lm_loss': 0.31789367198944096, 'train/info_loss': 0.2461404800415039, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00014492600457742812, 'train/video_loss': 0.24599555134773254, 'train/total_loss': 0.5638892650604248} -tensor(-0.0007, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(0.4158, device='cuda:3', grad_fn=) tensor(-0.0006, device='cuda:3', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.4059, 'grad_norm': 16.058292388916016, 'learning_rate': 5.405527838282458e-06} -[Rank 3] Trainer log: {'loss': 0.4059, 'grad_norm': 16.058292388916016, 'learning_rate': 5.405527838282458e-06} -[Rank 0] Trainer log: {'loss': 0.4059, 'grad_norm': 16.058292388916016, 'learning_rate': 5.405527838282458e-06} -[Rank 2] Trainer log: {'loss': 0.4059, 'grad_norm': 16.058292388916016, 'learning_rate': 5.405527838282458e-06} -{'loss': 0.4059, 'grad_norm': 16.058292388916016, 'learning_rate': 5.405527838282458e-06, 'epoch': 0.67} -tensor(-0.0010, device='cuda:2', grad_fn=) tensor(-0.0010, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0608, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -tensor(0.1152, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.000334132113493979, 'train/lm_loss': 3.177867038175464e-05, 'train/info_loss': 2.0682384274550714e-05, 'train/ref_loss': 0.2817174196243286, 'train/uncertainty_loss': 0.011518295109272004, 'train/video_loss': 0.2959294617176056, 'train/total_loss': 0.2959612309932709} -tensor(-0.0011, device='cuda:2', grad_fn=) tensor(-0.0011, device='cuda:2', grad_fn=) -tensor(0.3057, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0002599612809717655, 'train/lm_loss': 4.741583543363959e-05, 'train/info_loss': 2.3066464564180933e-05, 'train/ref_loss': 0.42135030031204224, 'train/uncertainty_loss': 0.03057071566581726, 'train/video_loss': 0.4540237784385681, 'train/total_loss': 0.45407119393348694} -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) -tensor(0.0280, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=) -[Rank 1] Trainer log: {'loss': 0.28, 'grad_norm': 6.182593822479248, 'learning_rate': 5.396055484682719e-06}[Rank 2] Trainer log: {'loss': 0.28, 'grad_norm': 6.182593822479248, 'learning_rate': 5.396055484682719e-06} - -[Rank 0] Trainer log: {'loss': 0.28, 'grad_norm': 6.182593822479248, 'learning_rate': 5.396055484682719e-06}[Rank 3] Trainer log: {'loss': 0.28, 'grad_norm': 6.182593822479248, 'learning_rate': 5.396055484682719e-06} - -{'loss': 0.28, 'grad_norm': 6.182593822479248, 'learning_rate': 5.396055484682719e-06, 'epoch': 0.67} -tensor(-0.0011, device='cuda:3', grad_fn=) tensor(-0.0011, device='cuda:3', grad_fn=) -tensor(0.0697, device='cuda:0', grad_fn=) tensor(-0.0007, device='cuda:0', grad_fn=) -{'train/tv_loss': 0.0003155144164338708, 'train/lm_loss': 5.3374795243144035e-05, 'train/info_loss': 2.455650974297896e-05, 'train/ref_loss': 0.22853229939937592, 'train/uncertainty_loss': 0.006969699263572693, 'train/video_loss': 0.2380506843328476, 'train/total_loss': 0.23810406029224396} -tensor(0.2178, device='cuda:2', grad_fn=) tensor(-0.0007, device='cuda:2', grad_fn=) -tensor(-0.0007, device='cuda:1', grad_fn=) tensor(-0.0007, device='cuda:1', grad_fn=) +{'train/tv_loss': 0.00021248778793960812, 'train/lm_loss': 8.719450561329723e-05, 'train/info_loss': 2.1636018573190086e-05, 'train/ref_loss': 0.02926066145300865, 'train/uncertainty_loss': -6.862442241981626e-05, 'train/video_loss': 0.0309135764837265, 'train/total_loss': 0.0310007706284523} +tensor(0.0051, device='cuda:3', grad_fn=) tensor(-0.0007, device='cuda:3', grad_fn=)