diff --git "a/train.log" "b/train.log" --- "a/train.log" +++ "b/train.log" @@ -1,171 +1,40 @@ +[2025-04-10 01:18:01,884] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 01:18:01,890] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 01:18:01,890] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 01:18:01,914] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) Global rank 1, Local Rank: 1 initiated +[2025-04-10 01:18:03,472] [INFO] [comm.py:652:init_distributed] cdb=None +Global rank 3, Local Rank: 3 initiated +[2025-04-10 01:18:03,485] [INFO] [comm.py:652:init_distributed] cdb=None Global rank 0, Local Rank: 0 initiated -[2025-04-10 00:47:09,278] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) -[2025-04-10 00:47:09,281] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) -[2025-04-10 00:47:10,150] [INFO] [comm.py:652:init_distributed] cdb=None -[2025-04-10 00:47:10,152] [INFO] [comm.py:652:init_distributed] cdb=None -[2025-04-10 00:47:10,152] [INFO] [comm.py:683:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +Global rank 2, Local Rank: 2 initiated +[2025-04-10 01:18:03,555] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-04-10 01:18:03,555] [INFO] [comm.py:683:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +[2025-04-10 01:18:03,558] [INFO] [comm.py:652:init_distributed] cdb=None GPU 0 - Using device: cuda +GPU 3 - Using device: cuda GPU 1 - Using device: cuda -LiveTrainingArguments( -_n_gpu=1, -accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'gradient_accumulation_kwargs': None}, -adafactor=False, -adam_beta1=0.9, -adam_beta2=0.999, -adam_epsilon=1e-08, -attn_implementation=flash_attention_2, -augmentation=False, -auto_find_batch_size=False, -bf16=True, -bf16_full_eval=False, -data_seed=None, -dataloader_drop_last=False, -dataloader_num_workers=4, -dataloader_persistent_workers=False, -dataloader_pin_memory=True, -dataloader_prefetch_factor=None, -dataset_config=configs/datasets/paperspace_configuration.json, -ddp_backend=None, -ddp_broadcast_buffers=None, -ddp_bucket_cap_mb=None, -ddp_find_unused_parameters=None, -ddp_timeout=1800, -debug=[], -deepspeed=configs/deepspeed/zero2.json, -disable_tqdm=False, -dispatch_batches=None, -do_eval=False, -do_predict=False, -do_train=False, -embed_mark=2fps_384_1+3x3, -eval_accumulation_steps=None, -eval_delay=0, -eval_do_concat_batches=True, -eval_steps=None, -evaluation_strategy=no, -finetune_modules=['connector', 'mm_projector', 'lm_head', 'informative_head', 'relevance_head', 'uncertainty_head'], -first_n_frames_no_generate=0, -fp16=False, -fp16_backend=auto, -fp16_full_eval=False, -fp16_opt_level=O1, -frame_fps=2, -frame_num_tokens=49, -frame_resolution=384, -frame_token_cls=False, -frame_token_pooled=[7, 7], -fsdp=[], -fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, -fsdp_min_num_params=0, -fsdp_transformer_layer_cls_to_wrap=None, -full_determinism=False, -gradient_accumulation_steps=16, -gradient_checkpointing=True, -gradient_checkpointing_kwargs=None, -greater_is_better=None, -grounding_mode=False, -group_by_length=False, -half_precision_backend=auto, -hub_always_push=False, -hub_model_id=None, -hub_private_repo=False, -hub_strategy=every_save, -hub_token=, -ignore_data_skip=False, -include_inputs_for_metrics=False, -include_num_input_tokens_seen=False, -include_tokens_per_second=False, -input_dir=dataset/tvsum/ydata-tvsum50-v1_1/video, -jit_mode_eval=False, -label_names=None, -label_smoothing_factor=0.0, -learning_rate=2e-05, -length_column_name=length, -live_version=live1+, -llm_pretrained=lmms-lab/llava-onevision-qwen2-7b-ov, -load_best_model_at_end=False, -local_rank=0, -log_level=passive, -log_level_replica=warning, -log_on_each_node=True, -logging_dir=outputs/aha/runs/Apr10_00-47-09_psdppq1rwe2a, -logging_first_step=False, -logging_nan_inf_filter=True, -logging_steps=10, -logging_strategy=steps, -lora_alpha=32, -lora_modules=model\.layers.*(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$, -lora_pretrained=None, -lora_r=16, -lr_scheduler_kwargs={}, -lr_scheduler_type=cosine, -max_grad_norm=1.0, -max_num_frames=100, -max_steps=-1, -metric_for_best_model=None, -mp_parameters=, -neftune_noise_alpha=None, -no_cuda=False, -num_train_epochs=1.0, -optim=adamw_torch, -optim_args=None, -optim_target_modules=None, -output_dir=outputs/aha, -overwrite_output_dir=False, -past_index=-1, -per_device_eval_batch_size=1, -per_device_train_batch_size=1, -prediction_loss_only=False, -push_to_hub=True, -push_to_hub_model_id=None, -push_to_hub_organization=None, -push_to_hub_token=, -quantization=False, -ray_scope=last, -remove_unused_columns=True, -report_to=['wandb'], -resume_from_checkpoint=None, -run_name=outputs/aha, -save_on_each_node=False, -save_only_model=False, -save_safetensors=True, -save_steps=500, -save_strategy=steps, -save_total_limit=5, -seed=42, -skip_memory_metrics=True, -split_batches=None, -stream_loss_weight=1.0, -tf32=True, -torch_compile=False, -torch_compile_backend=None, -torch_compile_mode=None, -torchdynamo=None, -tpu_metrics_debug=False, -tpu_num_cores=None, -use_cache=False, -use_cpu=False, -use_ipex=False, -use_legacy_prediction_loop=False, -use_mps_device=False, -v_placeholder=, -video_pooling_stride=4, -vision_pretrained=google/siglip-large-patch16-384, -warmup_ratio=0.05, -warmup_steps=0, -weight_decay=0.0, -) -using lm_loss_weight: 0.5, video_loss_weight: 1, info_loss_weight: 2.0, ref_loss_weight: 1.0, uncertainty_loss_weight: 0.2, and tv_loss_weight: 0.1 for training +GPU 2 - Using device: cuda +Wandb initialized Rank 0: Loading vision tower: google/siglip-so400m-patch14-384 -using lm_loss_weight: 0.5, video_loss_weight: 1, info_loss_weight: 2.0, ref_loss_weight: 1.0, uncertainty_loss_weight: 0.2, and tv_loss_weight: 0.1 for training -creating lora with config: LoraConfig(peft_type=, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='CAUSAL_LM', inference_mode=False, r=16, target_modules='model\\.layers.*(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$', lora_alpha=32, lora_dropout=0.05, fan_in_fan_out=False, bias='none', modules_to_save=['connector', 'mm_projector', 'lm_head', 'informative_head', 'relevance_head', 'uncertainty_head'], init_lora_weights=True, layers_to_transform=None, layers_pattern=None) -creating lora with config: LoraConfig(peft_type=, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='CAUSAL_LM', inference_mode=False, r=16, target_modules='model\\.layers.*(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$', lora_alpha=32, lora_dropout=0.05, fan_in_fan_out=False, bias='none', modules_to_save=['connector', 'mm_projector', 'lm_head', 'informative_head', 'relevance_head', 'uncertainty_head'], init_lora_weights=True, layers_to_transform=None, layers_pattern=None) +using lm_loss_weight: 0.5, video_loss_weight: 1, info_loss_weight: 2.0, ref_loss_weight: 1.0, uncertainty_loss_weight: 0.2, and tv_loss_weight: 0.1 for training +creating lora with config: LoraConfig(peft_type=, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='CAUSAL_LM', inference_mode=False, r=16, target_modules='model\\.layers.*(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$', lora_alpha=32, lora_dropout=0.05, fan_in_fan_out=False, bias='none', modules_to_save=['connector', 'mm_projector', 'response_head', 'related_head', 'lm_head', 'informative_head', 'relevance_head', 'uncertainty_head'], init_lora_weights=True, layers_to_transform=None, layers_pattern=None) +creating lora with config: LoraConfig(peft_type=, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='CAUSAL_LM', inference_mode=False, r=16, target_modules='model\\.layers.*(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$', lora_alpha=32, lora_dropout=0.05, fan_in_fan_out=False, bias='none', modules_to_save=['connector', 'mm_projector', 'response_head', 'related_head', 'lm_head', 'informative_head', 'relevance_head', 'uncertainty_head'], init_lora_weights=True, layers_to_transform=None, layers_pattern=None) +creating lora with config: LoraConfig(peft_type=, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='CAUSAL_LM', inference_mode=False, r=16, target_modules='model\\.layers.*(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$', lora_alpha=32, lora_dropout=0.05, fan_in_fan_out=False, bias='none', modules_to_save=['connector', 'mm_projector', 'response_head', 'related_head', 'lm_head', 'informative_head', 'relevance_head', 'uncertainty_head'], init_lora_weights=True, layers_to_transform=None, layers_pattern=None) +creating lora with config: LoraConfig(peft_type=, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='CAUSAL_LM', inference_mode=False, r=16, target_modules='model\\.layers.*(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$', lora_alpha=32, lora_dropout=0.05, fan_in_fan_out=False, bias='none', modules_to_save=['connector', 'mm_projector', 'response_head', 'related_head', 'lm_head', 'informative_head', 'relevance_head', 'uncertainty_head'], init_lora_weights=True, layers_to_transform=None, layers_pattern=None) trainable params: 1,164,355,584 || all params: 8,632,726,048 || trainable%: 13.487692966577503 -freezing ViT +[Rank 3] Distributed initialized? True +[Rank 3] Backend: nccl +load datasets/coin/videos_metadata.json... +trainable params: 1,164,355,584 || all params: 8,632,726,048 || trainable%: 13.487692966577503 +[Rank 1] Distributed initialized? True +[Rank 1] Backend: nccl +load datasets/coin/videos_metadata.json... +trainable params: 1,164,355,584 || all params: 8,632,726,048 || trainable%: 13.487692966577503 +[Rank 2] Distributed initialized? True +[Rank 2] Backend: nccl load datasets/coin/videos_metadata.json... trainable params: 1,164,355,584 || all params: 8,632,726,048 || trainable%: 13.487692966577503 -freezing ViT ('base_model.model.model.image_newline', torch.Size([3584]), torch.bfloat16, False) ('base_model.model.model.embed_tokens.weight', torch.Size([152064, 3584]), torch.bfloat16, False) ('base_model.model.model.layers.0.self_attn.q_proj.weight', torch.Size([3584, 3584]), torch.bfloat16, False) @@ -1334,20 +1203,57 @@ freezing ViT ('base_model.model.relevance_head.modules_to_save.default.weight', torch.Size([1, 3584]), torch.bfloat16, True) ('base_model.model.uncertainty_head.original_module.weight', torch.Size([1, 3584]), torch.bfloat16, True) ('base_model.model.uncertainty_head.modules_to_save.default.weight', torch.Size([1, 3584]), torch.bfloat16, True) +[Rank 0] Distributed initialized? True +[Rank 0] Backend: nccl load datasets/coin/videos_metadata.json... -Dataset DenseVideoCaptioningStreamDataset has 1 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nDo concise real-time narration.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nput the melted soap block into the vessel<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\ntake out after freezing<|im_end|>', torch.Size([91, 3, 384, 384]), [range(30830, 30881), range(31620, 31653)], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1], None, 0] +Dataset DenseVideoCaptioningStreamDataset has 5 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nHelp me to illustrate my view in short.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nput the melted soap block into the vessel<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\ntake out after freezing<|im_end|>', torch.Size([91, 3, 384, 384]), [range(30838, 30889), range(31628, 31661)], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1], None, 0] +load datasets/shot2story/release_134k_videos_metadata.json... +Dataset DenseVideoCaptioningStreamDataset has 5 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nSimply interpret the scene for me.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nput the melted soap block into the vessel<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\ntake out after freezing<|im_end|>', torch.Size([91, 3, 384, 384]), [range(30833, 30884), range(31623, 31656)], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1], None, 0] load datasets/shot2story/release_134k_videos_metadata.json... -Dataset MAGQAStreamDataset has 1 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>user\nWhat happens during the basketball game?<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nSeveral players in white jerseys are celebrating by high-fiving each other.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA player in a white jersey makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA successful shot is made by a player in a white jersey after a series of passes.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA player in a white jersey passes the ball and the teammate shoots over the defending blue players.<|im_end|>', torch.Size([36, 3, 384, 384]), [range(1371, 1456), range(4939, 5000), range(8826, 8917), range(13086, 13195)], [-100, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], None, 0] +Dataset DenseVideoCaptioningStreamDataset has 5 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nPlease simply describe what do you see.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nput the melted soap block into the vessel<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\ntake out after freezing<|im_end|>', torch.Size([91, 3, 384, 384]), [range(30838, 30889), range(31628, 31661)], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1], None, 0] load datasets/shot2story/release_134k_videos_metadata.json... -Dataset DenseVideoCaptioningStreamDataset has 1 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nHelp me to illustrate my view in short.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nSeveral players in white jerseys are high-fiving each other. There is a black screen behind them.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game going on. The players wear blue and white jerseys. There are many spectators sitting in the stands. A player in a white jersey passes the ball to his teammate, who takes the ball and makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game taking place in the stadium. There is a huge crowd watching from the stands. The players wear blue and white jerseys. A player in a white jersey passes the ball to his teammate, who passes the ball to another teammate. The last player wearing the white jersey catches the ball and makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game played by two teams, one is blue and the other is white. The game is played in a large stadium with many people watching in the stands. A player in a white jersey passes the ball to his teammate. There are several players in blue jerseys in front of him who are preventing him from passing the ball. Eventually, his teammate catches the ball and jumps up to shoot.<|im_end|>', torch.Size([37, 3, 384, 384]), [range(1683, 1790), range(5273, 5517), range(9000, 9342), range(13854, 14255)], [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1], None, 0] +Dataset MAGQAStreamDataset has 6 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>user\nWhat happens during the basketball game?<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nSeveral players in white jerseys are celebrating by high-fiving each other.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA player in a white jersey makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA successful shot is made by a player in a white jersey after a series of passes.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA player in a white jersey passes the ball and the teammate shoots over the defending blue players.<|im_end|>', torch.Size([36, 3, 384, 384]), [range(1371, 1456), range(4939, 5000), range(8826, 8917), range(13086, 13195)], [-100, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], None, 0] +load datasets/shot2story/release_134k_videos_metadata.json... +Dataset DenseVideoCaptioningStreamDataset has 5 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nContinuously answer what you observed with simple text.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nput the melted soap block into the vessel<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\ntake out after freezing<|im_end|>', torch.Size([91, 3, 384, 384]), [range(30854, 30905), range(31644, 31677)], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1], None, 0] +load datasets/shot2story/release_134k_videos_metadata.json... +Dataset MAGQAStreamDataset has 6 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>user\nWhat happens during the basketball game?<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nSeveral players in white jerseys are celebrating by high-fiving each other.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA player in a white jersey makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA successful shot is made by a player in a white jersey after a series of passes.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA player in a white jersey passes the ball and the teammate shoots over the defending blue players.<|im_end|>', torch.Size([36, 3, 384, 384]), [range(1371, 1456), range(4939, 5000), range(8826, 8917), range(13086, 13195)], [-100, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], None, 0] +load datasets/shot2story/release_134k_videos_metadata.json... +Dataset DenseVideoCaptioningStreamDataset has 6 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nWhat can you tell me about? Be concise.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nSeveral players in white jerseys are high-fiving each other. There is a black screen behind them.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game going on. The players wear blue and white jerseys. There are many spectators sitting in the stands. A player in a white jersey passes the ball to his teammate, who takes the ball and makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game taking place in the stadium. There is a huge crowd watching from the stands. The players wear blue and white jerseys. A player in a white jersey passes the ball to his teammate, who passes the ball to another teammate. The last player wearing the white jersey catches the ball and makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game played by two teams, one is blue and the other is white. The game is played in a large stadium with many people watching in the stands. A player in a white jersey passes the ball to his teammate. There are several players in blue jerseys in front of him who are preventing him from passing the ball. Eventually, his teammate catches the ball and jumps up to shoot.<|im_end|>', torch.Size([37, 3, 384, 384]), [range(1683, 1790), range(5273, 5517), range(9000, 9342), range(13854, 14255)], [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1], None, 0] load datasets/hisum/videos_metadata.json... -Dataset DenseVideoCaptioningStreamDataset has 1 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nPlease concisely narrate the video in real time.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nput the melted soap block into the vessel<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\ntake out after freezing<|im_end|>', torch.Size([91, 3, 384, 384]), [range(30847, 30898), range(31637, 31670)], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1], None, 0] +Dataset MAGQAStreamDataset has 6 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>user\nWhat happens during the basketball game?<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nSeveral players in white jerseys are celebrating by high-fiving each other.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA player in a white jersey makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA successful shot is made by a player in a white jersey after a series of passes.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA player in a white jersey passes the ball and the teammate shoots over the defending blue players.<|im_end|>', torch.Size([36, 3, 384, 384]), [range(1371, 1456), range(4939, 5000), range(8826, 8917), range(13086, 13195)], [-100, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], None, 0] load datasets/shot2story/release_134k_videos_metadata.json... -Dataset MAGQAStreamDataset has 1 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>user\nWhat happens during the basketball game?<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nSeveral players in white jerseys are celebrating by high-fiving each other.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA player in a white jersey makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA successful shot is made by a player in a white jersey after a series of passes.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA player in a white jersey passes the ball and the teammate shoots over the defending blue players.<|im_end|>', torch.Size([36, 3, 384, 384]), [range(1371, 1456), range(4939, 5000), range(8826, 8917), range(13086, 13195)], [-100, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], None, 0] +Dataset MAGQAStreamDataset has 6 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>user\nWhat happens during the basketball game?<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nSeveral players in white jerseys are celebrating by high-fiving each other.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA player in a white jersey makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA successful shot is made by a player in a white jersey after a series of passes.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nA player in a white jersey passes the ball and the teammate shoots over the defending blue players.<|im_end|>', torch.Size([36, 3, 384, 384]), [range(1371, 1456), range(4939, 5000), range(8826, 8917), range(13086, 13195)], [-100, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], None, 0] load datasets/shot2story/release_134k_videos_metadata.json... -Dataset DenseVideoCaptioningStreamDataset has 1 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nHelp me to illustrate my view in short.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nSeveral players in white jerseys are high-fiving each other. There is a black screen behind them.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game going on. The players wear blue and white jerseys. There are many spectators sitting in the stands. A player in a white jersey passes the ball to his teammate, who takes the ball and makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game taking place in the stadium. There is a huge crowd watching from the stands. The players wear blue and white jerseys. A player in a white jersey passes the ball to his teammate, who passes the ball to another teammate. The last player wearing the white jersey catches the ball and makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game played by two teams, one is blue and the other is white. The game is played in a large stadium with many people watching in the stands. A player in a white jersey passes the ball to his teammate. There are several players in blue jerseys in front of him who are preventing him from passing the ball. Eventually, his teammate catches the ball and jumps up to shoot.<|im_end|>', torch.Size([37, 3, 384, 384]), [range(1683, 1790), range(5273, 5517), range(9000, 9342), range(13854, 14255)], [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1], None, 0] +Dataset DenseVideoCaptioningStreamDataset has 6 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nDo concise real-time narration.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nSeveral players in white jerseys are high-fiving each other. There is a black screen behind them.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game going on. The players wear blue and white jerseys. There are many spectators sitting in the stands. A player in a white jersey passes the ball to his teammate, who takes the ball and makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game taking place in the stadium. There is a huge crowd watching from the stands. The players wear blue and white jerseys. A player in a white jersey passes the ball to his teammate, who passes the ball to another teammate. The last player wearing the white jersey catches the ball and makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game played by two teams, one is blue and the other is white. The game is played in a large stadium with many people watching in the stands. A player in a white jersey passes the ball to his teammate. There are several players in blue jerseys in front of him who are preventing him from passing the ball. Eventually, his teammate catches the ball and jumps up to shoot.<|im_end|>', torch.Size([37, 3, 384, 384]), [range(1675, 1782), range(5265, 5509), range(8992, 9334), range(13846, 14247)], [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1], None, 0] +load datasets/hisum/videos_metadata.json... +Dataset DenseVideoCaptioningStreamDataset has 6 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nPlease simply describe what do you see.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nSeveral players in white jerseys are high-fiving each other. There is a black screen behind them.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game going on. The players wear blue and white jerseys. There are many spectators sitting in the stands. A player in a white jersey passes the ball to his teammate, who takes the ball and makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game taking place in the stadium. There is a huge crowd watching from the stands. The players wear blue and white jerseys. A player in a white jersey passes the ball to his teammate, who passes the ball to another teammate. The last player wearing the white jersey catches the ball and makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game played by two teams, one is blue and the other is white. The game is played in a large stadium with many people watching in the stands. A player in a white jersey passes the ball to his teammate. There are several players in blue jerseys in front of him who are preventing him from passing the ball. Eventually, his teammate catches the ball and jumps up to shoot.<|im_end|>', torch.Size([37, 3, 384, 384]), [range(1683, 1790), range(5273, 5517), range(9000, 9342), range(13854, 14255)], [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1], None, 0] +load datasets/hisum/videos_metadata.json... +Dataset DenseVideoCaptioningStreamDataset has 6 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nPlease concisely narrate the video in real time.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nSeveral players in white jerseys are high-fiving each other. There is a black screen behind them.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game going on. The players wear blue and white jerseys. There are many spectators sitting in the stands. A player in a white jersey passes the ball to his teammate, who takes the ball and makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game taking place in the stadium. There is a huge crowd watching from the stands. The players wear blue and white jerseys. A player in a white jersey passes the ball to his teammate, who passes the ball to another teammate. The last player wearing the white jersey catches the ball and makes a successful shot.<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>assistant\nThere is a basketball game played by two teams, one is blue and the other is white. The game is played in a large stadium with many people watching in the stands. A player in a white jersey passes the ball to his teammate. There are several players in blue jerseys in front of him who are preventing him from passing the ball. Eventually, his teammate catches the ball and jumps up to shoot.<|im_end|>', torch.Size([37, 3, 384, 384]), [range(1692, 1799), range(5282, 5526), range(9009, 9351), range(13863, 14264)], [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1], None, 0] load datasets/hisum/videos_metadata.json... -Dataset HiSumDataset has 2 examples. Example data: ["<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nWhich clips in the video relate to the query 'Customized Yamaha MT-01 akrapovic HD (Without DB-killer)'?<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>", torch.Size([128, 3, 384, 384]), [], None, [0.3134304745870105, 0.3134304745870105, 0.3134304745870105, 0.27341293597518773, 0.27341293597518773, 0.27341293597518773, 0.22936455739720477, 0.22936455739720477, 0.26177033946041395, 0.26177033946041395, 0.26177033946041395, 0.22304313139994245, 0.22304313139994245, 0.2212598205977789, 0.2212598205977789, 0.2212598205977789, 0.1384467035012884, 0.1384467035012884, 0.09841897173495975, 0.09841897173495975, 0.09841897173495975, 0.08928153329409091, 0.08928153329409091, 0.07530286226111897, 0.07530286226111897, 0.07530286226111897, 0.07876354874403942, 0.07876354874403942, 0.10280863530154427, 0.10280863530154427, 0.10280863530154427, 0.12141787221289221, 0.12141787221289221, 0.09718365965612202, 0.09718365965612202, 0.09718365965612202, 0.1339686419514109, 0.1339686419514109, 0.14644916489020246, 0.14644916489020246, 0.14644916489020246, 0.12855772255061168, 0.12855772255061168, 0.15685082466979675, 0.15685082466979675, 0.15685082466979675, 0.1874172458619692, 0.1874172458619692, 0.169421366671875, 0.169421366671875, 0.169421366671875, 0.10439230764630152, 0.10439230764630152, 0.09539713125578915, 0.09539713125578915, 0.09539713125578915, 0.05157612142783203, 0.05157612142783203, 0.07396991695538777, 0.07396991695538777, 0.07396991695538777, 0.07995037579397458, 0.07995037579397458, 0.08992905650430286, 0.08992905650430286, 0.08992905650430286, 0.1344478430222767, 0.1344478430222767, 0.13905770328800585, 0.13905770328800585, 0.13905770328800585, 0.11761845369674213, 0.11761845369674213, 0.18842882433985786, 0.18842882433985786, 0.18842882433985786, 0.6550083383074086, 0.6550083383074086, 0.7350656194146042, 0.7350656194146042, 0.7350656194146042, 0.46149839837910667, 0.46149839837910667, 0.33264484101055325, 0.33264484101055325, 0.33264484101055325, 0.2907620288096126, 0.2907620288096126, 0.2894626103855693, 0.2894626103855693, 0.2894626103855693, 0.2957340776448439, 0.2957340776448439, 0.268680954349933, 0.268680954349933, 0.268680954349933, 0.22498931161783697, 0.22498931161783697, 0.22939634038980858, 0.22939634038980858, 0.22939634038980858, 0.20997150374708343, 0.20997150374708343, 0.24985810330668756, 0.24985810330668756, 0.24985810330668756, 0.2687247726333991, 0.2687247726333991, 0.2758359102057756, 0.2758359102057756, 0.2758359102057756, 0.24908047614607193, 0.24908047614607193, 0.24950377451897307, 0.24950377451897307, 0.24950377451897307, 0.24294203569355335, 0.24294203569355335, 0.25994721395113235, 0.25994721395113235, 0.25994721395113235, 0.2375320987654776, 0.2375320987654776, 0.18952636918104893, 0.18952636918104893, 0.18952636918104893, 0.17923914291191922, 0.17923914291191922], 0] -Dataset HiSumDataset has 2 examples. Example data: ["<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nAt what timestamp can I find information about 'Customized Yamaha MT-01 akrapovic HD (Without DB-killer)' in the video?<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>", torch.Size([128, 3, 384, 384]), [], None, [0.3134304745870105, 0.3134304745870105, 0.3134304745870105, 0.27341293597518773, 0.27341293597518773, 0.27341293597518773, 0.22936455739720477, 0.22936455739720477, 0.26177033946041395, 0.26177033946041395, 0.26177033946041395, 0.22304313139994245, 0.22304313139994245, 0.2212598205977789, 0.2212598205977789, 0.2212598205977789, 0.1384467035012884, 0.1384467035012884, 0.09841897173495975, 0.09841897173495975, 0.09841897173495975, 0.08928153329409091, 0.08928153329409091, 0.07530286226111897, 0.07530286226111897, 0.07530286226111897, 0.07876354874403942, 0.07876354874403942, 0.10280863530154427, 0.10280863530154427, 0.10280863530154427, 0.12141787221289221, 0.12141787221289221, 0.09718365965612202, 0.09718365965612202, 0.09718365965612202, 0.1339686419514109, 0.1339686419514109, 0.14644916489020246, 0.14644916489020246, 0.14644916489020246, 0.12855772255061168, 0.12855772255061168, 0.15685082466979675, 0.15685082466979675, 0.15685082466979675, 0.1874172458619692, 0.1874172458619692, 0.169421366671875, 0.169421366671875, 0.169421366671875, 0.10439230764630152, 0.10439230764630152, 0.09539713125578915, 0.09539713125578915, 0.09539713125578915, 0.05157612142783203, 0.05157612142783203, 0.07396991695538777, 0.07396991695538777, 0.07396991695538777, 0.07995037579397458, 0.07995037579397458, 0.08992905650430286, 0.08992905650430286, 0.08992905650430286, 0.1344478430222767, 0.1344478430222767, 0.13905770328800585, 0.13905770328800585, 0.13905770328800585, 0.11761845369674213, 0.11761845369674213, 0.18842882433985786, 0.18842882433985786, 0.18842882433985786, 0.6550083383074086, 0.6550083383074086, 0.7350656194146042, 0.7350656194146042, 0.7350656194146042, 0.46149839837910667, 0.46149839837910667, 0.33264484101055325, 0.33264484101055325, 0.33264484101055325, 0.2907620288096126, 0.2907620288096126, 0.2894626103855693, 0.2894626103855693, 0.2894626103855693, 0.2957340776448439, 0.2957340776448439, 0.268680954349933, 0.268680954349933, 0.268680954349933, 0.22498931161783697, 0.22498931161783697, 0.22939634038980858, 0.22939634038980858, 0.22939634038980858, 0.20997150374708343, 0.20997150374708343, 0.24985810330668756, 0.24985810330668756, 0.24985810330668756, 0.2687247726333991, 0.2687247726333991, 0.2758359102057756, 0.2758359102057756, 0.2758359102057756, 0.24908047614607193, 0.24908047614607193, 0.24950377451897307, 0.24950377451897307, 0.24950377451897307, 0.24294203569355335, 0.24294203569355335, 0.25994721395113235, 0.25994721395113235, 0.25994721395113235, 0.2375320987654776, 0.2375320987654776, 0.18952636918104893, 0.18952636918104893, 0.18952636918104893, 0.17923914291191922, 0.17923914291191922], 0] -{'train_runtime': 36.5455, 'train_samples_per_second': 0.137, 'train_steps_per_second': 0.027, 'train_loss': 2.332942247390747, 'epoch': 1.0} -Saving model to huggingface +Dataset HiSumDataset has 5 examples. Example data: ["<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nCan you point out the video segments that cover 'Customized Yamaha MT-01 akrapovic HD (Without DB-killer)'?<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>", torch.Size([128, 3, 384, 384]), [], None, [0.3134304745870105, 0.3134304745870105, 0.3134304745870105, 0.27341293597518773, 0.27341293597518773, 0.27341293597518773, 0.22936455739720477, 0.22936455739720477, 0.26177033946041395, 0.26177033946041395, 0.26177033946041395, 0.22304313139994245, 0.22304313139994245, 0.2212598205977789, 0.2212598205977789, 0.2212598205977789, 0.1384467035012884, 0.1384467035012884, 0.09841897173495975, 0.09841897173495975, 0.09841897173495975, 0.08928153329409091, 0.08928153329409091, 0.07530286226111897, 0.07530286226111897, 0.07530286226111897, 0.07876354874403942, 0.07876354874403942, 0.10280863530154427, 0.10280863530154427, 0.10280863530154427, 0.12141787221289221, 0.12141787221289221, 0.09718365965612202, 0.09718365965612202, 0.09718365965612202, 0.1339686419514109, 0.1339686419514109, 0.14644916489020246, 0.14644916489020246, 0.14644916489020246, 0.12855772255061168, 0.12855772255061168, 0.15685082466979675, 0.15685082466979675, 0.15685082466979675, 0.1874172458619692, 0.1874172458619692, 0.169421366671875, 0.169421366671875, 0.169421366671875, 0.10439230764630152, 0.10439230764630152, 0.09539713125578915, 0.09539713125578915, 0.09539713125578915, 0.05157612142783203, 0.05157612142783203, 0.07396991695538777, 0.07396991695538777, 0.07396991695538777, 0.07995037579397458, 0.07995037579397458, 0.08992905650430286, 0.08992905650430286, 0.08992905650430286, 0.1344478430222767, 0.1344478430222767, 0.13905770328800585, 0.13905770328800585, 0.13905770328800585, 0.11761845369674213, 0.11761845369674213, 0.18842882433985786, 0.18842882433985786, 0.18842882433985786, 0.6550083383074086, 0.6550083383074086, 0.7350656194146042, 0.7350656194146042, 0.7350656194146042, 0.46149839837910667, 0.46149839837910667, 0.33264484101055325, 0.33264484101055325, 0.33264484101055325, 0.2907620288096126, 0.2907620288096126, 0.2894626103855693, 0.2894626103855693, 0.2894626103855693, 0.2957340776448439, 0.2957340776448439, 0.268680954349933, 0.268680954349933, 0.268680954349933, 0.22498931161783697, 0.22498931161783697, 0.22939634038980858, 0.22939634038980858, 0.22939634038980858, 0.20997150374708343, 0.20997150374708343, 0.24985810330668756, 0.24985810330668756, 0.24985810330668756, 0.2687247726333991, 0.2687247726333991, 0.2758359102057756, 0.2758359102057756, 0.2758359102057756, 0.24908047614607193, 0.24908047614607193, 0.24950377451897307, 0.24950377451897307, 0.24950377451897307, 0.24294203569355335, 0.24294203569355335, 0.25994721395113235, 0.25994721395113235, 0.25994721395113235, 0.2375320987654776, 0.2375320987654776, 0.18952636918104893, 0.18952636918104893, 0.18952636918104893, 0.17923914291191922, 0.17923914291191922], 0] +Starting Training! +Dataset HiSumDataset has 5 examples. Example data: ['<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nCustomized Yamaha MT-01 akrapovic HD (Without DB-killer)<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>', torch.Size([128, 3, 384, 384]), [], None, [0.3134304745870105, 0.3134304745870105, 0.3134304745870105, 0.27341293597518773, 0.27341293597518773, 0.27341293597518773, 0.22936455739720477, 0.22936455739720477, 0.26177033946041395, 0.26177033946041395, 0.26177033946041395, 0.22304313139994245, 0.22304313139994245, 0.2212598205977789, 0.2212598205977789, 0.2212598205977789, 0.1384467035012884, 0.1384467035012884, 0.09841897173495975, 0.09841897173495975, 0.09841897173495975, 0.08928153329409091, 0.08928153329409091, 0.07530286226111897, 0.07530286226111897, 0.07530286226111897, 0.07876354874403942, 0.07876354874403942, 0.10280863530154427, 0.10280863530154427, 0.10280863530154427, 0.12141787221289221, 0.12141787221289221, 0.09718365965612202, 0.09718365965612202, 0.09718365965612202, 0.1339686419514109, 0.1339686419514109, 0.14644916489020246, 0.14644916489020246, 0.14644916489020246, 0.12855772255061168, 0.12855772255061168, 0.15685082466979675, 0.15685082466979675, 0.15685082466979675, 0.1874172458619692, 0.1874172458619692, 0.169421366671875, 0.169421366671875, 0.169421366671875, 0.10439230764630152, 0.10439230764630152, 0.09539713125578915, 0.09539713125578915, 0.09539713125578915, 0.05157612142783203, 0.05157612142783203, 0.07396991695538777, 0.07396991695538777, 0.07396991695538777, 0.07995037579397458, 0.07995037579397458, 0.08992905650430286, 0.08992905650430286, 0.08992905650430286, 0.1344478430222767, 0.1344478430222767, 0.13905770328800585, 0.13905770328800585, 0.13905770328800585, 0.11761845369674213, 0.11761845369674213, 0.18842882433985786, 0.18842882433985786, 0.18842882433985786, 0.6550083383074086, 0.6550083383074086, 0.7350656194146042, 0.7350656194146042, 0.7350656194146042, 0.46149839837910667, 0.46149839837910667, 0.33264484101055325, 0.33264484101055325, 0.33264484101055325, 0.2907620288096126, 0.2907620288096126, 0.2894626103855693, 0.2894626103855693, 0.2894626103855693, 0.2957340776448439, 0.2957340776448439, 0.268680954349933, 0.268680954349933, 0.268680954349933, 0.22498931161783697, 0.22498931161783697, 0.22939634038980858, 0.22939634038980858, 0.22939634038980858, 0.20997150374708343, 0.20997150374708343, 0.24985810330668756, 0.24985810330668756, 0.24985810330668756, 0.2687247726333991, 0.2687247726333991, 0.2758359102057756, 0.2758359102057756, 0.2758359102057756, 0.24908047614607193, 0.24908047614607193, 0.24950377451897307, 0.24950377451897307, 0.24950377451897307, 0.24294203569355335, 0.24294203569355335, 0.25994721395113235, 0.25994721395113235, 0.25994721395113235, 0.2375320987654776, 0.2375320987654776, 0.18952636918104893, 0.18952636918104893, 0.18952636918104893, 0.17923914291191922, 0.17923914291191922], 0] +Starting Training! +Dataset HiSumDataset has 5 examples. Example data: ["<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nWhich clips in the video relate to the query 'Customized Yamaha MT-01 akrapovic HD (Without DB-killer)'?<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>", torch.Size([128, 3, 384, 384]), [], None, [0.3134304745870105, 0.3134304745870105, 0.3134304745870105, 0.27341293597518773, 0.27341293597518773, 0.27341293597518773, 0.22936455739720477, 0.22936455739720477, 0.26177033946041395, 0.26177033946041395, 0.26177033946041395, 0.22304313139994245, 0.22304313139994245, 0.2212598205977789, 0.2212598205977789, 0.2212598205977789, 0.1384467035012884, 0.1384467035012884, 0.09841897173495975, 0.09841897173495975, 0.09841897173495975, 0.08928153329409091, 0.08928153329409091, 0.07530286226111897, 0.07530286226111897, 0.07530286226111897, 0.07876354874403942, 0.07876354874403942, 0.10280863530154427, 0.10280863530154427, 0.10280863530154427, 0.12141787221289221, 0.12141787221289221, 0.09718365965612202, 0.09718365965612202, 0.09718365965612202, 0.1339686419514109, 0.1339686419514109, 0.14644916489020246, 0.14644916489020246, 0.14644916489020246, 0.12855772255061168, 0.12855772255061168, 0.15685082466979675, 0.15685082466979675, 0.15685082466979675, 0.1874172458619692, 0.1874172458619692, 0.169421366671875, 0.169421366671875, 0.169421366671875, 0.10439230764630152, 0.10439230764630152, 0.09539713125578915, 0.09539713125578915, 0.09539713125578915, 0.05157612142783203, 0.05157612142783203, 0.07396991695538777, 0.07396991695538777, 0.07396991695538777, 0.07995037579397458, 0.07995037579397458, 0.08992905650430286, 0.08992905650430286, 0.08992905650430286, 0.1344478430222767, 0.1344478430222767, 0.13905770328800585, 0.13905770328800585, 0.13905770328800585, 0.11761845369674213, 0.11761845369674213, 0.18842882433985786, 0.18842882433985786, 0.18842882433985786, 0.6550083383074086, 0.6550083383074086, 0.7350656194146042, 0.7350656194146042, 0.7350656194146042, 0.46149839837910667, 0.46149839837910667, 0.33264484101055325, 0.33264484101055325, 0.33264484101055325, 0.2907620288096126, 0.2907620288096126, 0.2894626103855693, 0.2894626103855693, 0.2894626103855693, 0.2957340776448439, 0.2957340776448439, 0.268680954349933, 0.268680954349933, 0.268680954349933, 0.22498931161783697, 0.22498931161783697, 0.22939634038980858, 0.22939634038980858, 0.22939634038980858, 0.20997150374708343, 0.20997150374708343, 0.24985810330668756, 0.24985810330668756, 0.24985810330668756, 0.2687247726333991, 0.2687247726333991, 0.2758359102057756, 0.2758359102057756, 0.2758359102057756, 0.24908047614607193, 0.24908047614607193, 0.24950377451897307, 0.24950377451897307, 0.24950377451897307, 0.24294203569355335, 0.24294203569355335, 0.25994721395113235, 0.25994721395113235, 0.25994721395113235, 0.2375320987654776, 0.2375320987654776, 0.18952636918104893, 0.18952636918104893, 0.18952636918104893, 0.17923914291191922, 0.17923914291191922], 0] +Starting Training! +Dataset HiSumDataset has 5 examples. Example data: ["<|im_start|>system\nA multimodal AI assistant is helping users with some activities. Below is their conversation, interleaved with the list of video frames received by the assistant.<|im_end|>\n<|im_start|>user\nCan you point out the video segments that cover 'Customized Yamaha MT-01 akrapovic HD (Without DB-killer)'?<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>\n<|im_start|>stream\n<|im_end|>", torch.Size([128, 3, 384, 384]), [], None, [0.3134304745870105, 0.3134304745870105, 0.3134304745870105, 0.27341293597518773, 0.27341293597518773, 0.27341293597518773, 0.22936455739720477, 0.22936455739720477, 0.26177033946041395, 0.26177033946041395, 0.26177033946041395, 0.22304313139994245, 0.22304313139994245, 0.2212598205977789, 0.2212598205977789, 0.2212598205977789, 0.1384467035012884, 0.1384467035012884, 0.09841897173495975, 0.09841897173495975, 0.09841897173495975, 0.08928153329409091, 0.08928153329409091, 0.07530286226111897, 0.07530286226111897, 0.07530286226111897, 0.07876354874403942, 0.07876354874403942, 0.10280863530154427, 0.10280863530154427, 0.10280863530154427, 0.12141787221289221, 0.12141787221289221, 0.09718365965612202, 0.09718365965612202, 0.09718365965612202, 0.1339686419514109, 0.1339686419514109, 0.14644916489020246, 0.14644916489020246, 0.14644916489020246, 0.12855772255061168, 0.12855772255061168, 0.15685082466979675, 0.15685082466979675, 0.15685082466979675, 0.1874172458619692, 0.1874172458619692, 0.169421366671875, 0.169421366671875, 0.169421366671875, 0.10439230764630152, 0.10439230764630152, 0.09539713125578915, 0.09539713125578915, 0.09539713125578915, 0.05157612142783203, 0.05157612142783203, 0.07396991695538777, 0.07396991695538777, 0.07396991695538777, 0.07995037579397458, 0.07995037579397458, 0.08992905650430286, 0.08992905650430286, 0.08992905650430286, 0.1344478430222767, 0.1344478430222767, 0.13905770328800585, 0.13905770328800585, 0.13905770328800585, 0.11761845369674213, 0.11761845369674213, 0.18842882433985786, 0.18842882433985786, 0.18842882433985786, 0.6550083383074086, 0.6550083383074086, 0.7350656194146042, 0.7350656194146042, 0.7350656194146042, 0.46149839837910667, 0.46149839837910667, 0.33264484101055325, 0.33264484101055325, 0.33264484101055325, 0.2907620288096126, 0.2907620288096126, 0.2894626103855693, 0.2894626103855693, 0.2894626103855693, 0.2957340776448439, 0.2957340776448439, 0.268680954349933, 0.268680954349933, 0.268680954349933, 0.22498931161783697, 0.22498931161783697, 0.22939634038980858, 0.22939634038980858, 0.22939634038980858, 0.20997150374708343, 0.20997150374708343, 0.24985810330668756, 0.24985810330668756, 0.24985810330668756, 0.2687247726333991, 0.2687247726333991, 0.2758359102057756, 0.2758359102057756, 0.2758359102057756, 0.24908047614607193, 0.24908047614607193, 0.24950377451897307, 0.24950377451897307, 0.24950377451897307, 0.24294203569355335, 0.24294203569355335, 0.25994721395113235, 0.25994721395113235, 0.25994721395113235, 0.2375320987654776, 0.2375320987654776, 0.18952636918104893, 0.18952636918104893, 0.18952636918104893, 0.17923914291191922, 0.17923914291191922], 0] +Starting Training! +ninja: no work to do. +Time to load cpu_adam op: 2.271890163421631 seconds +ninja: no work to do. +Time to load cpu_adam op: 2.244173526763916 seconds +Time to load cpu_adam op: 2.284147024154663 seconds +ninja: no work to do. +Time to load cpu_adam op: 2.2384707927703857 seconds +{'train/tv_loss': None, 'train/lm_loss': 0.8570868372917175, 'train/info_loss': 9.467206954956055, 'train/ref_loss': None, 'train/uncertainty_loss': 0.013913665711879731, 'train/video_loss': 9.481121063232422} +{'train/tv_loss': None, 'train/lm_loss': 1.0620988607406616, 'train/info_loss': 6.471096038818359, 'train/ref_loss': None, 'train/uncertainty_loss': 0.02550567388534546, 'train/video_loss': 6.496601581573486} +{'train/tv_loss': None, 'train/lm_loss': 0.5937383770942688, 'train/info_loss': 6.909224033355713, 'train/ref_loss': None, 'train/uncertainty_loss': 0.07672024965286256, 'train/video_loss': 6.9859442710876465} +{'train/tv_loss': None, 'train/lm_loss': 1.390915036201477, 'train/info_loss': 9.34560489654541, 'train/ref_loss': None, 'train/uncertainty_loss': 0.0060056507587432865, 'train/video_loss': 9.35161018371582} +{'train/tv_loss': None, 'train/lm_loss': 0.7876078486442566, 'train/info_loss': 5.637686252593994, 'train/ref_loss': None, 'train/uncertainty_loss': 0.07472547292709351, 'train/video_loss': 5.712411880493164} +{'train/tv_loss': None, 'train/lm_loss': 2.279966115951538, 'train/info_loss': 10.876502990722656, 'train/ref_loss': None, 'train/uncertainty_loss': 0.09283035397529603, 'train/video_loss': 10.96933364868164} +{'loss': 6.1814, 'learning_rate': 2e-05, 'epoch': 1.0} +Finished Training +Finished Training +Finished Training +{'train_runtime': 63.3431, 'train_samples_per_second': 0.347, 'train_steps_per_second': 0.016, 'train_loss': 6.181375026702881, 'epoch': 1.0} +Finished Training