diff --git a/V4.3-ckpt/ckpt_iter0005000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/V4.3-ckpt/ckpt_iter0005000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5185e230c86e8d9e36b0fdec7c7bf9a1945c02fd --- /dev/null +++ b/V4.3-ckpt/ckpt_iter0005000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56c056016d8b8a248e7d54c844b6c004cfe54e1e483921f5b1952b065ca85349 +size 158369093 diff --git a/V4.3-ckpt/ckpt_iter0005000.pth/mp_rank_00_model_states.pt b/V4.3-ckpt/ckpt_iter0005000.pth/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..85088cb9134b5f848c6f01dfe1d371b579d6d746 --- /dev/null +++ b/V4.3-ckpt/ckpt_iter0005000.pth/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b531154b45173908020ee9a162a8100f9cc3d8d10deb37b09d68389308a6135d +size 26443493 diff --git a/V4.3-ckpt/ckpt_iter0010000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/V4.3-ckpt/ckpt_iter0010000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..74feb99ce05d5ebe70a9876db341acf9f6251e78 --- /dev/null +++ b/V4.3-ckpt/ckpt_iter0010000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da64c8d2b0268d31f45c524311146f16e9cbb688d917100e125834b617c2c9b5 +size 158369093 diff --git a/V4.3-ckpt/ckpt_iter0010000.pth/mp_rank_00_model_states.pt b/V4.3-ckpt/ckpt_iter0010000.pth/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..119a5b38bd3048cc8de857f64af3ce9df8eb7f16 --- /dev/null +++ b/V4.3-ckpt/ckpt_iter0010000.pth/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b380f89583e0e2a2ca080f5ba4036d78b87c669669dd7a6fb2d5ba9d3cda83dd +size 26443493 diff --git a/V4.3-ckpt/ckpt_iter0015000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/V4.3-ckpt/ckpt_iter0015000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..176569ca744f67b98de9b00a858122f293a9d98b --- /dev/null +++ b/V4.3-ckpt/ckpt_iter0015000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d81bfb74f3b4f2052f26112507bdfae7927aa34eb499538107394f7a960d3c8c +size 158369093 diff --git a/V4.3-ckpt/ckpt_iter0015000.pth/mp_rank_00_model_states.pt b/V4.3-ckpt/ckpt_iter0015000.pth/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7c4f85674a0305dab974e7beac9944e26d7825a6 --- /dev/null +++ b/V4.3-ckpt/ckpt_iter0015000.pth/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f49055976a46362bf30c1be8ca9157e66a90f2475ed9c76cf0f8137cd86f05b +size 26443493 diff --git a/V4.3-ckpt/ckpt_iter0020000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/V4.3-ckpt/ckpt_iter0020000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ef8490e782846a9bfa51f2b91438864d495a01fd --- /dev/null +++ b/V4.3-ckpt/ckpt_iter0020000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e75ff5e61aa6a945214debc3e1a795d94942ff9a808dc0f6477e822cad626c4b +size 158369093 diff --git a/V4.3-ckpt/ckpt_iter0020000.pth/mp_rank_00_model_states.pt b/V4.3-ckpt/ckpt_iter0020000.pth/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b41cae5fe23d595fabd84f7f4fc234593f019237 --- /dev/null +++ b/V4.3-ckpt/ckpt_iter0020000.pth/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62837c90687bd793e4f6e7d8913c6f6d151f4d61ddacbcb69d10c29a20fe86a7 +size 26443493 diff --git a/V4.3-ckpt/ckpt_iter0025000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/V4.3-ckpt/ckpt_iter0025000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..65912df0e611559bcaac4e0ecab674b6c59cfd38 --- /dev/null +++ b/V4.3-ckpt/ckpt_iter0025000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ca26f145d58915af43963f5c3801eedccbbf4206a50fa85cab73d7835811a5d +size 158369093 diff --git a/V4.3-ckpt/ckpt_iter0025000.pth/mp_rank_00_model_states.pt b/V4.3-ckpt/ckpt_iter0025000.pth/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..97388ece21c833456cc24764176c8b711828872f --- /dev/null +++ b/V4.3-ckpt/ckpt_iter0025000.pth/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c323dd0af112f1acb8c4b73beda2e326aa6f5e78d3c77e65b69aab953e49ead0 +size 26443493 diff --git a/V4.3-ckpt/ckpt_iter0030000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/V4.3-ckpt/ckpt_iter0030000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e0414c993a8ef0e760600e8664a16e9b52301a41 --- /dev/null +++ b/V4.3-ckpt/ckpt_iter0030000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bee656e4312a9700e9e379b217f3e550c356f2ccda11b15e01c4c3f0e22da5ae +size 158369093 diff --git a/V4.3-ckpt/ckpt_iter0030000.pth/mp_rank_00_model_states.pt b/V4.3-ckpt/ckpt_iter0030000.pth/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5ff5b8fb243b4b961db70e46a534b4501a4ff824 --- /dev/null +++ b/V4.3-ckpt/ckpt_iter0030000.pth/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d7b9ee7f0523c215814a1ca476a77847a79eff99d1b39acb3a15287ba0e5d27 +size 26443493 diff --git a/V4.3-ckpt/ckpt_iter0035000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/V4.3-ckpt/ckpt_iter0035000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..824296bb451f16b67489219b43c212b5222c0d29 --- /dev/null +++ b/V4.3-ckpt/ckpt_iter0035000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8da5825bf83934e65a7210eb7cd1e0bc057bb3a1d1554c9238f559cb8314a361 +size 158369093 diff --git a/V4.3-ckpt/ckpt_iter0035000.pth/mp_rank_00_model_states.pt b/V4.3-ckpt/ckpt_iter0035000.pth/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e9b50a66289a8a5f7a84e0b2ef3728e755879059 --- /dev/null +++ b/V4.3-ckpt/ckpt_iter0035000.pth/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ea7171231f41b203c368337ff928a17e16c81df3e0d843a1915594d8e26f495 +size 26443493 diff --git a/V4.3-ckpt/ckpt_iter0040000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/V4.3-ckpt/ckpt_iter0040000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a050eb1cdbc71425dd06cbd1e4d064be8047d1b9 --- /dev/null +++ b/V4.3-ckpt/ckpt_iter0040000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5448f30ccdae36c771d6945b984aff7383872024ea17b1afc659f6a8dea2fb6 +size 158369093 diff --git a/V4.3-ckpt/ckpt_iter0040000.pth/mp_rank_00_model_states.pt b/V4.3-ckpt/ckpt_iter0040000.pth/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..78d3dea48e4bd6ce6e36a1be73fb294da27e7fb4 --- /dev/null +++ b/V4.3-ckpt/ckpt_iter0040000.pth/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e70668941469527626a4fa068217258aa7d5269f56d5beaa4da58395e5928ab5 +size 26443493 diff --git a/V4.3-ckpt/ckpt_iter0045000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/V4.3-ckpt/ckpt_iter0045000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7663630175ede38c0841b2fa264e566cee250b94 --- /dev/null +++ b/V4.3-ckpt/ckpt_iter0045000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:046269135cd8be837f2397b20a241cd657b3ed8146689ebc03f31d416e96aeb4 +size 158369093 diff --git a/V4.3-ckpt/ckpt_iter0045000.pth/mp_rank_00_model_states.pt b/V4.3-ckpt/ckpt_iter0045000.pth/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ddae4f32727ea423f13957c7aefe21e3cd2efd7b --- /dev/null +++ b/V4.3-ckpt/ckpt_iter0045000.pth/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7eb4432e39cf49637932611eb23f53f5259e0e97936500334183a0d1f872eaa6 +size 26443493 diff --git a/V4.3-ckpt/ckpt_iter0050000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/V4.3-ckpt/ckpt_iter0050000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..66e8f875871a4cc0376cde384aa7ad1ed5f1ab3d --- /dev/null +++ b/V4.3-ckpt/ckpt_iter0050000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39a023ee1da3bdee2870c96ecc671bc6a877826f3d78939e79fd6705d7133439 +size 158369093 diff --git a/V4.3-ckpt/ckpt_iter0050000.pth/mp_rank_00_model_states.pt b/V4.3-ckpt/ckpt_iter0050000.pth/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4ee99e5d119e2c92aa3c53ead13dba7ea973431f --- /dev/null +++ b/V4.3-ckpt/ckpt_iter0050000.pth/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5d9e063bde8ee672fcad75c4f9618615f6edb7c18350b56c04973a2723f6c45 +size 26443493 diff --git a/V4.3-ckpt/ckpt_iter0055000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/V4.3-ckpt/ckpt_iter0055000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..299dca5241409693724ecbbdc470c11bea68978a --- /dev/null +++ b/V4.3-ckpt/ckpt_iter0055000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1da6e6b735e6d45595a608e43ae7d161deb7922f02757636e5b0d3ee06e50e7 +size 158369093 diff --git a/V4.3-ckpt/ckpt_iter0055000.pth/mp_rank_00_model_states.pt b/V4.3-ckpt/ckpt_iter0055000.pth/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fca30b61d034e072790005ffde156f4c04295855 --- /dev/null +++ b/V4.3-ckpt/ckpt_iter0055000.pth/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e58283a54fa4c0ddb9d757b5e7f40dd4fcd4a4a0fa1b40ae5a7b69c5354b5741 +size 26443493 diff --git a/V4.3-ckpt/ckpt_iter0060000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/V4.3-ckpt/ckpt_iter0060000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..135aaacea23c7aab0f9b4b3246878c8c097733b0 --- /dev/null +++ b/V4.3-ckpt/ckpt_iter0060000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03d2ad0d1fc3ac4f3d0dbf083955bbfaf32b893c1221b80cb4f7b430bbf3ea45 +size 158369093 diff --git a/V4.3-ckpt/ckpt_iter0060000.pth/mp_rank_00_model_states.pt b/V4.3-ckpt/ckpt_iter0060000.pth/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..96a7d197851ae5e87c7345b9a922c4dc6690b311 --- /dev/null +++ b/V4.3-ckpt/ckpt_iter0060000.pth/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bc97f15576086587cb20c2745cf41c59881c3e2d3d9df2915f8d157ca7d815a +size 26443493 diff --git a/V4.3-ckpt/ckpt_iter0065000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/V4.3-ckpt/ckpt_iter0065000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4307acb6612562097a03685055094a93682915a3 --- /dev/null +++ b/V4.3-ckpt/ckpt_iter0065000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8aead2cd83a525d092ebd67b163a32c1b26f115f5d990a63d5d89d8bc7354eef +size 158369093 diff --git a/V4.3-ckpt/ckpt_iter0065000.pth/mp_rank_00_model_states.pt b/V4.3-ckpt/ckpt_iter0065000.pth/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..41f5c8673d4dee1359296455f95a186fe3270f2c --- /dev/null +++ b/V4.3-ckpt/ckpt_iter0065000.pth/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a971fc817b840f3f5f0dce4dbdafec4342e4e6879bf1ac464cade9ad033db11 +size 26443493 diff --git a/V4.3-ckpt/ckpt_iter0070000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/V4.3-ckpt/ckpt_iter0070000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ce97b29b403d8520e461039e2e4508a125f82778 --- /dev/null +++ b/V4.3-ckpt/ckpt_iter0070000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd8c3d952773317fc8c7e86456c26d63cb3a0c97d7caaa2c6fd62ec7e39135dd +size 158369093 diff --git a/V4.3-ckpt/ckpt_iter0070000.pth/mp_rank_00_model_states.pt b/V4.3-ckpt/ckpt_iter0070000.pth/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f455497d4642175c97960bc6c763bade38fe5c44 --- /dev/null +++ b/V4.3-ckpt/ckpt_iter0070000.pth/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff32e409e5a8d8b3f7f71edd2e8cb251890ea7a2be606c6ca03c6a931673c10a +size 26443493 diff --git a/V4.3-ckpt/ckpt_iter0075000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/V4.3-ckpt/ckpt_iter0075000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3823c180e428af87b10b9d035fb4c3632d808d74 --- /dev/null +++ b/V4.3-ckpt/ckpt_iter0075000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ed28a79904a7820cbf7759416ff99f0bd13fa938f88192e05b637f641297ad3 +size 158369093 diff --git a/V4.3-ckpt/ckpt_iter0075000.pth/mp_rank_00_model_states.pt b/V4.3-ckpt/ckpt_iter0075000.pth/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2738ff521437617453ea27a3c8542e349a149717 --- /dev/null +++ b/V4.3-ckpt/ckpt_iter0075000.pth/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa06fb8ebce63e2be53630a27cca6e736e2d62691c409cab3f8bba9768fb8c69 +size 26443493 diff --git a/V4.3-ckpt/ckpt_iter0080000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/V4.3-ckpt/ckpt_iter0080000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..53301ed2c47793fa8d2e42fb2dc2d19a0213098e --- /dev/null +++ b/V4.3-ckpt/ckpt_iter0080000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:360fa7b5e63ace9b88522feea8df6c4e3ce1c4d2d40fd0da960d582e6284af4b +size 158369093 diff --git a/V4.3-ckpt/ckpt_iter0080000.pth/mp_rank_00_model_states.pt b/V4.3-ckpt/ckpt_iter0080000.pth/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8ecd3f0ba7e7496f59607d249581ad2d6e6d9222 --- /dev/null +++ b/V4.3-ckpt/ckpt_iter0080000.pth/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e33aed92487ae27d3fa58c8f3d986c4c9bfa4e5cb8c6ffa0023047b8912b0441 +size 26443493 diff --git a/V4.3-ckpt/ckpt_iter0085000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/V4.3-ckpt/ckpt_iter0085000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9da1885349a04e0e284326e6059b00790e171be9 --- /dev/null +++ b/V4.3-ckpt/ckpt_iter0085000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bae8dd84fdfae832e273b3f4495533d441105ad22a23f09a7721f1fdac88c4b9 +size 158369093 diff --git a/V4.3-ckpt/ckpt_iter0085000.pth/mp_rank_00_model_states.pt b/V4.3-ckpt/ckpt_iter0085000.pth/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9f508c17d2ef52961ad5af77db0c9ed4857e9c48 --- /dev/null +++ b/V4.3-ckpt/ckpt_iter0085000.pth/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:320ed1fa617cfd7e59180f4035c35a0ba9483500e8747832c532b2304e9fcd26 +size 26443493 diff --git a/V4.3-ckpt/ckpt_iter0090000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/V4.3-ckpt/ckpt_iter0090000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..84913a0c21da74213bdebfc7252cbe6dbf7b46cb --- /dev/null +++ b/V4.3-ckpt/ckpt_iter0090000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5356ed18783ee39e805ebb55315be818e386adfbd5b4af38c4d3c41bf9696c0 +size 158369093 diff --git a/V4.3-ckpt/ckpt_iter0090000.pth/mp_rank_00_model_states.pt b/V4.3-ckpt/ckpt_iter0090000.pth/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1a535d79d37bb009b28e7f445c382cc858b67da8 --- /dev/null +++ b/V4.3-ckpt/ckpt_iter0090000.pth/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f66984aab3c7fdf34c7058e19c6a175e1d93838e5e7c39cbc4702a221ef1b384 +size 26443493 diff --git a/V4.3-ckpt/ckpt_iter0095000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/V4.3-ckpt/ckpt_iter0095000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7c5f35c1345791ae174a5f1119c6802355cdd326 --- /dev/null +++ b/V4.3-ckpt/ckpt_iter0095000.pth/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:878e1ccefc862a1c2263801b39bac100dfb3586596988f10504cd45e69c7bc6f +size 158369093 diff --git a/V4.3-ckpt/ckpt_iter0095000.pth/mp_rank_00_model_states.pt b/V4.3-ckpt/ckpt_iter0095000.pth/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8e8a68cad223757464b915807672729d08917965 --- /dev/null +++ b/V4.3-ckpt/ckpt_iter0095000.pth/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e6979b3666f1766dc13530a5eea3bd87d317550fd3849b8b3de9c5a9c3e0858 +size 26443493 diff --git a/V4.3-ckpt/config.json b/V4.3-ckpt/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ba97123a7e543efe82a873de041c914b8482b5a1 --- /dev/null +++ b/V4.3-ckpt/config.json @@ -0,0 +1,243 @@ +{ + "root_path": "/home/zli", + "available_corpus": { + "cc3m": { + "anno_path": "your_path", + "data_root": "", + "media_type": "image" + }, + "webvid_10m": { + "anno_path": "your_path", + "data_root": "", + "media_type": "video" + }, + "smol_test": { + "anno_path": "/root/IV2/InternVideo2/multi_modality/data_test/smol_test.json", + "data_root": "/root/IV2/InternVideo2/multi_modality/data_test/", + "media_type": "video" + }, + "slim_kinetics": { + "anno_path": "/home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json", + "data_root": "/home/zli/kinetics-dataset/k600/train/train", + "media_type": "video" + }, + "slim_kinetics_act_val": { + "anno_path": "/home/zli/kinetics-dataset/k600/test/kinetics-test.json", + "data_root": "/home/zli/kinetics-dataset/k600/test/", + "media_type": "video", + "is_act_rec": true + } + }, + "VisionEncoders": {}, + "TextEncoders": { + "bert": { + "name": "bert_base", + "pretrained": "bert-base-uncased", + "config": "configs/config_bert.json", + "d_model": 768, + "fusion_layer": 9 + }, + "bert_large": { + "name": "bert_large", + "pretrained": "bert-large-uncased", + "config": "configs/config_bert_large.json", + "d_model": 1024, + "fusion_layer": 19 + }, + "med_bert": { + "name": "med_bert_base", + "pretrained": "bert-base-uncased", + "config": "configs/med_config.json", + "d_model": 768 + }, + "med_bert_large": { + "name": "med_bert_large", + "pretrained": "bert-base-uncased", + "config": "configs/med_large_config.json", + "d_model": 768 + } + }, + "train_corpus": "slim_kinetics", + "train_file": { + "anno_path": "/home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json", + "data_root": "/home/zli/kinetics-dataset/k600/train/train", + "media_type": "video" + }, + "test_file": { + "act_val": { + "anno_path": "/home/zli/kinetics-dataset/k600/test/kinetics-test.json", + "data_root": "/home/zli/kinetics-dataset/k600/test/", + "media_type": "video", + "is_act_rec": true + } + }, + "test_types": [ + "act_val" + ], + "num_workers": 2, + "stop_key": null, + "num_frames": 8, + "num_frames_test": 8, + "batch_size": 16, + "batch_size_test": 16, + "max_txt_l": 32, + "size_t": 224, + "inputs": { + "image_res": 224, + "video_input": { + "num_frames": 8, + "sample_type": "all", + "num_frames_test": 8, + "sample_type_test": "all", + "random_aug": false + }, + "max_txt_l": { + "image": 32, + "video": 32 + }, + "batch_size": { + "image": 16, + "video": 16 + }, + "batch_size_test": { + "image": 16, + "video": 16 + } + }, + "model": { + "model_cls": "InternVideo2_CLIP_small", + "vision_encoder": { + "name": "internvideo2", + "in_chans": 3, + "patch_size": 14, + "img_size": 224, + "qkv_bias": false, + "drop_path_rate": 0.0, + "head_drop_path_rate": 0.0, + "embed_dim": 768, + "num_heads": 12, + "mlp_ratio": 4, + "init_values": 0.1, + "qk_normalization": true, + "depth": 12, + "use_flash_attn": true, + "use_fused_rmsnorm": true, + "use_fused_mlp": true, + "fused_mlp_heuristic": 1, + "drop_cls_token": false, + "attn_pool_num_heads": 16, + "clip_embed_dim": 768, + "layerscale_no_force_fp32": true, + "num_frames": 8, + "tubelet_size": 1, + "sep_pos_embed": false, + "use_checkpoint": false, + "checkpoint_num": 0, + "align_dim": 512 + }, + "streaming_vision_encoder": { + "in_chans": 3, + "patch_size": 14, + "img_size": 224, + "vit_qkv_bias": true, + "vit_drop_path_rate": 0.05, + "student_embed_dim": 384, + "student_depth": 4, + "student_num_heads": 6, + "vit_mlp_ratio": 3.0, + "vit_init_values": null, + "vit_qk_normalization": false, + "vit_sep_pos_embed": true, + "vit_norm_layer_type": "rmsnorm", + "rnn_type": "lstm", + "rnn_hidden_size": 1024, + "rnn_num_layers": 1, + "fc_hidden_layers": [], + "teacher_clip_embed_dim": 768, + "student_num_frames_processed_by_vit": 1, + "student_tubelet_size_for_vit": 1 + }, + "text_encoder": { + "name": "mobileclip_b" + }, + "temp": 0.01, + "temp_min": 0.01, + "freeze_vision": true, + "open_vision_clip_projector": false, + "freeze_text": true, + "open_text_projection": false, + "open_text_lora": false, + "vision_ckpt_path": "/home/zli/IV2/models/stage1/B14/B14_dist_1B_stage2/pytorch_model.bin", + "load_vision_ckpt_from_internvideo2_stage2": false, + "text_ckpt_path": "/home/zli/IV2/models/mobileclip_blt.pt", + "extra_ckpt_path": "/home/zli/IV2/models/clip/B14/pytorch_model.bin" + }, + "criterion": { + "loss_weight": { + "vtc": 1.0 + } + }, + "optimizer": { + "opt": "adamW", + "lr": 1e-05, + "opt_betas": [ + 0.9, + 0.98 + ], + "weight_decay": 0.01, + "max_grad_norm": 0.7, + "different_lr": { + "enable": false, + "module_names": [], + "lr": 1e-05 + } + }, + "scheduler": { + "sched": "cosine", + "epochs": 1, + "min_lr_multi": 0.01, + "warmup_epochs": 0.05 + }, + "evaluate": false, + "deep_fusion": false, + "evaluation": { + "eval_frame_ensemble": "concat", + "eval_x_only": false, + "k_test": 128, + "eval_offload": true + }, + "use_half_precision": true, + "use_bf16": true, + "gradient_checkpointing": true, + "wandb": { + "enable": true, + "entity": "qingy2019-conker-mobile-inc-", + "project": "window_iv2" + }, + "dist_url": "env://", + "device": "cuda", + "mode": "pt", + "output_dir": "scripts/pretraining/clip/B14/B14", + "resume": true, + "debug": false, + "log_freq": 1, + "seed": 42, + "save_latest": false, + "save_iter": 5000, + "eval_freq_steps": 1000, + "eval_video_repo_id": "qingy2024/backflip_train", + "eval_video_filename": "1.mp4", + "eval_plot_output_dir": "scripts/pretraining/clip/B14/cosine_sim_graphs", + "auto_resume": true, + "pretrained_path": "", + "deepspeed": { + "enable": true, + "stage": 1 + }, + "rank": 0, + "world_size": 1, + "gpu": 0, + "distributed": true, + "dist_backend": "nccl", + "deepspeed_config": "scripts/pretraining/clip/B14/B14/deepspeed_config.json" +} \ No newline at end of file diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0000000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0000000.png new file mode 100644 index 0000000000000000000000000000000000000000..32f0d260ad7bf8022e3d05a57ea2ede20623de35 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0000000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0001000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0001000.png new file mode 100644 index 0000000000000000000000000000000000000000..94929cee1fbf08c35df7f883531811c29042d73e Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0001000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0002000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0002000.png new file mode 100644 index 0000000000000000000000000000000000000000..acfadfbc5aff62169e49fca839b5f8bdd4b84055 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0002000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0003000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0003000.png new file mode 100644 index 0000000000000000000000000000000000000000..f664799a90a1fcecfc3153951edda7a0ec1438c7 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0003000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0004000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0004000.png new file mode 100644 index 0000000000000000000000000000000000000000..ac06c0496478454d4bab76bd6ed2396486f4aa74 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0004000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0005000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0005000.png new file mode 100644 index 0000000000000000000000000000000000000000..16f63484f12ce8610d9a8e3c716dfc26d79bacdd Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0005000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0006000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0006000.png new file mode 100644 index 0000000000000000000000000000000000000000..2a60489c03419045317adc27355b8c2f4238af93 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0006000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0007000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0007000.png new file mode 100644 index 0000000000000000000000000000000000000000..19a6846a125f0636eff74eb3ef49e004c535cbad Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0007000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0008000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0008000.png new file mode 100644 index 0000000000000000000000000000000000000000..e993416500b1eb5f5d69395909567808953675d5 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0008000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0009000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0009000.png new file mode 100644 index 0000000000000000000000000000000000000000..9a1e66c23b2dc541c238845ece80a6926d75d6cd Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0009000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0010000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0010000.png new file mode 100644 index 0000000000000000000000000000000000000000..8ab86f7a779ab9feee37f7f0c5ff2983f538244d Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0010000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0011000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0011000.png new file mode 100644 index 0000000000000000000000000000000000000000..23b5db49c466ce755f02440ba3754b628315a9d5 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0011000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0012000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0012000.png new file mode 100644 index 0000000000000000000000000000000000000000..a47f696af6e19f7a8e2f83540edda2c3e0c4ace2 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0012000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0013000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0013000.png new file mode 100644 index 0000000000000000000000000000000000000000..12066864c6b444b732eaec832e9ee20773e3fe0c Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0013000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0014000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0014000.png new file mode 100644 index 0000000000000000000000000000000000000000..4d257d1e30bce79e049a654eb4b238277bb18a25 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0014000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0015000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0015000.png new file mode 100644 index 0000000000000000000000000000000000000000..7244def028c4b29d843812cd2047187cdb103053 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0015000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0016000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0016000.png new file mode 100644 index 0000000000000000000000000000000000000000..d75a25a45710cc781a21a202dddd0ac73e8e0d28 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0016000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0017000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0017000.png new file mode 100644 index 0000000000000000000000000000000000000000..b32c037cabb3cd198bc525653da0534b0e7bb42f Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0017000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0018000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0018000.png new file mode 100644 index 0000000000000000000000000000000000000000..2bcb7d6f9f7406a1043430677d66055771d45940 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0018000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0019000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0019000.png new file mode 100644 index 0000000000000000000000000000000000000000..ab6b3aeb535bd4c4ce21b952e61fbcbc86db0aa9 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0019000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0020000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0020000.png new file mode 100644 index 0000000000000000000000000000000000000000..082327526141cceef9ac9b496445e0471ce7ad18 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0020000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0021000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0021000.png new file mode 100644 index 0000000000000000000000000000000000000000..b7f0b0b4a22d94267c500f3f64ae6b04bd9d6aa2 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0021000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0022000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0022000.png new file mode 100644 index 0000000000000000000000000000000000000000..0694760cc2dd3b29a3e944f5b0f424fbc7ef330b Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0022000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0023000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0023000.png new file mode 100644 index 0000000000000000000000000000000000000000..999684781c0a972848392f5f658c2ee551d6f0b5 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0023000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0024000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0024000.png new file mode 100644 index 0000000000000000000000000000000000000000..c7c02081e1148ea1cc850bf16aa808cf7d34635d Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0024000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0025000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0025000.png new file mode 100644 index 0000000000000000000000000000000000000000..2da67301d973342bb758ec7f30c230da59d05647 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0025000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0026000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0026000.png new file mode 100644 index 0000000000000000000000000000000000000000..c93ffc6a4fecdb54a819ca2611c06e25df35f430 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0026000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0027000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0027000.png new file mode 100644 index 0000000000000000000000000000000000000000..dc96fdac26abdb9414ec7be0483bce5ee9b0450c Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0027000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0028000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0028000.png new file mode 100644 index 0000000000000000000000000000000000000000..545671a473e4308ba9147a734974bb97983a8497 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0028000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0029000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0029000.png new file mode 100644 index 0000000000000000000000000000000000000000..55f00b06751fffb0471bd54e41742fc0797ce95c Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0029000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0030000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0030000.png new file mode 100644 index 0000000000000000000000000000000000000000..349d8ec9148906f9085cc9fae351e869d052b1dc Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0030000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0031000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0031000.png new file mode 100644 index 0000000000000000000000000000000000000000..8b76a7929c66a05ef0fec1d2213cd2293749f1b9 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0031000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0032000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0032000.png new file mode 100644 index 0000000000000000000000000000000000000000..a4f55ce88f5e26a269b334fbc4f4f9d20479ccab Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0032000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0033000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0033000.png new file mode 100644 index 0000000000000000000000000000000000000000..f2dfd95a4ae023992bbe21695b3c47e1e45bfa69 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0033000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0034000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0034000.png new file mode 100644 index 0000000000000000000000000000000000000000..3e82b910fe5e343ce5660fa28988bd8d4c1efc3e Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0034000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0035000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0035000.png new file mode 100644 index 0000000000000000000000000000000000000000..5d416f362efd795295b4f2d8f6f496985f7c365c Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0035000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0036000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0036000.png new file mode 100644 index 0000000000000000000000000000000000000000..24d7982d1829138ec111dd0106c4ccf6ed161efb Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0036000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0037000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0037000.png new file mode 100644 index 0000000000000000000000000000000000000000..dcd4dce4c21ac01c6c21338c416d2c258deea66f Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0037000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0038000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0038000.png new file mode 100644 index 0000000000000000000000000000000000000000..b1935796a922e302e94146a6e026facd59aea221 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0038000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0039000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0039000.png new file mode 100644 index 0000000000000000000000000000000000000000..5f83d00d447426117f6c7333fcb2d18e1a3edfcf Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0039000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0040000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0040000.png new file mode 100644 index 0000000000000000000000000000000000000000..c93fe98e211470f451fc4053a94886fc82777d9f Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0040000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0041000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0041000.png new file mode 100644 index 0000000000000000000000000000000000000000..606d3b3c362035d6908473cecdc731f08763cf91 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0041000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0042000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0042000.png new file mode 100644 index 0000000000000000000000000000000000000000..0c7c72c463193ce5cfbdf7dbb0d5c94a9a6fe4ce Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0042000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0043000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0043000.png new file mode 100644 index 0000000000000000000000000000000000000000..26bafac14abb783c8ed53b68258034925dbac7ec Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0043000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0044000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0044000.png new file mode 100644 index 0000000000000000000000000000000000000000..6b8ef26bee3e3b217ea0b953a6818c40358e7f8e Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0044000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0045000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0045000.png new file mode 100644 index 0000000000000000000000000000000000000000..3cfdd2f84d8e655dcbe8ff079c5b272292e15312 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0045000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0046000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0046000.png new file mode 100644 index 0000000000000000000000000000000000000000..72e93b572cc8b4622ebe721114c4de7f2d10709d Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0046000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0047000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0047000.png new file mode 100644 index 0000000000000000000000000000000000000000..a754dfe93450b6d9c712da055d4deb227c62e6d6 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0047000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0048000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0048000.png new file mode 100644 index 0000000000000000000000000000000000000000..2f7b570bc4b9a065f0c1f757a9058284213d0cac Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0048000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0049000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0049000.png new file mode 100644 index 0000000000000000000000000000000000000000..b587e7f440dca88db794281f0fe1676270de2d23 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0049000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0050000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0050000.png new file mode 100644 index 0000000000000000000000000000000000000000..a6b22045a2b1c284288c18acf7c17151e304e570 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0050000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0051000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0051000.png new file mode 100644 index 0000000000000000000000000000000000000000..559a0a19b6d0558c4169f86eb497311cbdb6f4eb Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0051000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0052000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0052000.png new file mode 100644 index 0000000000000000000000000000000000000000..eb1bb224532110d5a2ea8ebe43473e9f67beb853 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0052000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0053000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0053000.png new file mode 100644 index 0000000000000000000000000000000000000000..7e7c49385ee5f84fcfe824854a3bdf1e789c02bb Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0053000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0054000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0054000.png new file mode 100644 index 0000000000000000000000000000000000000000..90a95fc31cb8edec7de86e8c720ed5b68c8f7b7a Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0054000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0055000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0055000.png new file mode 100644 index 0000000000000000000000000000000000000000..d5e4ea86f80bd2e4183cdec7082597cdef273507 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0055000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0056000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0056000.png new file mode 100644 index 0000000000000000000000000000000000000000..3d13c0403d89c15202214215d6db50ddbeb26576 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0056000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0057000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0057000.png new file mode 100644 index 0000000000000000000000000000000000000000..a8c91d4e4d8838f0aa39f38e0622293c756b506b Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0057000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0058000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0058000.png new file mode 100644 index 0000000000000000000000000000000000000000..5ad088c7e590e3fd37517c99ef9408547fe4f5ff Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0058000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0059000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0059000.png new file mode 100644 index 0000000000000000000000000000000000000000..993ccc7c355591abaabcbef316553ae26e3ca9bc Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0059000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0060000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0060000.png new file mode 100644 index 0000000000000000000000000000000000000000..9c3fb2dc919b926dce85bcff454fa761c98dab0b Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0060000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0061000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0061000.png new file mode 100644 index 0000000000000000000000000000000000000000..c167ef7a6332615a8f84086595914c12b992df9d Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0061000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0062000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0062000.png new file mode 100644 index 0000000000000000000000000000000000000000..4b0db7546b270ee08a57d01dae08f5df72c833bb Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0062000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0063000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0063000.png new file mode 100644 index 0000000000000000000000000000000000000000..6c97a02c73ebcdaddbab74008aa0e49e50fbd5fa Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0063000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0064000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0064000.png new file mode 100644 index 0000000000000000000000000000000000000000..6906230953162f60cf9c5b2e053554bac8c4c119 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0064000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0065000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0065000.png new file mode 100644 index 0000000000000000000000000000000000000000..196fbab31009c6e13de200707f41813840e02e79 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0065000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0066000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0066000.png new file mode 100644 index 0000000000000000000000000000000000000000..9d9c08194ee5b282bdff8f55cd6ba57defb5e46d Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0066000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0067000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0067000.png new file mode 100644 index 0000000000000000000000000000000000000000..d128b3cc1422651d81c33b112675e76f9f87a888 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0067000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0068000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0068000.png new file mode 100644 index 0000000000000000000000000000000000000000..82cd5274c8f77065e06c86688a6ec979a9b9f2ee Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0068000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0069000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0069000.png new file mode 100644 index 0000000000000000000000000000000000000000..b3ba2ee7c93334edc9e177ea89253a9416c9087e Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0069000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0070000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0070000.png new file mode 100644 index 0000000000000000000000000000000000000000..61b1e01dd27b922ee4d9b0797a90677ca49bb583 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0070000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0071000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0071000.png new file mode 100644 index 0000000000000000000000000000000000000000..ee8d82ed815de09135aca44d383aed8e0c6ab0f3 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0071000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0072000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0072000.png new file mode 100644 index 0000000000000000000000000000000000000000..27d6865f1afe0e0f042f5fb6be72bb6ece72126c Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0072000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0073000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0073000.png new file mode 100644 index 0000000000000000000000000000000000000000..2a3ed339633e4f960396a92b99542970d1fb00b1 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0073000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0074000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0074000.png new file mode 100644 index 0000000000000000000000000000000000000000..c643b74682c4eba508402f939cd9f71d0601cbfd Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0074000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0075000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0075000.png new file mode 100644 index 0000000000000000000000000000000000000000..0ad6a4573c15ea91ebdec2d352e08ff44aa526c3 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0075000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0076000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0076000.png new file mode 100644 index 0000000000000000000000000000000000000000..7e40041273080f54cd53a572aa462b9f26918f6f Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0076000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0077000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0077000.png new file mode 100644 index 0000000000000000000000000000000000000000..e953e3f4228b6b8dd5f91d47a5a43584a2376526 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0077000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0078000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0078000.png new file mode 100644 index 0000000000000000000000000000000000000000..036519006006ef25e28d131bd0fd09b64efc1fae Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0078000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0079000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0079000.png new file mode 100644 index 0000000000000000000000000000000000000000..08dcfc33e5146542b4611d933641c83edd681ec2 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0079000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0080000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0080000.png new file mode 100644 index 0000000000000000000000000000000000000000..fdfb3528a22f784cd451ccee47989ebf370aa337 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0080000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0081000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0081000.png new file mode 100644 index 0000000000000000000000000000000000000000..b84f6625998110ecfefe4cec2d691f302cf2c895 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0081000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0082000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0082000.png new file mode 100644 index 0000000000000000000000000000000000000000..8e7eef9734efb37793cb383446d4c11b2e577890 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0082000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0083000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0083000.png new file mode 100644 index 0000000000000000000000000000000000000000..4673b407e09398ea7b03af77af4d266c53c442e8 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0083000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0084000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0084000.png new file mode 100644 index 0000000000000000000000000000000000000000..9b8c0f79b237b82852a6abde99ad47e92dfa4675 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0084000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0085000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0085000.png new file mode 100644 index 0000000000000000000000000000000000000000..9d0b9b9d1e5f63ab7659807f4cc6c5b9235e9713 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0085000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0086000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0086000.png new file mode 100644 index 0000000000000000000000000000000000000000..118928d260eee767c1a8503c915fee50508c3df1 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0086000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0087000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0087000.png new file mode 100644 index 0000000000000000000000000000000000000000..c795b274c07238a028c18cf510e48b00007c85a5 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0087000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0088000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0088000.png new file mode 100644 index 0000000000000000000000000000000000000000..2c57b0e15d89ecae6ba9dc3eef0952edb30aead3 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0088000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0089000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0089000.png new file mode 100644 index 0000000000000000000000000000000000000000..a04451ddb9ea261564cd82df0a78ec5200f0eb19 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0089000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0090000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0090000.png new file mode 100644 index 0000000000000000000000000000000000000000..6ae0eafd53eb3456b6e0cafe7630d4104756e135 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0090000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0091000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0091000.png new file mode 100644 index 0000000000000000000000000000000000000000..c29397ee9d8513435687142c75c1cea97e92937b Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0091000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0092000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0092000.png new file mode 100644 index 0000000000000000000000000000000000000000..0a5734a4aa49eeeb6ed827bdbead2d1a71c28ec8 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0092000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0093000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0093000.png new file mode 100644 index 0000000000000000000000000000000000000000..6149a91f1d59ec746f35cc2cf5fd48837255ddf1 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0093000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0094000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0094000.png new file mode 100644 index 0000000000000000000000000000000000000000..85415ca2b42b6b7f3e122eb028e8c0ef47280c7d Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0094000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0095000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0095000.png new file mode 100644 index 0000000000000000000000000000000000000000..dba15341c074b623c3cde775c99f041332d8f5ac Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0095000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0096000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0096000.png new file mode 100644 index 0000000000000000000000000000000000000000..b2c885efe05e27a9a4e721466cf8daf0bd7c1038 Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0096000.png differ diff --git a/V4.3-ckpt/cosine_sim_graphs/graph_step_0097000.png b/V4.3-ckpt/cosine_sim_graphs/graph_step_0097000.png new file mode 100644 index 0000000000000000000000000000000000000000..19cd46ed4ee9d4e1f6155c6600b1be3b99593a3f Binary files /dev/null and b/V4.3-ckpt/cosine_sim_graphs/graph_step_0097000.png differ diff --git a/V4.3-ckpt/deepspeed_config.json b/V4.3-ckpt/deepspeed_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ed373ab89d65b4449b7b9ce2c3e0a914a2faebd9 --- /dev/null +++ b/V4.3-ckpt/deepspeed_config.json @@ -0,0 +1,27 @@ +{ + "train_batch_size": 16, + "train_micro_batch_size_per_gpu": 16, + "steps_per_print": 100, + "optimizer": { + "type": "Adam", + "adam_w_mode": true, + "params": { + "lr": 1e-05, + "weight_decay": 0.01, + "bias_correction": true, + "betas": [ + 0.9, + 0.98 + ], + "eps": 1e-08 + } + }, + "zero_optimization": { + "stage": 1, + "reduce_bucket_size": 500000000.0 + }, + "bf16": { + "enabled": true + }, + "gradient_clipping": 0.7 +} \ No newline at end of file diff --git a/V4.3-ckpt/train.log b/V4.3-ckpt/train.log new file mode 100644 index 0000000000000000000000000000000000000000..d88728c74e9f1fcf7577b81afd392e3d9b0a37c5 --- /dev/null +++ b/V4.3-ckpt/train.log @@ -0,0 +1,13490 @@ +2025-05-12T08:34:45 | INFO | vindlu : Logging to: scripts/pretraining/clip/B14/B14/train.log +2025-05-12T08:34:45 | INFO | utils.config_utils : config: { + root_path: /home/zli + available_corpus: { + cc3m: { + anno_path: your_path + data_root: + media_type: image } + webvid_10m: { + anno_path: your_path + data_root: + media_type: video } + smol_test: { + anno_path: /root/IV2/InternVideo2/multi_modality/data_test/smol_test.json + data_root: /root/IV2/InternVideo2/multi_modality/data_test/ + media_type: video } + slim_kinetics: { + anno_path: /home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json + data_root: /home/zli/kinetics-dataset/k600/train/train + media_type: video } + slim_kinetics_act_val: { + anno_path: /home/zli/kinetics-dataset/k600/test/kinetics-test.json + data_root: /home/zli/kinetics-dataset/k600/test/ + media_type: video + is_act_rec: True } } + VisionEncoders: { + + TextEncoders: { + bert: { + name: bert_base + pretrained: bert-base-uncased + config: configs/config_bert.json + d_model: 768 + fusion_layer: 9 } + bert_large: { + name: bert_large + pretrained: bert-large-uncased + config: configs/config_bert_large.json + d_model: 1024 + fusion_layer: 19 } + med_bert: { + name: med_bert_base + pretrained: bert-base-uncased + config: configs/med_config.json + d_model: 768 } + med_bert_large: { + name: med_bert_large + pretrained: bert-base-uncased + config: configs/med_large_config.json + d_model: 768 } } + train_corpus: slim_kinetics + train_file: { + anno_path: /home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json + data_root: /home/zli/kinetics-dataset/k600/train/train + media_type: video } + test_file: { + act_val: { + anno_path: /home/zli/kinetics-dataset/k600/test/kinetics-test.json + data_root: /home/zli/kinetics-dataset/k600/test/ + media_type: video + is_act_rec: True } } + test_types: ['act_val'] + num_workers: 2 + stop_key: None + num_frames: 8 + num_frames_test: 8 + batch_size: 16 + batch_size_test: 16 + max_txt_l: 32 + size_t: 224 + inputs: { + image_res: 224 + video_input: { + num_frames: 8 + sample_type: all + num_frames_test: 8 + sample_type_test: all + random_aug: False } + max_txt_l: { + image: 32 + video: 32 } + batch_size: { + image: 16 + video: 16 } + batch_size_test: { + image: 16 + video: 16 } } + model: { + model_cls: InternVideo2_CLIP_small + vision_encoder: { + name: internvideo2 + in_chans: 3 + patch_size: 14 + img_size: 224 + qkv_bias: False + drop_path_rate: 0.0 + head_drop_path_rate: 0.0 + embed_dim: 768 + num_heads: 12 + mlp_ratio: 4 + init_values: 0.1 + qk_normalization: True + depth: 12 + use_flash_attn: True + use_fused_rmsnorm: True + use_fused_mlp: True + fused_mlp_heuristic: 1 + drop_cls_token: False + attn_pool_num_heads: 16 + clip_embed_dim: 768 + layerscale_no_force_fp32: True + num_frames: 8 + tubelet_size: 1 + sep_pos_embed: False + use_checkpoint: False + checkpoint_num: 0 + align_dim: 512 } + streaming_vision_encoder: { + in_chans: 3 + patch_size: 14 + img_size: 224 + vit_qkv_bias: True + vit_drop_path_rate: 0.05 + student_embed_dim: 384 + student_depth: 4 + student_num_heads: 6 + vit_mlp_ratio: 3.0 + vit_init_values: None + vit_qk_normalization: False + vit_sep_pos_embed: True + vit_norm_layer_type: rmsnorm + rnn_type: lstm + rnn_hidden_size: 1024 + rnn_num_layers: 1 + fc_hidden_layers: [] + teacher_clip_embed_dim: 768 + student_num_frames_processed_by_vit: 1 + student_tubelet_size_for_vit: 1 } + text_encoder: { + name: mobileclip_b } + temp: 0.01 + temp_min: 0.01 + freeze_vision: True + open_vision_clip_projector: False + freeze_text: True + open_text_projection: False + open_text_lora: False + vision_ckpt_path: /home/zli/IV2/models/stage1/B14/B14_dist_1B_stage2/pytorch_model.bin + load_vision_ckpt_from_internvideo2_stage2: False + text_ckpt_path: /home/zli/IV2/models/mobileclip_blt.pt + extra_ckpt_path: /home/zli/IV2/models/clip/B14/pytorch_model.bin } + criterion: { + loss_weight: { + vtc: 1.0 } } + optimizer: { + opt: adamW + lr: 1e-05 + opt_betas: [0.9, 0.98] + weight_decay: 0.01 + max_grad_norm: 0.7 + different_lr: { + enable: False + module_names: [] + lr: 1e-05 } } + scheduler: { + sched: cosine + epochs: 1 + min_lr_multi: 0.01 + warmup_epochs: 0.05 } + evaluate: False + deep_fusion: False + evaluation: { + eval_frame_ensemble: concat + eval_x_only: False + k_test: 128 + eval_offload: True } + use_half_precision: True + use_bf16: True + gradient_checkpointing: True + wandb: { + enable: True + entity: qingy2019-conker-mobile-inc- + project: window_iv2 } + dist_url: env:// + device: cuda + mode: pt + output_dir: scripts/pretraining/clip/B14/B14 + resume: True + debug: False + log_freq: 1 + seed: 42 + save_latest: False + save_iter: 5000 + eval_freq_steps: 1000 + eval_repo_id: qingy2024/backflip_train + filename: 1.mp4 + eval_plot_output_dir: scripts/pretraining/clip/B14/cosine_sim_graphs + auto_resume: True + pretrained_path: + deepspeed: { + enable: True + stage: 1 } + rank: 0 + world_size: 1 + gpu: 0 + distributed: True + dist_backend: nccl + deepspeed_config: scripts/pretraining/clip/B14/B14/deepspeed_config.json } +2025-05-12T08:34:46 | INFO | __main__ : train_file: {'anno_path': '/home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json', 'data_root': '/home/zli/kinetics-dataset/k600/train/train', 'media_type': 'video'} +2025-05-12T08:34:46 | INFO | __main__ : Creating dataset for pt +2025-05-12T08:34:46 | WARNING | dataset : Make sure that you don't need audio input!!! +2025-05-12T08:34:46 | INFO | dataset : dataset_type: pt_train media_type: video dataset_cls: +2025-05-12T08:34:46 | INFO | dataset : dataset_type=pt_train, train_file={'anno_path': '/home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json', 'data_root': '/home/zli/kinetics-dataset/k600/train/train', 'media_type': 'video'} +2025-05-12T08:34:46 | INFO | dataset : {'ann_file': {'anno_path': '/home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json', 'data_root': '/home/zli/kinetics-dataset/k600/train/train', 'media_type': 'video'}, 'transform': Compose( + Lambda() + RandomResizedCrop(size=(224, 224), scale=(0.5, 1.0), ratio=(0.75, 1.3333), interpolation=bicubic, antialias=True) + RandomHorizontalFlip(p=0.5) + Lambda() + Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) +), 'num_epochs': 1, 'video_reader_type': 'decord', 'sample_type': 'all', 'num_frames': 8, 'num_tries': 10} +2025-05-12T08:34:46 | INFO | dataset : train_transform: +2025-05-12T08:34:46 | INFO | dataset : Compose( + Lambda() + RandomResizedCrop(size=(224, 224), scale=(0.5, 1.0), ratio=(0.75, 1.3333), interpolation=bicubic, antialias=True) + RandomHorizontalFlip(p=0.5) + Lambda() + Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) +) +2025-05-12T08:34:46 | INFO | dataset.pt_dataset : ann_file: {'anno_path': '/home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json', 'data_root': '/home/zli/kinetics-dataset/k600/train/train', 'media_type': 'video'} +2025-05-12T08:34:46 | INFO | dataset.pt_dataset : Loading json file /home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json +2025-05-12T08:34:46 | INFO | dataset.pt_dataset : Loading from local file! +2025-05-12T08:34:46 | INFO | dataset.pt_dataset : Num samples: 99243 +2025-05-12T08:34:46 | INFO | dataset.pt_dataset : Num too short: 19586 +2025-05-12T08:34:46 | INFO | dataset.pt_dataset : num_examples: 79657 +2025-05-12T08:34:46 | INFO | dataset : Use ConcatDataset for video +2025-05-12T08:34:46 | WARNING | dataset : Make sure that you don't need audio input!!! +2025-05-12T08:34:46 | INFO | dataset : dataset_type: ret_eval media_type: video dataset_cls: +2025-05-12T08:34:46 | INFO | dataset : dataset_type=pt_eval, test_file={'anno_path': '/home/zli/kinetics-dataset/k600/test/kinetics-test.json', 'data_root': '/home/zli/kinetics-dataset/k600/test/', 'media_type': 'video', 'is_act_rec': True} +2025-05-12T08:34:46 | INFO | dataset : {'ann_file': {'anno_path': '/home/zli/kinetics-dataset/k600/test/kinetics-test.json', 'data_root': '/home/zli/kinetics-dataset/k600/test/', 'media_type': 'video', 'is_act_rec': True}, 'transform': Compose( + Resize(size=(224, 224), interpolation=bicubic, max_size=None, antialias=True) + Lambda() + Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) +), 'video_reader_type': 'decord', 'sample_type': 'all', 'num_frames': 8, 'num_tries': 1} +2025-05-12T08:34:46 | INFO | dataset : test_transform: +2025-05-12T08:34:46 | INFO | dataset : Compose( + Resize(size=(224, 224), interpolation=bicubic, max_size=None, antialias=True) + Lambda() + Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) +) +2025-05-12T08:34:46 | INFO | dataset.ret_dataset : Action recognition, number of prompts: 16 +2025-05-12T08:34:46 | INFO | dataset.ret_dataset : Action recognition, number of classes: 512 +2025-05-12T08:34:46 | INFO | dataset.ret_dataset : Action recognition, number of prompts: 16 +2025-05-12T08:34:46 | INFO | dataset.ret_dataset : Action recognition, number of classes: 512 +2025-05-12T08:34:46 | INFO | tasks_clip.shared_utils : Creating model +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze cls_token +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze pos_embed +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze patch_embed.proj.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze patch_embed.proj.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.0.norm1.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.0.attn.qkv.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.0.attn.proj.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.0.attn.proj.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.0.attn.q_norm.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.0.attn.k_norm.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.0.ls1.gamma +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.0.norm2.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.0.mlp.fc1.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.0.mlp.fc1.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.0.mlp.fc2.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.0.mlp.fc2.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.0.ls2.gamma +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.1.norm1.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.1.attn.qkv.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.1.attn.proj.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.1.attn.proj.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.1.attn.q_norm.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.1.attn.k_norm.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.1.ls1.gamma +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.1.norm2.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.1.mlp.fc1.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.1.mlp.fc1.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.1.mlp.fc2.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.1.mlp.fc2.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.1.ls2.gamma +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.2.norm1.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.2.attn.qkv.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.2.attn.proj.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.2.attn.proj.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.2.attn.q_norm.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.2.attn.k_norm.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.2.ls1.gamma +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.2.norm2.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.2.mlp.fc1.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.2.mlp.fc1.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.2.mlp.fc2.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.2.mlp.fc2.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.2.ls2.gamma +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.3.norm1.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.3.attn.qkv.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.3.attn.proj.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.3.attn.proj.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.3.attn.q_norm.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.3.attn.k_norm.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.3.ls1.gamma +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.3.norm2.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.3.mlp.fc1.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.3.mlp.fc1.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.3.mlp.fc2.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.3.mlp.fc2.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.3.ls2.gamma +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.4.norm1.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.4.attn.qkv.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.4.attn.proj.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.4.attn.proj.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.4.attn.q_norm.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.4.attn.k_norm.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.4.ls1.gamma +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.4.norm2.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.4.mlp.fc1.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.4.mlp.fc1.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.4.mlp.fc2.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.4.mlp.fc2.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.4.ls2.gamma +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.5.norm1.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.5.attn.qkv.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.5.attn.proj.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.5.attn.proj.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.5.attn.q_norm.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.5.attn.k_norm.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.5.ls1.gamma +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.5.norm2.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.5.mlp.fc1.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.5.mlp.fc1.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.5.mlp.fc2.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.5.mlp.fc2.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.5.ls2.gamma +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.6.norm1.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.6.attn.qkv.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.6.attn.proj.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.6.attn.proj.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.6.attn.q_norm.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.6.attn.k_norm.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.6.ls1.gamma +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.6.norm2.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.6.mlp.fc1.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.6.mlp.fc1.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.6.mlp.fc2.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.6.mlp.fc2.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.6.ls2.gamma +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.7.norm1.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.7.attn.qkv.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.7.attn.proj.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.7.attn.proj.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.7.attn.q_norm.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.7.attn.k_norm.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.7.ls1.gamma +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.7.norm2.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.7.mlp.fc1.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.7.mlp.fc1.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.7.mlp.fc2.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.7.mlp.fc2.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.7.ls2.gamma +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.8.norm1.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.8.attn.qkv.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.8.attn.proj.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.8.attn.proj.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.8.attn.q_norm.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.8.attn.k_norm.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.8.ls1.gamma +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.8.norm2.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.8.mlp.fc1.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.8.mlp.fc1.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.8.mlp.fc2.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.8.mlp.fc2.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.8.ls2.gamma +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.9.norm1.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.9.attn.qkv.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.9.attn.proj.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.9.attn.proj.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.9.attn.q_norm.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.9.attn.k_norm.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.9.ls1.gamma +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.9.norm2.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.9.mlp.fc1.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.9.mlp.fc1.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.9.mlp.fc2.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.9.mlp.fc2.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.9.ls2.gamma +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.10.norm1.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.10.attn.qkv.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.10.attn.proj.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.10.attn.proj.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.10.attn.q_norm.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.10.attn.k_norm.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.10.ls1.gamma +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.10.norm2.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.10.mlp.fc1.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.10.mlp.fc1.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.10.mlp.fc2.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.10.mlp.fc2.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.10.ls2.gamma +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.11.norm1.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.11.attn.qkv.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.11.attn.proj.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.11.attn.proj.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.11.attn.q_norm.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.11.attn.k_norm.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.11.ls1.gamma +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.11.norm2.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.11.mlp.fc1.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.11.mlp.fc1.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.11.mlp.fc2.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.11.mlp.fc2.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze blocks.11.ls2.gamma +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze clip_projector.norm1_q.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze clip_projector.norm1_q.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze clip_projector.norm1_k.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze clip_projector.norm1_k.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze clip_projector.norm1_v.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze clip_projector.norm1_v.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze clip_projector.cross_attn.q_bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze clip_projector.cross_attn.k_bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze clip_projector.cross_attn.v_bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze clip_projector.cross_attn.q.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze clip_projector.cross_attn.k.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze clip_projector.cross_attn.v.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze clip_projector.cross_attn.proj.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze clip_projector.cross_attn.proj.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : ---- Froze all the vision encoder params ---- +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze 0.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze 0.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze 1.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze 1.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : ---- Froze all the vision align params ---- +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze projection_layer +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze embedding_layer.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze positional_embedding.pos_embed.pos_embed +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_mha.0.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_mha.0.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_mha.1.qkv_proj.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_mha.1.qkv_proj.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_mha.1.out_proj.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_mha.1.out_proj.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_ffn.0.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_ffn.0.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_ffn.1.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_ffn.1.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_ffn.4.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_ffn.4.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_mha.0.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_mha.0.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_mha.1.qkv_proj.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_mha.1.qkv_proj.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_mha.1.out_proj.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_mha.1.out_proj.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_ffn.0.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_ffn.0.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_ffn.1.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_ffn.1.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_ffn.4.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_ffn.4.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_mha.0.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_mha.0.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_mha.1.qkv_proj.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_mha.1.qkv_proj.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_mha.1.out_proj.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_mha.1.out_proj.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_ffn.0.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_ffn.0.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_ffn.1.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_ffn.1.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_ffn.4.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_ffn.4.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_mha.0.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_mha.0.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_mha.1.qkv_proj.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_mha.1.qkv_proj.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_mha.1.out_proj.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_mha.1.out_proj.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_ffn.0.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_ffn.0.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_ffn.1.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_ffn.1.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_ffn.4.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_ffn.4.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_mha.0.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_mha.0.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_mha.1.qkv_proj.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_mha.1.qkv_proj.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_mha.1.out_proj.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_mha.1.out_proj.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_ffn.0.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_ffn.0.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_ffn.1.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_ffn.1.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_ffn.4.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_ffn.4.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_mha.0.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_mha.0.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_mha.1.qkv_proj.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_mha.1.qkv_proj.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_mha.1.out_proj.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_mha.1.out_proj.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_ffn.0.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_ffn.0.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_ffn.1.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_ffn.1.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_ffn.4.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_ffn.4.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_mha.0.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_mha.0.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_mha.1.qkv_proj.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_mha.1.qkv_proj.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_mha.1.out_proj.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_mha.1.out_proj.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_ffn.0.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_ffn.0.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_ffn.1.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_ffn.1.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_ffn.4.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_ffn.4.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_mha.0.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_mha.0.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_mha.1.qkv_proj.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_mha.1.qkv_proj.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_mha.1.out_proj.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_mha.1.out_proj.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_ffn.0.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_ffn.0.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_ffn.1.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_ffn.1.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_ffn.4.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_ffn.4.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_mha.0.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_mha.0.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_mha.1.qkv_proj.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_mha.1.qkv_proj.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_mha.1.out_proj.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_mha.1.out_proj.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_ffn.0.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_ffn.0.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_ffn.1.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_ffn.1.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_ffn.4.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_ffn.4.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_mha.0.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_mha.0.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_mha.1.qkv_proj.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_mha.1.qkv_proj.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_mha.1.out_proj.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_mha.1.out_proj.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_ffn.0.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_ffn.0.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_ffn.1.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_ffn.1.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_ffn.4.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_ffn.4.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_mha.0.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_mha.0.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_mha.1.qkv_proj.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_mha.1.qkv_proj.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_mha.1.out_proj.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_mha.1.out_proj.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_ffn.0.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_ffn.0.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_ffn.1.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_ffn.1.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_ffn.4.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_ffn.4.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_mha.0.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_mha.0.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_mha.1.qkv_proj.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_mha.1.qkv_proj.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_mha.1.out_proj.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_mha.1.out_proj.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_ffn.0.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_ffn.0.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_ffn.1.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_ffn.1.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_ffn.4.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_ffn.4.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze final_layer_norm.weight +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Freeze final_layer_norm.bias +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Load vision_encoder checkpoint from /home/zli/IV2/models/stage1/B14/B14_dist_1B_stage2/pytorch_model.bin +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Load text_encoder checkpoint from /home/zli/IV2/models/mobileclip_blt.pt +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : Load extra checkpoint from /home/zli/IV2/models/clip/B14/pytorch_model.bin +2025-05-12T08:34:47 | INFO | models.internvideo2_clip_small : _IncompatibleKeys(missing_keys=['streaming_vision_encoder.vit_lite.cls_token', 'streaming_vision_encoder.vit_lite.pos_embed_spatial', 'streaming_vision_encoder.vit_lite.pos_embed_cls', 'streaming_vision_encoder.vit_lite.patch_embed.proj.weight', 'streaming_vision_encoder.vit_lite.patch_embed.proj.bias', 'streaming_vision_encoder.vit_lite.blocks.0.norm1.weight', 'streaming_vision_encoder.vit_lite.blocks.0.attn.qkv.weight', 'streaming_vision_encoder.vit_lite.blocks.0.attn.qkv.bias', 'streaming_vision_encoder.vit_lite.blocks.0.attn.proj.weight', 'streaming_vision_encoder.vit_lite.blocks.0.attn.proj.bias', 'streaming_vision_encoder.vit_lite.blocks.0.norm2.weight', 'streaming_vision_encoder.vit_lite.blocks.0.mlp.fc1.weight', 'streaming_vision_encoder.vit_lite.blocks.0.mlp.fc1.bias', 'streaming_vision_encoder.vit_lite.blocks.0.mlp.fc2.weight', 'streaming_vision_encoder.vit_lite.blocks.0.mlp.fc2.bias', 'streaming_vision_encoder.vit_lite.blocks.1.norm1.weight', 'streaming_vision_encoder.vit_lite.blocks.1.attn.qkv.weight', 'streaming_vision_encoder.vit_lite.blocks.1.attn.qkv.bias', 'streaming_vision_encoder.vit_lite.blocks.1.attn.proj.weight', 'streaming_vision_encoder.vit_lite.blocks.1.attn.proj.bias', 'streaming_vision_encoder.vit_lite.blocks.1.norm2.weight', 'streaming_vision_encoder.vit_lite.blocks.1.mlp.fc1.weight', 'streaming_vision_encoder.vit_lite.blocks.1.mlp.fc1.bias', 'streaming_vision_encoder.vit_lite.blocks.1.mlp.fc2.weight', 'streaming_vision_encoder.vit_lite.blocks.1.mlp.fc2.bias', 'streaming_vision_encoder.vit_lite.blocks.2.norm1.weight', 'streaming_vision_encoder.vit_lite.blocks.2.attn.qkv.weight', 'streaming_vision_encoder.vit_lite.blocks.2.attn.qkv.bias', 'streaming_vision_encoder.vit_lite.blocks.2.attn.proj.weight', 'streaming_vision_encoder.vit_lite.blocks.2.attn.proj.bias', 'streaming_vision_encoder.vit_lite.blocks.2.norm2.weight', 'streaming_vision_encoder.vit_lite.blocks.2.mlp.fc1.weight', 'streaming_vision_encoder.vit_lite.blocks.2.mlp.fc1.bias', 'streaming_vision_encoder.vit_lite.blocks.2.mlp.fc2.weight', 'streaming_vision_encoder.vit_lite.blocks.2.mlp.fc2.bias', 'streaming_vision_encoder.vit_lite.blocks.3.norm1.weight', 'streaming_vision_encoder.vit_lite.blocks.3.attn.qkv.weight', 'streaming_vision_encoder.vit_lite.blocks.3.attn.qkv.bias', 'streaming_vision_encoder.vit_lite.blocks.3.attn.proj.weight', 'streaming_vision_encoder.vit_lite.blocks.3.attn.proj.bias', 'streaming_vision_encoder.vit_lite.blocks.3.norm2.weight', 'streaming_vision_encoder.vit_lite.blocks.3.mlp.fc1.weight', 'streaming_vision_encoder.vit_lite.blocks.3.mlp.fc1.bias', 'streaming_vision_encoder.vit_lite.blocks.3.mlp.fc2.weight', 'streaming_vision_encoder.vit_lite.blocks.3.mlp.fc2.bias', 'streaming_vision_encoder.rnn.weight_ih_l0', 'streaming_vision_encoder.rnn.weight_hh_l0', 'streaming_vision_encoder.rnn.bias_ih_l0', 'streaming_vision_encoder.rnn.bias_hh_l0', 'streaming_vision_encoder.output_fc.0.weight', 'streaming_vision_encoder.output_fc.0.bias', 'streaming_vision_align.0.weight', 'streaming_vision_align.0.bias', 'streaming_vision_align.1.weight', 'streaming_vision_align.1.bias'], unexpected_keys=[]) +2025-05-12T08:34:47 | INFO | tasks_clip.shared_utils : Change to bfloat16 for model +2025-05-12T08:34:47 | INFO | utils.optimizer : diff_names: [], diff_lr: None +2025-05-12T08:34:47 | INFO | utils.optimizer : param temp: wd: 0, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.cls_token: wd: 0.01, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.pos_embed_spatial: wd: 0.01, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.pos_embed_cls: wd: 0.01, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.patch_embed.proj.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.patch_embed.proj.bias: wd: 0, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.norm1.weight: wd: 0, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.attn.qkv.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.attn.qkv.bias: wd: 0, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.attn.proj.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.attn.proj.bias: wd: 0, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.norm2.weight: wd: 0, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.mlp.fc1.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.mlp.fc1.bias: wd: 0, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.mlp.fc2.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.mlp.fc2.bias: wd: 0, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.norm1.weight: wd: 0, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.attn.qkv.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.attn.qkv.bias: wd: 0, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.attn.proj.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.attn.proj.bias: wd: 0, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.norm2.weight: wd: 0, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.mlp.fc1.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.mlp.fc1.bias: wd: 0, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.mlp.fc2.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.mlp.fc2.bias: wd: 0, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.norm1.weight: wd: 0, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.attn.qkv.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.attn.qkv.bias: wd: 0, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.attn.proj.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.attn.proj.bias: wd: 0, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.norm2.weight: wd: 0, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.mlp.fc1.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.mlp.fc1.bias: wd: 0, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.mlp.fc2.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.mlp.fc2.bias: wd: 0, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.norm1.weight: wd: 0, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.attn.qkv.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.attn.qkv.bias: wd: 0, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.attn.proj.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.attn.proj.bias: wd: 0, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.norm2.weight: wd: 0, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.mlp.fc1.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.mlp.fc1.bias: wd: 0, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.mlp.fc2.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.mlp.fc2.bias: wd: 0, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_encoder.rnn.weight_ih_l0: wd: 0.01, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_encoder.rnn.weight_hh_l0: wd: 0.01, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_encoder.rnn.bias_ih_l0: wd: 0, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_encoder.rnn.bias_hh_l0: wd: 0, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_encoder.output_fc.0.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_encoder.output_fc.0.bias: wd: 0, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_align.0.weight: wd: 0, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_align.0.bias: wd: 0, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_align.1.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : param streaming_vision_align.1.bias: wd: 0, lr: 1e-05 +2025-05-12T08:34:47 | INFO | utils.optimizer : optimizer -- lr=1e-05 wd=0 len(p)=32 +2025-05-12T08:34:47 | INFO | utils.optimizer : optimizer -- lr=1e-05 wd=0.01 len(p)=24 +2025-05-12T08:34:47 | INFO | tasks_clip.shared_utils : Auto resuming +2025-05-12T08:34:47 | INFO | tasks_clip.shared_utils : Not found checkpoint in scripts/pretraining/clip/B14/B14 +2025-05-12T08:34:47 | INFO | tasks_clip.shared_utils : Use deepspeed to initialize model!!! +2025-05-12T08:34:47 | WARNING | py.warnings : /home/zli/IV2/InternVideo2/multi_modality/utils/distributed.py:24: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. +If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST']. + builtin_warn(*args, **kwargs) + +2025-05-12T08:34:47 | WARNING | py.warnings : /home/zli/IV2/InternVideo2/multi_modality/utils/distributed.py:24: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. +If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST']. + builtin_warn(*args, **kwargs) + +2025-05-12T08:34:48 | INFO | tasks_clip.shared_utils : Cuda memory after create model: 379M, Max mem: 379M +2025-05-12T08:34:48 | INFO | __main__ : Start training +2025-05-12T08:34:48 | INFO | __main__ : Epoch: 0 +2025-05-12T08:34:48 | WARNING | __main__ : Model does not have a 'transform' or 'config.model.vision_encoder.img_size' attribute. Using default transform. +2025-05-12T08:36:40 | INFO | vindlu : Logging to: scripts/pretraining/clip/B14/B14/train.log +2025-05-12T08:36:40 | INFO | utils.config_utils : config: { + root_path: /home/zli + available_corpus: { + cc3m: { + anno_path: your_path + data_root: + media_type: image } + webvid_10m: { + anno_path: your_path + data_root: + media_type: video } + smol_test: { + anno_path: /root/IV2/InternVideo2/multi_modality/data_test/smol_test.json + data_root: /root/IV2/InternVideo2/multi_modality/data_test/ + media_type: video } + slim_kinetics: { + anno_path: /home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json + data_root: /home/zli/kinetics-dataset/k600/train/train + media_type: video } + slim_kinetics_act_val: { + anno_path: /home/zli/kinetics-dataset/k600/test/kinetics-test.json + data_root: /home/zli/kinetics-dataset/k600/test/ + media_type: video + is_act_rec: True } } + VisionEncoders: { + + TextEncoders: { + bert: { + name: bert_base + pretrained: bert-base-uncased + config: configs/config_bert.json + d_model: 768 + fusion_layer: 9 } + bert_large: { + name: bert_large + pretrained: bert-large-uncased + config: configs/config_bert_large.json + d_model: 1024 + fusion_layer: 19 } + med_bert: { + name: med_bert_base + pretrained: bert-base-uncased + config: configs/med_config.json + d_model: 768 } + med_bert_large: { + name: med_bert_large + pretrained: bert-base-uncased + config: configs/med_large_config.json + d_model: 768 } } + train_corpus: slim_kinetics + train_file: { + anno_path: /home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json + data_root: /home/zli/kinetics-dataset/k600/train/train + media_type: video } + test_file: { + act_val: { + anno_path: /home/zli/kinetics-dataset/k600/test/kinetics-test.json + data_root: /home/zli/kinetics-dataset/k600/test/ + media_type: video + is_act_rec: True } } + test_types: ['act_val'] + num_workers: 2 + stop_key: None + num_frames: 8 + num_frames_test: 8 + batch_size: 16 + batch_size_test: 16 + max_txt_l: 32 + size_t: 224 + inputs: { + image_res: 224 + video_input: { + num_frames: 8 + sample_type: all + num_frames_test: 8 + sample_type_test: all + random_aug: False } + max_txt_l: { + image: 32 + video: 32 } + batch_size: { + image: 16 + video: 16 } + batch_size_test: { + image: 16 + video: 16 } } + model: { + model_cls: InternVideo2_CLIP_small + vision_encoder: { + name: internvideo2 + in_chans: 3 + patch_size: 14 + img_size: 224 + qkv_bias: False + drop_path_rate: 0.0 + head_drop_path_rate: 0.0 + embed_dim: 768 + num_heads: 12 + mlp_ratio: 4 + init_values: 0.1 + qk_normalization: True + depth: 12 + use_flash_attn: True + use_fused_rmsnorm: True + use_fused_mlp: True + fused_mlp_heuristic: 1 + drop_cls_token: False + attn_pool_num_heads: 16 + clip_embed_dim: 768 + layerscale_no_force_fp32: True + num_frames: 8 + tubelet_size: 1 + sep_pos_embed: False + use_checkpoint: False + checkpoint_num: 0 + align_dim: 512 } + streaming_vision_encoder: { + in_chans: 3 + patch_size: 14 + img_size: 224 + vit_qkv_bias: True + vit_drop_path_rate: 0.05 + student_embed_dim: 384 + student_depth: 4 + student_num_heads: 6 + vit_mlp_ratio: 3.0 + vit_init_values: None + vit_qk_normalization: False + vit_sep_pos_embed: True + vit_norm_layer_type: rmsnorm + rnn_type: lstm + rnn_hidden_size: 1024 + rnn_num_layers: 1 + fc_hidden_layers: [] + teacher_clip_embed_dim: 768 + student_num_frames_processed_by_vit: 1 + student_tubelet_size_for_vit: 1 } + text_encoder: { + name: mobileclip_b } + temp: 0.01 + temp_min: 0.01 + freeze_vision: True + open_vision_clip_projector: False + freeze_text: True + open_text_projection: False + open_text_lora: False + vision_ckpt_path: /home/zli/IV2/models/stage1/B14/B14_dist_1B_stage2/pytorch_model.bin + load_vision_ckpt_from_internvideo2_stage2: False + text_ckpt_path: /home/zli/IV2/models/mobileclip_blt.pt + extra_ckpt_path: /home/zli/IV2/models/clip/B14/pytorch_model.bin } + criterion: { + loss_weight: { + vtc: 1.0 } } + optimizer: { + opt: adamW + lr: 1e-05 + opt_betas: [0.9, 0.98] + weight_decay: 0.01 + max_grad_norm: 0.7 + different_lr: { + enable: False + module_names: [] + lr: 1e-05 } } + scheduler: { + sched: cosine + epochs: 1 + min_lr_multi: 0.01 + warmup_epochs: 0.05 } + evaluate: False + deep_fusion: False + evaluation: { + eval_frame_ensemble: concat + eval_x_only: False + k_test: 128 + eval_offload: True } + use_half_precision: True + use_bf16: True + gradient_checkpointing: True + wandb: { + enable: True + entity: qingy2019-conker-mobile-inc- + project: window_iv2 } + dist_url: env:// + device: cuda + mode: pt + output_dir: scripts/pretraining/clip/B14/B14 + resume: True + debug: False + log_freq: 1 + seed: 42 + save_latest: False + save_iter: 5000 + eval_freq_steps: 1000 + eval_repo_id: qingy2024/backflip_train + filename: 1.mp4 + eval_plot_output_dir: scripts/pretraining/clip/B14/cosine_sim_graphs + auto_resume: True + pretrained_path: + deepspeed: { + enable: True + stage: 1 } + rank: 0 + world_size: 1 + gpu: 0 + distributed: True + dist_backend: nccl + deepspeed_config: scripts/pretraining/clip/B14/B14/deepspeed_config.json } +2025-05-12T08:36:41 | INFO | __main__ : train_file: {'anno_path': '/home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json', 'data_root': '/home/zli/kinetics-dataset/k600/train/train', 'media_type': 'video'} +2025-05-12T08:36:41 | INFO | __main__ : Creating dataset for pt +2025-05-12T08:36:41 | WARNING | dataset : Make sure that you don't need audio input!!! +2025-05-12T08:36:41 | INFO | dataset : dataset_type: pt_train media_type: video dataset_cls: +2025-05-12T08:36:41 | INFO | dataset : dataset_type=pt_train, train_file={'anno_path': '/home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json', 'data_root': '/home/zli/kinetics-dataset/k600/train/train', 'media_type': 'video'} +2025-05-12T08:36:41 | INFO | dataset : {'ann_file': {'anno_path': '/home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json', 'data_root': '/home/zli/kinetics-dataset/k600/train/train', 'media_type': 'video'}, 'transform': Compose( + Lambda() + RandomResizedCrop(size=(224, 224), scale=(0.5, 1.0), ratio=(0.75, 1.3333), interpolation=bicubic, antialias=True) + RandomHorizontalFlip(p=0.5) + Lambda() + Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) +), 'num_epochs': 1, 'video_reader_type': 'decord', 'sample_type': 'all', 'num_frames': 8, 'num_tries': 10} +2025-05-12T08:36:41 | INFO | dataset : train_transform: +2025-05-12T08:36:41 | INFO | dataset : Compose( + Lambda() + RandomResizedCrop(size=(224, 224), scale=(0.5, 1.0), ratio=(0.75, 1.3333), interpolation=bicubic, antialias=True) + RandomHorizontalFlip(p=0.5) + Lambda() + Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) +) +2025-05-12T08:36:41 | INFO | dataset.pt_dataset : ann_file: {'anno_path': '/home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json', 'data_root': '/home/zli/kinetics-dataset/k600/train/train', 'media_type': 'video'} +2025-05-12T08:36:41 | INFO | dataset.pt_dataset : Loading json file /home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json +2025-05-12T08:36:41 | INFO | dataset.pt_dataset : Loading from local file! +2025-05-12T08:36:41 | INFO | dataset.pt_dataset : Num samples: 99243 +2025-05-12T08:36:41 | INFO | dataset.pt_dataset : Num too short: 19586 +2025-05-12T08:36:41 | INFO | dataset.pt_dataset : num_examples: 79657 +2025-05-12T08:36:41 | INFO | dataset : Use ConcatDataset for video +2025-05-12T08:36:41 | WARNING | dataset : Make sure that you don't need audio input!!! +2025-05-12T08:36:41 | INFO | dataset : dataset_type: ret_eval media_type: video dataset_cls: +2025-05-12T08:36:41 | INFO | dataset : dataset_type=pt_eval, test_file={'anno_path': '/home/zli/kinetics-dataset/k600/test/kinetics-test.json', 'data_root': '/home/zli/kinetics-dataset/k600/test/', 'media_type': 'video', 'is_act_rec': True} +2025-05-12T08:36:41 | INFO | dataset : {'ann_file': {'anno_path': '/home/zli/kinetics-dataset/k600/test/kinetics-test.json', 'data_root': '/home/zli/kinetics-dataset/k600/test/', 'media_type': 'video', 'is_act_rec': True}, 'transform': Compose( + Resize(size=(224, 224), interpolation=bicubic, max_size=None, antialias=True) + Lambda() + Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) +), 'video_reader_type': 'decord', 'sample_type': 'all', 'num_frames': 8, 'num_tries': 1} +2025-05-12T08:36:41 | INFO | dataset : test_transform: +2025-05-12T08:36:41 | INFO | dataset : Compose( + Resize(size=(224, 224), interpolation=bicubic, max_size=None, antialias=True) + Lambda() + Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) +) +2025-05-12T08:36:41 | INFO | dataset.ret_dataset : Action recognition, number of prompts: 16 +2025-05-12T08:36:41 | INFO | dataset.ret_dataset : Action recognition, number of classes: 512 +2025-05-12T08:36:41 | INFO | dataset.ret_dataset : Action recognition, number of prompts: 16 +2025-05-12T08:36:41 | INFO | dataset.ret_dataset : Action recognition, number of classes: 512 +2025-05-12T08:36:41 | INFO | tasks_clip.shared_utils : Creating model +2025-05-12T08:36:41 | INFO | models.internvideo2_clip_small : Freeze cls_token +2025-05-12T08:36:41 | INFO | models.internvideo2_clip_small : Freeze pos_embed +2025-05-12T08:36:41 | INFO | models.internvideo2_clip_small : Freeze patch_embed.proj.weight +2025-05-12T08:36:41 | INFO | models.internvideo2_clip_small : Freeze patch_embed.proj.bias +2025-05-12T08:36:41 | INFO | models.internvideo2_clip_small : Freeze blocks.0.norm1.weight +2025-05-12T08:36:41 | INFO | models.internvideo2_clip_small : Freeze blocks.0.attn.qkv.weight +2025-05-12T08:36:41 | INFO | models.internvideo2_clip_small : Freeze blocks.0.attn.proj.weight +2025-05-12T08:36:41 | INFO | models.internvideo2_clip_small : Freeze blocks.0.attn.proj.bias +2025-05-12T08:36:41 | INFO | models.internvideo2_clip_small : Freeze blocks.0.attn.q_norm.weight +2025-05-12T08:36:41 | INFO | models.internvideo2_clip_small : Freeze blocks.0.attn.k_norm.weight +2025-05-12T08:36:41 | INFO | models.internvideo2_clip_small : Freeze blocks.0.ls1.gamma +2025-05-12T08:36:41 | INFO | models.internvideo2_clip_small : Freeze blocks.0.norm2.weight +2025-05-12T08:36:41 | INFO | models.internvideo2_clip_small : Freeze blocks.0.mlp.fc1.weight +2025-05-12T08:36:41 | INFO | models.internvideo2_clip_small : Freeze blocks.0.mlp.fc1.bias +2025-05-12T08:36:41 | INFO | models.internvideo2_clip_small : Freeze blocks.0.mlp.fc2.weight +2025-05-12T08:36:41 | INFO | models.internvideo2_clip_small : Freeze blocks.0.mlp.fc2.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.0.ls2.gamma +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.1.norm1.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.1.attn.qkv.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.1.attn.proj.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.1.attn.proj.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.1.attn.q_norm.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.1.attn.k_norm.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.1.ls1.gamma +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.1.norm2.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.1.mlp.fc1.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.1.mlp.fc1.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.1.mlp.fc2.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.1.mlp.fc2.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.1.ls2.gamma +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.2.norm1.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.2.attn.qkv.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.2.attn.proj.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.2.attn.proj.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.2.attn.q_norm.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.2.attn.k_norm.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.2.ls1.gamma +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.2.norm2.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.2.mlp.fc1.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.2.mlp.fc1.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.2.mlp.fc2.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.2.mlp.fc2.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.2.ls2.gamma +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.3.norm1.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.3.attn.qkv.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.3.attn.proj.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.3.attn.proj.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.3.attn.q_norm.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.3.attn.k_norm.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.3.ls1.gamma +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.3.norm2.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.3.mlp.fc1.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.3.mlp.fc1.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.3.mlp.fc2.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.3.mlp.fc2.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.3.ls2.gamma +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.4.norm1.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.4.attn.qkv.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.4.attn.proj.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.4.attn.proj.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.4.attn.q_norm.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.4.attn.k_norm.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.4.ls1.gamma +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.4.norm2.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.4.mlp.fc1.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.4.mlp.fc1.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.4.mlp.fc2.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.4.mlp.fc2.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.4.ls2.gamma +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.5.norm1.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.5.attn.qkv.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.5.attn.proj.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.5.attn.proj.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.5.attn.q_norm.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.5.attn.k_norm.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.5.ls1.gamma +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.5.norm2.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.5.mlp.fc1.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.5.mlp.fc1.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.5.mlp.fc2.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.5.mlp.fc2.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.5.ls2.gamma +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.6.norm1.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.6.attn.qkv.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.6.attn.proj.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.6.attn.proj.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.6.attn.q_norm.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.6.attn.k_norm.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.6.ls1.gamma +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.6.norm2.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.6.mlp.fc1.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.6.mlp.fc1.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.6.mlp.fc2.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.6.mlp.fc2.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.6.ls2.gamma +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.7.norm1.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.7.attn.qkv.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.7.attn.proj.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.7.attn.proj.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.7.attn.q_norm.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.7.attn.k_norm.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.7.ls1.gamma +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.7.norm2.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.7.mlp.fc1.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.7.mlp.fc1.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.7.mlp.fc2.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.7.mlp.fc2.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.7.ls2.gamma +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.8.norm1.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.8.attn.qkv.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.8.attn.proj.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.8.attn.proj.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.8.attn.q_norm.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.8.attn.k_norm.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.8.ls1.gamma +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.8.norm2.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.8.mlp.fc1.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.8.mlp.fc1.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.8.mlp.fc2.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.8.mlp.fc2.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.8.ls2.gamma +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.9.norm1.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.9.attn.qkv.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.9.attn.proj.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.9.attn.proj.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.9.attn.q_norm.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.9.attn.k_norm.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.9.ls1.gamma +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.9.norm2.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.9.mlp.fc1.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.9.mlp.fc1.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.9.mlp.fc2.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.9.mlp.fc2.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.9.ls2.gamma +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.10.norm1.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.10.attn.qkv.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.10.attn.proj.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.10.attn.proj.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.10.attn.q_norm.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.10.attn.k_norm.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.10.ls1.gamma +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.10.norm2.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.10.mlp.fc1.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.10.mlp.fc1.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.10.mlp.fc2.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.10.mlp.fc2.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.10.ls2.gamma +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.11.norm1.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.11.attn.qkv.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.11.attn.proj.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.11.attn.proj.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.11.attn.q_norm.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.11.attn.k_norm.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.11.ls1.gamma +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.11.norm2.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.11.mlp.fc1.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.11.mlp.fc1.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.11.mlp.fc2.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.11.mlp.fc2.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze blocks.11.ls2.gamma +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze clip_projector.norm1_q.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze clip_projector.norm1_q.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze clip_projector.norm1_k.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze clip_projector.norm1_k.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze clip_projector.norm1_v.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze clip_projector.norm1_v.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze clip_projector.cross_attn.q_bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze clip_projector.cross_attn.k_bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze clip_projector.cross_attn.v_bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze clip_projector.cross_attn.q.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze clip_projector.cross_attn.k.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze clip_projector.cross_attn.v.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze clip_projector.cross_attn.proj.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze clip_projector.cross_attn.proj.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : ---- Froze all the vision encoder params ---- +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze 0.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze 0.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze 1.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze 1.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : ---- Froze all the vision align params ---- +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze projection_layer +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze embedding_layer.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze positional_embedding.pos_embed.pos_embed +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_mha.0.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_mha.0.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_mha.1.qkv_proj.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_mha.1.qkv_proj.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_mha.1.out_proj.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_mha.1.out_proj.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_ffn.0.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_ffn.0.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_ffn.1.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_ffn.1.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_ffn.4.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_ffn.4.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_mha.0.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_mha.0.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_mha.1.qkv_proj.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_mha.1.qkv_proj.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_mha.1.out_proj.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_mha.1.out_proj.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_ffn.0.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_ffn.0.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_ffn.1.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_ffn.1.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_ffn.4.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_ffn.4.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_mha.0.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_mha.0.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_mha.1.qkv_proj.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_mha.1.qkv_proj.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_mha.1.out_proj.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_mha.1.out_proj.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_ffn.0.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_ffn.0.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_ffn.1.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_ffn.1.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_ffn.4.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_ffn.4.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_mha.0.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_mha.0.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_mha.1.qkv_proj.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_mha.1.qkv_proj.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_mha.1.out_proj.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_mha.1.out_proj.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_ffn.0.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_ffn.0.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_ffn.1.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_ffn.1.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_ffn.4.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_ffn.4.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_mha.0.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_mha.0.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_mha.1.qkv_proj.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_mha.1.qkv_proj.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_mha.1.out_proj.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_mha.1.out_proj.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_ffn.0.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_ffn.0.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_ffn.1.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_ffn.1.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_ffn.4.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_ffn.4.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_mha.0.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_mha.0.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_mha.1.qkv_proj.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_mha.1.qkv_proj.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_mha.1.out_proj.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_mha.1.out_proj.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_ffn.0.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_ffn.0.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_ffn.1.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_ffn.1.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_ffn.4.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_ffn.4.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_mha.0.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_mha.0.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_mha.1.qkv_proj.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_mha.1.qkv_proj.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_mha.1.out_proj.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_mha.1.out_proj.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_ffn.0.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_ffn.0.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_ffn.1.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_ffn.1.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_ffn.4.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_ffn.4.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_mha.0.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_mha.0.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_mha.1.qkv_proj.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_mha.1.qkv_proj.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_mha.1.out_proj.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_mha.1.out_proj.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_ffn.0.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_ffn.0.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_ffn.1.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_ffn.1.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_ffn.4.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_ffn.4.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_mha.0.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_mha.0.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_mha.1.qkv_proj.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_mha.1.qkv_proj.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_mha.1.out_proj.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_mha.1.out_proj.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_ffn.0.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_ffn.0.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_ffn.1.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_ffn.1.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_ffn.4.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_ffn.4.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_mha.0.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_mha.0.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_mha.1.qkv_proj.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_mha.1.qkv_proj.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_mha.1.out_proj.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_mha.1.out_proj.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_ffn.0.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_ffn.0.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_ffn.1.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_ffn.1.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_ffn.4.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_ffn.4.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_mha.0.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_mha.0.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_mha.1.qkv_proj.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_mha.1.qkv_proj.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_mha.1.out_proj.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_mha.1.out_proj.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_ffn.0.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_ffn.0.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_ffn.1.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_ffn.1.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_ffn.4.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_ffn.4.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_mha.0.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_mha.0.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_mha.1.qkv_proj.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_mha.1.qkv_proj.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_mha.1.out_proj.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_mha.1.out_proj.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_ffn.0.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_ffn.0.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_ffn.1.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_ffn.1.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_ffn.4.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_ffn.4.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze final_layer_norm.weight +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Freeze final_layer_norm.bias +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Load vision_encoder checkpoint from /home/zli/IV2/models/stage1/B14/B14_dist_1B_stage2/pytorch_model.bin +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Load text_encoder checkpoint from /home/zli/IV2/models/mobileclip_blt.pt +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : Load extra checkpoint from /home/zli/IV2/models/clip/B14/pytorch_model.bin +2025-05-12T08:36:42 | INFO | models.internvideo2_clip_small : _IncompatibleKeys(missing_keys=['streaming_vision_encoder.vit_lite.cls_token', 'streaming_vision_encoder.vit_lite.pos_embed_spatial', 'streaming_vision_encoder.vit_lite.pos_embed_cls', 'streaming_vision_encoder.vit_lite.patch_embed.proj.weight', 'streaming_vision_encoder.vit_lite.patch_embed.proj.bias', 'streaming_vision_encoder.vit_lite.blocks.0.norm1.weight', 'streaming_vision_encoder.vit_lite.blocks.0.attn.qkv.weight', 'streaming_vision_encoder.vit_lite.blocks.0.attn.qkv.bias', 'streaming_vision_encoder.vit_lite.blocks.0.attn.proj.weight', 'streaming_vision_encoder.vit_lite.blocks.0.attn.proj.bias', 'streaming_vision_encoder.vit_lite.blocks.0.norm2.weight', 'streaming_vision_encoder.vit_lite.blocks.0.mlp.fc1.weight', 'streaming_vision_encoder.vit_lite.blocks.0.mlp.fc1.bias', 'streaming_vision_encoder.vit_lite.blocks.0.mlp.fc2.weight', 'streaming_vision_encoder.vit_lite.blocks.0.mlp.fc2.bias', 'streaming_vision_encoder.vit_lite.blocks.1.norm1.weight', 'streaming_vision_encoder.vit_lite.blocks.1.attn.qkv.weight', 'streaming_vision_encoder.vit_lite.blocks.1.attn.qkv.bias', 'streaming_vision_encoder.vit_lite.blocks.1.attn.proj.weight', 'streaming_vision_encoder.vit_lite.blocks.1.attn.proj.bias', 'streaming_vision_encoder.vit_lite.blocks.1.norm2.weight', 'streaming_vision_encoder.vit_lite.blocks.1.mlp.fc1.weight', 'streaming_vision_encoder.vit_lite.blocks.1.mlp.fc1.bias', 'streaming_vision_encoder.vit_lite.blocks.1.mlp.fc2.weight', 'streaming_vision_encoder.vit_lite.blocks.1.mlp.fc2.bias', 'streaming_vision_encoder.vit_lite.blocks.2.norm1.weight', 'streaming_vision_encoder.vit_lite.blocks.2.attn.qkv.weight', 'streaming_vision_encoder.vit_lite.blocks.2.attn.qkv.bias', 'streaming_vision_encoder.vit_lite.blocks.2.attn.proj.weight', 'streaming_vision_encoder.vit_lite.blocks.2.attn.proj.bias', 'streaming_vision_encoder.vit_lite.blocks.2.norm2.weight', 'streaming_vision_encoder.vit_lite.blocks.2.mlp.fc1.weight', 'streaming_vision_encoder.vit_lite.blocks.2.mlp.fc1.bias', 'streaming_vision_encoder.vit_lite.blocks.2.mlp.fc2.weight', 'streaming_vision_encoder.vit_lite.blocks.2.mlp.fc2.bias', 'streaming_vision_encoder.vit_lite.blocks.3.norm1.weight', 'streaming_vision_encoder.vit_lite.blocks.3.attn.qkv.weight', 'streaming_vision_encoder.vit_lite.blocks.3.attn.qkv.bias', 'streaming_vision_encoder.vit_lite.blocks.3.attn.proj.weight', 'streaming_vision_encoder.vit_lite.blocks.3.attn.proj.bias', 'streaming_vision_encoder.vit_lite.blocks.3.norm2.weight', 'streaming_vision_encoder.vit_lite.blocks.3.mlp.fc1.weight', 'streaming_vision_encoder.vit_lite.blocks.3.mlp.fc1.bias', 'streaming_vision_encoder.vit_lite.blocks.3.mlp.fc2.weight', 'streaming_vision_encoder.vit_lite.blocks.3.mlp.fc2.bias', 'streaming_vision_encoder.rnn.weight_ih_l0', 'streaming_vision_encoder.rnn.weight_hh_l0', 'streaming_vision_encoder.rnn.bias_ih_l0', 'streaming_vision_encoder.rnn.bias_hh_l0', 'streaming_vision_encoder.output_fc.0.weight', 'streaming_vision_encoder.output_fc.0.bias', 'streaming_vision_align.0.weight', 'streaming_vision_align.0.bias', 'streaming_vision_align.1.weight', 'streaming_vision_align.1.bias'], unexpected_keys=[]) +2025-05-12T08:36:42 | INFO | tasks_clip.shared_utils : Change to bfloat16 for model +2025-05-12T08:36:42 | INFO | utils.optimizer : diff_names: [], diff_lr: None +2025-05-12T08:36:42 | INFO | utils.optimizer : param temp: wd: 0, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.cls_token: wd: 0.01, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.pos_embed_spatial: wd: 0.01, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.pos_embed_cls: wd: 0.01, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.patch_embed.proj.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.patch_embed.proj.bias: wd: 0, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.norm1.weight: wd: 0, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.attn.qkv.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.attn.qkv.bias: wd: 0, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.attn.proj.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.attn.proj.bias: wd: 0, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.norm2.weight: wd: 0, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.mlp.fc1.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.mlp.fc1.bias: wd: 0, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.mlp.fc2.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.mlp.fc2.bias: wd: 0, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.norm1.weight: wd: 0, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.attn.qkv.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.attn.qkv.bias: wd: 0, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.attn.proj.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.attn.proj.bias: wd: 0, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.norm2.weight: wd: 0, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.mlp.fc1.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.mlp.fc1.bias: wd: 0, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.mlp.fc2.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.mlp.fc2.bias: wd: 0, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.norm1.weight: wd: 0, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.attn.qkv.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.attn.qkv.bias: wd: 0, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.attn.proj.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.attn.proj.bias: wd: 0, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.norm2.weight: wd: 0, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.mlp.fc1.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.mlp.fc1.bias: wd: 0, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.mlp.fc2.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.mlp.fc2.bias: wd: 0, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.norm1.weight: wd: 0, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.attn.qkv.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.attn.qkv.bias: wd: 0, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.attn.proj.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.attn.proj.bias: wd: 0, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.norm2.weight: wd: 0, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.mlp.fc1.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.mlp.fc1.bias: wd: 0, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.mlp.fc2.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.mlp.fc2.bias: wd: 0, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_encoder.rnn.weight_ih_l0: wd: 0.01, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_encoder.rnn.weight_hh_l0: wd: 0.01, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_encoder.rnn.bias_ih_l0: wd: 0, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_encoder.rnn.bias_hh_l0: wd: 0, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_encoder.output_fc.0.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_encoder.output_fc.0.bias: wd: 0, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_align.0.weight: wd: 0, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_align.0.bias: wd: 0, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_align.1.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : param streaming_vision_align.1.bias: wd: 0, lr: 1e-05 +2025-05-12T08:36:42 | INFO | utils.optimizer : optimizer -- lr=1e-05 wd=0 len(p)=32 +2025-05-12T08:36:42 | INFO | utils.optimizer : optimizer -- lr=1e-05 wd=0.01 len(p)=24 +2025-05-12T08:36:42 | INFO | tasks_clip.shared_utils : Auto resuming +2025-05-12T08:36:42 | INFO | tasks_clip.shared_utils : Not found checkpoint in scripts/pretraining/clip/B14/B14 +2025-05-12T08:36:42 | INFO | tasks_clip.shared_utils : Use deepspeed to initialize model!!! +2025-05-12T08:36:42 | WARNING | py.warnings : /home/zli/IV2/InternVideo2/multi_modality/utils/distributed.py:24: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. +If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST']. + builtin_warn(*args, **kwargs) + +2025-05-12T08:36:42 | WARNING | py.warnings : /home/zli/IV2/InternVideo2/multi_modality/utils/distributed.py:24: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. +If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST']. + builtin_warn(*args, **kwargs) + +2025-05-12T08:36:43 | INFO | tasks_clip.shared_utils : Cuda memory after create model: 379M, Max mem: 379M +2025-05-12T08:36:43 | INFO | __main__ : Start training +2025-05-12T08:36:43 | INFO | __main__ : Epoch: 0 +2025-05-12T08:36:43 | WARNING | __main__ : Model does not have a 'transform' or 'config.model.vision_encoder.img_size' attribute. Using default transform. +2025-05-12T08:37:43 | INFO | vindlu : Logging to: scripts/pretraining/clip/B14/B14/train.log +2025-05-12T08:37:43 | INFO | utils.config_utils : config: { + root_path: /home/zli + available_corpus: { + cc3m: { + anno_path: your_path + data_root: + media_type: image } + webvid_10m: { + anno_path: your_path + data_root: + media_type: video } + smol_test: { + anno_path: /root/IV2/InternVideo2/multi_modality/data_test/smol_test.json + data_root: /root/IV2/InternVideo2/multi_modality/data_test/ + media_type: video } + slim_kinetics: { + anno_path: /home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json + data_root: /home/zli/kinetics-dataset/k600/train/train + media_type: video } + slim_kinetics_act_val: { + anno_path: /home/zli/kinetics-dataset/k600/test/kinetics-test.json + data_root: /home/zli/kinetics-dataset/k600/test/ + media_type: video + is_act_rec: True } } + VisionEncoders: { + + TextEncoders: { + bert: { + name: bert_base + pretrained: bert-base-uncased + config: configs/config_bert.json + d_model: 768 + fusion_layer: 9 } + bert_large: { + name: bert_large + pretrained: bert-large-uncased + config: configs/config_bert_large.json + d_model: 1024 + fusion_layer: 19 } + med_bert: { + name: med_bert_base + pretrained: bert-base-uncased + config: configs/med_config.json + d_model: 768 } + med_bert_large: { + name: med_bert_large + pretrained: bert-base-uncased + config: configs/med_large_config.json + d_model: 768 } } + train_corpus: slim_kinetics + train_file: { + anno_path: /home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json + data_root: /home/zli/kinetics-dataset/k600/train/train + media_type: video } + test_file: { + act_val: { + anno_path: /home/zli/kinetics-dataset/k600/test/kinetics-test.json + data_root: /home/zli/kinetics-dataset/k600/test/ + media_type: video + is_act_rec: True } } + test_types: ['act_val'] + num_workers: 2 + stop_key: None + num_frames: 8 + num_frames_test: 8 + batch_size: 16 + batch_size_test: 16 + max_txt_l: 32 + size_t: 224 + inputs: { + image_res: 224 + video_input: { + num_frames: 8 + sample_type: all + num_frames_test: 8 + sample_type_test: all + random_aug: False } + max_txt_l: { + image: 32 + video: 32 } + batch_size: { + image: 16 + video: 16 } + batch_size_test: { + image: 16 + video: 16 } } + model: { + model_cls: InternVideo2_CLIP_small + vision_encoder: { + name: internvideo2 + in_chans: 3 + patch_size: 14 + img_size: 224 + qkv_bias: False + drop_path_rate: 0.0 + head_drop_path_rate: 0.0 + embed_dim: 768 + num_heads: 12 + mlp_ratio: 4 + init_values: 0.1 + qk_normalization: True + depth: 12 + use_flash_attn: True + use_fused_rmsnorm: True + use_fused_mlp: True + fused_mlp_heuristic: 1 + drop_cls_token: False + attn_pool_num_heads: 16 + clip_embed_dim: 768 + layerscale_no_force_fp32: True + num_frames: 8 + tubelet_size: 1 + sep_pos_embed: False + use_checkpoint: False + checkpoint_num: 0 + align_dim: 512 } + streaming_vision_encoder: { + in_chans: 3 + patch_size: 14 + img_size: 224 + vit_qkv_bias: True + vit_drop_path_rate: 0.05 + student_embed_dim: 384 + student_depth: 4 + student_num_heads: 6 + vit_mlp_ratio: 3.0 + vit_init_values: None + vit_qk_normalization: False + vit_sep_pos_embed: True + vit_norm_layer_type: rmsnorm + rnn_type: lstm + rnn_hidden_size: 1024 + rnn_num_layers: 1 + fc_hidden_layers: [] + teacher_clip_embed_dim: 768 + student_num_frames_processed_by_vit: 1 + student_tubelet_size_for_vit: 1 } + text_encoder: { + name: mobileclip_b } + temp: 0.01 + temp_min: 0.01 + freeze_vision: True + open_vision_clip_projector: False + freeze_text: True + open_text_projection: False + open_text_lora: False + vision_ckpt_path: /home/zli/IV2/models/stage1/B14/B14_dist_1B_stage2/pytorch_model.bin + load_vision_ckpt_from_internvideo2_stage2: False + text_ckpt_path: /home/zli/IV2/models/mobileclip_blt.pt + extra_ckpt_path: /home/zli/IV2/models/clip/B14/pytorch_model.bin } + criterion: { + loss_weight: { + vtc: 1.0 } } + optimizer: { + opt: adamW + lr: 1e-05 + opt_betas: [0.9, 0.98] + weight_decay: 0.01 + max_grad_norm: 0.7 + different_lr: { + enable: False + module_names: [] + lr: 1e-05 } } + scheduler: { + sched: cosine + epochs: 1 + min_lr_multi: 0.01 + warmup_epochs: 0.05 } + evaluate: False + deep_fusion: False + evaluation: { + eval_frame_ensemble: concat + eval_x_only: False + k_test: 128 + eval_offload: True } + use_half_precision: True + use_bf16: True + gradient_checkpointing: True + wandb: { + enable: True + entity: qingy2019-conker-mobile-inc- + project: window_iv2 } + dist_url: env:// + device: cuda + mode: pt + output_dir: scripts/pretraining/clip/B14/B14 + resume: True + debug: False + log_freq: 1 + seed: 42 + save_latest: False + save_iter: 5000 + eval_freq_steps: 1000 + eval_video_repo_id: qingy2024/backflip_train + eval_video_filename: 1.mp4 + eval_plot_output_dir: scripts/pretraining/clip/B14/cosine_sim_graphs + auto_resume: True + pretrained_path: + deepspeed: { + enable: True + stage: 1 } + rank: 0 + world_size: 1 + gpu: 0 + distributed: True + dist_backend: nccl + deepspeed_config: scripts/pretraining/clip/B14/B14/deepspeed_config.json } +2025-05-12T08:37:44 | INFO | __main__ : train_file: {'anno_path': '/home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json', 'data_root': '/home/zli/kinetics-dataset/k600/train/train', 'media_type': 'video'} +2025-05-12T08:37:44 | INFO | __main__ : Creating dataset for pt +2025-05-12T08:37:44 | WARNING | dataset : Make sure that you don't need audio input!!! +2025-05-12T08:37:44 | INFO | dataset : dataset_type: pt_train media_type: video dataset_cls: +2025-05-12T08:37:44 | INFO | dataset : dataset_type=pt_train, train_file={'anno_path': '/home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json', 'data_root': '/home/zli/kinetics-dataset/k600/train/train', 'media_type': 'video'} +2025-05-12T08:37:44 | INFO | dataset : {'ann_file': {'anno_path': '/home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json', 'data_root': '/home/zli/kinetics-dataset/k600/train/train', 'media_type': 'video'}, 'transform': Compose( + Lambda() + RandomResizedCrop(size=(224, 224), scale=(0.5, 1.0), ratio=(0.75, 1.3333), interpolation=bicubic, antialias=True) + RandomHorizontalFlip(p=0.5) + Lambda() + Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) +), 'num_epochs': 1, 'video_reader_type': 'decord', 'sample_type': 'all', 'num_frames': 8, 'num_tries': 10} +2025-05-12T08:37:44 | INFO | dataset : train_transform: +2025-05-12T08:37:44 | INFO | dataset : Compose( + Lambda() + RandomResizedCrop(size=(224, 224), scale=(0.5, 1.0), ratio=(0.75, 1.3333), interpolation=bicubic, antialias=True) + RandomHorizontalFlip(p=0.5) + Lambda() + Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) +) +2025-05-12T08:37:44 | INFO | dataset.pt_dataset : ann_file: {'anno_path': '/home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json', 'data_root': '/home/zli/kinetics-dataset/k600/train/train', 'media_type': 'video'} +2025-05-12T08:37:44 | INFO | dataset.pt_dataset : Loading json file /home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json +2025-05-12T08:37:44 | INFO | dataset.pt_dataset : Loading from local file! +2025-05-12T08:37:44 | INFO | dataset.pt_dataset : Num samples: 99243 +2025-05-12T08:37:44 | INFO | dataset.pt_dataset : Num too short: 19586 +2025-05-12T08:37:44 | INFO | dataset.pt_dataset : num_examples: 79657 +2025-05-12T08:37:44 | INFO | dataset : Use ConcatDataset for video +2025-05-12T08:37:44 | WARNING | dataset : Make sure that you don't need audio input!!! +2025-05-12T08:37:44 | INFO | dataset : dataset_type: ret_eval media_type: video dataset_cls: +2025-05-12T08:37:44 | INFO | dataset : dataset_type=pt_eval, test_file={'anno_path': '/home/zli/kinetics-dataset/k600/test/kinetics-test.json', 'data_root': '/home/zli/kinetics-dataset/k600/test/', 'media_type': 'video', 'is_act_rec': True} +2025-05-12T08:37:44 | INFO | dataset : {'ann_file': {'anno_path': '/home/zli/kinetics-dataset/k600/test/kinetics-test.json', 'data_root': '/home/zli/kinetics-dataset/k600/test/', 'media_type': 'video', 'is_act_rec': True}, 'transform': Compose( + Resize(size=(224, 224), interpolation=bicubic, max_size=None, antialias=True) + Lambda() + Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) +), 'video_reader_type': 'decord', 'sample_type': 'all', 'num_frames': 8, 'num_tries': 1} +2025-05-12T08:37:44 | INFO | dataset : test_transform: +2025-05-12T08:37:44 | INFO | dataset : Compose( + Resize(size=(224, 224), interpolation=bicubic, max_size=None, antialias=True) + Lambda() + Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) +) +2025-05-12T08:37:44 | INFO | dataset.ret_dataset : Action recognition, number of prompts: 16 +2025-05-12T08:37:44 | INFO | dataset.ret_dataset : Action recognition, number of classes: 512 +2025-05-12T08:37:44 | INFO | dataset.ret_dataset : Action recognition, number of prompts: 16 +2025-05-12T08:37:44 | INFO | dataset.ret_dataset : Action recognition, number of classes: 512 +2025-05-12T08:37:44 | INFO | tasks_clip.shared_utils : Creating model +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze cls_token +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze pos_embed +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze patch_embed.proj.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze patch_embed.proj.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.0.norm1.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.0.attn.qkv.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.0.attn.proj.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.0.attn.proj.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.0.attn.q_norm.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.0.attn.k_norm.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.0.ls1.gamma +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.0.norm2.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.0.mlp.fc1.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.0.mlp.fc1.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.0.mlp.fc2.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.0.mlp.fc2.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.0.ls2.gamma +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.1.norm1.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.1.attn.qkv.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.1.attn.proj.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.1.attn.proj.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.1.attn.q_norm.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.1.attn.k_norm.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.1.ls1.gamma +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.1.norm2.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.1.mlp.fc1.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.1.mlp.fc1.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.1.mlp.fc2.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.1.mlp.fc2.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.1.ls2.gamma +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.2.norm1.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.2.attn.qkv.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.2.attn.proj.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.2.attn.proj.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.2.attn.q_norm.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.2.attn.k_norm.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.2.ls1.gamma +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.2.norm2.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.2.mlp.fc1.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.2.mlp.fc1.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.2.mlp.fc2.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.2.mlp.fc2.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.2.ls2.gamma +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.3.norm1.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.3.attn.qkv.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.3.attn.proj.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.3.attn.proj.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.3.attn.q_norm.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.3.attn.k_norm.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.3.ls1.gamma +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.3.norm2.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.3.mlp.fc1.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.3.mlp.fc1.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.3.mlp.fc2.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.3.mlp.fc2.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.3.ls2.gamma +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.4.norm1.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.4.attn.qkv.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.4.attn.proj.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.4.attn.proj.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.4.attn.q_norm.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.4.attn.k_norm.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.4.ls1.gamma +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.4.norm2.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.4.mlp.fc1.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.4.mlp.fc1.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.4.mlp.fc2.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.4.mlp.fc2.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.4.ls2.gamma +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.5.norm1.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.5.attn.qkv.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.5.attn.proj.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.5.attn.proj.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.5.attn.q_norm.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.5.attn.k_norm.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.5.ls1.gamma +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.5.norm2.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.5.mlp.fc1.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.5.mlp.fc1.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.5.mlp.fc2.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.5.mlp.fc2.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.5.ls2.gamma +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.6.norm1.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.6.attn.qkv.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.6.attn.proj.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.6.attn.proj.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.6.attn.q_norm.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.6.attn.k_norm.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.6.ls1.gamma +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.6.norm2.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.6.mlp.fc1.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.6.mlp.fc1.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.6.mlp.fc2.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.6.mlp.fc2.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.6.ls2.gamma +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.7.norm1.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.7.attn.qkv.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.7.attn.proj.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.7.attn.proj.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.7.attn.q_norm.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.7.attn.k_norm.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.7.ls1.gamma +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.7.norm2.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.7.mlp.fc1.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.7.mlp.fc1.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.7.mlp.fc2.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.7.mlp.fc2.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.7.ls2.gamma +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.8.norm1.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.8.attn.qkv.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.8.attn.proj.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.8.attn.proj.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.8.attn.q_norm.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.8.attn.k_norm.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.8.ls1.gamma +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.8.norm2.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.8.mlp.fc1.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.8.mlp.fc1.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.8.mlp.fc2.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.8.mlp.fc2.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.8.ls2.gamma +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.9.norm1.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.9.attn.qkv.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.9.attn.proj.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.9.attn.proj.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.9.attn.q_norm.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.9.attn.k_norm.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.9.ls1.gamma +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.9.norm2.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.9.mlp.fc1.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.9.mlp.fc1.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.9.mlp.fc2.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.9.mlp.fc2.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.9.ls2.gamma +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.10.norm1.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.10.attn.qkv.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.10.attn.proj.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.10.attn.proj.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.10.attn.q_norm.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.10.attn.k_norm.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.10.ls1.gamma +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.10.norm2.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.10.mlp.fc1.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.10.mlp.fc1.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.10.mlp.fc2.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.10.mlp.fc2.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.10.ls2.gamma +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.11.norm1.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.11.attn.qkv.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.11.attn.proj.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.11.attn.proj.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.11.attn.q_norm.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.11.attn.k_norm.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.11.ls1.gamma +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.11.norm2.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.11.mlp.fc1.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.11.mlp.fc1.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.11.mlp.fc2.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.11.mlp.fc2.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze blocks.11.ls2.gamma +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze clip_projector.norm1_q.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze clip_projector.norm1_q.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze clip_projector.norm1_k.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze clip_projector.norm1_k.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze clip_projector.norm1_v.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze clip_projector.norm1_v.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze clip_projector.cross_attn.q_bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze clip_projector.cross_attn.k_bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze clip_projector.cross_attn.v_bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze clip_projector.cross_attn.q.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze clip_projector.cross_attn.k.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze clip_projector.cross_attn.v.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze clip_projector.cross_attn.proj.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze clip_projector.cross_attn.proj.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : ---- Froze all the vision encoder params ---- +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze 0.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze 0.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze 1.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze 1.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : ---- Froze all the vision align params ---- +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze projection_layer +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze embedding_layer.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze positional_embedding.pos_embed.pos_embed +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_mha.0.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_mha.0.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_mha.1.qkv_proj.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_mha.1.qkv_proj.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_mha.1.out_proj.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_mha.1.out_proj.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_ffn.0.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_ffn.0.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_ffn.1.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_ffn.1.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_ffn.4.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_ffn.4.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_mha.0.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_mha.0.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_mha.1.qkv_proj.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_mha.1.qkv_proj.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_mha.1.out_proj.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_mha.1.out_proj.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_ffn.0.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_ffn.0.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_ffn.1.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_ffn.1.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_ffn.4.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_ffn.4.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_mha.0.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_mha.0.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_mha.1.qkv_proj.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_mha.1.qkv_proj.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_mha.1.out_proj.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_mha.1.out_proj.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_ffn.0.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_ffn.0.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_ffn.1.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_ffn.1.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_ffn.4.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_ffn.4.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_mha.0.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_mha.0.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_mha.1.qkv_proj.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_mha.1.qkv_proj.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_mha.1.out_proj.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_mha.1.out_proj.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_ffn.0.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_ffn.0.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_ffn.1.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_ffn.1.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_ffn.4.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_ffn.4.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_mha.0.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_mha.0.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_mha.1.qkv_proj.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_mha.1.qkv_proj.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_mha.1.out_proj.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_mha.1.out_proj.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_ffn.0.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_ffn.0.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_ffn.1.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_ffn.1.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_ffn.4.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_ffn.4.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_mha.0.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_mha.0.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_mha.1.qkv_proj.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_mha.1.qkv_proj.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_mha.1.out_proj.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_mha.1.out_proj.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_ffn.0.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_ffn.0.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_ffn.1.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_ffn.1.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_ffn.4.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_ffn.4.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_mha.0.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_mha.0.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_mha.1.qkv_proj.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_mha.1.qkv_proj.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_mha.1.out_proj.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_mha.1.out_proj.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_ffn.0.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_ffn.0.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_ffn.1.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_ffn.1.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_ffn.4.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_ffn.4.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_mha.0.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_mha.0.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_mha.1.qkv_proj.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_mha.1.qkv_proj.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_mha.1.out_proj.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_mha.1.out_proj.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_ffn.0.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_ffn.0.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_ffn.1.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_ffn.1.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_ffn.4.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_ffn.4.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_mha.0.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_mha.0.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_mha.1.qkv_proj.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_mha.1.qkv_proj.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_mha.1.out_proj.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_mha.1.out_proj.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_ffn.0.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_ffn.0.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_ffn.1.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_ffn.1.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_ffn.4.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_ffn.4.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_mha.0.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_mha.0.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_mha.1.qkv_proj.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_mha.1.qkv_proj.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_mha.1.out_proj.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_mha.1.out_proj.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_ffn.0.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_ffn.0.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_ffn.1.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_ffn.1.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_ffn.4.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_ffn.4.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_mha.0.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_mha.0.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_mha.1.qkv_proj.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_mha.1.qkv_proj.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_mha.1.out_proj.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_mha.1.out_proj.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_ffn.0.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_ffn.0.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_ffn.1.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_ffn.1.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_ffn.4.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_ffn.4.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_mha.0.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_mha.0.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_mha.1.qkv_proj.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_mha.1.qkv_proj.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_mha.1.out_proj.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_mha.1.out_proj.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_ffn.0.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_ffn.0.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_ffn.1.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_ffn.1.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_ffn.4.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_ffn.4.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze final_layer_norm.weight +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Freeze final_layer_norm.bias +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Load vision_encoder checkpoint from /home/zli/IV2/models/stage1/B14/B14_dist_1B_stage2/pytorch_model.bin +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Load text_encoder checkpoint from /home/zli/IV2/models/mobileclip_blt.pt +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : Load extra checkpoint from /home/zli/IV2/models/clip/B14/pytorch_model.bin +2025-05-12T08:37:45 | INFO | models.internvideo2_clip_small : _IncompatibleKeys(missing_keys=['streaming_vision_encoder.vit_lite.cls_token', 'streaming_vision_encoder.vit_lite.pos_embed_spatial', 'streaming_vision_encoder.vit_lite.pos_embed_cls', 'streaming_vision_encoder.vit_lite.patch_embed.proj.weight', 'streaming_vision_encoder.vit_lite.patch_embed.proj.bias', 'streaming_vision_encoder.vit_lite.blocks.0.norm1.weight', 'streaming_vision_encoder.vit_lite.blocks.0.attn.qkv.weight', 'streaming_vision_encoder.vit_lite.blocks.0.attn.qkv.bias', 'streaming_vision_encoder.vit_lite.blocks.0.attn.proj.weight', 'streaming_vision_encoder.vit_lite.blocks.0.attn.proj.bias', 'streaming_vision_encoder.vit_lite.blocks.0.norm2.weight', 'streaming_vision_encoder.vit_lite.blocks.0.mlp.fc1.weight', 'streaming_vision_encoder.vit_lite.blocks.0.mlp.fc1.bias', 'streaming_vision_encoder.vit_lite.blocks.0.mlp.fc2.weight', 'streaming_vision_encoder.vit_lite.blocks.0.mlp.fc2.bias', 'streaming_vision_encoder.vit_lite.blocks.1.norm1.weight', 'streaming_vision_encoder.vit_lite.blocks.1.attn.qkv.weight', 'streaming_vision_encoder.vit_lite.blocks.1.attn.qkv.bias', 'streaming_vision_encoder.vit_lite.blocks.1.attn.proj.weight', 'streaming_vision_encoder.vit_lite.blocks.1.attn.proj.bias', 'streaming_vision_encoder.vit_lite.blocks.1.norm2.weight', 'streaming_vision_encoder.vit_lite.blocks.1.mlp.fc1.weight', 'streaming_vision_encoder.vit_lite.blocks.1.mlp.fc1.bias', 'streaming_vision_encoder.vit_lite.blocks.1.mlp.fc2.weight', 'streaming_vision_encoder.vit_lite.blocks.1.mlp.fc2.bias', 'streaming_vision_encoder.vit_lite.blocks.2.norm1.weight', 'streaming_vision_encoder.vit_lite.blocks.2.attn.qkv.weight', 'streaming_vision_encoder.vit_lite.blocks.2.attn.qkv.bias', 'streaming_vision_encoder.vit_lite.blocks.2.attn.proj.weight', 'streaming_vision_encoder.vit_lite.blocks.2.attn.proj.bias', 'streaming_vision_encoder.vit_lite.blocks.2.norm2.weight', 'streaming_vision_encoder.vit_lite.blocks.2.mlp.fc1.weight', 'streaming_vision_encoder.vit_lite.blocks.2.mlp.fc1.bias', 'streaming_vision_encoder.vit_lite.blocks.2.mlp.fc2.weight', 'streaming_vision_encoder.vit_lite.blocks.2.mlp.fc2.bias', 'streaming_vision_encoder.vit_lite.blocks.3.norm1.weight', 'streaming_vision_encoder.vit_lite.blocks.3.attn.qkv.weight', 'streaming_vision_encoder.vit_lite.blocks.3.attn.qkv.bias', 'streaming_vision_encoder.vit_lite.blocks.3.attn.proj.weight', 'streaming_vision_encoder.vit_lite.blocks.3.attn.proj.bias', 'streaming_vision_encoder.vit_lite.blocks.3.norm2.weight', 'streaming_vision_encoder.vit_lite.blocks.3.mlp.fc1.weight', 'streaming_vision_encoder.vit_lite.blocks.3.mlp.fc1.bias', 'streaming_vision_encoder.vit_lite.blocks.3.mlp.fc2.weight', 'streaming_vision_encoder.vit_lite.blocks.3.mlp.fc2.bias', 'streaming_vision_encoder.rnn.weight_ih_l0', 'streaming_vision_encoder.rnn.weight_hh_l0', 'streaming_vision_encoder.rnn.bias_ih_l0', 'streaming_vision_encoder.rnn.bias_hh_l0', 'streaming_vision_encoder.output_fc.0.weight', 'streaming_vision_encoder.output_fc.0.bias', 'streaming_vision_align.0.weight', 'streaming_vision_align.0.bias', 'streaming_vision_align.1.weight', 'streaming_vision_align.1.bias'], unexpected_keys=[]) +2025-05-12T08:37:45 | INFO | tasks_clip.shared_utils : Change to bfloat16 for model +2025-05-12T08:37:45 | INFO | utils.optimizer : diff_names: [], diff_lr: None +2025-05-12T08:37:45 | INFO | utils.optimizer : param temp: wd: 0, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.cls_token: wd: 0.01, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.pos_embed_spatial: wd: 0.01, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.pos_embed_cls: wd: 0.01, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.patch_embed.proj.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.patch_embed.proj.bias: wd: 0, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.norm1.weight: wd: 0, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.attn.qkv.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.attn.qkv.bias: wd: 0, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.attn.proj.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.attn.proj.bias: wd: 0, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.norm2.weight: wd: 0, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.mlp.fc1.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.mlp.fc1.bias: wd: 0, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.mlp.fc2.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.mlp.fc2.bias: wd: 0, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.norm1.weight: wd: 0, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.attn.qkv.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.attn.qkv.bias: wd: 0, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.attn.proj.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.attn.proj.bias: wd: 0, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.norm2.weight: wd: 0, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.mlp.fc1.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.mlp.fc1.bias: wd: 0, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.mlp.fc2.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.mlp.fc2.bias: wd: 0, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.norm1.weight: wd: 0, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.attn.qkv.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.attn.qkv.bias: wd: 0, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.attn.proj.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.attn.proj.bias: wd: 0, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.norm2.weight: wd: 0, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.mlp.fc1.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.mlp.fc1.bias: wd: 0, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.mlp.fc2.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.mlp.fc2.bias: wd: 0, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.norm1.weight: wd: 0, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.attn.qkv.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.attn.qkv.bias: wd: 0, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.attn.proj.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.attn.proj.bias: wd: 0, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.norm2.weight: wd: 0, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.mlp.fc1.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.mlp.fc1.bias: wd: 0, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.mlp.fc2.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.mlp.fc2.bias: wd: 0, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_encoder.rnn.weight_ih_l0: wd: 0.01, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_encoder.rnn.weight_hh_l0: wd: 0.01, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_encoder.rnn.bias_ih_l0: wd: 0, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_encoder.rnn.bias_hh_l0: wd: 0, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_encoder.output_fc.0.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_encoder.output_fc.0.bias: wd: 0, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_align.0.weight: wd: 0, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_align.0.bias: wd: 0, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_align.1.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : param streaming_vision_align.1.bias: wd: 0, lr: 1e-05 +2025-05-12T08:37:45 | INFO | utils.optimizer : optimizer -- lr=1e-05 wd=0 len(p)=32 +2025-05-12T08:37:45 | INFO | utils.optimizer : optimizer -- lr=1e-05 wd=0.01 len(p)=24 +2025-05-12T08:37:45 | INFO | tasks_clip.shared_utils : Auto resuming +2025-05-12T08:37:45 | INFO | tasks_clip.shared_utils : Not found checkpoint in scripts/pretraining/clip/B14/B14 +2025-05-12T08:37:45 | INFO | tasks_clip.shared_utils : Use deepspeed to initialize model!!! +2025-05-12T08:37:45 | WARNING | py.warnings : /home/zli/IV2/InternVideo2/multi_modality/utils/distributed.py:24: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. +If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST']. + builtin_warn(*args, **kwargs) + +2025-05-12T08:37:45 | WARNING | py.warnings : /home/zli/IV2/InternVideo2/multi_modality/utils/distributed.py:24: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. +If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST']. + builtin_warn(*args, **kwargs) + +2025-05-12T08:37:46 | INFO | tasks_clip.shared_utils : Cuda memory after create model: 379M, Max mem: 379M +2025-05-12T08:37:46 | INFO | __main__ : Start training +2025-05-12T08:37:46 | INFO | __main__ : Epoch: 0 +2025-05-12T08:37:46 | WARNING | __main__ : Model does not have a 'transform' or 'config.model.vision_encoder.img_size' attribute. Using default transform. +2025-05-12T08:37:46 | INFO | __main__ : Getting evaluation video from qingy2024/backflip_train (1.mp4) +2025-05-12T08:37:46 | WARNING | huggingface_hub.file_download : Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet` +2025-05-12T08:37:47 | INFO | dataset.dataloader : Do not skip steps for any dataloader! +2025-05-12T08:37:47 | INFO | dataset.dataloader : MetaLoader has 1 dataloaders, 4978 batches in total +dataloader index=0 name=video, batch-size=16 length(#batches)=4978 +2025-05-12T08:38:05 | WARNING | py.warnings : /home/zli/miniconda3/lib/python3.10/site-packages/typing_extensions.py:2852: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. + warnings.warn(msg, category=category, stacklevel=stacklevel + 1) + +2025-05-12T08:38:05 | WARNING | py.warnings : /home/zli/miniconda3/lib/python3.10/site-packages/typing_extensions.py:2852: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. + warnings.warn(msg, category=category, stacklevel=stacklevel + 1) + +2025-05-12T08:38:30 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T08:38:30 | INFO | __main__ : Step: 100 +2025-05-12T08:38:30 | INFO | __main__ : Current Frame Index within Batch Video: 106/247 +2025-05-12T08:38:30 | INFO | __main__ : Batch-wise Cosine Similarity | -2.53% +2025-05-12T08:38:30 | INFO | __main__ : Cosine Embedding Loss | 1.0253 +2025-05-12T08:38:30 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T08:38:30 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T08:38:30 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T08:38:54 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T08:38:54 | INFO | __main__ : Step: 200 +2025-05-12T08:38:54 | INFO | __main__ : Current Frame Index within Batch Video: 206/247 +2025-05-12T08:38:54 | INFO | __main__ : Batch-wise Cosine Similarity | -1.13% +2025-05-12T08:38:54 | INFO | __main__ : Cosine Embedding Loss | 1.0113 +2025-05-12T08:38:54 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T08:38:54 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T08:38:54 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T08:39:04 | INFO | utils.basic_utils : Train Epoch: [0] [ 0/4978] eta: 4 days, 10:06:57 lr: 0.000000 temperature: 0.0126 video-loss_cosine: 1.0124 eval_avg_sim: No data video-cosine_similarity: -0.0124 time: 76.7411 data: 17.6682 max mem: 11173 res mem: 12166 +2025-05-12T08:39:18 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T08:39:18 | INFO | __main__ : Step: 300 +2025-05-12T08:39:18 | INFO | __main__ : Current Frame Index within Batch Video: 65/247 +2025-05-12T08:39:18 | INFO | __main__ : Batch-wise Cosine Similarity | 0.53% +2025-05-12T08:39:18 | INFO | __main__ : Cosine Embedding Loss | 0.9947 +2025-05-12T08:39:18 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T08:39:18 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T08:39:18 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T08:39:42 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T08:39:42 | INFO | __main__ : Step: 400 +2025-05-12T08:39:42 | INFO | __main__ : Current Frame Index within Batch Video: 165/247 +2025-05-12T08:39:42 | INFO | __main__ : Batch-wise Cosine Similarity | 3.91% +2025-05-12T08:39:42 | INFO | __main__ : Cosine Embedding Loss | 0.9609 +2025-05-12T08:39:42 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T08:39:42 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T08:39:42 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T08:40:02 | INFO | utils.basic_utils : Train Epoch: [0] [ 1/4978] eta: 3 days, 21:26:13 lr: 0.000000 temperature: 0.0126 video-loss_cosine: 0.9477 eval_avg_sim: No data video-cosine_similarity: 0.0523 time: 67.5856 data: 8.8342 max mem: 11173 res mem: 15242 +2025-05-12T08:40:07 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T08:40:07 | INFO | __main__ : Step: 500 +2025-05-12T08:40:07 | INFO | __main__ : Current Frame Index within Batch Video: 24/247 +2025-05-12T08:40:07 | INFO | __main__ : Batch-wise Cosine Similarity | 5.06% +2025-05-12T08:40:07 | INFO | __main__ : Cosine Embedding Loss | 0.9494 +2025-05-12T08:40:07 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T08:40:07 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T08:40:07 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T08:40:31 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T08:40:31 | INFO | __main__ : Step: 600 +2025-05-12T08:40:31 | INFO | __main__ : Current Frame Index within Batch Video: 124/247 +2025-05-12T08:40:31 | INFO | __main__ : Batch-wise Cosine Similarity | 8.26% +2025-05-12T08:40:31 | INFO | __main__ : Cosine Embedding Loss | 0.9174 +2025-05-12T08:40:31 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T08:40:31 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T08:40:31 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T08:40:55 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T08:40:55 | INFO | __main__ : Step: 700 +2025-05-12T08:40:55 | INFO | __main__ : Current Frame Index within Batch Video: 224/247 +2025-05-12T08:40:55 | INFO | __main__ : Batch-wise Cosine Similarity | 19.21% +2025-05-12T08:40:55 | INFO | __main__ : Cosine Embedding Loss | 0.8079 +2025-05-12T08:40:55 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T08:40:55 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T08:40:55 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T08:41:01 | INFO | utils.basic_utils : Train Epoch: [0] [ 2/4978] eta: 3 days, 17:15:20 lr: 0.000000 temperature: 0.0126 video-loss_cosine: 0.8063 eval_avg_sim: No data video-cosine_similarity: 0.1937 time: 64.5741 data: 5.8900 max mem: 11173 res mem: 15242 +2025-05-12T08:41:20 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T08:41:20 | INFO | __main__ : Step: 800 +2025-05-12T08:41:20 | INFO | __main__ : Current Frame Index within Batch Video: 83/247 +2025-05-12T08:41:20 | INFO | __main__ : Batch-wise Cosine Similarity | 17.65% +2025-05-12T08:41:20 | INFO | __main__ : Cosine Embedding Loss | 0.8235 +2025-05-12T08:41:20 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T08:41:20 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T08:41:20 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T08:41:44 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T08:41:44 | INFO | __main__ : Step: 900 +2025-05-12T08:41:44 | INFO | __main__ : Current Frame Index within Batch Video: 183/247 +2025-05-12T08:41:44 | INFO | __main__ : Batch-wise Cosine Similarity | 22.68% +2025-05-12T08:41:44 | INFO | __main__ : Cosine Embedding Loss | 0.7732 +2025-05-12T08:41:44 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T08:41:44 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T08:41:44 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T08:41:59 | INFO | utils.basic_utils : Train Epoch: [0] [ 3/4978] eta: 3 days, 14:58:51 lr: 0.000000 temperature: 0.0126 video-loss_cosine: 0.7545 eval_avg_sim: No data video-cosine_similarity: 0.2455 time: 62.9410 data: 4.4175 max mem: 11173 res mem: 15242 +2025-05-12T08:42:08 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T08:42:08 | INFO | __main__ : Step: 1000 +2025-05-12T08:42:08 | INFO | __main__ : Current Frame Index within Batch Video: 42/247 +2025-05-12T08:42:08 | INFO | __main__ : Batch-wise Cosine Similarity | 25.19% +2025-05-12T08:42:08 | INFO | __main__ : Cosine Embedding Loss | 0.7481 +2025-05-12T08:42:08 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T08:42:08 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T08:45:51 | INFO | vindlu : Logging to: scripts/pretraining/clip/B14/B14/train.log +2025-05-12T08:45:51 | INFO | utils.config_utils : config: { + root_path: /home/zli + available_corpus: { + cc3m: { + anno_path: your_path + data_root: + media_type: image } + webvid_10m: { + anno_path: your_path + data_root: + media_type: video } + smol_test: { + anno_path: /root/IV2/InternVideo2/multi_modality/data_test/smol_test.json + data_root: /root/IV2/InternVideo2/multi_modality/data_test/ + media_type: video } + slim_kinetics: { + anno_path: /home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json + data_root: /home/zli/kinetics-dataset/k600/train/train + media_type: video } + slim_kinetics_act_val: { + anno_path: /home/zli/kinetics-dataset/k600/test/kinetics-test.json + data_root: /home/zli/kinetics-dataset/k600/test/ + media_type: video + is_act_rec: True } } + VisionEncoders: { + + TextEncoders: { + bert: { + name: bert_base + pretrained: bert-base-uncased + config: configs/config_bert.json + d_model: 768 + fusion_layer: 9 } + bert_large: { + name: bert_large + pretrained: bert-large-uncased + config: configs/config_bert_large.json + d_model: 1024 + fusion_layer: 19 } + med_bert: { + name: med_bert_base + pretrained: bert-base-uncased + config: configs/med_config.json + d_model: 768 } + med_bert_large: { + name: med_bert_large + pretrained: bert-base-uncased + config: configs/med_large_config.json + d_model: 768 } } + train_corpus: slim_kinetics + train_file: { + anno_path: /home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json + data_root: /home/zli/kinetics-dataset/k600/train/train + media_type: video } + test_file: { + act_val: { + anno_path: /home/zli/kinetics-dataset/k600/test/kinetics-test.json + data_root: /home/zli/kinetics-dataset/k600/test/ + media_type: video + is_act_rec: True } } + test_types: ['act_val'] + num_workers: 2 + stop_key: None + num_frames: 8 + num_frames_test: 8 + batch_size: 16 + batch_size_test: 16 + max_txt_l: 32 + size_t: 224 + inputs: { + image_res: 224 + video_input: { + num_frames: 8 + sample_type: all + num_frames_test: 8 + sample_type_test: all + random_aug: False } + max_txt_l: { + image: 32 + video: 32 } + batch_size: { + image: 16 + video: 16 } + batch_size_test: { + image: 16 + video: 16 } } + model: { + model_cls: InternVideo2_CLIP_small + vision_encoder: { + name: internvideo2 + in_chans: 3 + patch_size: 14 + img_size: 224 + qkv_bias: False + drop_path_rate: 0.0 + head_drop_path_rate: 0.0 + embed_dim: 768 + num_heads: 12 + mlp_ratio: 4 + init_values: 0.1 + qk_normalization: True + depth: 12 + use_flash_attn: True + use_fused_rmsnorm: True + use_fused_mlp: True + fused_mlp_heuristic: 1 + drop_cls_token: False + attn_pool_num_heads: 16 + clip_embed_dim: 768 + layerscale_no_force_fp32: True + num_frames: 8 + tubelet_size: 1 + sep_pos_embed: False + use_checkpoint: False + checkpoint_num: 0 + align_dim: 512 } + streaming_vision_encoder: { + in_chans: 3 + patch_size: 14 + img_size: 224 + vit_qkv_bias: True + vit_drop_path_rate: 0.05 + student_embed_dim: 384 + student_depth: 4 + student_num_heads: 6 + vit_mlp_ratio: 3.0 + vit_init_values: None + vit_qk_normalization: False + vit_sep_pos_embed: True + vit_norm_layer_type: rmsnorm + rnn_type: lstm + rnn_hidden_size: 1024 + rnn_num_layers: 1 + fc_hidden_layers: [] + teacher_clip_embed_dim: 768 + student_num_frames_processed_by_vit: 1 + student_tubelet_size_for_vit: 1 } + text_encoder: { + name: mobileclip_b } + temp: 0.01 + temp_min: 0.01 + freeze_vision: True + open_vision_clip_projector: False + freeze_text: True + open_text_projection: False + open_text_lora: False + vision_ckpt_path: /home/zli/IV2/models/stage1/B14/B14_dist_1B_stage2/pytorch_model.bin + load_vision_ckpt_from_internvideo2_stage2: False + text_ckpt_path: /home/zli/IV2/models/mobileclip_blt.pt + extra_ckpt_path: /home/zli/IV2/models/clip/B14/pytorch_model.bin } + criterion: { + loss_weight: { + vtc: 1.0 } } + optimizer: { + opt: adamW + lr: 1e-05 + opt_betas: [0.9, 0.98] + weight_decay: 0.01 + max_grad_norm: 0.7 + different_lr: { + enable: False + module_names: [] + lr: 1e-05 } } + scheduler: { + sched: cosine + epochs: 1 + min_lr_multi: 0.01 + warmup_epochs: 0.05 } + evaluate: False + deep_fusion: False + evaluation: { + eval_frame_ensemble: concat + eval_x_only: False + k_test: 128 + eval_offload: True } + use_half_precision: True + use_bf16: True + gradient_checkpointing: True + wandb: { + enable: True + entity: qingy2019-conker-mobile-inc- + project: window_iv2 } + dist_url: env:// + device: cuda + mode: pt + output_dir: scripts/pretraining/clip/B14/B14 + resume: True + debug: False + log_freq: 1 + seed: 42 + save_latest: False + save_iter: 5000 + eval_freq_steps: 1000 + eval_video_repo_id: qingy2024/backflip_train + eval_video_filename: 1.mp4 + eval_plot_output_dir: scripts/pretraining/clip/B14/cosine_sim_graphs + auto_resume: True + pretrained_path: + deepspeed: { + enable: True + stage: 1 } + rank: 0 + world_size: 1 + gpu: 0 + distributed: True + dist_backend: nccl + deepspeed_config: scripts/pretraining/clip/B14/B14/deepspeed_config.json } +2025-05-12T08:45:52 | INFO | __main__ : train_file: {'anno_path': '/home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json', 'data_root': '/home/zli/kinetics-dataset/k600/train/train', 'media_type': 'video'} +2025-05-12T08:45:52 | INFO | __main__ : Creating dataset for pt +2025-05-12T08:45:52 | WARNING | dataset : Make sure that you don't need audio input!!! +2025-05-12T08:45:52 | INFO | dataset : dataset_type: pt_train media_type: video dataset_cls: +2025-05-12T08:45:52 | INFO | dataset : dataset_type=pt_train, train_file={'anno_path': '/home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json', 'data_root': '/home/zli/kinetics-dataset/k600/train/train', 'media_type': 'video'} +2025-05-12T08:45:52 | INFO | dataset : {'ann_file': {'anno_path': '/home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json', 'data_root': '/home/zli/kinetics-dataset/k600/train/train', 'media_type': 'video'}, 'transform': Compose( + Lambda() + RandomResizedCrop(size=(224, 224), scale=(0.5, 1.0), ratio=(0.75, 1.3333), interpolation=bicubic, antialias=True) + RandomHorizontalFlip(p=0.5) + Lambda() + Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) +), 'num_epochs': 1, 'video_reader_type': 'decord', 'sample_type': 'all', 'num_frames': 8, 'num_tries': 10} +2025-05-12T08:45:52 | INFO | dataset : train_transform: +2025-05-12T08:45:52 | INFO | dataset : Compose( + Lambda() + RandomResizedCrop(size=(224, 224), scale=(0.5, 1.0), ratio=(0.75, 1.3333), interpolation=bicubic, antialias=True) + RandomHorizontalFlip(p=0.5) + Lambda() + Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) +) +2025-05-12T08:45:52 | INFO | dataset.pt_dataset : ann_file: {'anno_path': '/home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json', 'data_root': '/home/zli/kinetics-dataset/k600/train/train', 'media_type': 'video'} +2025-05-12T08:45:52 | INFO | dataset.pt_dataset : Loading json file /home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json +2025-05-12T08:45:52 | INFO | dataset.pt_dataset : Loading from local file! +2025-05-12T08:45:52 | INFO | dataset.pt_dataset : Num samples: 99243 +2025-05-12T08:45:52 | INFO | dataset.pt_dataset : Num too short: 19586 +2025-05-12T08:45:52 | INFO | dataset.pt_dataset : num_examples: 79657 +2025-05-12T08:45:52 | INFO | dataset : Use ConcatDataset for video +2025-05-12T08:45:52 | WARNING | dataset : Make sure that you don't need audio input!!! +2025-05-12T08:45:52 | INFO | dataset : dataset_type: ret_eval media_type: video dataset_cls: +2025-05-12T08:45:52 | INFO | dataset : dataset_type=pt_eval, test_file={'anno_path': '/home/zli/kinetics-dataset/k600/test/kinetics-test.json', 'data_root': '/home/zli/kinetics-dataset/k600/test/', 'media_type': 'video', 'is_act_rec': True} +2025-05-12T08:45:52 | INFO | dataset : {'ann_file': {'anno_path': '/home/zli/kinetics-dataset/k600/test/kinetics-test.json', 'data_root': '/home/zli/kinetics-dataset/k600/test/', 'media_type': 'video', 'is_act_rec': True}, 'transform': Compose( + Resize(size=(224, 224), interpolation=bicubic, max_size=None, antialias=True) + Lambda() + Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) +), 'video_reader_type': 'decord', 'sample_type': 'all', 'num_frames': 8, 'num_tries': 1} +2025-05-12T08:45:52 | INFO | dataset : test_transform: +2025-05-12T08:45:52 | INFO | dataset : Compose( + Resize(size=(224, 224), interpolation=bicubic, max_size=None, antialias=True) + Lambda() + Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) +) +2025-05-12T08:45:52 | INFO | dataset.ret_dataset : Action recognition, number of prompts: 16 +2025-05-12T08:45:52 | INFO | dataset.ret_dataset : Action recognition, number of classes: 512 +2025-05-12T08:45:52 | INFO | dataset.ret_dataset : Action recognition, number of prompts: 16 +2025-05-12T08:45:52 | INFO | dataset.ret_dataset : Action recognition, number of classes: 512 +2025-05-12T08:45:52 | INFO | tasks_clip.shared_utils : Creating model +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze cls_token +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze pos_embed +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze patch_embed.proj.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze patch_embed.proj.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.0.norm1.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.0.attn.qkv.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.0.attn.proj.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.0.attn.proj.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.0.attn.q_norm.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.0.attn.k_norm.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.0.ls1.gamma +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.0.norm2.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.0.mlp.fc1.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.0.mlp.fc1.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.0.mlp.fc2.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.0.mlp.fc2.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.0.ls2.gamma +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.1.norm1.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.1.attn.qkv.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.1.attn.proj.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.1.attn.proj.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.1.attn.q_norm.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.1.attn.k_norm.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.1.ls1.gamma +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.1.norm2.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.1.mlp.fc1.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.1.mlp.fc1.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.1.mlp.fc2.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.1.mlp.fc2.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.1.ls2.gamma +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.2.norm1.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.2.attn.qkv.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.2.attn.proj.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.2.attn.proj.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.2.attn.q_norm.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.2.attn.k_norm.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.2.ls1.gamma +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.2.norm2.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.2.mlp.fc1.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.2.mlp.fc1.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.2.mlp.fc2.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.2.mlp.fc2.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.2.ls2.gamma +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.3.norm1.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.3.attn.qkv.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.3.attn.proj.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.3.attn.proj.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.3.attn.q_norm.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.3.attn.k_norm.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.3.ls1.gamma +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.3.norm2.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.3.mlp.fc1.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.3.mlp.fc1.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.3.mlp.fc2.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.3.mlp.fc2.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.3.ls2.gamma +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.4.norm1.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.4.attn.qkv.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.4.attn.proj.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.4.attn.proj.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.4.attn.q_norm.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.4.attn.k_norm.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.4.ls1.gamma +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.4.norm2.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.4.mlp.fc1.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.4.mlp.fc1.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.4.mlp.fc2.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.4.mlp.fc2.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.4.ls2.gamma +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.5.norm1.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.5.attn.qkv.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.5.attn.proj.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.5.attn.proj.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.5.attn.q_norm.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.5.attn.k_norm.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.5.ls1.gamma +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.5.norm2.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.5.mlp.fc1.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.5.mlp.fc1.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.5.mlp.fc2.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.5.mlp.fc2.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.5.ls2.gamma +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.6.norm1.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.6.attn.qkv.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.6.attn.proj.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.6.attn.proj.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.6.attn.q_norm.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.6.attn.k_norm.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.6.ls1.gamma +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.6.norm2.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.6.mlp.fc1.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.6.mlp.fc1.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.6.mlp.fc2.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.6.mlp.fc2.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.6.ls2.gamma +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.7.norm1.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.7.attn.qkv.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.7.attn.proj.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.7.attn.proj.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.7.attn.q_norm.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.7.attn.k_norm.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.7.ls1.gamma +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.7.norm2.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.7.mlp.fc1.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.7.mlp.fc1.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.7.mlp.fc2.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.7.mlp.fc2.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.7.ls2.gamma +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.8.norm1.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.8.attn.qkv.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.8.attn.proj.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.8.attn.proj.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.8.attn.q_norm.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.8.attn.k_norm.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.8.ls1.gamma +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.8.norm2.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.8.mlp.fc1.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.8.mlp.fc1.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.8.mlp.fc2.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.8.mlp.fc2.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.8.ls2.gamma +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.9.norm1.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.9.attn.qkv.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.9.attn.proj.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.9.attn.proj.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.9.attn.q_norm.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.9.attn.k_norm.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.9.ls1.gamma +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.9.norm2.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.9.mlp.fc1.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.9.mlp.fc1.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.9.mlp.fc2.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.9.mlp.fc2.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.9.ls2.gamma +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.10.norm1.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.10.attn.qkv.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.10.attn.proj.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.10.attn.proj.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.10.attn.q_norm.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.10.attn.k_norm.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.10.ls1.gamma +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.10.norm2.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.10.mlp.fc1.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.10.mlp.fc1.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.10.mlp.fc2.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.10.mlp.fc2.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.10.ls2.gamma +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.11.norm1.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.11.attn.qkv.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.11.attn.proj.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.11.attn.proj.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.11.attn.q_norm.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.11.attn.k_norm.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.11.ls1.gamma +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.11.norm2.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.11.mlp.fc1.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.11.mlp.fc1.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.11.mlp.fc2.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.11.mlp.fc2.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze blocks.11.ls2.gamma +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze clip_projector.norm1_q.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze clip_projector.norm1_q.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze clip_projector.norm1_k.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze clip_projector.norm1_k.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze clip_projector.norm1_v.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze clip_projector.norm1_v.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze clip_projector.cross_attn.q_bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze clip_projector.cross_attn.k_bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze clip_projector.cross_attn.v_bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze clip_projector.cross_attn.q.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze clip_projector.cross_attn.k.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze clip_projector.cross_attn.v.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze clip_projector.cross_attn.proj.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze clip_projector.cross_attn.proj.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : ---- Froze all the vision encoder params ---- +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze 0.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze 0.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze 1.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze 1.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : ---- Froze all the vision align params ---- +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze projection_layer +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze embedding_layer.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze positional_embedding.pos_embed.pos_embed +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_mha.0.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_mha.0.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_mha.1.qkv_proj.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_mha.1.qkv_proj.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_mha.1.out_proj.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_mha.1.out_proj.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_ffn.0.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_ffn.0.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_ffn.1.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_ffn.1.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_ffn.4.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_ffn.4.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_mha.0.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_mha.0.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_mha.1.qkv_proj.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_mha.1.qkv_proj.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_mha.1.out_proj.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_mha.1.out_proj.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_ffn.0.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_ffn.0.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_ffn.1.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_ffn.1.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_ffn.4.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_ffn.4.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_mha.0.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_mha.0.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_mha.1.qkv_proj.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_mha.1.qkv_proj.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_mha.1.out_proj.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_mha.1.out_proj.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_ffn.0.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_ffn.0.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_ffn.1.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_ffn.1.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_ffn.4.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_ffn.4.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_mha.0.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_mha.0.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_mha.1.qkv_proj.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_mha.1.qkv_proj.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_mha.1.out_proj.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_mha.1.out_proj.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_ffn.0.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_ffn.0.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_ffn.1.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_ffn.1.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_ffn.4.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_ffn.4.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_mha.0.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_mha.0.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_mha.1.qkv_proj.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_mha.1.qkv_proj.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_mha.1.out_proj.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_mha.1.out_proj.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_ffn.0.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_ffn.0.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_ffn.1.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_ffn.1.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_ffn.4.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_ffn.4.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_mha.0.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_mha.0.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_mha.1.qkv_proj.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_mha.1.qkv_proj.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_mha.1.out_proj.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_mha.1.out_proj.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_ffn.0.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_ffn.0.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_ffn.1.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_ffn.1.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_ffn.4.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_ffn.4.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_mha.0.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_mha.0.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_mha.1.qkv_proj.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_mha.1.qkv_proj.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_mha.1.out_proj.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_mha.1.out_proj.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_ffn.0.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_ffn.0.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_ffn.1.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_ffn.1.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_ffn.4.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_ffn.4.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_mha.0.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_mha.0.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_mha.1.qkv_proj.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_mha.1.qkv_proj.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_mha.1.out_proj.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_mha.1.out_proj.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_ffn.0.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_ffn.0.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_ffn.1.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_ffn.1.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_ffn.4.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_ffn.4.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_mha.0.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_mha.0.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_mha.1.qkv_proj.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_mha.1.qkv_proj.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_mha.1.out_proj.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_mha.1.out_proj.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_ffn.0.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_ffn.0.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_ffn.1.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_ffn.1.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_ffn.4.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_ffn.4.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_mha.0.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_mha.0.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_mha.1.qkv_proj.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_mha.1.qkv_proj.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_mha.1.out_proj.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_mha.1.out_proj.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_ffn.0.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_ffn.0.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_ffn.1.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_ffn.1.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_ffn.4.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_ffn.4.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_mha.0.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_mha.0.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_mha.1.qkv_proj.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_mha.1.qkv_proj.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_mha.1.out_proj.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_mha.1.out_proj.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_ffn.0.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_ffn.0.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_ffn.1.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_ffn.1.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_ffn.4.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_ffn.4.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_mha.0.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_mha.0.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_mha.1.qkv_proj.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_mha.1.qkv_proj.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_mha.1.out_proj.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_mha.1.out_proj.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_ffn.0.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_ffn.0.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_ffn.1.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_ffn.1.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_ffn.4.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_ffn.4.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze final_layer_norm.weight +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Freeze final_layer_norm.bias +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Load vision_encoder checkpoint from /home/zli/IV2/models/stage1/B14/B14_dist_1B_stage2/pytorch_model.bin +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Load text_encoder checkpoint from /home/zli/IV2/models/mobileclip_blt.pt +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : Load extra checkpoint from /home/zli/IV2/models/clip/B14/pytorch_model.bin +2025-05-12T08:45:53 | INFO | models.internvideo2_clip_small : _IncompatibleKeys(missing_keys=['streaming_vision_encoder.vit_lite.cls_token', 'streaming_vision_encoder.vit_lite.pos_embed_spatial', 'streaming_vision_encoder.vit_lite.pos_embed_cls', 'streaming_vision_encoder.vit_lite.patch_embed.proj.weight', 'streaming_vision_encoder.vit_lite.patch_embed.proj.bias', 'streaming_vision_encoder.vit_lite.blocks.0.norm1.weight', 'streaming_vision_encoder.vit_lite.blocks.0.attn.qkv.weight', 'streaming_vision_encoder.vit_lite.blocks.0.attn.qkv.bias', 'streaming_vision_encoder.vit_lite.blocks.0.attn.proj.weight', 'streaming_vision_encoder.vit_lite.blocks.0.attn.proj.bias', 'streaming_vision_encoder.vit_lite.blocks.0.norm2.weight', 'streaming_vision_encoder.vit_lite.blocks.0.mlp.fc1.weight', 'streaming_vision_encoder.vit_lite.blocks.0.mlp.fc1.bias', 'streaming_vision_encoder.vit_lite.blocks.0.mlp.fc2.weight', 'streaming_vision_encoder.vit_lite.blocks.0.mlp.fc2.bias', 'streaming_vision_encoder.vit_lite.blocks.1.norm1.weight', 'streaming_vision_encoder.vit_lite.blocks.1.attn.qkv.weight', 'streaming_vision_encoder.vit_lite.blocks.1.attn.qkv.bias', 'streaming_vision_encoder.vit_lite.blocks.1.attn.proj.weight', 'streaming_vision_encoder.vit_lite.blocks.1.attn.proj.bias', 'streaming_vision_encoder.vit_lite.blocks.1.norm2.weight', 'streaming_vision_encoder.vit_lite.blocks.1.mlp.fc1.weight', 'streaming_vision_encoder.vit_lite.blocks.1.mlp.fc1.bias', 'streaming_vision_encoder.vit_lite.blocks.1.mlp.fc2.weight', 'streaming_vision_encoder.vit_lite.blocks.1.mlp.fc2.bias', 'streaming_vision_encoder.vit_lite.blocks.2.norm1.weight', 'streaming_vision_encoder.vit_lite.blocks.2.attn.qkv.weight', 'streaming_vision_encoder.vit_lite.blocks.2.attn.qkv.bias', 'streaming_vision_encoder.vit_lite.blocks.2.attn.proj.weight', 'streaming_vision_encoder.vit_lite.blocks.2.attn.proj.bias', 'streaming_vision_encoder.vit_lite.blocks.2.norm2.weight', 'streaming_vision_encoder.vit_lite.blocks.2.mlp.fc1.weight', 'streaming_vision_encoder.vit_lite.blocks.2.mlp.fc1.bias', 'streaming_vision_encoder.vit_lite.blocks.2.mlp.fc2.weight', 'streaming_vision_encoder.vit_lite.blocks.2.mlp.fc2.bias', 'streaming_vision_encoder.vit_lite.blocks.3.norm1.weight', 'streaming_vision_encoder.vit_lite.blocks.3.attn.qkv.weight', 'streaming_vision_encoder.vit_lite.blocks.3.attn.qkv.bias', 'streaming_vision_encoder.vit_lite.blocks.3.attn.proj.weight', 'streaming_vision_encoder.vit_lite.blocks.3.attn.proj.bias', 'streaming_vision_encoder.vit_lite.blocks.3.norm2.weight', 'streaming_vision_encoder.vit_lite.blocks.3.mlp.fc1.weight', 'streaming_vision_encoder.vit_lite.blocks.3.mlp.fc1.bias', 'streaming_vision_encoder.vit_lite.blocks.3.mlp.fc2.weight', 'streaming_vision_encoder.vit_lite.blocks.3.mlp.fc2.bias', 'streaming_vision_encoder.rnn.weight_ih_l0', 'streaming_vision_encoder.rnn.weight_hh_l0', 'streaming_vision_encoder.rnn.bias_ih_l0', 'streaming_vision_encoder.rnn.bias_hh_l0', 'streaming_vision_encoder.output_fc.0.weight', 'streaming_vision_encoder.output_fc.0.bias', 'streaming_vision_align.0.weight', 'streaming_vision_align.0.bias', 'streaming_vision_align.1.weight', 'streaming_vision_align.1.bias'], unexpected_keys=[]) +2025-05-12T08:45:53 | INFO | tasks_clip.shared_utils : Change to bfloat16 for model +2025-05-12T08:45:53 | INFO | utils.optimizer : diff_names: [], diff_lr: None +2025-05-12T08:45:53 | INFO | utils.optimizer : param temp: wd: 0, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.cls_token: wd: 0.01, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.pos_embed_spatial: wd: 0.01, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.pos_embed_cls: wd: 0.01, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.patch_embed.proj.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.patch_embed.proj.bias: wd: 0, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.norm1.weight: wd: 0, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.attn.qkv.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.attn.qkv.bias: wd: 0, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.attn.proj.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.attn.proj.bias: wd: 0, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.norm2.weight: wd: 0, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.mlp.fc1.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.mlp.fc1.bias: wd: 0, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.mlp.fc2.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.mlp.fc2.bias: wd: 0, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.norm1.weight: wd: 0, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.attn.qkv.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.attn.qkv.bias: wd: 0, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.attn.proj.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.attn.proj.bias: wd: 0, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.norm2.weight: wd: 0, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.mlp.fc1.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.mlp.fc1.bias: wd: 0, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.mlp.fc2.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.mlp.fc2.bias: wd: 0, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.norm1.weight: wd: 0, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.attn.qkv.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.attn.qkv.bias: wd: 0, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.attn.proj.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.attn.proj.bias: wd: 0, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.norm2.weight: wd: 0, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.mlp.fc1.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.mlp.fc1.bias: wd: 0, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.mlp.fc2.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.mlp.fc2.bias: wd: 0, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.norm1.weight: wd: 0, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.attn.qkv.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.attn.qkv.bias: wd: 0, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.attn.proj.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.attn.proj.bias: wd: 0, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.norm2.weight: wd: 0, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.mlp.fc1.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.mlp.fc1.bias: wd: 0, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.mlp.fc2.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.mlp.fc2.bias: wd: 0, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_encoder.rnn.weight_ih_l0: wd: 0.01, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_encoder.rnn.weight_hh_l0: wd: 0.01, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_encoder.rnn.bias_ih_l0: wd: 0, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_encoder.rnn.bias_hh_l0: wd: 0, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_encoder.output_fc.0.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_encoder.output_fc.0.bias: wd: 0, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_align.0.weight: wd: 0, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_align.0.bias: wd: 0, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_align.1.weight: wd: 0.01, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : param streaming_vision_align.1.bias: wd: 0, lr: 1e-05 +2025-05-12T08:45:53 | INFO | utils.optimizer : optimizer -- lr=1e-05 wd=0 len(p)=32 +2025-05-12T08:45:53 | INFO | utils.optimizer : optimizer -- lr=1e-05 wd=0.01 len(p)=24 +2025-05-12T08:45:53 | INFO | tasks_clip.shared_utils : Auto resuming +2025-05-12T08:45:53 | INFO | tasks_clip.shared_utils : Not found checkpoint in scripts/pretraining/clip/B14/B14 +2025-05-12T08:45:53 | INFO | tasks_clip.shared_utils : Use deepspeed to initialize model!!! +2025-05-12T08:45:53 | WARNING | py.warnings : /home/zli/IV2/InternVideo2/multi_modality/utils/distributed.py:24: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. +If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST']. + builtin_warn(*args, **kwargs) + +2025-05-12T08:45:53 | WARNING | py.warnings : /home/zli/IV2/InternVideo2/multi_modality/utils/distributed.py:24: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. +If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST']. + builtin_warn(*args, **kwargs) + +2025-05-12T08:45:54 | INFO | tasks_clip.shared_utils : Cuda memory after create model: 379M, Max mem: 379M +2025-05-12T08:45:54 | INFO | __main__ : Start training +2025-05-12T08:45:54 | INFO | __main__ : Epoch: 0 +2025-05-12T08:45:54 | WARNING | __main__ : Model does not have a 'transform' or 'config.model.vision_encoder.img_size' attribute. Using default transform. +2025-05-12T08:45:54 | INFO | __main__ : Getting evaluation video from qingy2024/backflip_train (1.mp4) +2025-05-12T08:46:04 | INFO | dataset.dataloader : Do not skip steps for any dataloader! +2025-05-12T08:46:04 | INFO | dataset.dataloader : MetaLoader has 1 dataloaders, 4978 batches in total +dataloader index=0 name=video, batch-size=16 length(#batches)=4978 +2025-05-12T08:46:22 | WARNING | py.warnings : /home/zli/miniconda3/lib/python3.10/site-packages/typing_extensions.py:2852: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. + warnings.warn(msg, category=category, stacklevel=stacklevel + 1) + +2025-05-12T08:46:22 | WARNING | py.warnings : /home/zli/miniconda3/lib/python3.10/site-packages/typing_extensions.py:2852: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. + warnings.warn(msg, category=category, stacklevel=stacklevel + 1) + +2025-05-12T08:46:47 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T08:46:47 | INFO | __main__ : Step: 100 +2025-05-12T08:46:47 | INFO | __main__ : Current Frame Index within Batch Video: 106/247 +2025-05-12T08:46:47 | INFO | __main__ : Batch-wise Cosine Similarity | -2.53% +2025-05-12T08:46:47 | INFO | __main__ : Cosine Embedding Loss | 1.0253 +2025-05-12T08:46:47 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T08:46:47 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T08:46:47 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T08:47:12 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T08:47:12 | INFO | __main__ : Step: 200 +2025-05-12T08:47:12 | INFO | __main__ : Current Frame Index within Batch Video: 206/247 +2025-05-12T08:47:12 | INFO | __main__ : Batch-wise Cosine Similarity | -1.13% +2025-05-12T08:47:12 | INFO | __main__ : Cosine Embedding Loss | 1.0113 +2025-05-12T08:47:12 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T08:47:12 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T08:47:12 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T08:47:22 | INFO | utils.basic_utils : Train Epoch: [0] [ 0/4978] eta: 4 days, 11:08:47 lr: 0.000000 temperature: 0.0126 video-loss_cosine: 1.0124 eval_avg_sim: No data video-cosine_similarity: -0.0124 time: 77.4864 data: 17.6354 max mem: 11173 res mem: 12166 +2025-05-12T08:47:36 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T08:47:36 | INFO | __main__ : Step: 300 +2025-05-12T08:47:36 | INFO | __main__ : Current Frame Index within Batch Video: 65/247 +2025-05-12T08:47:36 | INFO | __main__ : Batch-wise Cosine Similarity | 0.53% +2025-05-12T08:47:36 | INFO | __main__ : Cosine Embedding Loss | 0.9947 +2025-05-12T08:47:36 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T08:47:37 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T08:47:37 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T08:48:01 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T08:48:01 | INFO | __main__ : Step: 400 +2025-05-12T08:48:01 | INFO | __main__ : Current Frame Index within Batch Video: 165/247 +2025-05-12T08:48:01 | INFO | __main__ : Batch-wise Cosine Similarity | 3.91% +2025-05-12T08:48:01 | INFO | __main__ : Cosine Embedding Loss | 0.9609 +2025-05-12T08:48:01 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T08:48:01 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T08:48:01 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T08:48:21 | INFO | utils.basic_utils : Train Epoch: [0] [ 1/4978] eta: 3 days, 22:14:38 lr: 0.000000 temperature: 0.0126 video-loss_cosine: 0.9477 eval_avg_sim: No data video-cosine_similarity: 0.0523 time: 68.1693 data: 8.8184 max mem: 11173 res mem: 15242 +2025-05-12T08:48:25 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T08:48:25 | INFO | __main__ : Step: 500 +2025-05-12T08:48:25 | INFO | __main__ : Current Frame Index within Batch Video: 24/247 +2025-05-12T08:48:25 | INFO | __main__ : Batch-wise Cosine Similarity | 5.06% +2025-05-12T08:48:25 | INFO | __main__ : Cosine Embedding Loss | 0.9494 +2025-05-12T08:48:25 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T08:48:25 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T08:48:25 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T08:48:50 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T08:48:50 | INFO | __main__ : Step: 600 +2025-05-12T08:48:50 | INFO | __main__ : Current Frame Index within Batch Video: 124/247 +2025-05-12T08:48:50 | INFO | __main__ : Batch-wise Cosine Similarity | 8.26% +2025-05-12T08:48:50 | INFO | __main__ : Cosine Embedding Loss | 0.9174 +2025-05-12T08:48:50 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T08:48:50 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T08:48:50 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T08:49:14 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T08:49:14 | INFO | __main__ : Step: 700 +2025-05-12T08:49:14 | INFO | __main__ : Current Frame Index within Batch Video: 224/247 +2025-05-12T08:49:14 | INFO | __main__ : Batch-wise Cosine Similarity | 19.21% +2025-05-12T08:49:14 | INFO | __main__ : Cosine Embedding Loss | 0.8079 +2025-05-12T08:49:14 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T08:49:14 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T08:49:14 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T08:49:19 | INFO | utils.basic_utils : Train Epoch: [0] [ 2/4978] eta: 3 days, 17:43:40 lr: 0.000000 temperature: 0.0126 video-loss_cosine: 0.8063 eval_avg_sim: No data video-cosine_similarity: 0.1937 time: 64.9156 data: 5.8793 max mem: 11173 res mem: 15242 +2025-05-12T08:49:38 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T08:49:38 | INFO | __main__ : Step: 800 +2025-05-12T08:49:38 | INFO | __main__ : Current Frame Index within Batch Video: 83/247 +2025-05-12T08:49:38 | INFO | __main__ : Batch-wise Cosine Similarity | 17.65% +2025-05-12T08:49:38 | INFO | __main__ : Cosine Embedding Loss | 0.8235 +2025-05-12T08:49:38 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T08:49:38 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T08:49:38 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T08:50:02 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T08:50:02 | INFO | __main__ : Step: 900 +2025-05-12T08:50:02 | INFO | __main__ : Current Frame Index within Batch Video: 183/247 +2025-05-12T08:50:02 | INFO | __main__ : Batch-wise Cosine Similarity | 22.68% +2025-05-12T08:50:02 | INFO | __main__ : Cosine Embedding Loss | 0.7732 +2025-05-12T08:50:02 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T08:50:02 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T08:50:02 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T08:50:17 | INFO | utils.basic_utils : Train Epoch: [0] [ 3/4978] eta: 3 days, 15:21:19 lr: 0.000000 temperature: 0.0126 video-loss_cosine: 0.7545 eval_avg_sim: No data video-cosine_similarity: 0.2455 time: 63.2120 data: 4.4095 max mem: 11173 res mem: 15242 +2025-05-12T08:50:26 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T08:50:26 | INFO | __main__ : Step: 1000 +2025-05-12T08:50:26 | INFO | __main__ : Current Frame Index within Batch Video: 42/247 +2025-05-12T08:50:26 | INFO | __main__ : Batch-wise Cosine Similarity | 25.19% +2025-05-12T08:50:26 | INFO | __main__ : Cosine Embedding Loss | 0.7481 +2025-05-12T08:50:26 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T08:50:26 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:03:05 | INFO | vindlu : Logging to: scripts/pretraining/clip/B14/B14/train.log +2025-05-12T09:03:05 | INFO | utils.config_utils : config: { + root_path: /home/zli + available_corpus: { + cc3m: { + anno_path: your_path + data_root: + media_type: image } + webvid_10m: { + anno_path: your_path + data_root: + media_type: video } + smol_test: { + anno_path: /root/IV2/InternVideo2/multi_modality/data_test/smol_test.json + data_root: /root/IV2/InternVideo2/multi_modality/data_test/ + media_type: video } + slim_kinetics: { + anno_path: /home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json + data_root: /home/zli/kinetics-dataset/k600/train/train + media_type: video } + slim_kinetics_act_val: { + anno_path: /home/zli/kinetics-dataset/k600/test/kinetics-test.json + data_root: /home/zli/kinetics-dataset/k600/test/ + media_type: video + is_act_rec: True } } + VisionEncoders: { + + TextEncoders: { + bert: { + name: bert_base + pretrained: bert-base-uncased + config: configs/config_bert.json + d_model: 768 + fusion_layer: 9 } + bert_large: { + name: bert_large + pretrained: bert-large-uncased + config: configs/config_bert_large.json + d_model: 1024 + fusion_layer: 19 } + med_bert: { + name: med_bert_base + pretrained: bert-base-uncased + config: configs/med_config.json + d_model: 768 } + med_bert_large: { + name: med_bert_large + pretrained: bert-base-uncased + config: configs/med_large_config.json + d_model: 768 } } + train_corpus: slim_kinetics + train_file: { + anno_path: /home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json + data_root: /home/zli/kinetics-dataset/k600/train/train + media_type: video } + test_file: { + act_val: { + anno_path: /home/zli/kinetics-dataset/k600/test/kinetics-test.json + data_root: /home/zli/kinetics-dataset/k600/test/ + media_type: video + is_act_rec: True } } + test_types: ['act_val'] + num_workers: 2 + stop_key: None + num_frames: 8 + num_frames_test: 8 + batch_size: 16 + batch_size_test: 16 + max_txt_l: 32 + size_t: 224 + inputs: { + image_res: 224 + video_input: { + num_frames: 8 + sample_type: all + num_frames_test: 8 + sample_type_test: all + random_aug: False } + max_txt_l: { + image: 32 + video: 32 } + batch_size: { + image: 16 + video: 16 } + batch_size_test: { + image: 16 + video: 16 } } + model: { + model_cls: InternVideo2_CLIP_small + vision_encoder: { + name: internvideo2 + in_chans: 3 + patch_size: 14 + img_size: 224 + qkv_bias: False + drop_path_rate: 0.0 + head_drop_path_rate: 0.0 + embed_dim: 768 + num_heads: 12 + mlp_ratio: 4 + init_values: 0.1 + qk_normalization: True + depth: 12 + use_flash_attn: True + use_fused_rmsnorm: True + use_fused_mlp: True + fused_mlp_heuristic: 1 + drop_cls_token: False + attn_pool_num_heads: 16 + clip_embed_dim: 768 + layerscale_no_force_fp32: True + num_frames: 8 + tubelet_size: 1 + sep_pos_embed: False + use_checkpoint: False + checkpoint_num: 0 + align_dim: 512 } + streaming_vision_encoder: { + in_chans: 3 + patch_size: 14 + img_size: 224 + vit_qkv_bias: True + vit_drop_path_rate: 0.05 + student_embed_dim: 384 + student_depth: 4 + student_num_heads: 6 + vit_mlp_ratio: 3.0 + vit_init_values: None + vit_qk_normalization: False + vit_sep_pos_embed: True + vit_norm_layer_type: rmsnorm + rnn_type: lstm + rnn_hidden_size: 1024 + rnn_num_layers: 1 + fc_hidden_layers: [] + teacher_clip_embed_dim: 768 + student_num_frames_processed_by_vit: 1 + student_tubelet_size_for_vit: 1 } + text_encoder: { + name: mobileclip_b } + temp: 0.01 + temp_min: 0.01 + freeze_vision: True + open_vision_clip_projector: False + freeze_text: True + open_text_projection: False + open_text_lora: False + vision_ckpt_path: /home/zli/IV2/models/stage1/B14/B14_dist_1B_stage2/pytorch_model.bin + load_vision_ckpt_from_internvideo2_stage2: False + text_ckpt_path: /home/zli/IV2/models/mobileclip_blt.pt + extra_ckpt_path: /home/zli/IV2/models/clip/B14/pytorch_model.bin } + criterion: { + loss_weight: { + vtc: 1.0 } } + optimizer: { + opt: adamW + lr: 1e-05 + opt_betas: [0.9, 0.98] + weight_decay: 0.01 + max_grad_norm: 0.7 + different_lr: { + enable: False + module_names: [] + lr: 1e-05 } } + scheduler: { + sched: cosine + epochs: 1 + min_lr_multi: 0.01 + warmup_epochs: 0.05 } + evaluate: False + deep_fusion: False + evaluation: { + eval_frame_ensemble: concat + eval_x_only: False + k_test: 128 + eval_offload: True } + use_half_precision: True + use_bf16: True + gradient_checkpointing: True + wandb: { + enable: True + entity: qingy2019-conker-mobile-inc- + project: window_iv2 } + dist_url: env:// + device: cuda + mode: pt + output_dir: scripts/pretraining/clip/B14/B14 + resume: True + debug: False + log_freq: 1 + seed: 42 + save_latest: False + save_iter: 5000 + eval_freq_steps: 1000 + eval_video_repo_id: qingy2024/backflip_train + eval_video_filename: 1.mp4 + eval_plot_output_dir: scripts/pretraining/clip/B14/cosine_sim_graphs + auto_resume: True + pretrained_path: + deepspeed: { + enable: True + stage: 1 } + rank: 0 + world_size: 1 + gpu: 0 + distributed: True + dist_backend: nccl + deepspeed_config: scripts/pretraining/clip/B14/B14/deepspeed_config.json } +2025-05-12T09:03:06 | INFO | __main__ : train_file: {'anno_path': '/home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json', 'data_root': '/home/zli/kinetics-dataset/k600/train/train', 'media_type': 'video'} +2025-05-12T09:03:06 | INFO | __main__ : Creating dataset for pt +2025-05-12T09:03:06 | WARNING | dataset : Make sure that you don't need audio input!!! +2025-05-12T09:03:06 | INFO | dataset : dataset_type: pt_train media_type: video dataset_cls: +2025-05-12T09:03:06 | INFO | dataset : dataset_type=pt_train, train_file={'anno_path': '/home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json', 'data_root': '/home/zli/kinetics-dataset/k600/train/train', 'media_type': 'video'} +2025-05-12T09:03:06 | INFO | dataset : {'ann_file': {'anno_path': '/home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json', 'data_root': '/home/zli/kinetics-dataset/k600/train/train', 'media_type': 'video'}, 'transform': Compose( + Lambda() + RandomResizedCrop(size=(224, 224), scale=(0.5, 1.0), ratio=(0.75, 1.3333), interpolation=bicubic, antialias=True) + RandomHorizontalFlip(p=0.5) + Lambda() + Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) +), 'num_epochs': 1, 'video_reader_type': 'decord', 'sample_type': 'all', 'num_frames': 8, 'num_tries': 10} +2025-05-12T09:03:06 | INFO | dataset : train_transform: +2025-05-12T09:03:06 | INFO | dataset : Compose( + Lambda() + RandomResizedCrop(size=(224, 224), scale=(0.5, 1.0), ratio=(0.75, 1.3333), interpolation=bicubic, antialias=True) + RandomHorizontalFlip(p=0.5) + Lambda() + Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) +) +2025-05-12T09:03:06 | INFO | dataset.pt_dataset : ann_file: {'anno_path': '/home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json', 'data_root': '/home/zli/kinetics-dataset/k600/train/train', 'media_type': 'video'} +2025-05-12T09:03:06 | INFO | dataset.pt_dataset : Loading json file /home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json +2025-05-12T09:03:06 | INFO | dataset.pt_dataset : Loading from local file! +2025-05-12T09:03:06 | INFO | dataset.pt_dataset : Num samples: 99243 +2025-05-12T09:03:06 | INFO | dataset.pt_dataset : Num too short: 19586 +2025-05-12T09:03:06 | INFO | dataset.pt_dataset : num_examples: 79657 +2025-05-12T09:03:06 | INFO | dataset : Use ConcatDataset for video +2025-05-12T09:03:06 | WARNING | dataset : Make sure that you don't need audio input!!! +2025-05-12T09:03:06 | INFO | dataset : dataset_type: ret_eval media_type: video dataset_cls: +2025-05-12T09:03:06 | INFO | dataset : dataset_type=pt_eval, test_file={'anno_path': '/home/zli/kinetics-dataset/k600/test/kinetics-test.json', 'data_root': '/home/zli/kinetics-dataset/k600/test/', 'media_type': 'video', 'is_act_rec': True} +2025-05-12T09:03:06 | INFO | dataset : {'ann_file': {'anno_path': '/home/zli/kinetics-dataset/k600/test/kinetics-test.json', 'data_root': '/home/zli/kinetics-dataset/k600/test/', 'media_type': 'video', 'is_act_rec': True}, 'transform': Compose( + Resize(size=(224, 224), interpolation=bicubic, max_size=None, antialias=True) + Lambda() + Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) +), 'video_reader_type': 'decord', 'sample_type': 'all', 'num_frames': 8, 'num_tries': 1} +2025-05-12T09:03:06 | INFO | dataset : test_transform: +2025-05-12T09:03:06 | INFO | dataset : Compose( + Resize(size=(224, 224), interpolation=bicubic, max_size=None, antialias=True) + Lambda() + Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) +) +2025-05-12T09:03:06 | INFO | dataset.ret_dataset : Action recognition, number of prompts: 16 +2025-05-12T09:03:06 | INFO | dataset.ret_dataset : Action recognition, number of classes: 512 +2025-05-12T09:03:06 | INFO | dataset.ret_dataset : Action recognition, number of prompts: 16 +2025-05-12T09:03:06 | INFO | dataset.ret_dataset : Action recognition, number of classes: 512 +2025-05-12T09:03:06 | INFO | tasks_clip.shared_utils : Creating model +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze cls_token +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze pos_embed +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze patch_embed.proj.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze patch_embed.proj.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.0.norm1.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.0.attn.qkv.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.0.attn.proj.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.0.attn.proj.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.0.attn.q_norm.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.0.attn.k_norm.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.0.ls1.gamma +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.0.norm2.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.0.mlp.fc1.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.0.mlp.fc1.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.0.mlp.fc2.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.0.mlp.fc2.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.0.ls2.gamma +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.1.norm1.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.1.attn.qkv.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.1.attn.proj.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.1.attn.proj.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.1.attn.q_norm.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.1.attn.k_norm.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.1.ls1.gamma +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.1.norm2.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.1.mlp.fc1.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.1.mlp.fc1.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.1.mlp.fc2.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.1.mlp.fc2.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.1.ls2.gamma +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.2.norm1.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.2.attn.qkv.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.2.attn.proj.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.2.attn.proj.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.2.attn.q_norm.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.2.attn.k_norm.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.2.ls1.gamma +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.2.norm2.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.2.mlp.fc1.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.2.mlp.fc1.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.2.mlp.fc2.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.2.mlp.fc2.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.2.ls2.gamma +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.3.norm1.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.3.attn.qkv.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.3.attn.proj.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.3.attn.proj.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.3.attn.q_norm.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.3.attn.k_norm.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.3.ls1.gamma +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.3.norm2.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.3.mlp.fc1.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.3.mlp.fc1.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.3.mlp.fc2.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.3.mlp.fc2.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.3.ls2.gamma +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.4.norm1.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.4.attn.qkv.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.4.attn.proj.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.4.attn.proj.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.4.attn.q_norm.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.4.attn.k_norm.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.4.ls1.gamma +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.4.norm2.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.4.mlp.fc1.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.4.mlp.fc1.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.4.mlp.fc2.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.4.mlp.fc2.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.4.ls2.gamma +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.5.norm1.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.5.attn.qkv.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.5.attn.proj.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.5.attn.proj.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.5.attn.q_norm.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.5.attn.k_norm.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.5.ls1.gamma +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.5.norm2.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.5.mlp.fc1.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.5.mlp.fc1.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.5.mlp.fc2.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.5.mlp.fc2.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.5.ls2.gamma +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.6.norm1.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.6.attn.qkv.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.6.attn.proj.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.6.attn.proj.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.6.attn.q_norm.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.6.attn.k_norm.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.6.ls1.gamma +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.6.norm2.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.6.mlp.fc1.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.6.mlp.fc1.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.6.mlp.fc2.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.6.mlp.fc2.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.6.ls2.gamma +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.7.norm1.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.7.attn.qkv.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.7.attn.proj.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.7.attn.proj.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.7.attn.q_norm.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.7.attn.k_norm.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.7.ls1.gamma +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.7.norm2.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.7.mlp.fc1.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.7.mlp.fc1.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.7.mlp.fc2.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.7.mlp.fc2.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.7.ls2.gamma +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.8.norm1.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.8.attn.qkv.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.8.attn.proj.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.8.attn.proj.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.8.attn.q_norm.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.8.attn.k_norm.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.8.ls1.gamma +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.8.norm2.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.8.mlp.fc1.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.8.mlp.fc1.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.8.mlp.fc2.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.8.mlp.fc2.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.8.ls2.gamma +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.9.norm1.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.9.attn.qkv.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.9.attn.proj.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.9.attn.proj.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.9.attn.q_norm.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.9.attn.k_norm.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.9.ls1.gamma +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.9.norm2.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.9.mlp.fc1.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.9.mlp.fc1.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.9.mlp.fc2.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.9.mlp.fc2.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.9.ls2.gamma +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.10.norm1.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.10.attn.qkv.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.10.attn.proj.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.10.attn.proj.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.10.attn.q_norm.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.10.attn.k_norm.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.10.ls1.gamma +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.10.norm2.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.10.mlp.fc1.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.10.mlp.fc1.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.10.mlp.fc2.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.10.mlp.fc2.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.10.ls2.gamma +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.11.norm1.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.11.attn.qkv.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.11.attn.proj.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.11.attn.proj.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.11.attn.q_norm.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.11.attn.k_norm.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.11.ls1.gamma +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.11.norm2.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.11.mlp.fc1.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.11.mlp.fc1.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.11.mlp.fc2.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.11.mlp.fc2.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze blocks.11.ls2.gamma +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze clip_projector.norm1_q.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze clip_projector.norm1_q.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze clip_projector.norm1_k.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze clip_projector.norm1_k.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze clip_projector.norm1_v.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze clip_projector.norm1_v.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze clip_projector.cross_attn.q_bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze clip_projector.cross_attn.k_bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze clip_projector.cross_attn.v_bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze clip_projector.cross_attn.q.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze clip_projector.cross_attn.k.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze clip_projector.cross_attn.v.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze clip_projector.cross_attn.proj.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze clip_projector.cross_attn.proj.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : ---- Froze all the vision encoder params ---- +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze 0.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze 0.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze 1.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze 1.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : ---- Froze all the vision align params ---- +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze projection_layer +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze embedding_layer.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze positional_embedding.pos_embed.pos_embed +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_mha.0.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_mha.0.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_mha.1.qkv_proj.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_mha.1.qkv_proj.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_mha.1.out_proj.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_mha.1.out_proj.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_ffn.0.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_ffn.0.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_ffn.1.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_ffn.1.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_ffn.4.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_ffn.4.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_mha.0.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_mha.0.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_mha.1.qkv_proj.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_mha.1.qkv_proj.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_mha.1.out_proj.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_mha.1.out_proj.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_ffn.0.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_ffn.0.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_ffn.1.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_ffn.1.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_ffn.4.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_ffn.4.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_mha.0.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_mha.0.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_mha.1.qkv_proj.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_mha.1.qkv_proj.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_mha.1.out_proj.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_mha.1.out_proj.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_ffn.0.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_ffn.0.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_ffn.1.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_ffn.1.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_ffn.4.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_ffn.4.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_mha.0.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_mha.0.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_mha.1.qkv_proj.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_mha.1.qkv_proj.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_mha.1.out_proj.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_mha.1.out_proj.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_ffn.0.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_ffn.0.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_ffn.1.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_ffn.1.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_ffn.4.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_ffn.4.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_mha.0.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_mha.0.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_mha.1.qkv_proj.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_mha.1.qkv_proj.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_mha.1.out_proj.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_mha.1.out_proj.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_ffn.0.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_ffn.0.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_ffn.1.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_ffn.1.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_ffn.4.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_ffn.4.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_mha.0.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_mha.0.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_mha.1.qkv_proj.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_mha.1.qkv_proj.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_mha.1.out_proj.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_mha.1.out_proj.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_ffn.0.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_ffn.0.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_ffn.1.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_ffn.1.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_ffn.4.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_ffn.4.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_mha.0.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_mha.0.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_mha.1.qkv_proj.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_mha.1.qkv_proj.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_mha.1.out_proj.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_mha.1.out_proj.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_ffn.0.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_ffn.0.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_ffn.1.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_ffn.1.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_ffn.4.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_ffn.4.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_mha.0.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_mha.0.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_mha.1.qkv_proj.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_mha.1.qkv_proj.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_mha.1.out_proj.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_mha.1.out_proj.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_ffn.0.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_ffn.0.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_ffn.1.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_ffn.1.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_ffn.4.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_ffn.4.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_mha.0.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_mha.0.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_mha.1.qkv_proj.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_mha.1.qkv_proj.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_mha.1.out_proj.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_mha.1.out_proj.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_ffn.0.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_ffn.0.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_ffn.1.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_ffn.1.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_ffn.4.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_ffn.4.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_mha.0.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_mha.0.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_mha.1.qkv_proj.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_mha.1.qkv_proj.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_mha.1.out_proj.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_mha.1.out_proj.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_ffn.0.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_ffn.0.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_ffn.1.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_ffn.1.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_ffn.4.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_ffn.4.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_mha.0.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_mha.0.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_mha.1.qkv_proj.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_mha.1.qkv_proj.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_mha.1.out_proj.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_mha.1.out_proj.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_ffn.0.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_ffn.0.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_ffn.1.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_ffn.1.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_ffn.4.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_ffn.4.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_mha.0.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_mha.0.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_mha.1.qkv_proj.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_mha.1.qkv_proj.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_mha.1.out_proj.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_mha.1.out_proj.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_ffn.0.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_ffn.0.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_ffn.1.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_ffn.1.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_ffn.4.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_ffn.4.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze final_layer_norm.weight +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Freeze final_layer_norm.bias +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Load vision_encoder checkpoint from /home/zli/IV2/models/stage1/B14/B14_dist_1B_stage2/pytorch_model.bin +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Load text_encoder checkpoint from /home/zli/IV2/models/mobileclip_blt.pt +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : Load extra checkpoint from /home/zli/IV2/models/clip/B14/pytorch_model.bin +2025-05-12T09:03:07 | INFO | models.internvideo2_clip_small : _IncompatibleKeys(missing_keys=['streaming_vision_encoder.vit_lite.cls_token', 'streaming_vision_encoder.vit_lite.pos_embed_spatial', 'streaming_vision_encoder.vit_lite.pos_embed_cls', 'streaming_vision_encoder.vit_lite.patch_embed.proj.weight', 'streaming_vision_encoder.vit_lite.patch_embed.proj.bias', 'streaming_vision_encoder.vit_lite.blocks.0.norm1.weight', 'streaming_vision_encoder.vit_lite.blocks.0.attn.qkv.weight', 'streaming_vision_encoder.vit_lite.blocks.0.attn.qkv.bias', 'streaming_vision_encoder.vit_lite.blocks.0.attn.proj.weight', 'streaming_vision_encoder.vit_lite.blocks.0.attn.proj.bias', 'streaming_vision_encoder.vit_lite.blocks.0.norm2.weight', 'streaming_vision_encoder.vit_lite.blocks.0.mlp.fc1.weight', 'streaming_vision_encoder.vit_lite.blocks.0.mlp.fc1.bias', 'streaming_vision_encoder.vit_lite.blocks.0.mlp.fc2.weight', 'streaming_vision_encoder.vit_lite.blocks.0.mlp.fc2.bias', 'streaming_vision_encoder.vit_lite.blocks.1.norm1.weight', 'streaming_vision_encoder.vit_lite.blocks.1.attn.qkv.weight', 'streaming_vision_encoder.vit_lite.blocks.1.attn.qkv.bias', 'streaming_vision_encoder.vit_lite.blocks.1.attn.proj.weight', 'streaming_vision_encoder.vit_lite.blocks.1.attn.proj.bias', 'streaming_vision_encoder.vit_lite.blocks.1.norm2.weight', 'streaming_vision_encoder.vit_lite.blocks.1.mlp.fc1.weight', 'streaming_vision_encoder.vit_lite.blocks.1.mlp.fc1.bias', 'streaming_vision_encoder.vit_lite.blocks.1.mlp.fc2.weight', 'streaming_vision_encoder.vit_lite.blocks.1.mlp.fc2.bias', 'streaming_vision_encoder.vit_lite.blocks.2.norm1.weight', 'streaming_vision_encoder.vit_lite.blocks.2.attn.qkv.weight', 'streaming_vision_encoder.vit_lite.blocks.2.attn.qkv.bias', 'streaming_vision_encoder.vit_lite.blocks.2.attn.proj.weight', 'streaming_vision_encoder.vit_lite.blocks.2.attn.proj.bias', 'streaming_vision_encoder.vit_lite.blocks.2.norm2.weight', 'streaming_vision_encoder.vit_lite.blocks.2.mlp.fc1.weight', 'streaming_vision_encoder.vit_lite.blocks.2.mlp.fc1.bias', 'streaming_vision_encoder.vit_lite.blocks.2.mlp.fc2.weight', 'streaming_vision_encoder.vit_lite.blocks.2.mlp.fc2.bias', 'streaming_vision_encoder.vit_lite.blocks.3.norm1.weight', 'streaming_vision_encoder.vit_lite.blocks.3.attn.qkv.weight', 'streaming_vision_encoder.vit_lite.blocks.3.attn.qkv.bias', 'streaming_vision_encoder.vit_lite.blocks.3.attn.proj.weight', 'streaming_vision_encoder.vit_lite.blocks.3.attn.proj.bias', 'streaming_vision_encoder.vit_lite.blocks.3.norm2.weight', 'streaming_vision_encoder.vit_lite.blocks.3.mlp.fc1.weight', 'streaming_vision_encoder.vit_lite.blocks.3.mlp.fc1.bias', 'streaming_vision_encoder.vit_lite.blocks.3.mlp.fc2.weight', 'streaming_vision_encoder.vit_lite.blocks.3.mlp.fc2.bias', 'streaming_vision_encoder.rnn.weight_ih_l0', 'streaming_vision_encoder.rnn.weight_hh_l0', 'streaming_vision_encoder.rnn.bias_ih_l0', 'streaming_vision_encoder.rnn.bias_hh_l0', 'streaming_vision_encoder.output_fc.0.weight', 'streaming_vision_encoder.output_fc.0.bias', 'streaming_vision_align.0.weight', 'streaming_vision_align.0.bias', 'streaming_vision_align.1.weight', 'streaming_vision_align.1.bias'], unexpected_keys=[]) +2025-05-12T09:03:07 | INFO | tasks_clip.shared_utils : Change to bfloat16 for model +2025-05-12T09:03:07 | INFO | utils.optimizer : diff_names: [], diff_lr: None +2025-05-12T09:03:07 | INFO | utils.optimizer : param temp: wd: 0, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.cls_token: wd: 0.01, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.pos_embed_spatial: wd: 0.01, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.pos_embed_cls: wd: 0.01, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.patch_embed.proj.weight: wd: 0.01, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.patch_embed.proj.bias: wd: 0, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.norm1.weight: wd: 0, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.attn.qkv.weight: wd: 0.01, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.attn.qkv.bias: wd: 0, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.attn.proj.weight: wd: 0.01, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.attn.proj.bias: wd: 0, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.norm2.weight: wd: 0, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.mlp.fc1.weight: wd: 0.01, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.mlp.fc1.bias: wd: 0, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.mlp.fc2.weight: wd: 0.01, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.mlp.fc2.bias: wd: 0, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.norm1.weight: wd: 0, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.attn.qkv.weight: wd: 0.01, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.attn.qkv.bias: wd: 0, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.attn.proj.weight: wd: 0.01, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.attn.proj.bias: wd: 0, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.norm2.weight: wd: 0, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.mlp.fc1.weight: wd: 0.01, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.mlp.fc1.bias: wd: 0, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.mlp.fc2.weight: wd: 0.01, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.mlp.fc2.bias: wd: 0, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.norm1.weight: wd: 0, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.attn.qkv.weight: wd: 0.01, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.attn.qkv.bias: wd: 0, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.attn.proj.weight: wd: 0.01, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.attn.proj.bias: wd: 0, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.norm2.weight: wd: 0, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.mlp.fc1.weight: wd: 0.01, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.mlp.fc1.bias: wd: 0, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.mlp.fc2.weight: wd: 0.01, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.mlp.fc2.bias: wd: 0, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.norm1.weight: wd: 0, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.attn.qkv.weight: wd: 0.01, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.attn.qkv.bias: wd: 0, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.attn.proj.weight: wd: 0.01, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.attn.proj.bias: wd: 0, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.norm2.weight: wd: 0, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.mlp.fc1.weight: wd: 0.01, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.mlp.fc1.bias: wd: 0, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.mlp.fc2.weight: wd: 0.01, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.mlp.fc2.bias: wd: 0, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_encoder.rnn.weight_ih_l0: wd: 0.01, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_encoder.rnn.weight_hh_l0: wd: 0.01, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_encoder.rnn.bias_ih_l0: wd: 0, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_encoder.rnn.bias_hh_l0: wd: 0, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_encoder.output_fc.0.weight: wd: 0.01, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_encoder.output_fc.0.bias: wd: 0, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_align.0.weight: wd: 0, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_align.0.bias: wd: 0, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_align.1.weight: wd: 0.01, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : param streaming_vision_align.1.bias: wd: 0, lr: 1e-05 +2025-05-12T09:03:07 | INFO | utils.optimizer : optimizer -- lr=1e-05 wd=0 len(p)=32 +2025-05-12T09:03:07 | INFO | utils.optimizer : optimizer -- lr=1e-05 wd=0.01 len(p)=24 +2025-05-12T09:03:07 | INFO | tasks_clip.shared_utils : Auto resuming +2025-05-12T09:03:07 | INFO | tasks_clip.shared_utils : Not found checkpoint in scripts/pretraining/clip/B14/B14 +2025-05-12T09:03:07 | INFO | tasks_clip.shared_utils : Use deepspeed to initialize model!!! +2025-05-12T09:03:07 | WARNING | py.warnings : /home/zli/IV2/InternVideo2/multi_modality/utils/distributed.py:24: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. +If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST']. + builtin_warn(*args, **kwargs) + +2025-05-12T09:03:07 | WARNING | py.warnings : /home/zli/IV2/InternVideo2/multi_modality/utils/distributed.py:24: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. +If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST']. + builtin_warn(*args, **kwargs) + +2025-05-12T09:03:08 | INFO | tasks_clip.shared_utils : Cuda memory after create model: 379M, Max mem: 379M +2025-05-12T09:03:08 | INFO | __main__ : Start training +2025-05-12T09:03:08 | INFO | __main__ : Epoch: 0 +2025-05-12T09:03:08 | WARNING | __main__ : Model does not have a 'transform' or 'config.model.vision_encoder.img_size' attribute. Using default transform. +2025-05-12T09:03:08 | INFO | __main__ : Getting evaluation video from qingy2024/backflip_train (1.mp4) +2025-05-12T09:03:08 | INFO | dataset.dataloader : Do not skip steps for any dataloader! +2025-05-12T09:03:08 | INFO | dataset.dataloader : MetaLoader has 1 dataloaders, 4978 batches in total +dataloader index=0 name=video, batch-size=16 length(#batches)=4978 +2025-05-12T09:03:26 | WARNING | py.warnings : /home/zli/miniconda3/lib/python3.10/site-packages/typing_extensions.py:2852: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. + warnings.warn(msg, category=category, stacklevel=stacklevel + 1) + +2025-05-12T09:03:26 | WARNING | py.warnings : /home/zli/miniconda3/lib/python3.10/site-packages/typing_extensions.py:2852: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. + warnings.warn(msg, category=category, stacklevel=stacklevel + 1) + +2025-05-12T09:03:51 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:03:51 | INFO | __main__ : Step: 100 +2025-05-12T09:03:51 | INFO | __main__ : Current Frame Index within Batch Video: 106/247 +2025-05-12T09:03:51 | INFO | __main__ : Batch-wise Cosine Similarity | -2.53% +2025-05-12T09:03:51 | INFO | __main__ : Cosine Embedding Loss | 1.0253 +2025-05-12T09:03:51 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T09:03:51 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:03:51 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:04:15 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:04:15 | INFO | __main__ : Step: 200 +2025-05-12T09:04:15 | INFO | __main__ : Current Frame Index within Batch Video: 206/247 +2025-05-12T09:04:15 | INFO | __main__ : Batch-wise Cosine Similarity | -1.13% +2025-05-12T09:04:15 | INFO | __main__ : Cosine Embedding Loss | 1.0113 +2025-05-12T09:04:15 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T09:04:15 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:04:15 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:04:25 | INFO | utils.basic_utils : Train Epoch: [0] [ 0/4978] eta: 4 days, 9:48:52 lr: 0.000000 temperature: 0.0126 video-loss_cosine: 1.0124 eval_avg_sim: No data video-cosine_similarity: -0.0124 time: 76.5233 data: 17.6402 max mem: 11173 res mem: 12166 +2025-05-12T09:04:39 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:04:39 | INFO | __main__ : Step: 300 +2025-05-12T09:04:39 | INFO | __main__ : Current Frame Index within Batch Video: 65/247 +2025-05-12T09:04:39 | INFO | __main__ : Batch-wise Cosine Similarity | 0.53% +2025-05-12T09:04:39 | INFO | __main__ : Cosine Embedding Loss | 0.9947 +2025-05-12T09:04:39 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T09:04:39 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:04:39 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:05:04 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:05:04 | INFO | __main__ : Step: 400 +2025-05-12T09:05:04 | INFO | __main__ : Current Frame Index within Batch Video: 165/247 +2025-05-12T09:05:04 | INFO | __main__ : Batch-wise Cosine Similarity | 3.91% +2025-05-12T09:05:04 | INFO | __main__ : Cosine Embedding Loss | 0.9609 +2025-05-12T09:05:04 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T09:05:04 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:05:04 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:05:23 | INFO | utils.basic_utils : Train Epoch: [0] [ 1/4978] eta: 3 days, 21:18:40 lr: 0.000000 temperature: 0.0126 video-loss_cosine: 0.9477 eval_avg_sim: No data video-cosine_similarity: 0.0523 time: 67.4946 data: 8.8202 max mem: 11173 res mem: 15242 +2025-05-12T09:05:28 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:05:28 | INFO | __main__ : Step: 500 +2025-05-12T09:05:28 | INFO | __main__ : Current Frame Index within Batch Video: 24/247 +2025-05-12T09:05:28 | INFO | __main__ : Batch-wise Cosine Similarity | 5.06% +2025-05-12T09:05:28 | INFO | __main__ : Cosine Embedding Loss | 0.9494 +2025-05-12T09:05:28 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T09:05:28 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:05:28 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:05:52 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:05:52 | INFO | __main__ : Step: 600 +2025-05-12T09:05:52 | INFO | __main__ : Current Frame Index within Batch Video: 124/247 +2025-05-12T09:05:52 | INFO | __main__ : Batch-wise Cosine Similarity | 8.26% +2025-05-12T09:05:52 | INFO | __main__ : Cosine Embedding Loss | 0.9174 +2025-05-12T09:05:52 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T09:05:52 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:05:52 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:06:16 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:06:16 | INFO | __main__ : Step: 700 +2025-05-12T09:06:16 | INFO | __main__ : Current Frame Index within Batch Video: 224/247 +2025-05-12T09:06:16 | INFO | __main__ : Batch-wise Cosine Similarity | 19.21% +2025-05-12T09:06:16 | INFO | __main__ : Cosine Embedding Loss | 0.8079 +2025-05-12T09:06:16 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T09:06:16 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:06:16 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:06:22 | INFO | utils.basic_utils : Train Epoch: [0] [ 2/4978] eta: 3 days, 17:07:30 lr: 0.000000 temperature: 0.0126 video-loss_cosine: 0.8063 eval_avg_sim: No data video-cosine_similarity: 0.1937 time: 64.4796 data: 5.8804 max mem: 11173 res mem: 15242 +2025-05-12T09:06:41 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:06:41 | INFO | __main__ : Step: 800 +2025-05-12T09:06:41 | INFO | __main__ : Current Frame Index within Batch Video: 83/247 +2025-05-12T09:06:41 | INFO | __main__ : Batch-wise Cosine Similarity | 17.65% +2025-05-12T09:06:41 | INFO | __main__ : Cosine Embedding Loss | 0.8235 +2025-05-12T09:06:41 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T09:06:41 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:06:41 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:07:05 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:07:05 | INFO | __main__ : Step: 900 +2025-05-12T09:07:05 | INFO | __main__ : Current Frame Index within Batch Video: 183/247 +2025-05-12T09:07:05 | INFO | __main__ : Batch-wise Cosine Similarity | 22.68% +2025-05-12T09:07:05 | INFO | __main__ : Cosine Embedding Loss | 0.7732 +2025-05-12T09:07:05 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T09:07:05 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:07:05 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:07:20 | INFO | utils.basic_utils : Train Epoch: [0] [ 3/4978] eta: 3 days, 14:52:53 lr: 0.000000 temperature: 0.0126 video-loss_cosine: 0.7545 eval_avg_sim: No data video-cosine_similarity: 0.2455 time: 62.8691 data: 4.4103 max mem: 11173 res mem: 15242 +2025-05-12T09:07:29 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:07:29 | INFO | __main__ : Step: 1000 +2025-05-12T09:07:29 | INFO | __main__ : Current Frame Index within Batch Video: 42/247 +2025-05-12T09:07:29 | INFO | __main__ : Batch-wise Cosine Similarity | 25.19% +2025-05-12T09:07:29 | INFO | __main__ : Cosine Embedding Loss | 0.7481 +2025-05-12T09:07:29 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T09:07:29 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:16:07 | INFO | vindlu : Logging to: scripts/pretraining/clip/B14/B14/train.log +2025-05-12T09:16:07 | INFO | utils.config_utils : config: { + root_path: /home/zli + available_corpus: { + cc3m: { + anno_path: your_path + data_root: + media_type: image } + webvid_10m: { + anno_path: your_path + data_root: + media_type: video } + smol_test: { + anno_path: /root/IV2/InternVideo2/multi_modality/data_test/smol_test.json + data_root: /root/IV2/InternVideo2/multi_modality/data_test/ + media_type: video } + slim_kinetics: { + anno_path: /home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json + data_root: /home/zli/kinetics-dataset/k600/train/train + media_type: video } + slim_kinetics_act_val: { + anno_path: /home/zli/kinetics-dataset/k600/test/kinetics-test.json + data_root: /home/zli/kinetics-dataset/k600/test/ + media_type: video + is_act_rec: True } } + VisionEncoders: { + + TextEncoders: { + bert: { + name: bert_base + pretrained: bert-base-uncased + config: configs/config_bert.json + d_model: 768 + fusion_layer: 9 } + bert_large: { + name: bert_large + pretrained: bert-large-uncased + config: configs/config_bert_large.json + d_model: 1024 + fusion_layer: 19 } + med_bert: { + name: med_bert_base + pretrained: bert-base-uncased + config: configs/med_config.json + d_model: 768 } + med_bert_large: { + name: med_bert_large + pretrained: bert-base-uncased + config: configs/med_large_config.json + d_model: 768 } } + train_corpus: slim_kinetics + train_file: { + anno_path: /home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json + data_root: /home/zli/kinetics-dataset/k600/train/train + media_type: video } + test_file: { + act_val: { + anno_path: /home/zli/kinetics-dataset/k600/test/kinetics-test.json + data_root: /home/zli/kinetics-dataset/k600/test/ + media_type: video + is_act_rec: True } } + test_types: ['act_val'] + num_workers: 2 + stop_key: None + num_frames: 8 + num_frames_test: 8 + batch_size: 16 + batch_size_test: 16 + max_txt_l: 32 + size_t: 224 + inputs: { + image_res: 224 + video_input: { + num_frames: 8 + sample_type: all + num_frames_test: 8 + sample_type_test: all + random_aug: False } + max_txt_l: { + image: 32 + video: 32 } + batch_size: { + image: 16 + video: 16 } + batch_size_test: { + image: 16 + video: 16 } } + model: { + model_cls: InternVideo2_CLIP_small + vision_encoder: { + name: internvideo2 + in_chans: 3 + patch_size: 14 + img_size: 224 + qkv_bias: False + drop_path_rate: 0.0 + head_drop_path_rate: 0.0 + embed_dim: 768 + num_heads: 12 + mlp_ratio: 4 + init_values: 0.1 + qk_normalization: True + depth: 12 + use_flash_attn: True + use_fused_rmsnorm: True + use_fused_mlp: True + fused_mlp_heuristic: 1 + drop_cls_token: False + attn_pool_num_heads: 16 + clip_embed_dim: 768 + layerscale_no_force_fp32: True + num_frames: 8 + tubelet_size: 1 + sep_pos_embed: False + use_checkpoint: False + checkpoint_num: 0 + align_dim: 512 } + streaming_vision_encoder: { + in_chans: 3 + patch_size: 14 + img_size: 224 + vit_qkv_bias: True + vit_drop_path_rate: 0.05 + student_embed_dim: 384 + student_depth: 4 + student_num_heads: 6 + vit_mlp_ratio: 3.0 + vit_init_values: None + vit_qk_normalization: False + vit_sep_pos_embed: True + vit_norm_layer_type: rmsnorm + rnn_type: lstm + rnn_hidden_size: 1024 + rnn_num_layers: 1 + fc_hidden_layers: [] + teacher_clip_embed_dim: 768 + student_num_frames_processed_by_vit: 1 + student_tubelet_size_for_vit: 1 } + text_encoder: { + name: mobileclip_b } + temp: 0.01 + temp_min: 0.01 + freeze_vision: True + open_vision_clip_projector: False + freeze_text: True + open_text_projection: False + open_text_lora: False + vision_ckpt_path: /home/zli/IV2/models/stage1/B14/B14_dist_1B_stage2/pytorch_model.bin + load_vision_ckpt_from_internvideo2_stage2: False + text_ckpt_path: /home/zli/IV2/models/mobileclip_blt.pt + extra_ckpt_path: /home/zli/IV2/models/clip/B14/pytorch_model.bin } + criterion: { + loss_weight: { + vtc: 1.0 } } + optimizer: { + opt: adamW + lr: 1e-05 + opt_betas: [0.9, 0.98] + weight_decay: 0.01 + max_grad_norm: 0.7 + different_lr: { + enable: False + module_names: [] + lr: 1e-05 } } + scheduler: { + sched: cosine + epochs: 1 + min_lr_multi: 0.01 + warmup_epochs: 0.05 } + evaluate: False + deep_fusion: False + evaluation: { + eval_frame_ensemble: concat + eval_x_only: False + k_test: 128 + eval_offload: True } + use_half_precision: True + use_bf16: True + gradient_checkpointing: True + wandb: { + enable: True + entity: qingy2019-conker-mobile-inc- + project: window_iv2 } + dist_url: env:// + device: cuda + mode: pt + output_dir: scripts/pretraining/clip/B14/B14 + resume: True + debug: False + log_freq: 1 + seed: 42 + save_latest: False + save_iter: 5000 + eval_freq_steps: 1000 + eval_video_repo_id: qingy2024/backflip_train + eval_video_filename: 1.mp4 + eval_plot_output_dir: scripts/pretraining/clip/B14/cosine_sim_graphs + auto_resume: True + pretrained_path: + deepspeed: { + enable: True + stage: 1 } + rank: 0 + world_size: 1 + gpu: 0 + distributed: True + dist_backend: nccl + deepspeed_config: scripts/pretraining/clip/B14/B14/deepspeed_config.json } +2025-05-12T09:16:09 | INFO | __main__ : train_file: {'anno_path': '/home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json', 'data_root': '/home/zli/kinetics-dataset/k600/train/train', 'media_type': 'video'} +2025-05-12T09:16:09 | INFO | __main__ : Creating dataset for pt +2025-05-12T09:16:09 | WARNING | dataset : Make sure that you don't need audio input!!! +2025-05-12T09:16:09 | INFO | dataset : dataset_type: pt_train media_type: video dataset_cls: +2025-05-12T09:16:09 | INFO | dataset : dataset_type=pt_train, train_file={'anno_path': '/home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json', 'data_root': '/home/zli/kinetics-dataset/k600/train/train', 'media_type': 'video'} +2025-05-12T09:16:09 | INFO | dataset : {'ann_file': {'anno_path': '/home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json', 'data_root': '/home/zli/kinetics-dataset/k600/train/train', 'media_type': 'video'}, 'transform': Compose( + Lambda() + RandomResizedCrop(size=(224, 224), scale=(0.5, 1.0), ratio=(0.75, 1.3333), interpolation=bicubic, antialias=True) + RandomHorizontalFlip(p=0.5) + Lambda() + Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) +), 'num_epochs': 1, 'video_reader_type': 'decord', 'sample_type': 'all', 'num_frames': 8, 'num_tries': 10} +2025-05-12T09:16:09 | INFO | dataset : train_transform: +2025-05-12T09:16:09 | INFO | dataset : Compose( + Lambda() + RandomResizedCrop(size=(224, 224), scale=(0.5, 1.0), ratio=(0.75, 1.3333), interpolation=bicubic, antialias=True) + RandomHorizontalFlip(p=0.5) + Lambda() + Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) +) +2025-05-12T09:16:09 | INFO | dataset.pt_dataset : ann_file: {'anno_path': '/home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json', 'data_root': '/home/zli/kinetics-dataset/k600/train/train', 'media_type': 'video'} +2025-05-12T09:16:09 | INFO | dataset.pt_dataset : Loading json file /home/zli/kinetics-dataset/k600/train/train/kinetics_v2.json +2025-05-12T09:16:09 | INFO | dataset.pt_dataset : Loading from local file! +2025-05-12T09:16:09 | INFO | dataset.pt_dataset : Num samples: 99243 +2025-05-12T09:16:09 | INFO | dataset.pt_dataset : Num too short: 19586 +2025-05-12T09:16:09 | INFO | dataset.pt_dataset : num_examples: 79657 +2025-05-12T09:16:09 | INFO | dataset : Use ConcatDataset for video +2025-05-12T09:16:09 | WARNING | dataset : Make sure that you don't need audio input!!! +2025-05-12T09:16:09 | INFO | dataset : dataset_type: ret_eval media_type: video dataset_cls: +2025-05-12T09:16:09 | INFO | dataset : dataset_type=pt_eval, test_file={'anno_path': '/home/zli/kinetics-dataset/k600/test/kinetics-test.json', 'data_root': '/home/zli/kinetics-dataset/k600/test/', 'media_type': 'video', 'is_act_rec': True} +2025-05-12T09:16:09 | INFO | dataset : {'ann_file': {'anno_path': '/home/zli/kinetics-dataset/k600/test/kinetics-test.json', 'data_root': '/home/zli/kinetics-dataset/k600/test/', 'media_type': 'video', 'is_act_rec': True}, 'transform': Compose( + Resize(size=(224, 224), interpolation=bicubic, max_size=None, antialias=True) + Lambda() + Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) +), 'video_reader_type': 'decord', 'sample_type': 'all', 'num_frames': 8, 'num_tries': 1} +2025-05-12T09:16:09 | INFO | dataset : test_transform: +2025-05-12T09:16:09 | INFO | dataset : Compose( + Resize(size=(224, 224), interpolation=bicubic, max_size=None, antialias=True) + Lambda() + Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) +) +2025-05-12T09:16:09 | INFO | dataset.ret_dataset : Action recognition, number of prompts: 16 +2025-05-12T09:16:09 | INFO | dataset.ret_dataset : Action recognition, number of classes: 512 +2025-05-12T09:16:09 | INFO | dataset.ret_dataset : Action recognition, number of prompts: 16 +2025-05-12T09:16:09 | INFO | dataset.ret_dataset : Action recognition, number of classes: 512 +2025-05-12T09:16:09 | INFO | tasks_clip.shared_utils : Creating model +2025-05-12T09:16:09 | INFO | models.internvideo2_clip_small : Freeze cls_token +2025-05-12T09:16:09 | INFO | models.internvideo2_clip_small : Freeze pos_embed +2025-05-12T09:16:09 | INFO | models.internvideo2_clip_small : Freeze patch_embed.proj.weight +2025-05-12T09:16:09 | INFO | models.internvideo2_clip_small : Freeze patch_embed.proj.bias +2025-05-12T09:16:09 | INFO | models.internvideo2_clip_small : Freeze blocks.0.norm1.weight +2025-05-12T09:16:09 | INFO | models.internvideo2_clip_small : Freeze blocks.0.attn.qkv.weight +2025-05-12T09:16:09 | INFO | models.internvideo2_clip_small : Freeze blocks.0.attn.proj.weight +2025-05-12T09:16:09 | INFO | models.internvideo2_clip_small : Freeze blocks.0.attn.proj.bias +2025-05-12T09:16:09 | INFO | models.internvideo2_clip_small : Freeze blocks.0.attn.q_norm.weight +2025-05-12T09:16:09 | INFO | models.internvideo2_clip_small : Freeze blocks.0.attn.k_norm.weight +2025-05-12T09:16:09 | INFO | models.internvideo2_clip_small : Freeze blocks.0.ls1.gamma +2025-05-12T09:16:09 | INFO | models.internvideo2_clip_small : Freeze blocks.0.norm2.weight +2025-05-12T09:16:09 | INFO | models.internvideo2_clip_small : Freeze blocks.0.mlp.fc1.weight +2025-05-12T09:16:09 | INFO | models.internvideo2_clip_small : Freeze blocks.0.mlp.fc1.bias +2025-05-12T09:16:09 | INFO | models.internvideo2_clip_small : Freeze blocks.0.mlp.fc2.weight +2025-05-12T09:16:09 | INFO | models.internvideo2_clip_small : Freeze blocks.0.mlp.fc2.bias +2025-05-12T09:16:09 | INFO | models.internvideo2_clip_small : Freeze blocks.0.ls2.gamma +2025-05-12T09:16:09 | INFO | models.internvideo2_clip_small : Freeze blocks.1.norm1.weight +2025-05-12T09:16:09 | INFO | models.internvideo2_clip_small : Freeze blocks.1.attn.qkv.weight +2025-05-12T09:16:09 | INFO | models.internvideo2_clip_small : Freeze blocks.1.attn.proj.weight +2025-05-12T09:16:09 | INFO | models.internvideo2_clip_small : Freeze blocks.1.attn.proj.bias +2025-05-12T09:16:09 | INFO | models.internvideo2_clip_small : Freeze blocks.1.attn.q_norm.weight +2025-05-12T09:16:09 | INFO | models.internvideo2_clip_small : Freeze blocks.1.attn.k_norm.weight +2025-05-12T09:16:09 | INFO | models.internvideo2_clip_small : Freeze blocks.1.ls1.gamma +2025-05-12T09:16:09 | INFO | models.internvideo2_clip_small : Freeze blocks.1.norm2.weight +2025-05-12T09:16:09 | INFO | models.internvideo2_clip_small : Freeze blocks.1.mlp.fc1.weight +2025-05-12T09:16:09 | INFO | models.internvideo2_clip_small : Freeze blocks.1.mlp.fc1.bias +2025-05-12T09:16:09 | INFO | models.internvideo2_clip_small : Freeze blocks.1.mlp.fc2.weight +2025-05-12T09:16:09 | INFO | models.internvideo2_clip_small : Freeze blocks.1.mlp.fc2.bias +2025-05-12T09:16:09 | INFO | models.internvideo2_clip_small : Freeze blocks.1.ls2.gamma +2025-05-12T09:16:09 | INFO | models.internvideo2_clip_small : Freeze blocks.2.norm1.weight +2025-05-12T09:16:09 | INFO | models.internvideo2_clip_small : Freeze blocks.2.attn.qkv.weight +2025-05-12T09:16:09 | INFO | models.internvideo2_clip_small : Freeze blocks.2.attn.proj.weight +2025-05-12T09:16:09 | INFO | models.internvideo2_clip_small : Freeze blocks.2.attn.proj.bias +2025-05-12T09:16:09 | INFO | models.internvideo2_clip_small : Freeze blocks.2.attn.q_norm.weight +2025-05-12T09:16:09 | INFO | models.internvideo2_clip_small : Freeze blocks.2.attn.k_norm.weight +2025-05-12T09:16:09 | INFO | models.internvideo2_clip_small : Freeze blocks.2.ls1.gamma +2025-05-12T09:16:09 | INFO | models.internvideo2_clip_small : Freeze blocks.2.norm2.weight +2025-05-12T09:16:09 | INFO | models.internvideo2_clip_small : Freeze blocks.2.mlp.fc1.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.2.mlp.fc1.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.2.mlp.fc2.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.2.mlp.fc2.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.2.ls2.gamma +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.3.norm1.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.3.attn.qkv.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.3.attn.proj.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.3.attn.proj.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.3.attn.q_norm.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.3.attn.k_norm.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.3.ls1.gamma +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.3.norm2.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.3.mlp.fc1.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.3.mlp.fc1.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.3.mlp.fc2.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.3.mlp.fc2.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.3.ls2.gamma +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.4.norm1.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.4.attn.qkv.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.4.attn.proj.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.4.attn.proj.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.4.attn.q_norm.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.4.attn.k_norm.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.4.ls1.gamma +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.4.norm2.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.4.mlp.fc1.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.4.mlp.fc1.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.4.mlp.fc2.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.4.mlp.fc2.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.4.ls2.gamma +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.5.norm1.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.5.attn.qkv.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.5.attn.proj.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.5.attn.proj.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.5.attn.q_norm.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.5.attn.k_norm.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.5.ls1.gamma +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.5.norm2.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.5.mlp.fc1.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.5.mlp.fc1.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.5.mlp.fc2.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.5.mlp.fc2.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.5.ls2.gamma +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.6.norm1.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.6.attn.qkv.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.6.attn.proj.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.6.attn.proj.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.6.attn.q_norm.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.6.attn.k_norm.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.6.ls1.gamma +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.6.norm2.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.6.mlp.fc1.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.6.mlp.fc1.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.6.mlp.fc2.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.6.mlp.fc2.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.6.ls2.gamma +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.7.norm1.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.7.attn.qkv.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.7.attn.proj.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.7.attn.proj.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.7.attn.q_norm.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.7.attn.k_norm.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.7.ls1.gamma +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.7.norm2.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.7.mlp.fc1.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.7.mlp.fc1.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.7.mlp.fc2.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.7.mlp.fc2.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.7.ls2.gamma +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.8.norm1.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.8.attn.qkv.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.8.attn.proj.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.8.attn.proj.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.8.attn.q_norm.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.8.attn.k_norm.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.8.ls1.gamma +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.8.norm2.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.8.mlp.fc1.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.8.mlp.fc1.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.8.mlp.fc2.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.8.mlp.fc2.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.8.ls2.gamma +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.9.norm1.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.9.attn.qkv.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.9.attn.proj.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.9.attn.proj.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.9.attn.q_norm.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.9.attn.k_norm.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.9.ls1.gamma +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.9.norm2.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.9.mlp.fc1.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.9.mlp.fc1.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.9.mlp.fc2.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.9.mlp.fc2.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.9.ls2.gamma +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.10.norm1.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.10.attn.qkv.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.10.attn.proj.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.10.attn.proj.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.10.attn.q_norm.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.10.attn.k_norm.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.10.ls1.gamma +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.10.norm2.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.10.mlp.fc1.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.10.mlp.fc1.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.10.mlp.fc2.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.10.mlp.fc2.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.10.ls2.gamma +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.11.norm1.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.11.attn.qkv.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.11.attn.proj.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.11.attn.proj.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.11.attn.q_norm.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.11.attn.k_norm.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.11.ls1.gamma +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.11.norm2.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.11.mlp.fc1.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.11.mlp.fc1.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.11.mlp.fc2.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.11.mlp.fc2.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze blocks.11.ls2.gamma +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze clip_projector.norm1_q.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze clip_projector.norm1_q.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze clip_projector.norm1_k.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze clip_projector.norm1_k.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze clip_projector.norm1_v.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze clip_projector.norm1_v.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze clip_projector.cross_attn.q_bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze clip_projector.cross_attn.k_bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze clip_projector.cross_attn.v_bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze clip_projector.cross_attn.q.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze clip_projector.cross_attn.k.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze clip_projector.cross_attn.v.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze clip_projector.cross_attn.proj.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze clip_projector.cross_attn.proj.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : ---- Froze all the vision encoder params ---- +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze 0.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze 0.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze 1.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze 1.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : ---- Froze all the vision align params ---- +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze projection_layer +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze embedding_layer.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze positional_embedding.pos_embed.pos_embed +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_mha.0.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_mha.0.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_mha.1.qkv_proj.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_mha.1.qkv_proj.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_mha.1.out_proj.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_mha.1.out_proj.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_ffn.0.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_ffn.0.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_ffn.1.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_ffn.1.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_ffn.4.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.0.pre_norm_ffn.4.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_mha.0.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_mha.0.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_mha.1.qkv_proj.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_mha.1.qkv_proj.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_mha.1.out_proj.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_mha.1.out_proj.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_ffn.0.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_ffn.0.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_ffn.1.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_ffn.1.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_ffn.4.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.1.pre_norm_ffn.4.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_mha.0.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_mha.0.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_mha.1.qkv_proj.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_mha.1.qkv_proj.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_mha.1.out_proj.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_mha.1.out_proj.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_ffn.0.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_ffn.0.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_ffn.1.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_ffn.1.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_ffn.4.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.2.pre_norm_ffn.4.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_mha.0.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_mha.0.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_mha.1.qkv_proj.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_mha.1.qkv_proj.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_mha.1.out_proj.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_mha.1.out_proj.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_ffn.0.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_ffn.0.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_ffn.1.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_ffn.1.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_ffn.4.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.3.pre_norm_ffn.4.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_mha.0.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_mha.0.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_mha.1.qkv_proj.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_mha.1.qkv_proj.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_mha.1.out_proj.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_mha.1.out_proj.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_ffn.0.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_ffn.0.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_ffn.1.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_ffn.1.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_ffn.4.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.4.pre_norm_ffn.4.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_mha.0.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_mha.0.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_mha.1.qkv_proj.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_mha.1.qkv_proj.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_mha.1.out_proj.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_mha.1.out_proj.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_ffn.0.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_ffn.0.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_ffn.1.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_ffn.1.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_ffn.4.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.5.pre_norm_ffn.4.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_mha.0.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_mha.0.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_mha.1.qkv_proj.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_mha.1.qkv_proj.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_mha.1.out_proj.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_mha.1.out_proj.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_ffn.0.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_ffn.0.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_ffn.1.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_ffn.1.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_ffn.4.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.6.pre_norm_ffn.4.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_mha.0.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_mha.0.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_mha.1.qkv_proj.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_mha.1.qkv_proj.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_mha.1.out_proj.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_mha.1.out_proj.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_ffn.0.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_ffn.0.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_ffn.1.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_ffn.1.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_ffn.4.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.7.pre_norm_ffn.4.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_mha.0.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_mha.0.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_mha.1.qkv_proj.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_mha.1.qkv_proj.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_mha.1.out_proj.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_mha.1.out_proj.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_ffn.0.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_ffn.0.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_ffn.1.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_ffn.1.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_ffn.4.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.8.pre_norm_ffn.4.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_mha.0.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_mha.0.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_mha.1.qkv_proj.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_mha.1.qkv_proj.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_mha.1.out_proj.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_mha.1.out_proj.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_ffn.0.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_ffn.0.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_ffn.1.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_ffn.1.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_ffn.4.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.9.pre_norm_ffn.4.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_mha.0.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_mha.0.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_mha.1.qkv_proj.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_mha.1.qkv_proj.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_mha.1.out_proj.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_mha.1.out_proj.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_ffn.0.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_ffn.0.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_ffn.1.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_ffn.1.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_ffn.4.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.10.pre_norm_ffn.4.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_mha.0.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_mha.0.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_mha.1.qkv_proj.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_mha.1.qkv_proj.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_mha.1.out_proj.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_mha.1.out_proj.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_ffn.0.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_ffn.0.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_ffn.1.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_ffn.1.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_ffn.4.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze transformer.11.pre_norm_ffn.4.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze final_layer_norm.weight +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Freeze final_layer_norm.bias +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Load vision_encoder checkpoint from /home/zli/IV2/models/stage1/B14/B14_dist_1B_stage2/pytorch_model.bin +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Load text_encoder checkpoint from /home/zli/IV2/models/mobileclip_blt.pt +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : Load extra checkpoint from /home/zli/IV2/models/clip/B14/pytorch_model.bin +2025-05-12T09:16:10 | INFO | models.internvideo2_clip_small : _IncompatibleKeys(missing_keys=['streaming_vision_encoder.vit_lite.cls_token', 'streaming_vision_encoder.vit_lite.pos_embed_spatial', 'streaming_vision_encoder.vit_lite.pos_embed_cls', 'streaming_vision_encoder.vit_lite.patch_embed.proj.weight', 'streaming_vision_encoder.vit_lite.patch_embed.proj.bias', 'streaming_vision_encoder.vit_lite.blocks.0.norm1.weight', 'streaming_vision_encoder.vit_lite.blocks.0.attn.qkv.weight', 'streaming_vision_encoder.vit_lite.blocks.0.attn.qkv.bias', 'streaming_vision_encoder.vit_lite.blocks.0.attn.proj.weight', 'streaming_vision_encoder.vit_lite.blocks.0.attn.proj.bias', 'streaming_vision_encoder.vit_lite.blocks.0.norm2.weight', 'streaming_vision_encoder.vit_lite.blocks.0.mlp.fc1.weight', 'streaming_vision_encoder.vit_lite.blocks.0.mlp.fc1.bias', 'streaming_vision_encoder.vit_lite.blocks.0.mlp.fc2.weight', 'streaming_vision_encoder.vit_lite.blocks.0.mlp.fc2.bias', 'streaming_vision_encoder.vit_lite.blocks.1.norm1.weight', 'streaming_vision_encoder.vit_lite.blocks.1.attn.qkv.weight', 'streaming_vision_encoder.vit_lite.blocks.1.attn.qkv.bias', 'streaming_vision_encoder.vit_lite.blocks.1.attn.proj.weight', 'streaming_vision_encoder.vit_lite.blocks.1.attn.proj.bias', 'streaming_vision_encoder.vit_lite.blocks.1.norm2.weight', 'streaming_vision_encoder.vit_lite.blocks.1.mlp.fc1.weight', 'streaming_vision_encoder.vit_lite.blocks.1.mlp.fc1.bias', 'streaming_vision_encoder.vit_lite.blocks.1.mlp.fc2.weight', 'streaming_vision_encoder.vit_lite.blocks.1.mlp.fc2.bias', 'streaming_vision_encoder.vit_lite.blocks.2.norm1.weight', 'streaming_vision_encoder.vit_lite.blocks.2.attn.qkv.weight', 'streaming_vision_encoder.vit_lite.blocks.2.attn.qkv.bias', 'streaming_vision_encoder.vit_lite.blocks.2.attn.proj.weight', 'streaming_vision_encoder.vit_lite.blocks.2.attn.proj.bias', 'streaming_vision_encoder.vit_lite.blocks.2.norm2.weight', 'streaming_vision_encoder.vit_lite.blocks.2.mlp.fc1.weight', 'streaming_vision_encoder.vit_lite.blocks.2.mlp.fc1.bias', 'streaming_vision_encoder.vit_lite.blocks.2.mlp.fc2.weight', 'streaming_vision_encoder.vit_lite.blocks.2.mlp.fc2.bias', 'streaming_vision_encoder.vit_lite.blocks.3.norm1.weight', 'streaming_vision_encoder.vit_lite.blocks.3.attn.qkv.weight', 'streaming_vision_encoder.vit_lite.blocks.3.attn.qkv.bias', 'streaming_vision_encoder.vit_lite.blocks.3.attn.proj.weight', 'streaming_vision_encoder.vit_lite.blocks.3.attn.proj.bias', 'streaming_vision_encoder.vit_lite.blocks.3.norm2.weight', 'streaming_vision_encoder.vit_lite.blocks.3.mlp.fc1.weight', 'streaming_vision_encoder.vit_lite.blocks.3.mlp.fc1.bias', 'streaming_vision_encoder.vit_lite.blocks.3.mlp.fc2.weight', 'streaming_vision_encoder.vit_lite.blocks.3.mlp.fc2.bias', 'streaming_vision_encoder.rnn.weight_ih_l0', 'streaming_vision_encoder.rnn.weight_hh_l0', 'streaming_vision_encoder.rnn.bias_ih_l0', 'streaming_vision_encoder.rnn.bias_hh_l0', 'streaming_vision_encoder.output_fc.0.weight', 'streaming_vision_encoder.output_fc.0.bias', 'streaming_vision_align.0.weight', 'streaming_vision_align.0.bias', 'streaming_vision_align.1.weight', 'streaming_vision_align.1.bias'], unexpected_keys=[]) +2025-05-12T09:16:10 | INFO | tasks_clip.shared_utils : Change to bfloat16 for model +2025-05-12T09:16:10 | INFO | utils.optimizer : diff_names: [], diff_lr: None +2025-05-12T09:16:10 | INFO | utils.optimizer : param temp: wd: 0, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.cls_token: wd: 0.01, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.pos_embed_spatial: wd: 0.01, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.pos_embed_cls: wd: 0.01, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.patch_embed.proj.weight: wd: 0.01, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.patch_embed.proj.bias: wd: 0, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.norm1.weight: wd: 0, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.attn.qkv.weight: wd: 0.01, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.attn.qkv.bias: wd: 0, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.attn.proj.weight: wd: 0.01, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.attn.proj.bias: wd: 0, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.norm2.weight: wd: 0, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.mlp.fc1.weight: wd: 0.01, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.mlp.fc1.bias: wd: 0, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.mlp.fc2.weight: wd: 0.01, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.0.mlp.fc2.bias: wd: 0, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.norm1.weight: wd: 0, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.attn.qkv.weight: wd: 0.01, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.attn.qkv.bias: wd: 0, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.attn.proj.weight: wd: 0.01, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.attn.proj.bias: wd: 0, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.norm2.weight: wd: 0, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.mlp.fc1.weight: wd: 0.01, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.mlp.fc1.bias: wd: 0, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.mlp.fc2.weight: wd: 0.01, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.1.mlp.fc2.bias: wd: 0, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.norm1.weight: wd: 0, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.attn.qkv.weight: wd: 0.01, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.attn.qkv.bias: wd: 0, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.attn.proj.weight: wd: 0.01, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.attn.proj.bias: wd: 0, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.norm2.weight: wd: 0, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.mlp.fc1.weight: wd: 0.01, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.mlp.fc1.bias: wd: 0, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.mlp.fc2.weight: wd: 0.01, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.2.mlp.fc2.bias: wd: 0, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.norm1.weight: wd: 0, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.attn.qkv.weight: wd: 0.01, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.attn.qkv.bias: wd: 0, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.attn.proj.weight: wd: 0.01, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.attn.proj.bias: wd: 0, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.norm2.weight: wd: 0, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.mlp.fc1.weight: wd: 0.01, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.mlp.fc1.bias: wd: 0, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.mlp.fc2.weight: wd: 0.01, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_encoder.vit_lite.blocks.3.mlp.fc2.bias: wd: 0, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_encoder.rnn.weight_ih_l0: wd: 0.01, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_encoder.rnn.weight_hh_l0: wd: 0.01, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_encoder.rnn.bias_ih_l0: wd: 0, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_encoder.rnn.bias_hh_l0: wd: 0, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_encoder.output_fc.0.weight: wd: 0.01, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_encoder.output_fc.0.bias: wd: 0, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_align.0.weight: wd: 0, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_align.0.bias: wd: 0, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_align.1.weight: wd: 0.01, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : param streaming_vision_align.1.bias: wd: 0, lr: 1e-05 +2025-05-12T09:16:10 | INFO | utils.optimizer : optimizer -- lr=1e-05 wd=0 len(p)=32 +2025-05-12T09:16:10 | INFO | utils.optimizer : optimizer -- lr=1e-05 wd=0.01 len(p)=24 +2025-05-12T09:16:10 | INFO | tasks_clip.shared_utils : Auto resuming +2025-05-12T09:16:10 | INFO | tasks_clip.shared_utils : Not found checkpoint in scripts/pretraining/clip/B14/B14 +2025-05-12T09:16:10 | INFO | tasks_clip.shared_utils : Use deepspeed to initialize model!!! +2025-05-12T09:16:10 | WARNING | py.warnings : /home/zli/IV2/InternVideo2/multi_modality/utils/distributed.py:24: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. +If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST']. + builtin_warn(*args, **kwargs) + +2025-05-12T09:16:10 | WARNING | py.warnings : /home/zli/IV2/InternVideo2/multi_modality/utils/distributed.py:24: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. +If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST']. + builtin_warn(*args, **kwargs) + +2025-05-12T09:16:11 | INFO | tasks_clip.shared_utils : Cuda memory after create model: 379M, Max mem: 379M +2025-05-12T09:16:11 | INFO | __main__ : Start training +2025-05-12T09:16:11 | INFO | __main__ : Epoch: 0 +2025-05-12T09:16:11 | WARNING | __main__ : Model does not have a 'transform' or 'config.model.vision_encoder.img_size' attribute. Using default transform. +2025-05-12T09:16:11 | INFO | __main__ : Getting evaluation video from qingy2024/backflip_train (1.mp4) +2025-05-12T09:16:11 | INFO | dataset.dataloader : Do not skip steps for any dataloader! +2025-05-12T09:16:11 | INFO | dataset.dataloader : MetaLoader has 1 dataloaders, 4978 batches in total +dataloader index=0 name=video, batch-size=16 length(#batches)=4978 +2025-05-12T09:16:29 | WARNING | py.warnings : /home/zli/miniconda3/lib/python3.10/site-packages/typing_extensions.py:2852: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. + warnings.warn(msg, category=category, stacklevel=stacklevel + 1) + +2025-05-12T09:16:29 | WARNING | py.warnings : /home/zli/miniconda3/lib/python3.10/site-packages/typing_extensions.py:2852: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. + warnings.warn(msg, category=category, stacklevel=stacklevel + 1) + +2025-05-12T09:16:30 | INFO | __main__ : Performing periodic evaluation at global step 0... +2025-05-12T09:16:30 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T09:16:30 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T09:16:30 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T09:16:30 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T09:16:40 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: -0.0009 +2025-05-12T09:16:40 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0000000.png +2025-05-12T09:16:40 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T09:16:40 | INFO | __main__ : Evaluation at step 0 complete. Average Similarity: -0.0009 +2025-05-12T09:17:05 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:17:05 | INFO | __main__ : Step: 100 +2025-05-12T09:17:05 | INFO | __main__ : Current Frame Index within Batch Video: 106/247 +2025-05-12T09:17:05 | INFO | __main__ : Batch-wise Cosine Similarity | -2.53% +2025-05-12T09:17:05 | INFO | __main__ : Cosine Embedding Loss | 1.0253 +2025-05-12T09:17:05 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T09:17:05 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:17:05 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:17:29 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:17:29 | INFO | __main__ : Step: 200 +2025-05-12T09:17:29 | INFO | __main__ : Current Frame Index within Batch Video: 206/247 +2025-05-12T09:17:29 | INFO | __main__ : Batch-wise Cosine Similarity | -1.13% +2025-05-12T09:17:29 | INFO | __main__ : Cosine Embedding Loss | 1.0113 +2025-05-12T09:17:29 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T09:17:29 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:17:29 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:17:38 | INFO | utils.basic_utils : Train Epoch: [0] [ 0/4978] eta: 5 days, 0:33:29 lr: 0.000000 temperature: 0.0126 video-loss_cosine: 1.0124 eval_avg_sim: -0.0009 video-cosine_similarity: -0.0124 time: 87.1856 data: 17.7268 max mem: 11173 res mem: 12128 +2025-05-12T09:17:53 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:17:53 | INFO | __main__ : Step: 300 +2025-05-12T09:17:53 | INFO | __main__ : Current Frame Index within Batch Video: 65/247 +2025-05-12T09:17:53 | INFO | __main__ : Batch-wise Cosine Similarity | 0.53% +2025-05-12T09:17:53 | INFO | __main__ : Cosine Embedding Loss | 0.9947 +2025-05-12T09:17:53 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T09:17:53 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:17:53 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:18:17 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:18:17 | INFO | __main__ : Step: 400 +2025-05-12T09:18:17 | INFO | __main__ : Current Frame Index within Batch Video: 165/247 +2025-05-12T09:18:17 | INFO | __main__ : Batch-wise Cosine Similarity | 3.91% +2025-05-12T09:18:17 | INFO | __main__ : Cosine Embedding Loss | 0.9609 +2025-05-12T09:18:17 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T09:18:17 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:18:17 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:18:37 | INFO | utils.basic_utils : Train Epoch: [0] [ 1/4978] eta: 4 days, 4:45:09 lr: 0.000000 temperature: 0.0126 video-loss_cosine: 0.9477 eval_avg_sim: -0.0009 video-cosine_similarity: 0.0523 time: 72.8771 data: 8.8635 max mem: 11173 res mem: 15204 +2025-05-12T09:18:42 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:18:42 | INFO | __main__ : Step: 500 +2025-05-12T09:18:42 | INFO | __main__ : Current Frame Index within Batch Video: 24/247 +2025-05-12T09:18:42 | INFO | __main__ : Batch-wise Cosine Similarity | 5.06% +2025-05-12T09:18:42 | INFO | __main__ : Cosine Embedding Loss | 0.9494 +2025-05-12T09:18:42 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T09:18:42 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:18:42 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:19:06 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:19:06 | INFO | __main__ : Step: 600 +2025-05-12T09:19:06 | INFO | __main__ : Current Frame Index within Batch Video: 124/247 +2025-05-12T09:19:06 | INFO | __main__ : Batch-wise Cosine Similarity | 8.26% +2025-05-12T09:19:06 | INFO | __main__ : Cosine Embedding Loss | 0.9174 +2025-05-12T09:19:06 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T09:19:06 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:19:06 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:19:30 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:19:30 | INFO | __main__ : Step: 700 +2025-05-12T09:19:30 | INFO | __main__ : Current Frame Index within Batch Video: 224/247 +2025-05-12T09:19:30 | INFO | __main__ : Batch-wise Cosine Similarity | 19.21% +2025-05-12T09:19:30 | INFO | __main__ : Cosine Embedding Loss | 0.8079 +2025-05-12T09:19:30 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T09:19:30 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:19:30 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:19:35 | INFO | utils.basic_utils : Train Epoch: [0] [ 2/4978] eta: 3 days, 22:04:21 lr: 0.000000 temperature: 0.0126 video-loss_cosine: 0.8063 eval_avg_sim: -0.0009 video-cosine_similarity: 0.1937 time: 68.0590 data: 5.9092 max mem: 11173 res mem: 15204 +2025-05-12T09:19:54 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:19:54 | INFO | __main__ : Step: 800 +2025-05-12T09:19:54 | INFO | __main__ : Current Frame Index within Batch Video: 83/247 +2025-05-12T09:19:54 | INFO | __main__ : Batch-wise Cosine Similarity | 17.65% +2025-05-12T09:19:54 | INFO | __main__ : Cosine Embedding Loss | 0.8235 +2025-05-12T09:19:54 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T09:19:54 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:19:54 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:20:18 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:20:18 | INFO | __main__ : Step: 900 +2025-05-12T09:20:18 | INFO | __main__ : Current Frame Index within Batch Video: 183/247 +2025-05-12T09:20:18 | INFO | __main__ : Batch-wise Cosine Similarity | 22.68% +2025-05-12T09:20:18 | INFO | __main__ : Cosine Embedding Loss | 0.7732 +2025-05-12T09:20:18 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T09:20:18 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:20:18 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:20:34 | INFO | utils.basic_utils : Train Epoch: [0] [ 3/4978] eta: 3 days, 18:35:16 lr: 0.000000 temperature: 0.0126 video-loss_cosine: 0.7545 eval_avg_sim: -0.0009 video-cosine_similarity: 0.2455 time: 65.5511 data: 4.4340 max mem: 11173 res mem: 15204 +2025-05-12T09:20:42 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:20:42 | INFO | __main__ : Step: 1000 +2025-05-12T09:20:42 | INFO | __main__ : Current Frame Index within Batch Video: 42/247 +2025-05-12T09:20:42 | INFO | __main__ : Batch-wise Cosine Similarity | 25.19% +2025-05-12T09:20:42 | INFO | __main__ : Cosine Embedding Loss | 0.7481 +2025-05-12T09:20:42 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T09:20:42 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:20:42 | INFO | __main__ : Evaluation Average Sim | -0.0009 +2025-05-12T09:20:42 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:20:43 | INFO | __main__ : Performing periodic evaluation at global step 1000... +2025-05-12T09:20:43 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T09:20:43 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T09:20:43 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T09:20:43 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T09:20:53 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.2379 +2025-05-12T09:20:53 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0001000.png +2025-05-12T09:20:53 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T09:20:53 | INFO | __main__ : Evaluation at step 1000 complete. Average Similarity: 0.2379 +2025-05-12T09:21:17 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:21:17 | INFO | __main__ : Step: 1100 +2025-05-12T09:21:17 | INFO | __main__ : Current Frame Index within Batch Video: 142/247 +2025-05-12T09:21:17 | INFO | __main__ : Batch-wise Cosine Similarity | 31.37% +2025-05-12T09:21:17 | INFO | __main__ : Cosine Embedding Loss | 0.6863 +2025-05-12T09:21:17 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T09:21:17 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:21:17 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:21:41 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:21:41 | INFO | __main__ : Step: 1200 +2025-05-12T09:21:41 | INFO | __main__ : Current Frame Index within Batch Video: 242/247 +2025-05-12T09:21:41 | INFO | __main__ : Batch-wise Cosine Similarity | 37.14% +2025-05-12T09:21:41 | INFO | __main__ : Cosine Embedding Loss | 0.6286 +2025-05-12T09:21:41 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T09:21:41 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:21:41 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:21:42 | INFO | utils.basic_utils : Train Epoch: [0] [ 4/4978] eta: 3 days, 19:20:06 lr: 0.000000 temperature: 0.0126 video-loss_cosine: 0.6235 eval_avg_sim: 0.2379 video-cosine_similarity: 0.3765 time: 66.1050 data: 3.5472 max mem: 11173 res mem: 15204 +2025-05-12T09:22:05 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:22:05 | INFO | __main__ : Step: 1300 +2025-05-12T09:22:05 | INFO | __main__ : Current Frame Index within Batch Video: 101/247 +2025-05-12T09:22:05 | INFO | __main__ : Batch-wise Cosine Similarity | 35.53% +2025-05-12T09:22:05 | INFO | __main__ : Cosine Embedding Loss | 0.6447 +2025-05-12T09:22:05 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T09:22:05 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:22:05 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:22:29 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:22:29 | INFO | __main__ : Step: 1400 +2025-05-12T09:22:29 | INFO | __main__ : Current Frame Index within Batch Video: 201/247 +2025-05-12T09:22:29 | INFO | __main__ : Batch-wise Cosine Similarity | 43.80% +2025-05-12T09:22:29 | INFO | __main__ : Cosine Embedding Loss | 0.5620 +2025-05-12T09:22:29 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T09:22:29 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:22:29 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:22:40 | INFO | utils.basic_utils : Train Epoch: [0] [ 5/4978] eta: 3 days, 17:23:03 lr: 0.000000 temperature: 0.0126 video-loss_cosine: 0.5272 eval_avg_sim: 0.2379 video-cosine_similarity: 0.4728 time: 64.7062 data: 2.9560 max mem: 11173 res mem: 15204 +2025-05-12T09:22:53 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:22:53 | INFO | __main__ : Step: 1500 +2025-05-12T09:22:53 | INFO | __main__ : Current Frame Index within Batch Video: 60/247 +2025-05-12T09:22:53 | INFO | __main__ : Batch-wise Cosine Similarity | 48.72% +2025-05-12T09:22:53 | INFO | __main__ : Cosine Embedding Loss | 0.5128 +2025-05-12T09:22:53 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T09:22:53 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:22:53 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:23:16 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:23:16 | INFO | __main__ : Step: 1600 +2025-05-12T09:23:16 | INFO | __main__ : Current Frame Index within Batch Video: 160/247 +2025-05-12T09:23:16 | INFO | __main__ : Batch-wise Cosine Similarity | 52.85% +2025-05-12T09:23:16 | INFO | __main__ : Cosine Embedding Loss | 0.4715 +2025-05-12T09:23:16 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T09:23:16 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:23:16 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:23:37 | INFO | utils.basic_utils : Train Epoch: [0] [ 6/4978] eta: 3 days, 15:57:49 lr: 0.000000 temperature: 0.0126 video-loss_cosine: 0.4364 eval_avg_sim: 0.2379 video-cosine_similarity: 0.5636 time: 63.6906 data: 2.5338 max mem: 11173 res mem: 15204 +2025-05-12T09:23:40 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:23:40 | INFO | __main__ : Step: 1700 +2025-05-12T09:23:40 | INFO | __main__ : Current Frame Index within Batch Video: 19/247 +2025-05-12T09:23:40 | INFO | __main__ : Batch-wise Cosine Similarity | 52.54% +2025-05-12T09:23:40 | INFO | __main__ : Cosine Embedding Loss | 0.4746 +2025-05-12T09:23:40 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T09:23:40 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:23:40 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:24:04 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:24:04 | INFO | __main__ : Step: 1800 +2025-05-12T09:24:04 | INFO | __main__ : Current Frame Index within Batch Video: 119/247 +2025-05-12T09:24:04 | INFO | __main__ : Batch-wise Cosine Similarity | 58.54% +2025-05-12T09:24:04 | INFO | __main__ : Cosine Embedding Loss | 0.4146 +2025-05-12T09:24:04 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T09:24:04 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:24:04 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:24:28 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:24:28 | INFO | __main__ : Step: 1900 +2025-05-12T09:24:28 | INFO | __main__ : Current Frame Index within Batch Video: 219/247 +2025-05-12T09:24:28 | INFO | __main__ : Batch-wise Cosine Similarity | 62.85% +2025-05-12T09:24:28 | INFO | __main__ : Cosine Embedding Loss | 0.3715 +2025-05-12T09:24:28 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T09:24:28 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:24:28 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:24:35 | INFO | utils.basic_utils : Train Epoch: [0] [ 7/4978] eta: 3 days, 14:53:28 lr: 0.000000 temperature: 0.0126 video-loss_cosine: 0.3630 eval_avg_sim: 0.2379 video-cosine_similarity: 0.6370 time: 62.9267 data: 2.2170 max mem: 11173 res mem: 15204 +2025-05-12T09:24:52 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:24:52 | INFO | __main__ : Step: 2000 +2025-05-12T09:24:52 | INFO | __main__ : Current Frame Index within Batch Video: 78/247 +2025-05-12T09:24:52 | INFO | __main__ : Batch-wise Cosine Similarity | 61.84% +2025-05-12T09:24:52 | INFO | __main__ : Cosine Embedding Loss | 0.3816 +2025-05-12T09:24:52 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T09:24:52 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:24:52 | INFO | __main__ : Evaluation Average Sim | 0.2379 +2025-05-12T09:24:52 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:24:52 | INFO | __main__ : Performing periodic evaluation at global step 2000... +2025-05-12T09:24:52 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T09:24:52 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T09:24:52 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T09:24:52 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T09:25:02 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.6032 +2025-05-12T09:25:02 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0002000.png +2025-05-12T09:25:02 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T09:25:02 | INFO | __main__ : Evaluation at step 2000 complete. Average Similarity: 0.6032 +2025-05-12T09:25:25 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:25:25 | INFO | __main__ : Step: 2100 +2025-05-12T09:25:25 | INFO | __main__ : Current Frame Index within Batch Video: 178/247 +2025-05-12T09:25:25 | INFO | __main__ : Batch-wise Cosine Similarity | 65.68% +2025-05-12T09:25:25 | INFO | __main__ : Cosine Embedding Loss | 0.3432 +2025-05-12T09:25:25 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T09:25:25 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:25:25 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:25:42 | INFO | utils.basic_utils : Train Epoch: [0] [ 8/4978] eta: 3 days, 15:28:53 lr: 0.000000 temperature: 0.0126 video-loss_cosine: 0.3228 eval_avg_sim: 0.6032 video-cosine_similarity: 0.6772 time: 63.3670 data: 1.9707 max mem: 11173 res mem: 15204 +2025-05-12T09:25:49 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:25:49 | INFO | __main__ : Step: 2200 +2025-05-12T09:25:49 | INFO | __main__ : Current Frame Index within Batch Video: 37/247 +2025-05-12T09:25:49 | INFO | __main__ : Batch-wise Cosine Similarity | 62.13% +2025-05-12T09:25:49 | INFO | __main__ : Cosine Embedding Loss | 0.3787 +2025-05-12T09:25:49 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T09:25:49 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:25:49 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:26:13 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:26:13 | INFO | __main__ : Step: 2300 +2025-05-12T09:26:13 | INFO | __main__ : Current Frame Index within Batch Video: 137/247 +2025-05-12T09:26:13 | INFO | __main__ : Batch-wise Cosine Similarity | 66.92% +2025-05-12T09:26:13 | INFO | __main__ : Cosine Embedding Loss | 0.3308 +2025-05-12T09:26:13 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T09:26:13 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:26:13 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:26:37 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:26:37 | INFO | __main__ : Step: 2400 +2025-05-12T09:26:37 | INFO | __main__ : Current Frame Index within Batch Video: 237/247 +2025-05-12T09:26:37 | INFO | __main__ : Batch-wise Cosine Similarity | 67.53% +2025-05-12T09:26:37 | INFO | __main__ : Cosine Embedding Loss | 0.3247 +2025-05-12T09:26:37 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T09:26:37 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:26:37 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:26:39 | INFO | utils.basic_utils : Train Epoch: [0] [ 9/4978] eta: 3 days, 14:39:15 lr: 0.000000 temperature: 0.0126 video-loss_cosine: 0.3258 eval_avg_sim: 0.6032 video-cosine_similarity: 0.6742 time: 62.7802 data: 1.7737 max mem: 11173 res mem: 15204 +2025-05-12T09:27:01 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:27:01 | INFO | __main__ : Step: 2500 +2025-05-12T09:27:01 | INFO | __main__ : Current Frame Index within Batch Video: 96/247 +2025-05-12T09:27:01 | INFO | __main__ : Batch-wise Cosine Similarity | 68.23% +2025-05-12T09:27:01 | INFO | __main__ : Cosine Embedding Loss | 0.3177 +2025-05-12T09:27:01 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T09:27:01 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:27:01 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:27:24 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:27:24 | INFO | __main__ : Step: 2600 +2025-05-12T09:27:24 | INFO | __main__ : Current Frame Index within Batch Video: 196/247 +2025-05-12T09:27:24 | INFO | __main__ : Batch-wise Cosine Similarity | 70.31% +2025-05-12T09:27:24 | INFO | __main__ : Cosine Embedding Loss | 0.2969 +2025-05-12T09:27:24 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T09:27:24 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:27:24 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:27:37 | INFO | utils.basic_utils : Train Epoch: [0] [ 10/4978] eta: 3 days, 13:58:04 lr: 0.000000 temperature: 0.0126 video-loss_cosine: 0.2955 eval_avg_sim: 0.6032 video-cosine_similarity: 0.7045 time: 62.2957 data: 1.6125 max mem: 11173 res mem: 15204 +2025-05-12T09:27:48 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:27:48 | INFO | __main__ : Step: 2700 +2025-05-12T09:27:48 | INFO | __main__ : Current Frame Index within Batch Video: 55/247 +2025-05-12T09:27:48 | INFO | __main__ : Batch-wise Cosine Similarity | 66.64% +2025-05-12T09:27:48 | INFO | __main__ : Cosine Embedding Loss | 0.3336 +2025-05-12T09:27:48 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T09:27:48 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:27:48 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:28:12 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:28:12 | INFO | __main__ : Step: 2800 +2025-05-12T09:28:12 | INFO | __main__ : Current Frame Index within Batch Video: 155/247 +2025-05-12T09:28:12 | INFO | __main__ : Batch-wise Cosine Similarity | 68.63% +2025-05-12T09:28:12 | INFO | __main__ : Cosine Embedding Loss | 0.3137 +2025-05-12T09:28:12 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T09:28:12 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:28:12 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:28:34 | INFO | utils.basic_utils : Train Epoch: [0] [ 11/4978] eta: 3 days, 13:24:07 lr: 0.000000 temperature: 0.0126 video-loss_cosine: 0.2853 eval_avg_sim: 0.6032 video-cosine_similarity: 0.7147 time: 61.8979 data: 1.4781 max mem: 11173 res mem: 15204 +2025-05-12T09:28:36 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:28:36 | INFO | __main__ : Step: 2900 +2025-05-12T09:28:36 | INFO | __main__ : Current Frame Index within Batch Video: 14/247 +2025-05-12T09:28:36 | INFO | __main__ : Batch-wise Cosine Similarity | 63.02% +2025-05-12T09:28:36 | INFO | __main__ : Cosine Embedding Loss | 0.3698 +2025-05-12T09:28:36 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T09:28:36 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:28:36 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:29:00 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:29:00 | INFO | __main__ : Step: 3000 +2025-05-12T09:29:00 | INFO | __main__ : Current Frame Index within Batch Video: 114/247 +2025-05-12T09:29:00 | INFO | __main__ : Batch-wise Cosine Similarity | 66.73% +2025-05-12T09:29:00 | INFO | __main__ : Cosine Embedding Loss | 0.3327 +2025-05-12T09:29:00 | INFO | __main__ : Learning Rate | 0.000000 +2025-05-12T09:29:00 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:29:00 | INFO | __main__ : Evaluation Average Sim | 0.6032 +2025-05-12T09:29:00 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:29:00 | INFO | __main__ : Performing periodic evaluation at global step 3000... +2025-05-12T09:29:00 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T09:29:00 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T09:29:00 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T09:29:00 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T09:29:09 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.6723 +2025-05-12T09:29:09 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0003000.png +2025-05-12T09:29:09 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T09:29:09 | INFO | __main__ : Evaluation at step 3000 complete. Average Similarity: 0.6723 +2025-05-12T09:29:33 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:29:33 | INFO | __main__ : Step: 3100 +2025-05-12T09:29:33 | INFO | __main__ : Current Frame Index within Batch Video: 214/247 +2025-05-12T09:29:33 | INFO | __main__ : Batch-wise Cosine Similarity | 68.63% +2025-05-12T09:29:33 | INFO | __main__ : Cosine Embedding Loss | 0.3137 +2025-05-12T09:29:33 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:29:33 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:29:33 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:29:41 | INFO | utils.basic_utils : Train Epoch: [0] [ 12/4978] eta: 3 days, 13:53:52 lr: 0.000001 temperature: 0.0126 video-loss_cosine: 0.3044 eval_avg_sim: 0.6723 video-cosine_similarity: 0.6956 time: 62.2699 data: 1.3644 max mem: 11173 res mem: 15204 +2025-05-12T09:29:57 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:29:57 | INFO | __main__ : Step: 3200 +2025-05-12T09:29:57 | INFO | __main__ : Current Frame Index within Batch Video: 73/247 +2025-05-12T09:29:57 | INFO | __main__ : Batch-wise Cosine Similarity | 69.43% +2025-05-12T09:29:57 | INFO | __main__ : Cosine Embedding Loss | 0.3057 +2025-05-12T09:29:57 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:29:57 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:29:57 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:30:21 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:30:21 | INFO | __main__ : Step: 3300 +2025-05-12T09:30:21 | INFO | __main__ : Current Frame Index within Batch Video: 173/247 +2025-05-12T09:30:21 | INFO | __main__ : Batch-wise Cosine Similarity | 69.59% +2025-05-12T09:30:21 | INFO | __main__ : Cosine Embedding Loss | 0.3041 +2025-05-12T09:30:21 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:30:21 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:30:21 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:30:38 | INFO | utils.basic_utils : Train Epoch: [0] [ 13/4978] eta: 3 days, 13:24:27 lr: 0.000001 temperature: 0.0126 video-loss_cosine: 0.2888 eval_avg_sim: 0.6723 video-cosine_similarity: 0.7112 time: 61.9269 data: 1.2676 max mem: 11173 res mem: 15204 +2025-05-12T09:30:45 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:30:45 | INFO | __main__ : Step: 3400 +2025-05-12T09:30:45 | INFO | __main__ : Current Frame Index within Batch Video: 32/247 +2025-05-12T09:30:45 | INFO | __main__ : Batch-wise Cosine Similarity | 65.90% +2025-05-12T09:30:45 | INFO | __main__ : Cosine Embedding Loss | 0.3410 +2025-05-12T09:30:45 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:30:45 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:30:45 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:31:08 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:31:08 | INFO | __main__ : Step: 3500 +2025-05-12T09:31:08 | INFO | __main__ : Current Frame Index within Batch Video: 132/247 +2025-05-12T09:31:08 | INFO | __main__ : Batch-wise Cosine Similarity | 67.88% +2025-05-12T09:31:08 | INFO | __main__ : Cosine Embedding Loss | 0.3212 +2025-05-12T09:31:08 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:31:08 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:31:08 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:31:32 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:31:32 | INFO | __main__ : Step: 3600 +2025-05-12T09:31:32 | INFO | __main__ : Current Frame Index within Batch Video: 232/247 +2025-05-12T09:31:32 | INFO | __main__ : Batch-wise Cosine Similarity | 70.80% +2025-05-12T09:31:32 | INFO | __main__ : Cosine Embedding Loss | 0.2920 +2025-05-12T09:31:32 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:31:32 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:31:32 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:31:36 | INFO | utils.basic_utils : Train Epoch: [0] [ 14/4978] eta: 3 days, 12:58:24 lr: 0.000001 temperature: 0.0126 video-loss_cosine: 0.2848 eval_avg_sim: 0.6723 video-cosine_similarity: 0.7152 time: 61.6246 data: 1.1831 max mem: 11173 res mem: 15204 +2025-05-12T09:31:56 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:31:56 | INFO | __main__ : Step: 3700 +2025-05-12T09:31:56 | INFO | __main__ : Current Frame Index within Batch Video: 91/247 +2025-05-12T09:31:56 | INFO | __main__ : Batch-wise Cosine Similarity | 68.59% +2025-05-12T09:31:56 | INFO | __main__ : Cosine Embedding Loss | 0.3141 +2025-05-12T09:31:56 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:31:56 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:31:56 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:32:20 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:32:20 | INFO | __main__ : Step: 3800 +2025-05-12T09:32:20 | INFO | __main__ : Current Frame Index within Batch Video: 191/247 +2025-05-12T09:32:20 | INFO | __main__ : Batch-wise Cosine Similarity | 70.35% +2025-05-12T09:32:20 | INFO | __main__ : Cosine Embedding Loss | 0.2965 +2025-05-12T09:32:20 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:32:20 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:32:20 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:32:33 | INFO | utils.basic_utils : Train Epoch: [0] [ 15/4978] eta: 3 days, 12:35:40 lr: 0.000001 temperature: 0.0126 video-loss_cosine: 0.2937 eval_avg_sim: 0.6723 video-cosine_similarity: 0.7063 time: 61.3621 data: 1.1091 max mem: 11173 res mem: 15204 +2025-05-12T09:32:44 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:32:44 | INFO | __main__ : Step: 3900 +2025-05-12T09:32:44 | INFO | __main__ : Current Frame Index within Batch Video: 50/247 +2025-05-12T09:32:44 | INFO | __main__ : Batch-wise Cosine Similarity | 68.25% +2025-05-12T09:32:44 | INFO | __main__ : Cosine Embedding Loss | 0.3175 +2025-05-12T09:32:44 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:32:44 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:32:44 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:33:08 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:33:08 | INFO | __main__ : Step: 4000 +2025-05-12T09:33:08 | INFO | __main__ : Current Frame Index within Batch Video: 150/247 +2025-05-12T09:33:08 | INFO | __main__ : Batch-wise Cosine Similarity | 70.93% +2025-05-12T09:33:08 | INFO | __main__ : Cosine Embedding Loss | 0.2907 +2025-05-12T09:33:08 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:33:08 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:33:08 | INFO | __main__ : Evaluation Average Sim | 0.6723 +2025-05-12T09:33:08 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:33:08 | INFO | __main__ : Performing periodic evaluation at global step 4000... +2025-05-12T09:33:08 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T09:33:08 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T09:33:08 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T09:33:08 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T09:33:17 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.6552 +2025-05-12T09:33:17 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0004000.png +2025-05-12T09:33:17 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T09:33:17 | INFO | __main__ : Evaluation at step 4000 complete. Average Similarity: 0.6552 +2025-05-12T09:33:40 | INFO | utils.basic_utils : Train Epoch: [0] [ 16/4978] eta: 3 days, 13:00:51 lr: 0.000001 temperature: 0.0126 video-loss_cosine: 0.2702 eval_avg_sim: 0.6552 video-cosine_similarity: 0.7298 time: 61.6791 data: 1.0439 max mem: 11173 res mem: 15204 +2025-05-12T09:33:41 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:33:41 | INFO | __main__ : Step: 4100 +2025-05-12T09:33:41 | INFO | __main__ : Current Frame Index within Batch Video: 9/247 +2025-05-12T09:33:41 | INFO | __main__ : Batch-wise Cosine Similarity | 61.63% +2025-05-12T09:33:41 | INFO | __main__ : Cosine Embedding Loss | 0.3837 +2025-05-12T09:33:41 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:33:41 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:33:41 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:34:05 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:34:05 | INFO | __main__ : Step: 4200 +2025-05-12T09:34:05 | INFO | __main__ : Current Frame Index within Batch Video: 109/247 +2025-05-12T09:34:05 | INFO | __main__ : Batch-wise Cosine Similarity | 69.03% +2025-05-12T09:34:05 | INFO | __main__ : Cosine Embedding Loss | 0.3097 +2025-05-12T09:34:05 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:34:05 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:34:05 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:34:28 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:34:28 | INFO | __main__ : Step: 4300 +2025-05-12T09:34:28 | INFO | __main__ : Current Frame Index within Batch Video: 209/247 +2025-05-12T09:34:28 | INFO | __main__ : Batch-wise Cosine Similarity | 70.66% +2025-05-12T09:34:28 | INFO | __main__ : Cosine Embedding Loss | 0.2934 +2025-05-12T09:34:28 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:34:28 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:34:28 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:34:37 | INFO | utils.basic_utils : Train Epoch: [0] [ 17/4978] eta: 3 days, 12:40:25 lr: 0.000001 temperature: 0.0126 video-loss_cosine: 0.2877 eval_avg_sim: 0.6552 video-cosine_similarity: 0.7123 time: 61.4444 data: 0.9859 max mem: 11173 res mem: 15204 +2025-05-12T09:34:52 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:34:52 | INFO | __main__ : Step: 4400 +2025-05-12T09:34:52 | INFO | __main__ : Current Frame Index within Batch Video: 68/247 +2025-05-12T09:34:52 | INFO | __main__ : Batch-wise Cosine Similarity | 70.97% +2025-05-12T09:34:52 | INFO | __main__ : Cosine Embedding Loss | 0.2903 +2025-05-12T09:34:52 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:34:52 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:34:52 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:35:16 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:35:16 | INFO | __main__ : Step: 4500 +2025-05-12T09:35:16 | INFO | __main__ : Current Frame Index within Batch Video: 168/247 +2025-05-12T09:35:16 | INFO | __main__ : Batch-wise Cosine Similarity | 72.95% +2025-05-12T09:35:16 | INFO | __main__ : Cosine Embedding Loss | 0.2705 +2025-05-12T09:35:16 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:35:16 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:35:16 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:35:35 | INFO | utils.basic_utils : Train Epoch: [0] [ 18/4978] eta: 3 days, 12:21:40 lr: 0.000001 temperature: 0.0126 video-loss_cosine: 0.2785 eval_avg_sim: 0.6552 video-cosine_similarity: 0.7215 time: 61.2299 data: 0.9340 max mem: 11173 res mem: 15204 +2025-05-12T09:35:40 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:35:40 | INFO | __main__ : Step: 4600 +2025-05-12T09:35:40 | INFO | __main__ : Current Frame Index within Batch Video: 27/247 +2025-05-12T09:35:40 | INFO | __main__ : Batch-wise Cosine Similarity | 65.18% +2025-05-12T09:35:40 | INFO | __main__ : Cosine Embedding Loss | 0.3482 +2025-05-12T09:35:40 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:35:40 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:35:40 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:36:04 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:36:04 | INFO | __main__ : Step: 4700 +2025-05-12T09:36:04 | INFO | __main__ : Current Frame Index within Batch Video: 127/247 +2025-05-12T09:36:04 | INFO | __main__ : Batch-wise Cosine Similarity | 69.86% +2025-05-12T09:36:04 | INFO | __main__ : Cosine Embedding Loss | 0.3014 +2025-05-12T09:36:04 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:36:04 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:36:04 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:36:27 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:36:27 | INFO | __main__ : Step: 4800 +2025-05-12T09:36:27 | INFO | __main__ : Current Frame Index within Batch Video: 227/247 +2025-05-12T09:36:27 | INFO | __main__ : Batch-wise Cosine Similarity | 72.13% +2025-05-12T09:36:27 | INFO | __main__ : Cosine Embedding Loss | 0.2787 +2025-05-12T09:36:27 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:36:27 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:36:27 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:36:32 | INFO | utils.basic_utils : Train Epoch: [0] [ 19/4978] eta: 3 days, 12:05:08 lr: 0.000001 temperature: 0.0126 video-loss_cosine: 0.2754 eval_avg_sim: 0.6552 video-cosine_similarity: 0.7246 time: 61.0423 data: 0.8873 max mem: 11173 res mem: 15204 +2025-05-12T09:36:51 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:36:51 | INFO | __main__ : Step: 4900 +2025-05-12T09:36:51 | INFO | __main__ : Current Frame Index within Batch Video: 86/247 +2025-05-12T09:36:51 | INFO | __main__ : Batch-wise Cosine Similarity | 69.66% +2025-05-12T09:36:51 | INFO | __main__ : Cosine Embedding Loss | 0.3034 +2025-05-12T09:36:51 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:36:51 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:36:51 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:37:15 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:37:15 | INFO | __main__ : Step: 5000 +2025-05-12T09:37:15 | INFO | __main__ : Current Frame Index within Batch Video: 186/247 +2025-05-12T09:37:15 | INFO | __main__ : Batch-wise Cosine Similarity | 72.48% +2025-05-12T09:37:15 | INFO | __main__ : Cosine Embedding Loss | 0.2752 +2025-05-12T09:37:15 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:37:15 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:37:15 | INFO | __main__ : Evaluation Average Sim | 0.6552 +2025-05-12T09:37:15 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:37:15 | INFO | __main__ : Saving checkpoint at global step 5000 +2025-05-12T09:37:16 | INFO | __main__ : Performing periodic evaluation at global step 5000... +2025-05-12T09:37:16 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T09:37:16 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T09:37:16 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T09:37:16 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T09:37:25 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.6593 +2025-05-12T09:37:25 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0005000.png +2025-05-12T09:37:25 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T09:37:25 | INFO | __main__ : Evaluation at step 5000 complete. Average Similarity: 0.6593 +2025-05-12T09:37:39 | INFO | utils.basic_utils : Train Epoch: [0] [ 20/4978] eta: 3 days, 12:27:02 lr: 0.000001 temperature: 0.0126 video-loss_cosine: 0.2690 eval_avg_sim: 0.6593 video-cosine_similarity: 0.7310 time: 60.0263 data: 0.0010 max mem: 11173 res mem: 15204 +2025-05-12T09:37:48 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:37:48 | INFO | __main__ : Step: 5100 +2025-05-12T09:37:48 | INFO | __main__ : Current Frame Index within Batch Video: 45/247 +2025-05-12T09:37:48 | INFO | __main__ : Batch-wise Cosine Similarity | 69.01% +2025-05-12T09:37:48 | INFO | __main__ : Cosine Embedding Loss | 0.3099 +2025-05-12T09:37:48 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:37:48 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:37:48 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:38:12 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:38:12 | INFO | __main__ : Step: 5200 +2025-05-12T09:38:12 | INFO | __main__ : Current Frame Index within Batch Video: 145/247 +2025-05-12T09:38:12 | INFO | __main__ : Batch-wise Cosine Similarity | 72.78% +2025-05-12T09:38:12 | INFO | __main__ : Cosine Embedding Loss | 0.2722 +2025-05-12T09:38:12 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:38:12 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:38:12 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:38:36 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:38:36 | INFO | __main__ : Step: 5300 +2025-05-12T09:38:36 | INFO | __main__ : Current Frame Index within Batch Video: 245/247 +2025-05-12T09:38:36 | INFO | __main__ : Batch-wise Cosine Similarity | 74.87% +2025-05-12T09:38:36 | INFO | __main__ : Cosine Embedding Loss | 0.2513 +2025-05-12T09:38:36 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:38:36 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:38:36 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:38:36 | INFO | utils.basic_utils : Train Epoch: [0] [ 21/4978] eta: 3 days, 12:11:25 lr: 0.000001 temperature: 0.0126 video-loss_cosine: 0.2545 eval_avg_sim: 0.6593 video-cosine_similarity: 0.7455 time: 59.9695 data: 0.0010 max mem: 11173 res mem: 15204 +2025-05-12T09:39:00 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:39:00 | INFO | __main__ : Step: 5400 +2025-05-12T09:39:00 | INFO | __main__ : Current Frame Index within Batch Video: 104/247 +2025-05-12T09:39:00 | INFO | __main__ : Batch-wise Cosine Similarity | 69.87% +2025-05-12T09:39:00 | INFO | __main__ : Cosine Embedding Loss | 0.3013 +2025-05-12T09:39:00 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:39:00 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:39:00 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:39:24 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:39:24 | INFO | __main__ : Step: 5500 +2025-05-12T09:39:24 | INFO | __main__ : Current Frame Index within Batch Video: 204/247 +2025-05-12T09:39:24 | INFO | __main__ : Batch-wise Cosine Similarity | 71.71% +2025-05-12T09:39:24 | INFO | __main__ : Cosine Embedding Loss | 0.2829 +2025-05-12T09:39:24 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:39:24 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:39:24 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:39:34 | INFO | utils.basic_utils : Train Epoch: [0] [ 22/4978] eta: 3 days, 11:57:06 lr: 0.000001 temperature: 0.0126 video-loss_cosine: 0.2821 eval_avg_sim: 0.6593 video-cosine_similarity: 0.7179 time: 59.9204 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T09:39:48 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:39:48 | INFO | __main__ : Step: 5600 +2025-05-12T09:39:48 | INFO | __main__ : Current Frame Index within Batch Video: 63/247 +2025-05-12T09:39:48 | INFO | __main__ : Batch-wise Cosine Similarity | 67.35% +2025-05-12T09:39:48 | INFO | __main__ : Cosine Embedding Loss | 0.3265 +2025-05-12T09:39:48 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:39:48 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:39:48 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:40:11 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:40:11 | INFO | __main__ : Step: 5700 +2025-05-12T09:40:11 | INFO | __main__ : Current Frame Index within Batch Video: 163/247 +2025-05-12T09:40:11 | INFO | __main__ : Batch-wise Cosine Similarity | 69.54% +2025-05-12T09:40:11 | INFO | __main__ : Cosine Embedding Loss | 0.3046 +2025-05-12T09:40:11 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:40:11 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:40:11 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:40:31 | INFO | utils.basic_utils : Train Epoch: [0] [ 23/4978] eta: 3 days, 11:43:57 lr: 0.000001 temperature: 0.0126 video-loss_cosine: 0.2891 eval_avg_sim: 0.6593 video-cosine_similarity: 0.7109 time: 59.8919 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T09:40:35 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:40:35 | INFO | __main__ : Step: 5800 +2025-05-12T09:40:35 | INFO | __main__ : Current Frame Index within Batch Video: 22/247 +2025-05-12T09:40:35 | INFO | __main__ : Batch-wise Cosine Similarity | 63.77% +2025-05-12T09:40:35 | INFO | __main__ : Cosine Embedding Loss | 0.3623 +2025-05-12T09:40:35 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:40:35 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:40:35 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:40:59 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:40:59 | INFO | __main__ : Step: 5900 +2025-05-12T09:40:59 | INFO | __main__ : Current Frame Index within Batch Video: 122/247 +2025-05-12T09:40:59 | INFO | __main__ : Batch-wise Cosine Similarity | 71.27% +2025-05-12T09:40:59 | INFO | __main__ : Cosine Embedding Loss | 0.2873 +2025-05-12T09:40:59 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:40:59 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:40:59 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:41:23 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:41:23 | INFO | __main__ : Step: 6000 +2025-05-12T09:41:23 | INFO | __main__ : Current Frame Index within Batch Video: 222/247 +2025-05-12T09:41:23 | INFO | __main__ : Batch-wise Cosine Similarity | 72.69% +2025-05-12T09:41:23 | INFO | __main__ : Cosine Embedding Loss | 0.2731 +2025-05-12T09:41:23 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:41:23 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:41:23 | INFO | __main__ : Evaluation Average Sim | 0.6593 +2025-05-12T09:41:23 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:41:23 | INFO | __main__ : Performing periodic evaluation at global step 6000... +2025-05-12T09:41:23 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T09:41:23 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T09:41:23 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T09:41:23 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T09:41:32 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.6380 +2025-05-12T09:41:32 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0006000.png +2025-05-12T09:41:32 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T09:41:32 | INFO | __main__ : Evaluation at step 6000 complete. Average Similarity: 0.6380 +2025-05-12T09:41:38 | INFO | utils.basic_utils : Train Epoch: [0] [ 24/4978] eta: 3 days, 12:02:31 lr: 0.000001 temperature: 0.0126 video-loss_cosine: 0.2688 eval_avg_sim: 0.6380 video-cosine_similarity: 0.7312 time: 59.8141 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T09:41:56 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:41:56 | INFO | __main__ : Step: 6100 +2025-05-12T09:41:56 | INFO | __main__ : Current Frame Index within Batch Video: 81/247 +2025-05-12T09:41:56 | INFO | __main__ : Batch-wise Cosine Similarity | 69.27% +2025-05-12T09:41:56 | INFO | __main__ : Cosine Embedding Loss | 0.3073 +2025-05-12T09:41:56 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:41:56 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:41:56 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:42:20 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:42:20 | INFO | __main__ : Step: 6200 +2025-05-12T09:42:20 | INFO | __main__ : Current Frame Index within Batch Video: 181/247 +2025-05-12T09:42:20 | INFO | __main__ : Batch-wise Cosine Similarity | 73.11% +2025-05-12T09:42:20 | INFO | __main__ : Cosine Embedding Loss | 0.2689 +2025-05-12T09:42:20 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:42:20 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:42:20 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:42:36 | INFO | utils.basic_utils : Train Epoch: [0] [ 25/4978] eta: 3 days, 11:49:45 lr: 0.000001 temperature: 0.0126 video-loss_cosine: 0.2597 eval_avg_sim: 0.6380 video-cosine_similarity: 0.7403 time: 59.7969 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T09:42:44 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:42:44 | INFO | __main__ : Step: 6300 +2025-05-12T09:42:44 | INFO | __main__ : Current Frame Index within Batch Video: 40/247 +2025-05-12T09:42:44 | INFO | __main__ : Batch-wise Cosine Similarity | 68.11% +2025-05-12T09:42:44 | INFO | __main__ : Cosine Embedding Loss | 0.3189 +2025-05-12T09:42:44 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:42:44 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:42:44 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:43:08 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:43:08 | INFO | __main__ : Step: 6400 +2025-05-12T09:43:08 | INFO | __main__ : Current Frame Index within Batch Video: 140/247 +2025-05-12T09:43:08 | INFO | __main__ : Batch-wise Cosine Similarity | 71.13% +2025-05-12T09:43:08 | INFO | __main__ : Cosine Embedding Loss | 0.2887 +2025-05-12T09:43:08 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:43:08 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:43:08 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:43:31 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:43:31 | INFO | __main__ : Step: 6500 +2025-05-12T09:43:31 | INFO | __main__ : Current Frame Index within Batch Video: 240/247 +2025-05-12T09:43:31 | INFO | __main__ : Batch-wise Cosine Similarity | 73.40% +2025-05-12T09:43:31 | INFO | __main__ : Cosine Embedding Loss | 0.2660 +2025-05-12T09:43:31 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:43:31 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:43:31 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:43:33 | INFO | utils.basic_utils : Train Epoch: [0] [ 26/4978] eta: 3 days, 11:38:07 lr: 0.000001 temperature: 0.0126 video-loss_cosine: 0.2647 eval_avg_sim: 0.6380 video-cosine_similarity: 0.7353 time: 59.7898 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T09:43:55 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:43:55 | INFO | __main__ : Step: 6600 +2025-05-12T09:43:55 | INFO | __main__ : Current Frame Index within Batch Video: 99/247 +2025-05-12T09:43:55 | INFO | __main__ : Batch-wise Cosine Similarity | 72.49% +2025-05-12T09:43:55 | INFO | __main__ : Cosine Embedding Loss | 0.2751 +2025-05-12T09:43:55 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:43:55 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:43:55 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:44:19 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:44:19 | INFO | __main__ : Step: 6700 +2025-05-12T09:44:19 | INFO | __main__ : Current Frame Index within Batch Video: 199/247 +2025-05-12T09:44:19 | INFO | __main__ : Batch-wise Cosine Similarity | 75.30% +2025-05-12T09:44:19 | INFO | __main__ : Cosine Embedding Loss | 0.2470 +2025-05-12T09:44:19 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:44:19 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:44:19 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:44:30 | INFO | utils.basic_utils : Train Epoch: [0] [ 27/4978] eta: 3 days, 11:27:11 lr: 0.000001 temperature: 0.0126 video-loss_cosine: 0.2369 eval_avg_sim: 0.6380 video-cosine_similarity: 0.7631 time: 59.7826 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T09:44:43 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:44:43 | INFO | __main__ : Step: 6800 +2025-05-12T09:44:43 | INFO | __main__ : Current Frame Index within Batch Video: 58/247 +2025-05-12T09:44:43 | INFO | __main__ : Batch-wise Cosine Similarity | 68.40% +2025-05-12T09:44:43 | INFO | __main__ : Cosine Embedding Loss | 0.3160 +2025-05-12T09:44:43 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:44:43 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:44:43 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:45:07 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:45:07 | INFO | __main__ : Step: 6900 +2025-05-12T09:45:07 | INFO | __main__ : Current Frame Index within Batch Video: 158/247 +2025-05-12T09:45:07 | INFO | __main__ : Batch-wise Cosine Similarity | 72.51% +2025-05-12T09:45:07 | INFO | __main__ : Cosine Embedding Loss | 0.2749 +2025-05-12T09:45:07 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:45:07 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:45:07 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:45:28 | INFO | utils.basic_utils : Train Epoch: [0] [ 28/4978] eta: 3 days, 11:16:56 lr: 0.000001 temperature: 0.0126 video-loss_cosine: 0.2568 eval_avg_sim: 0.6380 video-cosine_similarity: 0.7432 time: 59.3099 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T09:45:31 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:45:31 | INFO | __main__ : Step: 7000 +2025-05-12T09:45:31 | INFO | __main__ : Current Frame Index within Batch Video: 17/247 +2025-05-12T09:45:31 | INFO | __main__ : Batch-wise Cosine Similarity | 64.71% +2025-05-12T09:45:31 | INFO | __main__ : Cosine Embedding Loss | 0.3529 +2025-05-12T09:45:31 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:45:31 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:45:31 | INFO | __main__ : Evaluation Average Sim | 0.6380 +2025-05-12T09:45:31 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:45:31 | INFO | __main__ : Performing periodic evaluation at global step 7000... +2025-05-12T09:45:31 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T09:45:31 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T09:45:31 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T09:45:31 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T09:45:41 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.6365 +2025-05-12T09:45:41 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0007000.png +2025-05-12T09:45:41 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T09:45:41 | INFO | __main__ : Evaluation at step 7000 complete. Average Similarity: 0.6365 +2025-05-12T09:46:04 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:46:04 | INFO | __main__ : Step: 7100 +2025-05-12T09:46:04 | INFO | __main__ : Current Frame Index within Batch Video: 117/247 +2025-05-12T09:46:04 | INFO | __main__ : Batch-wise Cosine Similarity | 73.57% +2025-05-12T09:46:04 | INFO | __main__ : Cosine Embedding Loss | 0.2643 +2025-05-12T09:46:04 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:46:04 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:46:04 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:46:28 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:46:28 | INFO | __main__ : Step: 7200 +2025-05-12T09:46:28 | INFO | __main__ : Current Frame Index within Batch Video: 217/247 +2025-05-12T09:46:28 | INFO | __main__ : Batch-wise Cosine Similarity | 77.52% +2025-05-12T09:46:28 | INFO | __main__ : Cosine Embedding Loss | 0.2248 +2025-05-12T09:46:28 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:46:28 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:46:28 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:46:35 | INFO | utils.basic_utils : Train Epoch: [0] [ 29/4978] eta: 3 days, 11:34:23 lr: 0.000001 temperature: 0.0126 video-loss_cosine: 0.2335 eval_avg_sim: 0.6365 video-cosine_similarity: 0.7665 time: 59.7991 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T09:46:52 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:46:52 | INFO | __main__ : Step: 7300 +2025-05-12T09:46:52 | INFO | __main__ : Current Frame Index within Batch Video: 76/247 +2025-05-12T09:46:52 | INFO | __main__ : Batch-wise Cosine Similarity | 71.38% +2025-05-12T09:46:52 | INFO | __main__ : Cosine Embedding Loss | 0.2862 +2025-05-12T09:46:52 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:46:52 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:46:52 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:47:16 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:47:16 | INFO | __main__ : Step: 7400 +2025-05-12T09:47:16 | INFO | __main__ : Current Frame Index within Batch Video: 176/247 +2025-05-12T09:47:16 | INFO | __main__ : Batch-wise Cosine Similarity | 73.94% +2025-05-12T09:47:16 | INFO | __main__ : Cosine Embedding Loss | 0.2606 +2025-05-12T09:47:16 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:47:16 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:47:16 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:47:33 | INFO | utils.basic_utils : Train Epoch: [0] [ 30/4978] eta: 3 days, 11:24:23 lr: 0.000001 temperature: 0.0126 video-loss_cosine: 0.2400 eval_avg_sim: 0.6365 video-cosine_similarity: 0.7600 time: 59.7974 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T09:47:40 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:47:40 | INFO | __main__ : Step: 7500 +2025-05-12T09:47:40 | INFO | __main__ : Current Frame Index within Batch Video: 35/247 +2025-05-12T09:47:40 | INFO | __main__ : Batch-wise Cosine Similarity | 68.32% +2025-05-12T09:47:40 | INFO | __main__ : Cosine Embedding Loss | 0.3168 +2025-05-12T09:47:40 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:47:40 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:47:40 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:48:03 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:48:03 | INFO | __main__ : Step: 7600 +2025-05-12T09:48:03 | INFO | __main__ : Current Frame Index within Batch Video: 135/247 +2025-05-12T09:48:03 | INFO | __main__ : Batch-wise Cosine Similarity | 71.35% +2025-05-12T09:48:03 | INFO | __main__ : Cosine Embedding Loss | 0.2865 +2025-05-12T09:48:03 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:48:03 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:48:03 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:48:27 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:48:27 | INFO | __main__ : Step: 7700 +2025-05-12T09:48:27 | INFO | __main__ : Current Frame Index within Batch Video: 235/247 +2025-05-12T09:48:27 | INFO | __main__ : Batch-wise Cosine Similarity | 73.38% +2025-05-12T09:48:27 | INFO | __main__ : Cosine Embedding Loss | 0.2662 +2025-05-12T09:48:27 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:48:27 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:48:27 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:48:30 | INFO | utils.basic_utils : Train Epoch: [0] [ 31/4978] eta: 3 days, 11:15:04 lr: 0.000001 temperature: 0.0126 video-loss_cosine: 0.2787 eval_avg_sim: 0.6365 video-cosine_similarity: 0.7213 time: 59.7940 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T09:48:51 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:48:51 | INFO | __main__ : Step: 7800 +2025-05-12T09:48:51 | INFO | __main__ : Current Frame Index within Batch Video: 94/247 +2025-05-12T09:48:51 | INFO | __main__ : Batch-wise Cosine Similarity | 69.22% +2025-05-12T09:48:51 | INFO | __main__ : Cosine Embedding Loss | 0.3078 +2025-05-12T09:48:51 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:48:51 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:48:51 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:49:15 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:49:15 | INFO | __main__ : Step: 7900 +2025-05-12T09:49:15 | INFO | __main__ : Current Frame Index within Batch Video: 194/247 +2025-05-12T09:49:15 | INFO | __main__ : Batch-wise Cosine Similarity | 71.73% +2025-05-12T09:49:15 | INFO | __main__ : Cosine Embedding Loss | 0.2827 +2025-05-12T09:49:15 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:49:15 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:49:15 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:49:27 | INFO | utils.basic_utils : Train Epoch: [0] [ 32/4978] eta: 3 days, 11:06:04 lr: 0.000001 temperature: 0.0126 video-loss_cosine: 0.2616 eval_avg_sim: 0.6365 video-cosine_similarity: 0.7384 time: 59.3267 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T09:49:39 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:49:39 | INFO | __main__ : Step: 8000 +2025-05-12T09:49:39 | INFO | __main__ : Current Frame Index within Batch Video: 53/247 +2025-05-12T09:49:39 | INFO | __main__ : Batch-wise Cosine Similarity | 71.03% +2025-05-12T09:49:39 | INFO | __main__ : Cosine Embedding Loss | 0.2897 +2025-05-12T09:49:39 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:49:39 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:49:39 | INFO | __main__ : Evaluation Average Sim | 0.6365 +2025-05-12T09:49:39 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:49:39 | INFO | __main__ : Performing periodic evaluation at global step 8000... +2025-05-12T09:49:39 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T09:49:39 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T09:49:39 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T09:49:39 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T09:49:49 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.6630 +2025-05-12T09:49:49 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0008000.png +2025-05-12T09:49:49 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T09:49:49 | INFO | __main__ : Evaluation at step 8000 complete. Average Similarity: 0.6630 +2025-05-12T09:50:12 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:50:12 | INFO | __main__ : Step: 8100 +2025-05-12T09:50:12 | INFO | __main__ : Current Frame Index within Batch Video: 153/247 +2025-05-12T09:50:12 | INFO | __main__ : Batch-wise Cosine Similarity | 74.56% +2025-05-12T09:50:12 | INFO | __main__ : Cosine Embedding Loss | 0.2544 +2025-05-12T09:50:12 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:50:12 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:50:12 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:50:35 | INFO | utils.basic_utils : Train Epoch: [0] [ 33/4978] eta: 3 days, 11:21:06 lr: 0.000001 temperature: 0.0126 video-loss_cosine: 0.2412 eval_avg_sim: 0.6630 video-cosine_similarity: 0.7588 time: 59.8087 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T09:50:36 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:50:36 | INFO | __main__ : Step: 8200 +2025-05-12T09:50:36 | INFO | __main__ : Current Frame Index within Batch Video: 12/247 +2025-05-12T09:50:36 | INFO | __main__ : Batch-wise Cosine Similarity | 61.53% +2025-05-12T09:50:36 | INFO | __main__ : Cosine Embedding Loss | 0.3847 +2025-05-12T09:50:36 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:50:36 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:50:36 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:51:00 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:51:00 | INFO | __main__ : Step: 8300 +2025-05-12T09:51:00 | INFO | __main__ : Current Frame Index within Batch Video: 112/247 +2025-05-12T09:51:00 | INFO | __main__ : Batch-wise Cosine Similarity | 73.49% +2025-05-12T09:51:00 | INFO | __main__ : Cosine Embedding Loss | 0.2651 +2025-05-12T09:51:00 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:51:00 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:51:00 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:51:24 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:51:24 | INFO | __main__ : Step: 8400 +2025-05-12T09:51:24 | INFO | __main__ : Current Frame Index within Batch Video: 212/247 +2025-05-12T09:51:24 | INFO | __main__ : Batch-wise Cosine Similarity | 76.53% +2025-05-12T09:51:24 | INFO | __main__ : Cosine Embedding Loss | 0.2347 +2025-05-12T09:51:24 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:51:24 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:51:24 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:51:32 | INFO | utils.basic_utils : Train Epoch: [0] [ 34/4978] eta: 3 days, 11:12:20 lr: 0.000001 temperature: 0.0126 video-loss_cosine: 0.2309 eval_avg_sim: 0.6630 video-cosine_similarity: 0.7691 time: 59.8084 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T09:51:47 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:51:47 | INFO | __main__ : Step: 8500 +2025-05-12T09:51:47 | INFO | __main__ : Current Frame Index within Batch Video: 71/247 +2025-05-12T09:51:47 | INFO | __main__ : Batch-wise Cosine Similarity | 73.62% +2025-05-12T09:51:47 | INFO | __main__ : Cosine Embedding Loss | 0.2638 +2025-05-12T09:51:47 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:51:48 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:51:48 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:52:11 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:52:11 | INFO | __main__ : Step: 8600 +2025-05-12T09:52:11 | INFO | __main__ : Current Frame Index within Batch Video: 171/247 +2025-05-12T09:52:11 | INFO | __main__ : Batch-wise Cosine Similarity | 76.45% +2025-05-12T09:52:11 | INFO | __main__ : Cosine Embedding Loss | 0.2355 +2025-05-12T09:52:11 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:52:11 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:52:11 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:52:29 | INFO | utils.basic_utils : Train Epoch: [0] [ 35/4978] eta: 3 days, 11:03:56 lr: 0.000001 temperature: 0.0126 video-loss_cosine: 0.2234 eval_avg_sim: 0.6630 video-cosine_similarity: 0.7766 time: 59.8048 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T09:52:35 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:52:35 | INFO | __main__ : Step: 8700 +2025-05-12T09:52:35 | INFO | __main__ : Current Frame Index within Batch Video: 30/247 +2025-05-12T09:52:35 | INFO | __main__ : Batch-wise Cosine Similarity | 67.69% +2025-05-12T09:52:35 | INFO | __main__ : Cosine Embedding Loss | 0.3231 +2025-05-12T09:52:35 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:52:35 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:52:35 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:52:59 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:52:59 | INFO | __main__ : Step: 8800 +2025-05-12T09:52:59 | INFO | __main__ : Current Frame Index within Batch Video: 130/247 +2025-05-12T09:52:59 | INFO | __main__ : Batch-wise Cosine Similarity | 74.77% +2025-05-12T09:52:59 | INFO | __main__ : Cosine Embedding Loss | 0.2523 +2025-05-12T09:52:59 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:52:59 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:52:59 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:53:23 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:53:23 | INFO | __main__ : Step: 8900 +2025-05-12T09:53:23 | INFO | __main__ : Current Frame Index within Batch Video: 230/247 +2025-05-12T09:53:23 | INFO | __main__ : Batch-wise Cosine Similarity | 76.75% +2025-05-12T09:53:23 | INFO | __main__ : Cosine Embedding Loss | 0.2325 +2025-05-12T09:53:23 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:53:23 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:53:23 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:53:27 | INFO | utils.basic_utils : Train Epoch: [0] [ 36/4978] eta: 3 days, 10:56:07 lr: 0.000001 temperature: 0.0126 video-loss_cosine: 0.2388 eval_avg_sim: 0.6630 video-cosine_similarity: 0.7612 time: 59.3391 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T09:53:47 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:53:47 | INFO | __main__ : Step: 9000 +2025-05-12T09:53:47 | INFO | __main__ : Current Frame Index within Batch Video: 89/247 +2025-05-12T09:53:47 | INFO | __main__ : Batch-wise Cosine Similarity | 73.20% +2025-05-12T09:53:47 | INFO | __main__ : Cosine Embedding Loss | 0.2680 +2025-05-12T09:53:47 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:53:47 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:53:47 | INFO | __main__ : Evaluation Average Sim | 0.6630 +2025-05-12T09:53:47 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:53:47 | INFO | __main__ : Performing periodic evaluation at global step 9000... +2025-05-12T09:53:47 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T09:53:47 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T09:53:47 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T09:53:47 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T09:53:56 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.6050 +2025-05-12T09:53:56 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0009000.png +2025-05-12T09:53:56 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T09:53:56 | INFO | __main__ : Evaluation at step 9000 complete. Average Similarity: 0.6050 +2025-05-12T09:54:20 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:54:20 | INFO | __main__ : Step: 9100 +2025-05-12T09:54:20 | INFO | __main__ : Current Frame Index within Batch Video: 189/247 +2025-05-12T09:54:20 | INFO | __main__ : Batch-wise Cosine Similarity | 75.71% +2025-05-12T09:54:20 | INFO | __main__ : Cosine Embedding Loss | 0.2429 +2025-05-12T09:54:20 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:54:20 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:54:20 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:54:34 | INFO | utils.basic_utils : Train Epoch: [0] [ 37/4978] eta: 3 days, 11:09:09 lr: 0.000001 temperature: 0.0126 video-loss_cosine: 0.2188 eval_avg_sim: 0.6050 video-cosine_similarity: 0.7812 time: 59.8113 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T09:54:44 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:54:44 | INFO | __main__ : Step: 9200 +2025-05-12T09:54:44 | INFO | __main__ : Current Frame Index within Batch Video: 48/247 +2025-05-12T09:54:44 | INFO | __main__ : Batch-wise Cosine Similarity | 66.69% +2025-05-12T09:54:44 | INFO | __main__ : Cosine Embedding Loss | 0.3331 +2025-05-12T09:54:44 | INFO | __main__ : Learning Rate | 0.000001 +2025-05-12T09:54:44 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:54:44 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:55:07 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:55:07 | INFO | __main__ : Step: 9300 +2025-05-12T09:55:07 | INFO | __main__ : Current Frame Index within Batch Video: 148/247 +2025-05-12T09:55:07 | INFO | __main__ : Batch-wise Cosine Similarity | 71.24% +2025-05-12T09:55:07 | INFO | __main__ : Cosine Embedding Loss | 0.2876 +2025-05-12T09:55:07 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T09:55:07 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:55:07 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:55:31 | INFO | utils.basic_utils : Train Epoch: [0] [ 38/4978] eta: 3 days, 11:01:23 lr: 0.000002 temperature: 0.0126 video-loss_cosine: 0.2584 eval_avg_sim: 0.6050 video-cosine_similarity: 0.7416 time: 59.8120 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T09:55:31 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:55:31 | INFO | __main__ : Step: 9400 +2025-05-12T09:55:31 | INFO | __main__ : Current Frame Index within Batch Video: 7/247 +2025-05-12T09:55:31 | INFO | __main__ : Batch-wise Cosine Similarity | 64.59% +2025-05-12T09:55:31 | INFO | __main__ : Cosine Embedding Loss | 0.3541 +2025-05-12T09:55:31 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T09:55:31 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:55:31 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:55:55 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:55:55 | INFO | __main__ : Step: 9500 +2025-05-12T09:55:55 | INFO | __main__ : Current Frame Index within Batch Video: 107/247 +2025-05-12T09:55:55 | INFO | __main__ : Batch-wise Cosine Similarity | 76.07% +2025-05-12T09:55:55 | INFO | __main__ : Cosine Embedding Loss | 0.2393 +2025-05-12T09:55:55 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T09:55:55 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:55:55 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:56:19 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:56:19 | INFO | __main__ : Step: 9600 +2025-05-12T09:56:19 | INFO | __main__ : Current Frame Index within Batch Video: 207/247 +2025-05-12T09:56:19 | INFO | __main__ : Batch-wise Cosine Similarity | 78.00% +2025-05-12T09:56:19 | INFO | __main__ : Cosine Embedding Loss | 0.2200 +2025-05-12T09:56:19 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T09:56:19 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:56:19 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:56:28 | INFO | utils.basic_utils : Train Epoch: [0] [ 39/4978] eta: 3 days, 10:54:03 lr: 0.000002 temperature: 0.0126 video-loss_cosine: 0.2158 eval_avg_sim: 0.6050 video-cosine_similarity: 0.7842 time: 59.8096 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T09:56:43 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:56:43 | INFO | __main__ : Step: 9700 +2025-05-12T09:56:43 | INFO | __main__ : Current Frame Index within Batch Video: 66/247 +2025-05-12T09:56:43 | INFO | __main__ : Batch-wise Cosine Similarity | 68.98% +2025-05-12T09:56:43 | INFO | __main__ : Cosine Embedding Loss | 0.3102 +2025-05-12T09:56:43 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T09:56:43 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:56:43 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:57:07 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:57:07 | INFO | __main__ : Step: 9800 +2025-05-12T09:57:07 | INFO | __main__ : Current Frame Index within Batch Video: 166/247 +2025-05-12T09:57:07 | INFO | __main__ : Batch-wise Cosine Similarity | 73.05% +2025-05-12T09:57:07 | INFO | __main__ : Cosine Embedding Loss | 0.2695 +2025-05-12T09:57:07 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T09:57:07 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:57:07 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:57:26 | INFO | utils.basic_utils : Train Epoch: [0] [ 40/4978] eta: 3 days, 10:47:00 lr: 0.000002 temperature: 0.0126 video-loss_cosine: 0.2244 eval_avg_sim: 0.6050 video-cosine_similarity: 0.7756 time: 59.3369 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T09:57:30 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:57:30 | INFO | __main__ : Step: 9900 +2025-05-12T09:57:30 | INFO | __main__ : Current Frame Index within Batch Video: 25/247 +2025-05-12T09:57:30 | INFO | __main__ : Batch-wise Cosine Similarity | 63.70% +2025-05-12T09:57:30 | INFO | __main__ : Cosine Embedding Loss | 0.3630 +2025-05-12T09:57:30 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T09:57:30 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:57:30 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:57:54 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:57:54 | INFO | __main__ : Step: 10000 +2025-05-12T09:57:54 | INFO | __main__ : Current Frame Index within Batch Video: 125/247 +2025-05-12T09:57:54 | INFO | __main__ : Batch-wise Cosine Similarity | 74.43% +2025-05-12T09:57:54 | INFO | __main__ : Cosine Embedding Loss | 0.2557 +2025-05-12T09:57:54 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T09:57:54 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:57:54 | INFO | __main__ : Evaluation Average Sim | 0.6050 +2025-05-12T09:57:54 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:57:54 | INFO | __main__ : Saving checkpoint at global step 10000 +2025-05-12T09:57:55 | INFO | __main__ : Performing periodic evaluation at global step 10000... +2025-05-12T09:57:55 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T09:57:55 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T09:57:55 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T09:57:55 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T09:58:04 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.6093 +2025-05-12T09:58:04 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0010000.png +2025-05-12T09:58:04 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T09:58:04 | INFO | __main__ : Evaluation at step 10000 complete. Average Similarity: 0.6093 +2025-05-12T09:58:28 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:58:28 | INFO | __main__ : Step: 10100 +2025-05-12T09:58:28 | INFO | __main__ : Current Frame Index within Batch Video: 225/247 +2025-05-12T09:58:28 | INFO | __main__ : Batch-wise Cosine Similarity | 80.64% +2025-05-12T09:58:28 | INFO | __main__ : Cosine Embedding Loss | 0.1936 +2025-05-12T09:58:28 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T09:58:28 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:58:28 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:58:33 | INFO | utils.basic_utils : Train Epoch: [0] [ 41/4978] eta: 3 days, 10:58:55 lr: 0.000002 temperature: 0.0126 video-loss_cosine: 0.1843 eval_avg_sim: 0.6093 video-cosine_similarity: 0.8157 time: 59.8130 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T09:58:51 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:58:52 | INFO | __main__ : Step: 10200 +2025-05-12T09:58:52 | INFO | __main__ : Current Frame Index within Batch Video: 84/247 +2025-05-12T09:58:52 | INFO | __main__ : Batch-wise Cosine Similarity | 71.52% +2025-05-12T09:58:52 | INFO | __main__ : Cosine Embedding Loss | 0.2848 +2025-05-12T09:58:52 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T09:58:52 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:58:52 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:59:15 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:59:15 | INFO | __main__ : Step: 10300 +2025-05-12T09:59:15 | INFO | __main__ : Current Frame Index within Batch Video: 184/247 +2025-05-12T09:59:15 | INFO | __main__ : Batch-wise Cosine Similarity | 76.67% +2025-05-12T09:59:15 | INFO | __main__ : Cosine Embedding Loss | 0.2333 +2025-05-12T09:59:15 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T09:59:15 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:59:15 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:59:30 | INFO | utils.basic_utils : Train Epoch: [0] [ 42/4978] eta: 3 days, 10:51:57 lr: 0.000002 temperature: 0.0126 video-loss_cosine: 0.2317 eval_avg_sim: 0.6093 video-cosine_similarity: 0.7683 time: 59.8107 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T09:59:39 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T09:59:39 | INFO | __main__ : Step: 10400 +2025-05-12T09:59:39 | INFO | __main__ : Current Frame Index within Batch Video: 43/247 +2025-05-12T09:59:39 | INFO | __main__ : Batch-wise Cosine Similarity | 70.36% +2025-05-12T09:59:39 | INFO | __main__ : Cosine Embedding Loss | 0.2964 +2025-05-12T09:59:39 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T09:59:39 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T09:59:39 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:00:03 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:00:03 | INFO | __main__ : Step: 10500 +2025-05-12T10:00:03 | INFO | __main__ : Current Frame Index within Batch Video: 143/247 +2025-05-12T10:00:03 | INFO | __main__ : Batch-wise Cosine Similarity | 76.05% +2025-05-12T10:00:03 | INFO | __main__ : Cosine Embedding Loss | 0.2395 +2025-05-12T10:00:03 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T10:00:03 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:00:03 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:00:27 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:00:27 | INFO | __main__ : Step: 10600 +2025-05-12T10:00:27 | INFO | __main__ : Current Frame Index within Batch Video: 243/247 +2025-05-12T10:00:27 | INFO | __main__ : Batch-wise Cosine Similarity | 78.50% +2025-05-12T10:00:27 | INFO | __main__ : Cosine Embedding Loss | 0.2150 +2025-05-12T10:00:27 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T10:00:27 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:00:27 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:00:28 | INFO | utils.basic_utils : Train Epoch: [0] [ 43/4978] eta: 3 days, 10:45:17 lr: 0.000002 temperature: 0.0126 video-loss_cosine: 0.2127 eval_avg_sim: 0.6093 video-cosine_similarity: 0.7873 time: 59.8083 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T10:00:51 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:00:51 | INFO | __main__ : Step: 10700 +2025-05-12T10:00:51 | INFO | __main__ : Current Frame Index within Batch Video: 102/247 +2025-05-12T10:00:51 | INFO | __main__ : Batch-wise Cosine Similarity | 74.04% +2025-05-12T10:00:51 | INFO | __main__ : Cosine Embedding Loss | 0.2596 +2025-05-12T10:00:51 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T10:00:51 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:00:51 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:01:14 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:01:14 | INFO | __main__ : Step: 10800 +2025-05-12T10:01:14 | INFO | __main__ : Current Frame Index within Batch Video: 202/247 +2025-05-12T10:01:14 | INFO | __main__ : Batch-wise Cosine Similarity | 76.85% +2025-05-12T10:01:14 | INFO | __main__ : Cosine Embedding Loss | 0.2315 +2025-05-12T10:01:14 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T10:01:14 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:01:14 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:01:25 | INFO | utils.basic_utils : Train Epoch: [0] [ 44/4978] eta: 3 days, 10:38:51 lr: 0.000002 temperature: 0.0126 video-loss_cosine: 0.2220 eval_avg_sim: 0.6093 video-cosine_similarity: 0.7780 time: 59.3400 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T10:01:38 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:01:38 | INFO | __main__ : Step: 10900 +2025-05-12T10:01:38 | INFO | __main__ : Current Frame Index within Batch Video: 61/247 +2025-05-12T10:01:38 | INFO | __main__ : Batch-wise Cosine Similarity | 72.23% +2025-05-12T10:01:38 | INFO | __main__ : Cosine Embedding Loss | 0.2777 +2025-05-12T10:01:38 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T10:01:38 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:01:38 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:02:02 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:02:02 | INFO | __main__ : Step: 11000 +2025-05-12T10:02:02 | INFO | __main__ : Current Frame Index within Batch Video: 161/247 +2025-05-12T10:02:02 | INFO | __main__ : Batch-wise Cosine Similarity | 77.57% +2025-05-12T10:02:02 | INFO | __main__ : Cosine Embedding Loss | 0.2243 +2025-05-12T10:02:02 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T10:02:02 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:02:02 | INFO | __main__ : Evaluation Average Sim | 0.6093 +2025-05-12T10:02:02 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:02:02 | INFO | __main__ : Performing periodic evaluation at global step 11000... +2025-05-12T10:02:02 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T10:02:02 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T10:02:02 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T10:02:02 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T10:02:12 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.6764 +2025-05-12T10:02:12 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0011000.png +2025-05-12T10:02:12 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T10:02:12 | INFO | __main__ : Evaluation at step 11000 complete. Average Similarity: 0.6764 +2025-05-12T10:02:32 | INFO | utils.basic_utils : Train Epoch: [0] [ 45/4978] eta: 3 days, 10:49:23 lr: 0.000002 temperature: 0.0126 video-loss_cosine: 0.1917 eval_avg_sim: 0.6764 video-cosine_similarity: 0.8083 time: 59.8094 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T10:02:35 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:02:35 | INFO | __main__ : Step: 11100 +2025-05-12T10:02:35 | INFO | __main__ : Current Frame Index within Batch Video: 20/247 +2025-05-12T10:02:35 | INFO | __main__ : Batch-wise Cosine Similarity | 62.58% +2025-05-12T10:02:35 | INFO | __main__ : Cosine Embedding Loss | 0.3742 +2025-05-12T10:02:35 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T10:02:35 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:02:35 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:02:59 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:02:59 | INFO | __main__ : Step: 11200 +2025-05-12T10:02:59 | INFO | __main__ : Current Frame Index within Batch Video: 120/247 +2025-05-12T10:02:59 | INFO | __main__ : Batch-wise Cosine Similarity | 74.95% +2025-05-12T10:02:59 | INFO | __main__ : Cosine Embedding Loss | 0.2505 +2025-05-12T10:02:59 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T10:02:59 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:02:59 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:03:23 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:03:23 | INFO | __main__ : Step: 11300 +2025-05-12T10:03:23 | INFO | __main__ : Current Frame Index within Batch Video: 220/247 +2025-05-12T10:03:23 | INFO | __main__ : Batch-wise Cosine Similarity | 79.35% +2025-05-12T10:03:23 | INFO | __main__ : Cosine Embedding Loss | 0.2065 +2025-05-12T10:03:23 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T10:03:23 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:03:23 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:03:29 | INFO | utils.basic_utils : Train Epoch: [0] [ 46/4978] eta: 3 days, 10:43:04 lr: 0.000002 temperature: 0.0126 video-loss_cosine: 0.1898 eval_avg_sim: 0.6764 video-cosine_similarity: 0.8102 time: 59.8067 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T10:03:47 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:03:47 | INFO | __main__ : Step: 11400 +2025-05-12T10:03:47 | INFO | __main__ : Current Frame Index within Batch Video: 79/247 +2025-05-12T10:03:47 | INFO | __main__ : Batch-wise Cosine Similarity | 73.77% +2025-05-12T10:03:47 | INFO | __main__ : Cosine Embedding Loss | 0.2623 +2025-05-12T10:03:47 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T10:03:47 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:03:47 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:04:10 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:04:10 | INFO | __main__ : Step: 11500 +2025-05-12T10:04:10 | INFO | __main__ : Current Frame Index within Batch Video: 179/247 +2025-05-12T10:04:10 | INFO | __main__ : Batch-wise Cosine Similarity | 79.87% +2025-05-12T10:04:10 | INFO | __main__ : Cosine Embedding Loss | 0.2013 +2025-05-12T10:04:10 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T10:04:10 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:04:10 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:04:27 | INFO | utils.basic_utils : Train Epoch: [0] [ 47/4978] eta: 3 days, 10:37:02 lr: 0.000002 temperature: 0.0126 video-loss_cosine: 0.1979 eval_avg_sim: 0.6764 video-cosine_similarity: 0.8021 time: 59.8074 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T10:04:34 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:04:34 | INFO | __main__ : Step: 11600 +2025-05-12T10:04:34 | INFO | __main__ : Current Frame Index within Batch Video: 38/247 +2025-05-12T10:04:34 | INFO | __main__ : Batch-wise Cosine Similarity | 67.65% +2025-05-12T10:04:34 | INFO | __main__ : Cosine Embedding Loss | 0.3235 +2025-05-12T10:04:34 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T10:04:34 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:04:34 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:04:58 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:04:58 | INFO | __main__ : Step: 11700 +2025-05-12T10:04:58 | INFO | __main__ : Current Frame Index within Batch Video: 138/247 +2025-05-12T10:04:58 | INFO | __main__ : Batch-wise Cosine Similarity | 75.79% +2025-05-12T10:04:58 | INFO | __main__ : Cosine Embedding Loss | 0.2421 +2025-05-12T10:04:58 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T10:04:58 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:04:58 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:05:22 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:05:22 | INFO | __main__ : Step: 11800 +2025-05-12T10:05:22 | INFO | __main__ : Current Frame Index within Batch Video: 238/247 +2025-05-12T10:05:22 | INFO | __main__ : Batch-wise Cosine Similarity | 79.80% +2025-05-12T10:05:22 | INFO | __main__ : Cosine Embedding Loss | 0.2020 +2025-05-12T10:05:22 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T10:05:22 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:05:22 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:05:24 | INFO | utils.basic_utils : Train Epoch: [0] [ 48/4978] eta: 3 days, 10:31:14 lr: 0.000002 temperature: 0.0126 video-loss_cosine: 0.1986 eval_avg_sim: 0.6764 video-cosine_similarity: 0.8014 time: 59.8085 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T10:05:46 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:05:46 | INFO | __main__ : Step: 11900 +2025-05-12T10:05:46 | INFO | __main__ : Current Frame Index within Batch Video: 97/247 +2025-05-12T10:05:46 | INFO | __main__ : Batch-wise Cosine Similarity | 73.59% +2025-05-12T10:05:46 | INFO | __main__ : Cosine Embedding Loss | 0.2641 +2025-05-12T10:05:46 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T10:05:46 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:05:46 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:06:10 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:06:10 | INFO | __main__ : Step: 12000 +2025-05-12T10:06:10 | INFO | __main__ : Current Frame Index within Batch Video: 197/247 +2025-05-12T10:06:10 | INFO | __main__ : Batch-wise Cosine Similarity | 77.66% +2025-05-12T10:06:10 | INFO | __main__ : Cosine Embedding Loss | 0.2234 +2025-05-12T10:06:10 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T10:06:10 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:06:10 | INFO | __main__ : Evaluation Average Sim | 0.6764 +2025-05-12T10:06:10 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:06:10 | INFO | __main__ : Performing periodic evaluation at global step 12000... +2025-05-12T10:06:10 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T10:06:10 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T10:06:10 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T10:06:10 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T10:06:19 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.6579 +2025-05-12T10:06:19 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0012000.png +2025-05-12T10:06:19 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T10:06:19 | INFO | __main__ : Evaluation at step 12000 complete. Average Similarity: 0.6579 +2025-05-12T10:06:31 | INFO | utils.basic_utils : Train Epoch: [0] [ 49/4978] eta: 3 days, 10:41:05 lr: 0.000002 temperature: 0.0126 video-loss_cosine: 0.2112 eval_avg_sim: 0.6579 video-cosine_similarity: 0.7888 time: 59.7872 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T10:06:43 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:06:43 | INFO | __main__ : Step: 12100 +2025-05-12T10:06:43 | INFO | __main__ : Current Frame Index within Batch Video: 56/247 +2025-05-12T10:06:43 | INFO | __main__ : Batch-wise Cosine Similarity | 72.30% +2025-05-12T10:06:43 | INFO | __main__ : Cosine Embedding Loss | 0.2770 +2025-05-12T10:06:43 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T10:06:43 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:06:43 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:07:07 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:07:07 | INFO | __main__ : Step: 12200 +2025-05-12T10:07:07 | INFO | __main__ : Current Frame Index within Batch Video: 156/247 +2025-05-12T10:07:07 | INFO | __main__ : Batch-wise Cosine Similarity | 77.73% +2025-05-12T10:07:07 | INFO | __main__ : Cosine Embedding Loss | 0.2227 +2025-05-12T10:07:07 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T10:07:07 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:07:07 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:07:28 | INFO | utils.basic_utils : Train Epoch: [0] [ 50/4978] eta: 3 days, 10:35:11 lr: 0.000002 temperature: 0.0126 video-loss_cosine: 0.1755 eval_avg_sim: 0.6579 video-cosine_similarity: 0.8245 time: 59.7845 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T10:07:31 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:07:31 | INFO | __main__ : Step: 12300 +2025-05-12T10:07:31 | INFO | __main__ : Current Frame Index within Batch Video: 15/247 +2025-05-12T10:07:31 | INFO | __main__ : Batch-wise Cosine Similarity | 62.90% +2025-05-12T10:07:31 | INFO | __main__ : Cosine Embedding Loss | 0.3710 +2025-05-12T10:07:31 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T10:07:31 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:07:31 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:07:54 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:07:54 | INFO | __main__ : Step: 12400 +2025-05-12T10:07:54 | INFO | __main__ : Current Frame Index within Batch Video: 115/247 +2025-05-12T10:07:54 | INFO | __main__ : Batch-wise Cosine Similarity | 75.68% +2025-05-12T10:07:54 | INFO | __main__ : Cosine Embedding Loss | 0.2432 +2025-05-12T10:07:54 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T10:07:54 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:07:54 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:08:18 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:08:18 | INFO | __main__ : Step: 12500 +2025-05-12T10:08:18 | INFO | __main__ : Current Frame Index within Batch Video: 215/247 +2025-05-12T10:08:18 | INFO | __main__ : Batch-wise Cosine Similarity | 80.12% +2025-05-12T10:08:18 | INFO | __main__ : Cosine Embedding Loss | 0.1988 +2025-05-12T10:08:18 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T10:08:18 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:08:18 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:08:26 | INFO | utils.basic_utils : Train Epoch: [0] [ 51/4978] eta: 3 days, 10:29:38 lr: 0.000002 temperature: 0.0126 video-loss_cosine: 0.1795 eval_avg_sim: 0.6579 video-cosine_similarity: 0.8205 time: 59.7839 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T10:08:42 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:08:42 | INFO | __main__ : Step: 12600 +2025-05-12T10:08:42 | INFO | __main__ : Current Frame Index within Batch Video: 74/247 +2025-05-12T10:08:42 | INFO | __main__ : Batch-wise Cosine Similarity | 73.90% +2025-05-12T10:08:42 | INFO | __main__ : Cosine Embedding Loss | 0.2610 +2025-05-12T10:08:42 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T10:08:42 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:08:42 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:09:06 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:09:06 | INFO | __main__ : Step: 12700 +2025-05-12T10:09:06 | INFO | __main__ : Current Frame Index within Batch Video: 174/247 +2025-05-12T10:09:06 | INFO | __main__ : Batch-wise Cosine Similarity | 80.93% +2025-05-12T10:09:06 | INFO | __main__ : Cosine Embedding Loss | 0.1907 +2025-05-12T10:09:06 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T10:09:06 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:09:06 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:09:23 | INFO | utils.basic_utils : Train Epoch: [0] [ 52/4978] eta: 3 days, 10:24:11 lr: 0.000002 temperature: 0.0126 video-loss_cosine: 0.1574 eval_avg_sim: 0.6579 video-cosine_similarity: 0.8426 time: 59.7852 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T10:09:30 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:09:30 | INFO | __main__ : Step: 12800 +2025-05-12T10:09:30 | INFO | __main__ : Current Frame Index within Batch Video: 33/247 +2025-05-12T10:09:30 | INFO | __main__ : Batch-wise Cosine Similarity | 67.31% +2025-05-12T10:09:30 | INFO | __main__ : Cosine Embedding Loss | 0.3269 +2025-05-12T10:09:30 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T10:09:30 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:09:30 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:09:54 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:09:54 | INFO | __main__ : Step: 12900 +2025-05-12T10:09:54 | INFO | __main__ : Current Frame Index within Batch Video: 133/247 +2025-05-12T10:09:54 | INFO | __main__ : Batch-wise Cosine Similarity | 75.90% +2025-05-12T10:09:54 | INFO | __main__ : Cosine Embedding Loss | 0.2410 +2025-05-12T10:09:54 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T10:09:54 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:09:54 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:10:17 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:10:17 | INFO | __main__ : Step: 13000 +2025-05-12T10:10:17 | INFO | __main__ : Current Frame Index within Batch Video: 233/247 +2025-05-12T10:10:17 | INFO | __main__ : Batch-wise Cosine Similarity | 79.11% +2025-05-12T10:10:17 | INFO | __main__ : Cosine Embedding Loss | 0.2089 +2025-05-12T10:10:17 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T10:10:17 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:10:17 | INFO | __main__ : Evaluation Average Sim | 0.6579 +2025-05-12T10:10:17 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:10:18 | INFO | __main__ : Performing periodic evaluation at global step 13000... +2025-05-12T10:10:18 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T10:10:18 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T10:10:18 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T10:10:18 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T10:10:27 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.6762 +2025-05-12T10:10:27 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0013000.png +2025-05-12T10:10:27 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T10:10:27 | INFO | __main__ : Evaluation at step 13000 complete. Average Similarity: 0.6762 +2025-05-12T10:10:30 | INFO | utils.basic_utils : Train Epoch: [0] [ 53/4978] eta: 3 days, 10:33:08 lr: 0.000002 temperature: 0.0126 video-loss_cosine: 0.1966 eval_avg_sim: 0.6762 video-cosine_similarity: 0.8034 time: 59.7683 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T10:10:51 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:10:51 | INFO | __main__ : Step: 13100 +2025-05-12T10:10:51 | INFO | __main__ : Current Frame Index within Batch Video: 92/247 +2025-05-12T10:10:51 | INFO | __main__ : Batch-wise Cosine Similarity | 77.82% +2025-05-12T10:10:51 | INFO | __main__ : Cosine Embedding Loss | 0.2218 +2025-05-12T10:10:51 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T10:10:51 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:10:51 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:11:14 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:11:14 | INFO | __main__ : Step: 13200 +2025-05-12T10:11:14 | INFO | __main__ : Current Frame Index within Batch Video: 192/247 +2025-05-12T10:11:14 | INFO | __main__ : Batch-wise Cosine Similarity | 83.78% +2025-05-12T10:11:14 | INFO | __main__ : Cosine Embedding Loss | 0.1622 +2025-05-12T10:11:14 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T10:11:14 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:11:14 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:11:27 | INFO | utils.basic_utils : Train Epoch: [0] [ 54/4978] eta: 3 days, 10:27:40 lr: 0.000002 temperature: 0.0126 video-loss_cosine: 0.1547 eval_avg_sim: 0.6762 video-cosine_similarity: 0.8453 time: 59.7665 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T10:11:38 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:11:38 | INFO | __main__ : Step: 13300 +2025-05-12T10:11:38 | INFO | __main__ : Current Frame Index within Batch Video: 51/247 +2025-05-12T10:11:38 | INFO | __main__ : Batch-wise Cosine Similarity | 73.17% +2025-05-12T10:11:38 | INFO | __main__ : Cosine Embedding Loss | 0.2683 +2025-05-12T10:11:38 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T10:11:38 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:11:38 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:12:02 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:12:02 | INFO | __main__ : Step: 13400 +2025-05-12T10:12:02 | INFO | __main__ : Current Frame Index within Batch Video: 151/247 +2025-05-12T10:12:02 | INFO | __main__ : Batch-wise Cosine Similarity | 78.83% +2025-05-12T10:12:02 | INFO | __main__ : Cosine Embedding Loss | 0.2117 +2025-05-12T10:12:02 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T10:12:02 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:12:02 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:12:25 | INFO | utils.basic_utils : Train Epoch: [0] [ 55/4978] eta: 3 days, 10:22:28 lr: 0.000002 temperature: 0.0126 video-loss_cosine: 0.1790 eval_avg_sim: 0.6762 video-cosine_similarity: 0.8210 time: 59.7701 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T10:12:26 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:12:26 | INFO | __main__ : Step: 13500 +2025-05-12T10:12:26 | INFO | __main__ : Current Frame Index within Batch Video: 10/247 +2025-05-12T10:12:26 | INFO | __main__ : Batch-wise Cosine Similarity | 56.96% +2025-05-12T10:12:26 | INFO | __main__ : Cosine Embedding Loss | 0.4304 +2025-05-12T10:12:26 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T10:12:26 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:12:26 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:12:50 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:12:50 | INFO | __main__ : Step: 13600 +2025-05-12T10:12:50 | INFO | __main__ : Current Frame Index within Batch Video: 110/247 +2025-05-12T10:12:50 | INFO | __main__ : Batch-wise Cosine Similarity | 76.70% +2025-05-12T10:12:50 | INFO | __main__ : Cosine Embedding Loss | 0.2330 +2025-05-12T10:12:50 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T10:12:50 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:12:50 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:13:13 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:13:13 | INFO | __main__ : Step: 13700 +2025-05-12T10:13:13 | INFO | __main__ : Current Frame Index within Batch Video: 210/247 +2025-05-12T10:13:13 | INFO | __main__ : Batch-wise Cosine Similarity | 81.55% +2025-05-12T10:13:13 | INFO | __main__ : Cosine Embedding Loss | 0.1845 +2025-05-12T10:13:13 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T10:13:13 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:13:13 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:13:22 | INFO | utils.basic_utils : Train Epoch: [0] [ 56/4978] eta: 3 days, 10:17:26 lr: 0.000002 temperature: 0.0126 video-loss_cosine: 0.1650 eval_avg_sim: 0.6762 video-cosine_similarity: 0.8350 time: 59.7699 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T10:13:37 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:13:37 | INFO | __main__ : Step: 13800 +2025-05-12T10:13:37 | INFO | __main__ : Current Frame Index within Batch Video: 69/247 +2025-05-12T10:13:37 | INFO | __main__ : Batch-wise Cosine Similarity | 73.57% +2025-05-12T10:13:37 | INFO | __main__ : Cosine Embedding Loss | 0.2643 +2025-05-12T10:13:37 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T10:13:37 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:13:37 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:14:01 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:14:01 | INFO | __main__ : Step: 13900 +2025-05-12T10:14:01 | INFO | __main__ : Current Frame Index within Batch Video: 169/247 +2025-05-12T10:14:01 | INFO | __main__ : Batch-wise Cosine Similarity | 78.24% +2025-05-12T10:14:01 | INFO | __main__ : Cosine Embedding Loss | 0.2176 +2025-05-12T10:14:01 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T10:14:01 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:14:01 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:14:20 | INFO | utils.basic_utils : Train Epoch: [0] [ 57/4978] eta: 3 days, 10:12:32 lr: 0.000002 temperature: 0.0126 video-loss_cosine: 0.2050 eval_avg_sim: 0.6762 video-cosine_similarity: 0.7950 time: 59.2971 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T10:14:25 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:14:25 | INFO | __main__ : Step: 14000 +2025-05-12T10:14:25 | INFO | __main__ : Current Frame Index within Batch Video: 28/247 +2025-05-12T10:14:25 | INFO | __main__ : Batch-wise Cosine Similarity | 72.33% +2025-05-12T10:14:25 | INFO | __main__ : Cosine Embedding Loss | 0.2767 +2025-05-12T10:14:25 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T10:14:25 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:14:25 | INFO | __main__ : Evaluation Average Sim | 0.6762 +2025-05-12T10:14:25 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:14:25 | INFO | __main__ : Performing periodic evaluation at global step 14000... +2025-05-12T10:14:25 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T10:14:25 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T10:14:25 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T10:14:25 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T10:14:35 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.6990 +2025-05-12T10:14:35 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0014000.png +2025-05-12T10:14:35 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T10:14:35 | INFO | __main__ : Evaluation at step 14000 complete. Average Similarity: 0.6990 +2025-05-12T10:14:59 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:14:59 | INFO | __main__ : Step: 14100 +2025-05-12T10:14:59 | INFO | __main__ : Current Frame Index within Batch Video: 128/247 +2025-05-12T10:14:59 | INFO | __main__ : Batch-wise Cosine Similarity | 79.79% +2025-05-12T10:14:59 | INFO | __main__ : Cosine Embedding Loss | 0.2021 +2025-05-12T10:14:59 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T10:14:59 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:14:59 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:15:22 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:15:22 | INFO | __main__ : Step: 14200 +2025-05-12T10:15:22 | INFO | __main__ : Current Frame Index within Batch Video: 228/247 +2025-05-12T10:15:22 | INFO | __main__ : Batch-wise Cosine Similarity | 80.87% +2025-05-12T10:15:22 | INFO | __main__ : Cosine Embedding Loss | 0.1913 +2025-05-12T10:15:22 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T10:15:22 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:15:22 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:15:27 | INFO | utils.basic_utils : Train Epoch: [0] [ 58/4978] eta: 3 days, 10:21:19 lr: 0.000002 temperature: 0.0126 video-loss_cosine: 0.2096 eval_avg_sim: 0.6990 video-cosine_similarity: 0.7904 time: 59.7867 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T10:15:46 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:15:46 | INFO | __main__ : Step: 14300 +2025-05-12T10:15:46 | INFO | __main__ : Current Frame Index within Batch Video: 87/247 +2025-05-12T10:15:46 | INFO | __main__ : Batch-wise Cosine Similarity | 76.36% +2025-05-12T10:15:46 | INFO | __main__ : Cosine Embedding Loss | 0.2364 +2025-05-12T10:15:46 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T10:15:46 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:15:46 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:16:10 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:16:10 | INFO | __main__ : Step: 14400 +2025-05-12T10:16:10 | INFO | __main__ : Current Frame Index within Batch Video: 187/247 +2025-05-12T10:16:10 | INFO | __main__ : Batch-wise Cosine Similarity | 83.61% +2025-05-12T10:16:10 | INFO | __main__ : Cosine Embedding Loss | 0.1639 +2025-05-12T10:16:10 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T10:16:10 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:16:10 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:16:24 | INFO | utils.basic_utils : Train Epoch: [0] [ 59/4978] eta: 3 days, 10:16:29 lr: 0.000002 temperature: 0.0126 video-loss_cosine: 0.1471 eval_avg_sim: 0.6990 video-cosine_similarity: 0.8529 time: 59.7879 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T10:16:34 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:16:34 | INFO | __main__ : Step: 14500 +2025-05-12T10:16:34 | INFO | __main__ : Current Frame Index within Batch Video: 46/247 +2025-05-12T10:16:34 | INFO | __main__ : Batch-wise Cosine Similarity | 71.61% +2025-05-12T10:16:34 | INFO | __main__ : Cosine Embedding Loss | 0.2839 +2025-05-12T10:16:34 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T10:16:34 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:16:34 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:16:58 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:16:58 | INFO | __main__ : Step: 14600 +2025-05-12T10:16:58 | INFO | __main__ : Current Frame Index within Batch Video: 146/247 +2025-05-12T10:16:58 | INFO | __main__ : Batch-wise Cosine Similarity | 78.94% +2025-05-12T10:16:58 | INFO | __main__ : Cosine Embedding Loss | 0.2106 +2025-05-12T10:16:58 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T10:16:58 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:16:58 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:17:21 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:17:21 | INFO | __main__ : Step: 14700 +2025-05-12T10:17:21 | INFO | __main__ : Current Frame Index within Batch Video: 246/247 +2025-05-12T10:17:21 | INFO | __main__ : Batch-wise Cosine Similarity | 82.74% +2025-05-12T10:17:21 | INFO | __main__ : Cosine Embedding Loss | 0.1726 +2025-05-12T10:17:21 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T10:17:21 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:17:21 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:17:22 | INFO | utils.basic_utils : Train Epoch: [0] [ 60/4978] eta: 3 days, 10:11:44 lr: 0.000002 temperature: 0.0126 video-loss_cosine: 0.1740 eval_avg_sim: 0.6990 video-cosine_similarity: 0.8260 time: 59.7889 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T10:17:45 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:17:45 | INFO | __main__ : Step: 14800 +2025-05-12T10:17:45 | INFO | __main__ : Current Frame Index within Batch Video: 105/247 +2025-05-12T10:17:45 | INFO | __main__ : Batch-wise Cosine Similarity | 77.08% +2025-05-12T10:17:45 | INFO | __main__ : Cosine Embedding Loss | 0.2292 +2025-05-12T10:17:45 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T10:17:45 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:17:45 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:18:09 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:18:09 | INFO | __main__ : Step: 14900 +2025-05-12T10:18:09 | INFO | __main__ : Current Frame Index within Batch Video: 205/247 +2025-05-12T10:18:09 | INFO | __main__ : Batch-wise Cosine Similarity | 80.94% +2025-05-12T10:18:09 | INFO | __main__ : Cosine Embedding Loss | 0.1906 +2025-05-12T10:18:09 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T10:18:09 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:18:09 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:18:19 | INFO | utils.basic_utils : Train Epoch: [0] [ 61/4978] eta: 3 days, 10:07:07 lr: 0.000002 temperature: 0.0126 video-loss_cosine: 0.1696 eval_avg_sim: 0.6990 video-cosine_similarity: 0.8304 time: 59.3129 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T10:18:33 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:18:33 | INFO | __main__ : Step: 15000 +2025-05-12T10:18:33 | INFO | __main__ : Current Frame Index within Batch Video: 64/247 +2025-05-12T10:18:33 | INFO | __main__ : Batch-wise Cosine Similarity | 73.96% +2025-05-12T10:18:33 | INFO | __main__ : Cosine Embedding Loss | 0.2604 +2025-05-12T10:18:33 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T10:18:33 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:18:33 | INFO | __main__ : Evaluation Average Sim | 0.6990 +2025-05-12T10:18:33 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:18:33 | INFO | __main__ : Saving checkpoint at global step 15000 +2025-05-12T10:18:34 | INFO | __main__ : Performing periodic evaluation at global step 15000... +2025-05-12T10:18:34 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T10:18:34 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T10:18:34 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T10:18:34 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T10:18:43 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.5957 +2025-05-12T10:18:43 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0015000.png +2025-05-12T10:18:43 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T10:18:43 | INFO | __main__ : Evaluation at step 15000 complete. Average Similarity: 0.5957 +2025-05-12T10:19:06 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:19:06 | INFO | __main__ : Step: 15100 +2025-05-12T10:19:06 | INFO | __main__ : Current Frame Index within Batch Video: 164/247 +2025-05-12T10:19:06 | INFO | __main__ : Batch-wise Cosine Similarity | 81.55% +2025-05-12T10:19:06 | INFO | __main__ : Cosine Embedding Loss | 0.1845 +2025-05-12T10:19:06 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T10:19:06 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:19:06 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:19:26 | INFO | utils.basic_utils : Train Epoch: [0] [ 62/4978] eta: 3 days, 10:14:54 lr: 0.000002 temperature: 0.0126 video-loss_cosine: 0.1577 eval_avg_sim: 0.5957 video-cosine_similarity: 0.8423 time: 59.7870 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T10:19:30 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:19:30 | INFO | __main__ : Step: 15200 +2025-05-12T10:19:30 | INFO | __main__ : Current Frame Index within Batch Video: 23/247 +2025-05-12T10:19:30 | INFO | __main__ : Batch-wise Cosine Similarity | 62.26% +2025-05-12T10:19:30 | INFO | __main__ : Cosine Embedding Loss | 0.3774 +2025-05-12T10:19:30 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T10:19:30 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:19:30 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:19:54 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:19:54 | INFO | __main__ : Step: 15300 +2025-05-12T10:19:54 | INFO | __main__ : Current Frame Index within Batch Video: 123/247 +2025-05-12T10:19:54 | INFO | __main__ : Batch-wise Cosine Similarity | 76.93% +2025-05-12T10:19:54 | INFO | __main__ : Cosine Embedding Loss | 0.2307 +2025-05-12T10:19:54 | INFO | __main__ : Learning Rate | 0.000002 +2025-05-12T10:19:54 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:19:54 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:20:18 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:20:18 | INFO | __main__ : Step: 15400 +2025-05-12T10:20:18 | INFO | __main__ : Current Frame Index within Batch Video: 223/247 +2025-05-12T10:20:18 | INFO | __main__ : Batch-wise Cosine Similarity | 80.93% +2025-05-12T10:20:18 | INFO | __main__ : Cosine Embedding Loss | 0.1907 +2025-05-12T10:20:18 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:20:18 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:20:18 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:20:23 | INFO | utils.basic_utils : Train Epoch: [0] [ 63/4978] eta: 3 days, 10:10:17 lr: 0.000003 temperature: 0.0126 video-loss_cosine: 0.1862 eval_avg_sim: 0.5957 video-cosine_similarity: 0.8138 time: 59.7872 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T10:20:42 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:20:42 | INFO | __main__ : Step: 15500 +2025-05-12T10:20:42 | INFO | __main__ : Current Frame Index within Batch Video: 82/247 +2025-05-12T10:20:42 | INFO | __main__ : Batch-wise Cosine Similarity | 78.09% +2025-05-12T10:20:42 | INFO | __main__ : Cosine Embedding Loss | 0.2191 +2025-05-12T10:20:42 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:20:42 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:20:42 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:21:05 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:21:05 | INFO | __main__ : Step: 15600 +2025-05-12T10:21:05 | INFO | __main__ : Current Frame Index within Batch Video: 182/247 +2025-05-12T10:21:05 | INFO | __main__ : Batch-wise Cosine Similarity | 82.60% +2025-05-12T10:21:05 | INFO | __main__ : Cosine Embedding Loss | 0.1740 +2025-05-12T10:21:05 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:21:05 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:21:05 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:21:21 | INFO | utils.basic_utils : Train Epoch: [0] [ 64/4978] eta: 3 days, 10:05:48 lr: 0.000003 temperature: 0.0126 video-loss_cosine: 0.1346 eval_avg_sim: 0.5957 video-cosine_similarity: 0.8654 time: 59.7884 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T10:21:29 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:21:29 | INFO | __main__ : Step: 15700 +2025-05-12T10:21:29 | INFO | __main__ : Current Frame Index within Batch Video: 41/247 +2025-05-12T10:21:29 | INFO | __main__ : Batch-wise Cosine Similarity | 71.80% +2025-05-12T10:21:29 | INFO | __main__ : Cosine Embedding Loss | 0.2820 +2025-05-12T10:21:29 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:21:29 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:21:29 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:21:53 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:21:53 | INFO | __main__ : Step: 15800 +2025-05-12T10:21:53 | INFO | __main__ : Current Frame Index within Batch Video: 141/247 +2025-05-12T10:21:53 | INFO | __main__ : Batch-wise Cosine Similarity | 79.33% +2025-05-12T10:21:53 | INFO | __main__ : Cosine Embedding Loss | 0.2067 +2025-05-12T10:21:53 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:21:53 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:21:53 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:22:17 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:22:17 | INFO | __main__ : Step: 15900 +2025-05-12T10:22:17 | INFO | __main__ : Current Frame Index within Batch Video: 241/247 +2025-05-12T10:22:17 | INFO | __main__ : Batch-wise Cosine Similarity | 83.07% +2025-05-12T10:22:17 | INFO | __main__ : Cosine Embedding Loss | 0.1693 +2025-05-12T10:22:17 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:22:17 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:22:17 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:22:18 | INFO | utils.basic_utils : Train Epoch: [0] [ 65/4978] eta: 3 days, 10:01:30 lr: 0.000003 temperature: 0.0126 video-loss_cosine: 0.1821 eval_avg_sim: 0.5957 video-cosine_similarity: 0.8179 time: 59.3246 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T10:22:41 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:22:41 | INFO | __main__ : Step: 16000 +2025-05-12T10:22:41 | INFO | __main__ : Current Frame Index within Batch Video: 100/247 +2025-05-12T10:22:41 | INFO | __main__ : Batch-wise Cosine Similarity | 77.60% +2025-05-12T10:22:41 | INFO | __main__ : Cosine Embedding Loss | 0.2240 +2025-05-12T10:22:41 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:22:41 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:22:41 | INFO | __main__ : Evaluation Average Sim | 0.5957 +2025-05-12T10:22:41 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:22:41 | INFO | __main__ : Performing periodic evaluation at global step 16000... +2025-05-12T10:22:41 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T10:22:41 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T10:22:41 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T10:22:41 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T10:22:50 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.6104 +2025-05-12T10:22:50 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0016000.png +2025-05-12T10:22:50 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T10:22:50 | INFO | __main__ : Evaluation at step 16000 complete. Average Similarity: 0.6104 +2025-05-12T10:23:14 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:23:14 | INFO | __main__ : Step: 16100 +2025-05-12T10:23:14 | INFO | __main__ : Current Frame Index within Batch Video: 200/247 +2025-05-12T10:23:14 | INFO | __main__ : Batch-wise Cosine Similarity | 82.67% +2025-05-12T10:23:14 | INFO | __main__ : Cosine Embedding Loss | 0.1733 +2025-05-12T10:23:14 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:23:14 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:23:14 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:23:25 | INFO | utils.basic_utils : Train Epoch: [0] [ 66/4978] eta: 3 days, 10:08:37 lr: 0.000003 temperature: 0.0126 video-loss_cosine: 0.1552 eval_avg_sim: 0.6104 video-cosine_similarity: 0.8448 time: 59.7919 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T10:23:38 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:23:38 | INFO | __main__ : Step: 16200 +2025-05-12T10:23:38 | INFO | __main__ : Current Frame Index within Batch Video: 59/247 +2025-05-12T10:23:38 | INFO | __main__ : Batch-wise Cosine Similarity | 75.94% +2025-05-12T10:23:38 | INFO | __main__ : Cosine Embedding Loss | 0.2406 +2025-05-12T10:23:38 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:23:38 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:23:38 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:24:02 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:24:02 | INFO | __main__ : Step: 16300 +2025-05-12T10:24:02 | INFO | __main__ : Current Frame Index within Batch Video: 159/247 +2025-05-12T10:24:02 | INFO | __main__ : Batch-wise Cosine Similarity | 82.30% +2025-05-12T10:24:02 | INFO | __main__ : Cosine Embedding Loss | 0.1770 +2025-05-12T10:24:02 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:24:02 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:24:02 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:24:22 | INFO | utils.basic_utils : Train Epoch: [0] [ 67/4978] eta: 3 days, 10:04:10 lr: 0.000003 temperature: 0.0126 video-loss_cosine: 0.1468 eval_avg_sim: 0.6104 video-cosine_similarity: 0.8532 time: 59.7870 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T10:24:25 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:24:25 | INFO | __main__ : Step: 16400 +2025-05-12T10:24:25 | INFO | __main__ : Current Frame Index within Batch Video: 18/247 +2025-05-12T10:24:25 | INFO | __main__ : Batch-wise Cosine Similarity | 63.98% +2025-05-12T10:24:25 | INFO | __main__ : Cosine Embedding Loss | 0.3602 +2025-05-12T10:24:25 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:24:25 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:24:25 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:24:49 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:24:49 | INFO | __main__ : Step: 16500 +2025-05-12T10:24:49 | INFO | __main__ : Current Frame Index within Batch Video: 118/247 +2025-05-12T10:24:49 | INFO | __main__ : Batch-wise Cosine Similarity | 80.53% +2025-05-12T10:24:49 | INFO | __main__ : Cosine Embedding Loss | 0.1947 +2025-05-12T10:24:49 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:24:49 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:24:49 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:25:13 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:25:13 | INFO | __main__ : Step: 16600 +2025-05-12T10:25:13 | INFO | __main__ : Current Frame Index within Batch Video: 218/247 +2025-05-12T10:25:13 | INFO | __main__ : Batch-wise Cosine Similarity | 84.28% +2025-05-12T10:25:13 | INFO | __main__ : Cosine Embedding Loss | 0.1572 +2025-05-12T10:25:13 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:25:13 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:25:13 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:25:20 | INFO | utils.basic_utils : Train Epoch: [0] [ 68/4978] eta: 3 days, 9:59:56 lr: 0.000003 temperature: 0.0126 video-loss_cosine: 0.1585 eval_avg_sim: 0.6104 video-cosine_similarity: 0.8415 time: 59.7855 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T10:25:37 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:25:37 | INFO | __main__ : Step: 16700 +2025-05-12T10:25:37 | INFO | __main__ : Current Frame Index within Batch Video: 77/247 +2025-05-12T10:25:37 | INFO | __main__ : Batch-wise Cosine Similarity | 76.86% +2025-05-12T10:25:37 | INFO | __main__ : Cosine Embedding Loss | 0.2314 +2025-05-12T10:25:37 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:25:37 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:25:37 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:26:01 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:26:01 | INFO | __main__ : Step: 16800 +2025-05-12T10:26:01 | INFO | __main__ : Current Frame Index within Batch Video: 177/247 +2025-05-12T10:26:01 | INFO | __main__ : Batch-wise Cosine Similarity | 82.94% +2025-05-12T10:26:01 | INFO | __main__ : Cosine Embedding Loss | 0.1706 +2025-05-12T10:26:01 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:26:01 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:26:01 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:26:17 | INFO | utils.basic_utils : Train Epoch: [0] [ 69/4978] eta: 3 days, 9:55:48 lr: 0.000003 temperature: 0.0126 video-loss_cosine: 0.1469 eval_avg_sim: 0.6104 video-cosine_similarity: 0.8531 time: 59.3149 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T10:26:25 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:26:25 | INFO | __main__ : Step: 16900 +2025-05-12T10:26:25 | INFO | __main__ : Current Frame Index within Batch Video: 36/247 +2025-05-12T10:26:25 | INFO | __main__ : Batch-wise Cosine Similarity | 71.61% +2025-05-12T10:26:25 | INFO | __main__ : Cosine Embedding Loss | 0.2839 +2025-05-12T10:26:25 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:26:25 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:26:25 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:26:48 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:26:48 | INFO | __main__ : Step: 17000 +2025-05-12T10:26:48 | INFO | __main__ : Current Frame Index within Batch Video: 136/247 +2025-05-12T10:26:48 | INFO | __main__ : Batch-wise Cosine Similarity | 82.79% +2025-05-12T10:26:48 | INFO | __main__ : Cosine Embedding Loss | 0.1721 +2025-05-12T10:26:48 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:26:48 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:26:48 | INFO | __main__ : Evaluation Average Sim | 0.6104 +2025-05-12T10:26:48 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:26:49 | INFO | __main__ : Performing periodic evaluation at global step 17000... +2025-05-12T10:26:49 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T10:26:49 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T10:26:49 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T10:26:49 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T10:26:58 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.6678 +2025-05-12T10:26:58 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0017000.png +2025-05-12T10:26:58 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T10:26:58 | INFO | __main__ : Evaluation at step 17000 complete. Average Similarity: 0.6678 +2025-05-12T10:27:21 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:27:21 | INFO | __main__ : Step: 17100 +2025-05-12T10:27:21 | INFO | __main__ : Current Frame Index within Batch Video: 236/247 +2025-05-12T10:27:21 | INFO | __main__ : Batch-wise Cosine Similarity | 85.08% +2025-05-12T10:27:21 | INFO | __main__ : Cosine Embedding Loss | 0.1492 +2025-05-12T10:27:21 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:27:21 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:27:21 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:27:24 | INFO | utils.basic_utils : Train Epoch: [0] [ 70/4978] eta: 3 days, 10:02:29 lr: 0.000003 temperature: 0.0126 video-loss_cosine: 0.1429 eval_avg_sim: 0.6678 video-cosine_similarity: 0.8571 time: 59.7844 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T10:27:45 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:27:45 | INFO | __main__ : Step: 17200 +2025-05-12T10:27:45 | INFO | __main__ : Current Frame Index within Batch Video: 95/247 +2025-05-12T10:27:45 | INFO | __main__ : Batch-wise Cosine Similarity | 78.21% +2025-05-12T10:27:45 | INFO | __main__ : Cosine Embedding Loss | 0.2179 +2025-05-12T10:27:45 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:27:45 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:27:45 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:28:09 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:28:09 | INFO | __main__ : Step: 17300 +2025-05-12T10:28:09 | INFO | __main__ : Current Frame Index within Batch Video: 195/247 +2025-05-12T10:28:09 | INFO | __main__ : Batch-wise Cosine Similarity | 83.06% +2025-05-12T10:28:09 | INFO | __main__ : Cosine Embedding Loss | 0.1694 +2025-05-12T10:28:09 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:28:09 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:28:09 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:28:21 | INFO | utils.basic_utils : Train Epoch: [0] [ 71/4978] eta: 3 days, 9:58:21 lr: 0.000003 temperature: 0.0126 video-loss_cosine: 0.1741 eval_avg_sim: 0.6678 video-cosine_similarity: 0.8259 time: 59.7836 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T10:28:33 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:28:33 | INFO | __main__ : Step: 17400 +2025-05-12T10:28:33 | INFO | __main__ : Current Frame Index within Batch Video: 54/247 +2025-05-12T10:28:33 | INFO | __main__ : Batch-wise Cosine Similarity | 75.72% +2025-05-12T10:28:33 | INFO | __main__ : Cosine Embedding Loss | 0.2428 +2025-05-12T10:28:33 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:28:33 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:28:33 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:28:57 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:28:57 | INFO | __main__ : Step: 17500 +2025-05-12T10:28:57 | INFO | __main__ : Current Frame Index within Batch Video: 154/247 +2025-05-12T10:28:57 | INFO | __main__ : Batch-wise Cosine Similarity | 83.32% +2025-05-12T10:28:57 | INFO | __main__ : Cosine Embedding Loss | 0.1668 +2025-05-12T10:28:57 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:28:57 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:28:57 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:29:19 | INFO | utils.basic_utils : Train Epoch: [0] [ 72/4978] eta: 3 days, 9:54:17 lr: 0.000003 temperature: 0.0126 video-loss_cosine: 0.1395 eval_avg_sim: 0.6678 video-cosine_similarity: 0.8605 time: 59.7830 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T10:29:21 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:29:21 | INFO | __main__ : Step: 17600 +2025-05-12T10:29:21 | INFO | __main__ : Current Frame Index within Batch Video: 13/247 +2025-05-12T10:29:21 | INFO | __main__ : Batch-wise Cosine Similarity | 61.55% +2025-05-12T10:29:21 | INFO | __main__ : Cosine Embedding Loss | 0.3845 +2025-05-12T10:29:21 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:29:21 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:29:21 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:29:44 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:29:44 | INFO | __main__ : Step: 17700 +2025-05-12T10:29:44 | INFO | __main__ : Current Frame Index within Batch Video: 113/247 +2025-05-12T10:29:44 | INFO | __main__ : Batch-wise Cosine Similarity | 79.74% +2025-05-12T10:29:44 | INFO | __main__ : Cosine Embedding Loss | 0.2026 +2025-05-12T10:29:44 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:29:44 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:29:44 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:30:08 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:30:08 | INFO | __main__ : Step: 17800 +2025-05-12T10:30:08 | INFO | __main__ : Current Frame Index within Batch Video: 213/247 +2025-05-12T10:30:08 | INFO | __main__ : Batch-wise Cosine Similarity | 84.93% +2025-05-12T10:30:08 | INFO | __main__ : Cosine Embedding Loss | 0.1507 +2025-05-12T10:30:08 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:30:08 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:30:08 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:30:16 | INFO | utils.basic_utils : Train Epoch: [0] [ 73/4978] eta: 3 days, 9:50:22 lr: 0.000003 temperature: 0.0126 video-loss_cosine: 0.1220 eval_avg_sim: 0.6678 video-cosine_similarity: 0.8780 time: 59.3176 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T10:30:32 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:30:32 | INFO | __main__ : Step: 17900 +2025-05-12T10:30:32 | INFO | __main__ : Current Frame Index within Batch Video: 72/247 +2025-05-12T10:30:32 | INFO | __main__ : Batch-wise Cosine Similarity | 78.60% +2025-05-12T10:30:32 | INFO | __main__ : Cosine Embedding Loss | 0.2140 +2025-05-12T10:30:32 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:30:32 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:30:32 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:30:56 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:30:56 | INFO | __main__ : Step: 18000 +2025-05-12T10:30:56 | INFO | __main__ : Current Frame Index within Batch Video: 172/247 +2025-05-12T10:30:56 | INFO | __main__ : Batch-wise Cosine Similarity | 80.71% +2025-05-12T10:30:56 | INFO | __main__ : Cosine Embedding Loss | 0.1929 +2025-05-12T10:30:56 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:30:56 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:30:56 | INFO | __main__ : Evaluation Average Sim | 0.6678 +2025-05-12T10:30:56 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:30:56 | INFO | __main__ : Performing periodic evaluation at global step 18000... +2025-05-12T10:30:56 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T10:30:56 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T10:30:56 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T10:30:56 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T10:31:05 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.5896 +2025-05-12T10:31:06 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0018000.png +2025-05-12T10:31:06 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T10:31:06 | INFO | __main__ : Evaluation at step 18000 complete. Average Similarity: 0.5896 +2025-05-12T10:31:23 | INFO | utils.basic_utils : Train Epoch: [0] [ 74/4978] eta: 3 days, 9:56:37 lr: 0.000003 temperature: 0.0126 video-loss_cosine: 0.1427 eval_avg_sim: 0.5896 video-cosine_similarity: 0.8573 time: 59.7863 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T10:31:29 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:31:29 | INFO | __main__ : Step: 18100 +2025-05-12T10:31:29 | INFO | __main__ : Current Frame Index within Batch Video: 31/247 +2025-05-12T10:31:29 | INFO | __main__ : Batch-wise Cosine Similarity | 71.51% +2025-05-12T10:31:29 | INFO | __main__ : Cosine Embedding Loss | 0.2849 +2025-05-12T10:31:29 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:31:29 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:31:29 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:31:53 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:31:53 | INFO | __main__ : Step: 18200 +2025-05-12T10:31:53 | INFO | __main__ : Current Frame Index within Batch Video: 131/247 +2025-05-12T10:31:53 | INFO | __main__ : Batch-wise Cosine Similarity | 82.99% +2025-05-12T10:31:53 | INFO | __main__ : Cosine Embedding Loss | 0.1701 +2025-05-12T10:31:53 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:31:53 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:31:53 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:32:17 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:32:17 | INFO | __main__ : Step: 18300 +2025-05-12T10:32:17 | INFO | __main__ : Current Frame Index within Batch Video: 231/247 +2025-05-12T10:32:17 | INFO | __main__ : Batch-wise Cosine Similarity | 86.78% +2025-05-12T10:32:17 | INFO | __main__ : Cosine Embedding Loss | 0.1322 +2025-05-12T10:32:17 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:32:17 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:32:17 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:32:20 | INFO | utils.basic_utils : Train Epoch: [0] [ 75/4978] eta: 3 days, 9:52:39 lr: 0.000003 temperature: 0.0126 video-loss_cosine: 0.1383 eval_avg_sim: 0.5896 video-cosine_similarity: 0.8617 time: 59.7848 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T10:32:41 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:32:41 | INFO | __main__ : Step: 18400 +2025-05-12T10:32:41 | INFO | __main__ : Current Frame Index within Batch Video: 90/247 +2025-05-12T10:32:41 | INFO | __main__ : Batch-wise Cosine Similarity | 77.61% +2025-05-12T10:32:41 | INFO | __main__ : Cosine Embedding Loss | 0.2239 +2025-05-12T10:32:41 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:32:41 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:32:41 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:33:04 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:33:04 | INFO | __main__ : Step: 18500 +2025-05-12T10:33:04 | INFO | __main__ : Current Frame Index within Batch Video: 190/247 +2025-05-12T10:33:04 | INFO | __main__ : Batch-wise Cosine Similarity | 85.96% +2025-05-12T10:33:04 | INFO | __main__ : Cosine Embedding Loss | 0.1404 +2025-05-12T10:33:04 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:33:04 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:33:04 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:33:18 | INFO | utils.basic_utils : Train Epoch: [0] [ 76/4978] eta: 3 days, 9:48:49 lr: 0.000003 temperature: 0.0126 video-loss_cosine: 0.1339 eval_avg_sim: 0.5896 video-cosine_similarity: 0.8661 time: 59.7854 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T10:33:28 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:33:28 | INFO | __main__ : Step: 18600 +2025-05-12T10:33:28 | INFO | __main__ : Current Frame Index within Batch Video: 49/247 +2025-05-12T10:33:28 | INFO | __main__ : Batch-wise Cosine Similarity | 74.46% +2025-05-12T10:33:28 | INFO | __main__ : Cosine Embedding Loss | 0.2554 +2025-05-12T10:33:28 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:33:28 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:33:28 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:33:52 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:33:52 | INFO | __main__ : Step: 18700 +2025-05-12T10:33:52 | INFO | __main__ : Current Frame Index within Batch Video: 149/247 +2025-05-12T10:33:52 | INFO | __main__ : Batch-wise Cosine Similarity | 83.53% +2025-05-12T10:33:52 | INFO | __main__ : Cosine Embedding Loss | 0.1647 +2025-05-12T10:33:52 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:33:52 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:33:52 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:34:15 | INFO | utils.basic_utils : Train Epoch: [0] [ 77/4978] eta: 3 days, 9:45:03 lr: 0.000003 temperature: 0.0126 video-loss_cosine: 0.1553 eval_avg_sim: 0.5896 video-cosine_similarity: 0.8447 time: 59.7856 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T10:34:16 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:34:16 | INFO | __main__ : Step: 18800 +2025-05-12T10:34:16 | INFO | __main__ : Current Frame Index within Batch Video: 8/247 +2025-05-12T10:34:16 | INFO | __main__ : Batch-wise Cosine Similarity | 58.15% +2025-05-12T10:34:16 | INFO | __main__ : Cosine Embedding Loss | 0.4185 +2025-05-12T10:34:16 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:34:16 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:34:16 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:34:40 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:34:40 | INFO | __main__ : Step: 18900 +2025-05-12T10:34:40 | INFO | __main__ : Current Frame Index within Batch Video: 108/247 +2025-05-12T10:34:40 | INFO | __main__ : Batch-wise Cosine Similarity | 83.22% +2025-05-12T10:34:40 | INFO | __main__ : Cosine Embedding Loss | 0.1678 +2025-05-12T10:34:40 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:34:40 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:34:40 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:35:04 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:35:04 | INFO | __main__ : Step: 19000 +2025-05-12T10:35:04 | INFO | __main__ : Current Frame Index within Batch Video: 208/247 +2025-05-12T10:35:04 | INFO | __main__ : Batch-wise Cosine Similarity | 86.68% +2025-05-12T10:35:04 | INFO | __main__ : Cosine Embedding Loss | 0.1332 +2025-05-12T10:35:04 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:35:04 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:35:04 | INFO | __main__ : Evaluation Average Sim | 0.5896 +2025-05-12T10:35:04 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:35:04 | INFO | __main__ : Performing periodic evaluation at global step 19000... +2025-05-12T10:35:04 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T10:35:04 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T10:35:04 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T10:35:04 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T10:35:13 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.5784 +2025-05-12T10:35:13 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0019000.png +2025-05-12T10:35:13 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T10:35:13 | INFO | __main__ : Evaluation at step 19000 complete. Average Similarity: 0.5784 +2025-05-12T10:35:22 | INFO | utils.basic_utils : Train Epoch: [0] [ 78/4978] eta: 3 days, 9:50:57 lr: 0.000003 temperature: 0.0126 video-loss_cosine: 0.1146 eval_avg_sim: 0.5784 video-cosine_similarity: 0.8854 time: 59.7629 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T10:35:37 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:35:37 | INFO | __main__ : Step: 19100 +2025-05-12T10:35:37 | INFO | __main__ : Current Frame Index within Batch Video: 67/247 +2025-05-12T10:35:37 | INFO | __main__ : Batch-wise Cosine Similarity | 77.25% +2025-05-12T10:35:37 | INFO | __main__ : Cosine Embedding Loss | 0.2275 +2025-05-12T10:35:37 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:35:37 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:35:37 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:36:00 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:36:00 | INFO | __main__ : Step: 19200 +2025-05-12T10:36:00 | INFO | __main__ : Current Frame Index within Batch Video: 167/247 +2025-05-12T10:36:00 | INFO | __main__ : Batch-wise Cosine Similarity | 83.06% +2025-05-12T10:36:00 | INFO | __main__ : Cosine Embedding Loss | 0.1694 +2025-05-12T10:36:00 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:36:00 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:36:00 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:36:19 | INFO | utils.basic_utils : Train Epoch: [0] [ 79/4978] eta: 3 days, 9:47:10 lr: 0.000003 temperature: 0.0126 video-loss_cosine: 0.1601 eval_avg_sim: 0.5784 video-cosine_similarity: 0.8399 time: 59.7605 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T10:36:24 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:36:24 | INFO | __main__ : Step: 19300 +2025-05-12T10:36:24 | INFO | __main__ : Current Frame Index within Batch Video: 26/247 +2025-05-12T10:36:24 | INFO | __main__ : Batch-wise Cosine Similarity | 63.81% +2025-05-12T10:36:24 | INFO | __main__ : Cosine Embedding Loss | 0.3619 +2025-05-12T10:36:24 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:36:24 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:36:24 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:36:48 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:36:48 | INFO | __main__ : Step: 19400 +2025-05-12T10:36:48 | INFO | __main__ : Current Frame Index within Batch Video: 126/247 +2025-05-12T10:36:48 | INFO | __main__ : Batch-wise Cosine Similarity | 81.60% +2025-05-12T10:36:48 | INFO | __main__ : Cosine Embedding Loss | 0.1840 +2025-05-12T10:36:48 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:36:48 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:36:48 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:37:12 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:37:12 | INFO | __main__ : Step: 19500 +2025-05-12T10:37:12 | INFO | __main__ : Current Frame Index within Batch Video: 226/247 +2025-05-12T10:37:12 | INFO | __main__ : Batch-wise Cosine Similarity | 88.40% +2025-05-12T10:37:12 | INFO | __main__ : Cosine Embedding Loss | 0.1160 +2025-05-12T10:37:12 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:37:12 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:37:12 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:37:17 | INFO | utils.basic_utils : Train Epoch: [0] [ 80/4978] eta: 3 days, 9:43:28 lr: 0.000003 temperature: 0.0126 video-loss_cosine: 0.1015 eval_avg_sim: 0.5784 video-cosine_similarity: 0.8985 time: 59.7600 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T10:37:36 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:37:36 | INFO | __main__ : Step: 19600 +2025-05-12T10:37:36 | INFO | __main__ : Current Frame Index within Batch Video: 85/247 +2025-05-12T10:37:36 | INFO | __main__ : Batch-wise Cosine Similarity | 77.77% +2025-05-12T10:37:36 | INFO | __main__ : Cosine Embedding Loss | 0.2223 +2025-05-12T10:37:36 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:37:36 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:37:36 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:38:00 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:38:00 | INFO | __main__ : Step: 19700 +2025-05-12T10:38:00 | INFO | __main__ : Current Frame Index within Batch Video: 185/247 +2025-05-12T10:38:00 | INFO | __main__ : Batch-wise Cosine Similarity | 84.00% +2025-05-12T10:38:00 | INFO | __main__ : Cosine Embedding Loss | 0.1600 +2025-05-12T10:38:00 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:38:00 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:38:00 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:38:14 | INFO | utils.basic_utils : Train Epoch: [0] [ 81/4978] eta: 3 days, 9:39:52 lr: 0.000003 temperature: 0.0126 video-loss_cosine: 0.1285 eval_avg_sim: 0.5784 video-cosine_similarity: 0.8715 time: 59.7610 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T10:38:24 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:38:24 | INFO | __main__ : Step: 19800 +2025-05-12T10:38:24 | INFO | __main__ : Current Frame Index within Batch Video: 44/247 +2025-05-12T10:38:24 | INFO | __main__ : Batch-wise Cosine Similarity | 73.69% +2025-05-12T10:38:24 | INFO | __main__ : Cosine Embedding Loss | 0.2631 +2025-05-12T10:38:24 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:38:24 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:38:24 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:38:47 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:38:47 | INFO | __main__ : Step: 19900 +2025-05-12T10:38:47 | INFO | __main__ : Current Frame Index within Batch Video: 144/247 +2025-05-12T10:38:47 | INFO | __main__ : Batch-wise Cosine Similarity | 82.26% +2025-05-12T10:38:47 | INFO | __main__ : Cosine Embedding Loss | 0.1774 +2025-05-12T10:38:47 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:38:47 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:38:47 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:39:11 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:39:11 | INFO | __main__ : Step: 20000 +2025-05-12T10:39:11 | INFO | __main__ : Current Frame Index within Batch Video: 244/247 +2025-05-12T10:39:11 | INFO | __main__ : Batch-wise Cosine Similarity | 85.74% +2025-05-12T10:39:11 | INFO | __main__ : Cosine Embedding Loss | 0.1426 +2025-05-12T10:39:11 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:39:11 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:39:11 | INFO | __main__ : Evaluation Average Sim | 0.5784 +2025-05-12T10:39:11 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:39:11 | INFO | __main__ : Saving checkpoint at global step 20000 +2025-05-12T10:39:12 | INFO | __main__ : Performing periodic evaluation at global step 20000... +2025-05-12T10:39:12 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T10:39:12 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T10:39:12 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T10:39:12 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T10:39:21 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.6147 +2025-05-12T10:39:21 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0020000.png +2025-05-12T10:39:21 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T10:39:21 | INFO | __main__ : Evaluation at step 20000 complete. Average Similarity: 0.6147 +2025-05-12T10:39:21 | INFO | utils.basic_utils : Train Epoch: [0] [ 82/4978] eta: 3 days, 9:45:37 lr: 0.000003 temperature: 0.0126 video-loss_cosine: 0.1357 eval_avg_sim: 0.6147 video-cosine_similarity: 0.8643 time: 59.7624 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T10:39:44 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:39:44 | INFO | __main__ : Step: 20100 +2025-05-12T10:39:44 | INFO | __main__ : Current Frame Index within Batch Video: 103/247 +2025-05-12T10:39:44 | INFO | __main__ : Batch-wise Cosine Similarity | 78.96% +2025-05-12T10:39:44 | INFO | __main__ : Cosine Embedding Loss | 0.2104 +2025-05-12T10:39:44 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:39:44 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:39:44 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:40:08 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:40:08 | INFO | __main__ : Step: 20200 +2025-05-12T10:40:08 | INFO | __main__ : Current Frame Index within Batch Video: 203/247 +2025-05-12T10:40:08 | INFO | __main__ : Batch-wise Cosine Similarity | 86.91% +2025-05-12T10:40:08 | INFO | __main__ : Cosine Embedding Loss | 0.1309 +2025-05-12T10:40:08 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:40:08 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:40:08 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:40:19 | INFO | utils.basic_utils : Train Epoch: [0] [ 83/4978] eta: 3 days, 9:41:58 lr: 0.000003 temperature: 0.0126 video-loss_cosine: 0.1094 eval_avg_sim: 0.6147 video-cosine_similarity: 0.8906 time: 59.7618 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T10:40:32 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:40:32 | INFO | __main__ : Step: 20300 +2025-05-12T10:40:32 | INFO | __main__ : Current Frame Index within Batch Video: 62/247 +2025-05-12T10:40:32 | INFO | __main__ : Batch-wise Cosine Similarity | 76.63% +2025-05-12T10:40:32 | INFO | __main__ : Cosine Embedding Loss | 0.2337 +2025-05-12T10:40:32 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:40:32 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:40:32 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:40:56 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:40:56 | INFO | __main__ : Step: 20400 +2025-05-12T10:40:56 | INFO | __main__ : Current Frame Index within Batch Video: 162/247 +2025-05-12T10:40:56 | INFO | __main__ : Batch-wise Cosine Similarity | 80.73% +2025-05-12T10:40:56 | INFO | __main__ : Cosine Embedding Loss | 0.1927 +2025-05-12T10:40:56 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:40:56 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:40:56 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:41:16 | INFO | utils.basic_utils : Train Epoch: [0] [ 84/4978] eta: 3 days, 9:38:27 lr: 0.000003 temperature: 0.0126 video-loss_cosine: 0.1174 eval_avg_sim: 0.6147 video-cosine_similarity: 0.8826 time: 59.7639 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T10:41:20 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:41:20 | INFO | __main__ : Step: 20500 +2025-05-12T10:41:20 | INFO | __main__ : Current Frame Index within Batch Video: 21/247 +2025-05-12T10:41:20 | INFO | __main__ : Batch-wise Cosine Similarity | 66.30% +2025-05-12T10:41:20 | INFO | __main__ : Cosine Embedding Loss | 0.3370 +2025-05-12T10:41:20 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:41:20 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:41:20 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:41:44 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:41:44 | INFO | __main__ : Step: 20600 +2025-05-12T10:41:44 | INFO | __main__ : Current Frame Index within Batch Video: 121/247 +2025-05-12T10:41:44 | INFO | __main__ : Batch-wise Cosine Similarity | 84.69% +2025-05-12T10:41:44 | INFO | __main__ : Cosine Embedding Loss | 0.1531 +2025-05-12T10:41:44 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:41:44 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:41:44 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:42:07 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:42:07 | INFO | __main__ : Step: 20700 +2025-05-12T10:42:07 | INFO | __main__ : Current Frame Index within Batch Video: 221/247 +2025-05-12T10:42:07 | INFO | __main__ : Batch-wise Cosine Similarity | 90.79% +2025-05-12T10:42:07 | INFO | __main__ : Cosine Embedding Loss | 0.0921 +2025-05-12T10:42:07 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:42:07 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:42:07 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:42:14 | INFO | utils.basic_utils : Train Epoch: [0] [ 85/4978] eta: 3 days, 9:35:03 lr: 0.000003 temperature: 0.0126 video-loss_cosine: 0.0888 eval_avg_sim: 0.6147 video-cosine_similarity: 0.9112 time: 59.7661 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T10:42:31 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:42:31 | INFO | __main__ : Step: 20800 +2025-05-12T10:42:31 | INFO | __main__ : Current Frame Index within Batch Video: 80/247 +2025-05-12T10:42:31 | INFO | __main__ : Batch-wise Cosine Similarity | 79.42% +2025-05-12T10:42:31 | INFO | __main__ : Cosine Embedding Loss | 0.2058 +2025-05-12T10:42:31 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:42:31 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:42:31 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:42:55 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:42:55 | INFO | __main__ : Step: 20900 +2025-05-12T10:42:55 | INFO | __main__ : Current Frame Index within Batch Video: 180/247 +2025-05-12T10:42:55 | INFO | __main__ : Batch-wise Cosine Similarity | 86.66% +2025-05-12T10:42:55 | INFO | __main__ : Cosine Embedding Loss | 0.1334 +2025-05-12T10:42:55 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:42:55 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:42:55 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:43:11 | INFO | utils.basic_utils : Train Epoch: [0] [ 86/4978] eta: 3 days, 9:31:38 lr: 0.000003 temperature: 0.0126 video-loss_cosine: 0.1065 eval_avg_sim: 0.6147 video-cosine_similarity: 0.8935 time: 59.3008 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T10:43:19 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:43:19 | INFO | __main__ : Step: 21000 +2025-05-12T10:43:19 | INFO | __main__ : Current Frame Index within Batch Video: 39/247 +2025-05-12T10:43:19 | INFO | __main__ : Batch-wise Cosine Similarity | 74.08% +2025-05-12T10:43:19 | INFO | __main__ : Cosine Embedding Loss | 0.2592 +2025-05-12T10:43:19 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:43:19 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:43:19 | INFO | __main__ : Evaluation Average Sim | 0.6147 +2025-05-12T10:43:19 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:43:19 | INFO | __main__ : Performing periodic evaluation at global step 21000... +2025-05-12T10:43:19 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T10:43:19 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T10:43:20 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T10:43:20 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T10:43:29 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.5895 +2025-05-12T10:43:29 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0021000.png +2025-05-12T10:43:29 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T10:43:29 | INFO | __main__ : Evaluation at step 21000 complete. Average Similarity: 0.5895 +2025-05-12T10:43:52 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:43:52 | INFO | __main__ : Step: 21100 +2025-05-12T10:43:52 | INFO | __main__ : Current Frame Index within Batch Video: 139/247 +2025-05-12T10:43:52 | INFO | __main__ : Batch-wise Cosine Similarity | 81.72% +2025-05-12T10:43:52 | INFO | __main__ : Cosine Embedding Loss | 0.1828 +2025-05-12T10:43:52 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:43:52 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:43:52 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:44:16 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:44:16 | INFO | __main__ : Step: 21200 +2025-05-12T10:44:16 | INFO | __main__ : Current Frame Index within Batch Video: 239/247 +2025-05-12T10:44:16 | INFO | __main__ : Batch-wise Cosine Similarity | 87.44% +2025-05-12T10:44:16 | INFO | __main__ : Cosine Embedding Loss | 0.1256 +2025-05-12T10:44:16 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:44:16 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:44:16 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:44:18 | INFO | utils.basic_utils : Train Epoch: [0] [ 87/4978] eta: 3 days, 9:37:08 lr: 0.000003 temperature: 0.0126 video-loss_cosine: 0.1348 eval_avg_sim: 0.5895 video-cosine_similarity: 0.8652 time: 59.7842 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T10:44:40 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:44:40 | INFO | __main__ : Step: 21300 +2025-05-12T10:44:40 | INFO | __main__ : Current Frame Index within Batch Video: 98/247 +2025-05-12T10:44:40 | INFO | __main__ : Batch-wise Cosine Similarity | 83.05% +2025-05-12T10:44:40 | INFO | __main__ : Cosine Embedding Loss | 0.1695 +2025-05-12T10:44:40 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:44:40 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:44:40 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:45:04 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:45:04 | INFO | __main__ : Step: 21400 +2025-05-12T10:45:04 | INFO | __main__ : Current Frame Index within Batch Video: 198/247 +2025-05-12T10:45:04 | INFO | __main__ : Batch-wise Cosine Similarity | 85.37% +2025-05-12T10:45:04 | INFO | __main__ : Cosine Embedding Loss | 0.1463 +2025-05-12T10:45:04 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:45:04 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:45:04 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:45:16 | INFO | utils.basic_utils : Train Epoch: [0] [ 88/4978] eta: 3 days, 9:33:44 lr: 0.000003 temperature: 0.0126 video-loss_cosine: 0.1365 eval_avg_sim: 0.5895 video-cosine_similarity: 0.8635 time: 59.7849 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T10:45:28 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:45:28 | INFO | __main__ : Step: 21500 +2025-05-12T10:45:28 | INFO | __main__ : Current Frame Index within Batch Video: 57/247 +2025-05-12T10:45:28 | INFO | __main__ : Batch-wise Cosine Similarity | 77.64% +2025-05-12T10:45:28 | INFO | __main__ : Cosine Embedding Loss | 0.2236 +2025-05-12T10:45:28 | INFO | __main__ : Learning Rate | 0.000003 +2025-05-12T10:45:28 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:45:28 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:45:52 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:45:52 | INFO | __main__ : Step: 21600 +2025-05-12T10:45:52 | INFO | __main__ : Current Frame Index within Batch Video: 157/247 +2025-05-12T10:45:52 | INFO | __main__ : Batch-wise Cosine Similarity | 86.55% +2025-05-12T10:45:52 | INFO | __main__ : Cosine Embedding Loss | 0.1345 +2025-05-12T10:45:52 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T10:45:52 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:45:52 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:46:13 | INFO | utils.basic_utils : Train Epoch: [0] [ 89/4978] eta: 3 days, 9:30:24 lr: 0.000004 temperature: 0.0126 video-loss_cosine: 0.0943 eval_avg_sim: 0.5895 video-cosine_similarity: 0.9057 time: 59.7863 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T10:46:16 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:46:16 | INFO | __main__ : Step: 21700 +2025-05-12T10:46:16 | INFO | __main__ : Current Frame Index within Batch Video: 16/247 +2025-05-12T10:46:16 | INFO | __main__ : Batch-wise Cosine Similarity | 62.65% +2025-05-12T10:46:16 | INFO | __main__ : Cosine Embedding Loss | 0.3735 +2025-05-12T10:46:16 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T10:46:16 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:46:16 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:46:39 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:46:39 | INFO | __main__ : Step: 21800 +2025-05-12T10:46:39 | INFO | __main__ : Current Frame Index within Batch Video: 116/247 +2025-05-12T10:46:39 | INFO | __main__ : Batch-wise Cosine Similarity | 83.65% +2025-05-12T10:46:39 | INFO | __main__ : Cosine Embedding Loss | 0.1635 +2025-05-12T10:46:39 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T10:46:39 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:46:39 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:47:03 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:47:03 | INFO | __main__ : Step: 21900 +2025-05-12T10:47:03 | INFO | __main__ : Current Frame Index within Batch Video: 216/247 +2025-05-12T10:47:03 | INFO | __main__ : Batch-wise Cosine Similarity | 89.03% +2025-05-12T10:47:03 | INFO | __main__ : Cosine Embedding Loss | 0.1097 +2025-05-12T10:47:03 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T10:47:03 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:47:03 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:47:10 | INFO | utils.basic_utils : Train Epoch: [0] [ 90/4978] eta: 3 days, 9:27:05 lr: 0.000004 temperature: 0.0126 video-loss_cosine: 0.1072 eval_avg_sim: 0.5895 video-cosine_similarity: 0.8928 time: 59.3201 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T10:47:27 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:47:27 | INFO | __main__ : Step: 22000 +2025-05-12T10:47:27 | INFO | __main__ : Current Frame Index within Batch Video: 75/247 +2025-05-12T10:47:27 | INFO | __main__ : Batch-wise Cosine Similarity | 78.29% +2025-05-12T10:47:27 | INFO | __main__ : Cosine Embedding Loss | 0.2171 +2025-05-12T10:47:27 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T10:47:27 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:47:27 | INFO | __main__ : Evaluation Average Sim | 0.5895 +2025-05-12T10:47:27 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:47:27 | INFO | __main__ : Performing periodic evaluation at global step 22000... +2025-05-12T10:47:27 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T10:47:27 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T10:47:27 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T10:47:27 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T10:47:37 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.5952 +2025-05-12T10:47:37 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0022000.png +2025-05-12T10:47:37 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T10:47:37 | INFO | __main__ : Evaluation at step 22000 complete. Average Similarity: 0.5952 +2025-05-12T10:48:00 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:48:00 | INFO | __main__ : Step: 22100 +2025-05-12T10:48:00 | INFO | __main__ : Current Frame Index within Batch Video: 175/247 +2025-05-12T10:48:00 | INFO | __main__ : Batch-wise Cosine Similarity | 85.15% +2025-05-12T10:48:00 | INFO | __main__ : Cosine Embedding Loss | 0.1485 +2025-05-12T10:48:00 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T10:48:00 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:48:00 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:48:17 | INFO | utils.basic_utils : Train Epoch: [0] [ 91/4978] eta: 3 days, 9:32:05 lr: 0.000004 temperature: 0.0126 video-loss_cosine: 0.1584 eval_avg_sim: 0.5952 video-cosine_similarity: 0.8416 time: 59.7875 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T10:48:24 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:48:24 | INFO | __main__ : Step: 22200 +2025-05-12T10:48:24 | INFO | __main__ : Current Frame Index within Batch Video: 34/247 +2025-05-12T10:48:24 | INFO | __main__ : Batch-wise Cosine Similarity | 73.86% +2025-05-12T10:48:24 | INFO | __main__ : Cosine Embedding Loss | 0.2614 +2025-05-12T10:48:24 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T10:48:24 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:48:24 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:48:48 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:48:48 | INFO | __main__ : Step: 22300 +2025-05-12T10:48:48 | INFO | __main__ : Current Frame Index within Batch Video: 134/247 +2025-05-12T10:48:48 | INFO | __main__ : Batch-wise Cosine Similarity | 84.53% +2025-05-12T10:48:48 | INFO | __main__ : Cosine Embedding Loss | 0.1547 +2025-05-12T10:48:48 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T10:48:48 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:48:48 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:49:12 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:49:12 | INFO | __main__ : Step: 22400 +2025-05-12T10:49:12 | INFO | __main__ : Current Frame Index within Batch Video: 234/247 +2025-05-12T10:49:12 | INFO | __main__ : Batch-wise Cosine Similarity | 89.96% +2025-05-12T10:49:12 | INFO | __main__ : Cosine Embedding Loss | 0.1004 +2025-05-12T10:49:12 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T10:49:12 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:49:12 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:49:15 | INFO | utils.basic_utils : Train Epoch: [0] [ 92/4978] eta: 3 days, 9:28:46 lr: 0.000004 temperature: 0.0126 video-loss_cosine: 0.0915 eval_avg_sim: 0.5952 video-cosine_similarity: 0.9085 time: 59.7877 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T10:49:36 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:49:36 | INFO | __main__ : Step: 22500 +2025-05-12T10:49:36 | INFO | __main__ : Current Frame Index within Batch Video: 93/247 +2025-05-12T10:49:36 | INFO | __main__ : Batch-wise Cosine Similarity | 83.32% +2025-05-12T10:49:36 | INFO | __main__ : Cosine Embedding Loss | 0.1668 +2025-05-12T10:49:36 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T10:49:36 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:49:36 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:49:59 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:49:59 | INFO | __main__ : Step: 22600 +2025-05-12T10:49:59 | INFO | __main__ : Current Frame Index within Batch Video: 193/247 +2025-05-12T10:49:59 | INFO | __main__ : Batch-wise Cosine Similarity | 88.60% +2025-05-12T10:49:59 | INFO | __main__ : Cosine Embedding Loss | 0.1140 +2025-05-12T10:49:59 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T10:49:59 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:49:59 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:50:12 | INFO | utils.basic_utils : Train Epoch: [0] [ 93/4978] eta: 3 days, 9:25:31 lr: 0.000004 temperature: 0.0126 video-loss_cosine: 0.0878 eval_avg_sim: 0.5952 video-cosine_similarity: 0.9122 time: 59.7871 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T10:50:23 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:50:23 | INFO | __main__ : Step: 22700 +2025-05-12T10:50:23 | INFO | __main__ : Current Frame Index within Batch Video: 52/247 +2025-05-12T10:50:23 | INFO | __main__ : Batch-wise Cosine Similarity | 78.71% +2025-05-12T10:50:23 | INFO | __main__ : Cosine Embedding Loss | 0.2129 +2025-05-12T10:50:23 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T10:50:23 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:50:23 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:50:47 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:50:47 | INFO | __main__ : Step: 22800 +2025-05-12T10:50:47 | INFO | __main__ : Current Frame Index within Batch Video: 152/247 +2025-05-12T10:50:47 | INFO | __main__ : Batch-wise Cosine Similarity | 85.64% +2025-05-12T10:50:47 | INFO | __main__ : Cosine Embedding Loss | 0.1436 +2025-05-12T10:50:47 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T10:50:47 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:50:47 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:51:10 | INFO | utils.basic_utils : Train Epoch: [0] [ 94/4978] eta: 3 days, 9:22:19 lr: 0.000004 temperature: 0.0126 video-loss_cosine: 0.1087 eval_avg_sim: 0.5952 video-cosine_similarity: 0.8913 time: 59.3227 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T10:51:11 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:51:11 | INFO | __main__ : Step: 22900 +2025-05-12T10:51:11 | INFO | __main__ : Current Frame Index within Batch Video: 11/247 +2025-05-12T10:51:11 | INFO | __main__ : Batch-wise Cosine Similarity | 54.04% +2025-05-12T10:51:11 | INFO | __main__ : Cosine Embedding Loss | 0.4596 +2025-05-12T10:51:11 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T10:51:11 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:51:11 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:51:35 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:51:35 | INFO | __main__ : Step: 23000 +2025-05-12T10:51:35 | INFO | __main__ : Current Frame Index within Batch Video: 111/247 +2025-05-12T10:51:35 | INFO | __main__ : Batch-wise Cosine Similarity | 78.88% +2025-05-12T10:51:35 | INFO | __main__ : Cosine Embedding Loss | 0.2112 +2025-05-12T10:51:35 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T10:51:35 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:51:35 | INFO | __main__ : Evaluation Average Sim | 0.5952 +2025-05-12T10:51:35 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:51:35 | INFO | __main__ : Performing periodic evaluation at global step 23000... +2025-05-12T10:51:35 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T10:51:35 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T10:51:35 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T10:51:35 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T10:51:44 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.6287 +2025-05-12T10:51:44 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0023000.png +2025-05-12T10:51:44 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T10:51:44 | INFO | __main__ : Evaluation at step 23000 complete. Average Similarity: 0.6287 +2025-05-12T10:52:08 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:52:08 | INFO | __main__ : Step: 23100 +2025-05-12T10:52:08 | INFO | __main__ : Current Frame Index within Batch Video: 211/247 +2025-05-12T10:52:08 | INFO | __main__ : Batch-wise Cosine Similarity | 81.12% +2025-05-12T10:52:08 | INFO | __main__ : Cosine Embedding Loss | 0.1888 +2025-05-12T10:52:08 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T10:52:08 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:52:08 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:52:16 | INFO | utils.basic_utils : Train Epoch: [0] [ 95/4978] eta: 3 days, 9:27:06 lr: 0.000004 temperature: 0.0126 video-loss_cosine: 0.1917 eval_avg_sim: 0.6287 video-cosine_similarity: 0.8083 time: 59.7933 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T10:52:32 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:52:32 | INFO | __main__ : Step: 23200 +2025-05-12T10:52:32 | INFO | __main__ : Current Frame Index within Batch Video: 70/247 +2025-05-12T10:52:32 | INFO | __main__ : Batch-wise Cosine Similarity | 82.00% +2025-05-12T10:52:32 | INFO | __main__ : Cosine Embedding Loss | 0.1800 +2025-05-12T10:52:32 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T10:52:32 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:52:32 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:52:56 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:52:56 | INFO | __main__ : Step: 23300 +2025-05-12T10:52:56 | INFO | __main__ : Current Frame Index within Batch Video: 170/247 +2025-05-12T10:52:56 | INFO | __main__ : Batch-wise Cosine Similarity | 88.65% +2025-05-12T10:52:56 | INFO | __main__ : Cosine Embedding Loss | 0.1135 +2025-05-12T10:52:56 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T10:52:56 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:52:56 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:53:14 | INFO | utils.basic_utils : Train Epoch: [0] [ 96/4978] eta: 3 days, 9:23:55 lr: 0.000004 temperature: 0.0126 video-loss_cosine: 0.1102 eval_avg_sim: 0.6287 video-cosine_similarity: 0.8898 time: 59.7930 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T10:53:19 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:53:19 | INFO | __main__ : Step: 23400 +2025-05-12T10:53:19 | INFO | __main__ : Current Frame Index within Batch Video: 29/247 +2025-05-12T10:53:19 | INFO | __main__ : Batch-wise Cosine Similarity | 70.85% +2025-05-12T10:53:19 | INFO | __main__ : Cosine Embedding Loss | 0.2915 +2025-05-12T10:53:19 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T10:53:19 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:53:19 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:53:43 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:53:43 | INFO | __main__ : Step: 23500 +2025-05-12T10:53:43 | INFO | __main__ : Current Frame Index within Batch Video: 129/247 +2025-05-12T10:53:43 | INFO | __main__ : Batch-wise Cosine Similarity | 85.38% +2025-05-12T10:53:43 | INFO | __main__ : Cosine Embedding Loss | 0.1462 +2025-05-12T10:53:43 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T10:53:43 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:53:43 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:54:07 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:54:07 | INFO | __main__ : Step: 23600 +2025-05-12T10:54:07 | INFO | __main__ : Current Frame Index within Batch Video: 229/247 +2025-05-12T10:54:07 | INFO | __main__ : Batch-wise Cosine Similarity | 89.19% +2025-05-12T10:54:07 | INFO | __main__ : Cosine Embedding Loss | 0.1081 +2025-05-12T10:54:07 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T10:54:07 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:54:07 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:54:11 | INFO | utils.basic_utils : Train Epoch: [0] [ 97/4978] eta: 3 days, 9:20:50 lr: 0.000004 temperature: 0.0126 video-loss_cosine: 0.1006 eval_avg_sim: 0.6287 video-cosine_similarity: 0.8994 time: 59.7961 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T10:54:31 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:54:31 | INFO | __main__ : Step: 23700 +2025-05-12T10:54:31 | INFO | __main__ : Current Frame Index within Batch Video: 88/247 +2025-05-12T10:54:31 | INFO | __main__ : Batch-wise Cosine Similarity | 81.42% +2025-05-12T10:54:31 | INFO | __main__ : Cosine Embedding Loss | 0.1858 +2025-05-12T10:54:31 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T10:54:31 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:54:31 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:54:55 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:54:55 | INFO | __main__ : Step: 23800 +2025-05-12T10:54:55 | INFO | __main__ : Current Frame Index within Batch Video: 188/247 +2025-05-12T10:54:55 | INFO | __main__ : Batch-wise Cosine Similarity | 87.82% +2025-05-12T10:54:55 | INFO | __main__ : Cosine Embedding Loss | 0.1218 +2025-05-12T10:54:55 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T10:54:55 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:54:55 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:55:09 | INFO | utils.basic_utils : Train Epoch: [0] [ 98/4978] eta: 3 days, 9:17:40 lr: 0.000004 temperature: 0.0126 video-loss_cosine: 0.1064 eval_avg_sim: 0.6287 video-cosine_similarity: 0.8936 time: 59.3282 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T10:55:19 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:55:19 | INFO | __main__ : Step: 23900 +2025-05-12T10:55:19 | INFO | __main__ : Current Frame Index within Batch Video: 47/247 +2025-05-12T10:55:19 | INFO | __main__ : Batch-wise Cosine Similarity | 75.73% +2025-05-12T10:55:19 | INFO | __main__ : Cosine Embedding Loss | 0.2427 +2025-05-12T10:55:19 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T10:55:19 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:55:19 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:55:42 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:55:42 | INFO | __main__ : Step: 24000 +2025-05-12T10:55:42 | INFO | __main__ : Current Frame Index within Batch Video: 147/247 +2025-05-12T10:55:42 | INFO | __main__ : Batch-wise Cosine Similarity | 85.65% +2025-05-12T10:55:42 | INFO | __main__ : Cosine Embedding Loss | 0.1435 +2025-05-12T10:55:42 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T10:55:42 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:55:42 | INFO | __main__ : Evaluation Average Sim | 0.6287 +2025-05-12T10:55:42 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:55:43 | INFO | __main__ : Performing periodic evaluation at global step 24000... +2025-05-12T10:55:43 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T10:55:43 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T10:55:43 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T10:55:43 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T10:55:52 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.5998 +2025-05-12T10:55:52 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0024000.png +2025-05-12T10:55:52 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T10:55:52 | INFO | __main__ : Evaluation at step 24000 complete. Average Similarity: 0.5998 +2025-05-12T10:56:15 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:56:15 | INFO | __main__ : Step: 24100 +2025-05-12T10:56:15 | INFO | __main__ : Current Frame Index within Batch Video: 247/247 +2025-05-12T10:56:15 | INFO | __main__ : Batch-wise Cosine Similarity | 90.82% +2025-05-12T10:56:15 | INFO | __main__ : Cosine Embedding Loss | 0.0918 +2025-05-12T10:56:15 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T10:56:15 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:56:15 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:56:15 | INFO | utils.basic_utils : Train Epoch: [0] [ 99/4978] eta: 3 days, 9:22:07 lr: 0.000004 temperature: 0.0126 video-loss_cosine: 0.0918 eval_avg_sim: 0.5998 video-cosine_similarity: 0.9082 time: 59.7916 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T10:56:39 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:56:39 | INFO | __main__ : Step: 24200 +2025-05-12T10:56:39 | INFO | __main__ : Current Frame Index within Batch Video: 106/247 +2025-05-12T10:56:39 | INFO | __main__ : Batch-wise Cosine Similarity | 83.93% +2025-05-12T10:56:39 | INFO | __main__ : Cosine Embedding Loss | 0.1607 +2025-05-12T10:56:39 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T10:56:39 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:56:39 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:57:03 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:57:03 | INFO | __main__ : Step: 24300 +2025-05-12T10:57:03 | INFO | __main__ : Current Frame Index within Batch Video: 206/247 +2025-05-12T10:57:03 | INFO | __main__ : Batch-wise Cosine Similarity | 89.47% +2025-05-12T10:57:03 | INFO | __main__ : Cosine Embedding Loss | 0.1053 +2025-05-12T10:57:03 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T10:57:03 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:57:03 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:57:13 | INFO | utils.basic_utils : Train Epoch: [0] [ 100/4978] eta: 3 days, 9:19:00 lr: 0.000004 temperature: 0.0126 video-loss_cosine: 0.0798 eval_avg_sim: 0.5998 video-cosine_similarity: 0.9202 time: 59.7910 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T10:57:27 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:57:27 | INFO | __main__ : Step: 24400 +2025-05-12T10:57:27 | INFO | __main__ : Current Frame Index within Batch Video: 65/247 +2025-05-12T10:57:27 | INFO | __main__ : Batch-wise Cosine Similarity | 79.06% +2025-05-12T10:57:27 | INFO | __main__ : Cosine Embedding Loss | 0.2094 +2025-05-12T10:57:27 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T10:57:27 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:57:27 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:57:51 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:57:51 | INFO | __main__ : Step: 24500 +2025-05-12T10:57:51 | INFO | __main__ : Current Frame Index within Batch Video: 165/247 +2025-05-12T10:57:51 | INFO | __main__ : Batch-wise Cosine Similarity | 89.17% +2025-05-12T10:57:51 | INFO | __main__ : Cosine Embedding Loss | 0.1083 +2025-05-12T10:57:51 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T10:57:51 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:57:51 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:58:10 | INFO | utils.basic_utils : Train Epoch: [0] [ 101/4978] eta: 3 days, 9:15:57 lr: 0.000004 temperature: 0.0126 video-loss_cosine: 0.1054 eval_avg_sim: 0.5998 video-cosine_similarity: 0.8946 time: 59.7910 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T10:58:15 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:58:15 | INFO | __main__ : Step: 24600 +2025-05-12T10:58:15 | INFO | __main__ : Current Frame Index within Batch Video: 24/247 +2025-05-12T10:58:15 | INFO | __main__ : Batch-wise Cosine Similarity | 66.92% +2025-05-12T10:58:15 | INFO | __main__ : Cosine Embedding Loss | 0.3308 +2025-05-12T10:58:15 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T10:58:15 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:58:15 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:58:38 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:58:38 | INFO | __main__ : Step: 24700 +2025-05-12T10:58:38 | INFO | __main__ : Current Frame Index within Batch Video: 124/247 +2025-05-12T10:58:38 | INFO | __main__ : Batch-wise Cosine Similarity | 86.20% +2025-05-12T10:58:38 | INFO | __main__ : Cosine Embedding Loss | 0.1380 +2025-05-12T10:58:38 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T10:58:38 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:58:38 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:59:02 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:59:02 | INFO | __main__ : Step: 24800 +2025-05-12T10:59:02 | INFO | __main__ : Current Frame Index within Batch Video: 224/247 +2025-05-12T10:59:02 | INFO | __main__ : Batch-wise Cosine Similarity | 89.77% +2025-05-12T10:59:02 | INFO | __main__ : Cosine Embedding Loss | 0.1023 +2025-05-12T10:59:02 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T10:59:02 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:59:02 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:59:08 | INFO | utils.basic_utils : Train Epoch: [0] [ 102/4978] eta: 3 days, 9:12:57 lr: 0.000004 temperature: 0.0126 video-loss_cosine: 0.0956 eval_avg_sim: 0.5998 video-cosine_similarity: 0.9044 time: 59.3177 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T10:59:26 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:59:26 | INFO | __main__ : Step: 24900 +2025-05-12T10:59:26 | INFO | __main__ : Current Frame Index within Batch Video: 83/247 +2025-05-12T10:59:26 | INFO | __main__ : Batch-wise Cosine Similarity | 82.34% +2025-05-12T10:59:26 | INFO | __main__ : Cosine Embedding Loss | 0.1766 +2025-05-12T10:59:26 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T10:59:26 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:59:26 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:59:50 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:59:50 | INFO | __main__ : Step: 25000 +2025-05-12T10:59:50 | INFO | __main__ : Current Frame Index within Batch Video: 183/247 +2025-05-12T10:59:50 | INFO | __main__ : Batch-wise Cosine Similarity | 82.74% +2025-05-12T10:59:50 | INFO | __main__ : Cosine Embedding Loss | 0.1726 +2025-05-12T10:59:50 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T10:59:50 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T10:59:50 | INFO | __main__ : Evaluation Average Sim | 0.5998 +2025-05-12T10:59:50 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T10:59:50 | INFO | __main__ : Saving checkpoint at global step 25000 +2025-05-12T10:59:50 | INFO | __main__ : Performing periodic evaluation at global step 25000... +2025-05-12T10:59:50 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T10:59:50 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T10:59:50 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T10:59:50 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T11:00:00 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.4869 +2025-05-12T11:00:00 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0025000.png +2025-05-12T11:00:00 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T11:00:00 | INFO | __main__ : Evaluation at step 25000 complete. Average Similarity: 0.4869 +2025-05-12T11:00:15 | INFO | utils.basic_utils : Train Epoch: [0] [ 103/4978] eta: 3 days, 9:17:22 lr: 0.000004 temperature: 0.0126 video-loss_cosine: 0.1491 eval_avg_sim: 0.4869 video-cosine_similarity: 0.8509 time: 59.7922 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T11:00:23 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:00:23 | INFO | __main__ : Step: 25100 +2025-05-12T11:00:23 | INFO | __main__ : Current Frame Index within Batch Video: 42/247 +2025-05-12T11:00:23 | INFO | __main__ : Batch-wise Cosine Similarity | 72.37% +2025-05-12T11:00:23 | INFO | __main__ : Cosine Embedding Loss | 0.2763 +2025-05-12T11:00:23 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T11:00:23 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:00:23 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:00:47 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:00:47 | INFO | __main__ : Step: 25200 +2025-05-12T11:00:47 | INFO | __main__ : Current Frame Index within Batch Video: 142/247 +2025-05-12T11:00:47 | INFO | __main__ : Batch-wise Cosine Similarity | 83.81% +2025-05-12T11:00:47 | INFO | __main__ : Cosine Embedding Loss | 0.1619 +2025-05-12T11:00:47 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T11:00:47 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:00:47 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:01:11 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:01:11 | INFO | __main__ : Step: 25300 +2025-05-12T11:01:11 | INFO | __main__ : Current Frame Index within Batch Video: 242/247 +2025-05-12T11:01:11 | INFO | __main__ : Batch-wise Cosine Similarity | 89.37% +2025-05-12T11:01:11 | INFO | __main__ : Cosine Embedding Loss | 0.1063 +2025-05-12T11:01:11 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T11:01:11 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:01:11 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:01:12 | INFO | utils.basic_utils : Train Epoch: [0] [ 104/4978] eta: 3 days, 9:14:19 lr: 0.000004 temperature: 0.0126 video-loss_cosine: 0.1100 eval_avg_sim: 0.4869 video-cosine_similarity: 0.8900 time: 59.7880 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T11:01:35 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:01:35 | INFO | __main__ : Step: 25400 +2025-05-12T11:01:35 | INFO | __main__ : Current Frame Index within Batch Video: 101/247 +2025-05-12T11:01:35 | INFO | __main__ : Batch-wise Cosine Similarity | 81.90% +2025-05-12T11:01:35 | INFO | __main__ : Cosine Embedding Loss | 0.1810 +2025-05-12T11:01:35 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T11:01:35 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:01:35 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:01:58 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:01:58 | INFO | __main__ : Step: 25500 +2025-05-12T11:01:58 | INFO | __main__ : Current Frame Index within Batch Video: 201/247 +2025-05-12T11:01:58 | INFO | __main__ : Batch-wise Cosine Similarity | 87.38% +2025-05-12T11:01:58 | INFO | __main__ : Cosine Embedding Loss | 0.1262 +2025-05-12T11:01:58 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T11:01:58 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:01:58 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:02:09 | INFO | utils.basic_utils : Train Epoch: [0] [ 105/4978] eta: 3 days, 9:11:21 lr: 0.000004 temperature: 0.0126 video-loss_cosine: 0.1025 eval_avg_sim: 0.4869 video-cosine_similarity: 0.8975 time: 59.7840 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T11:02:22 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:02:22 | INFO | __main__ : Step: 25600 +2025-05-12T11:02:22 | INFO | __main__ : Current Frame Index within Batch Video: 60/247 +2025-05-12T11:02:22 | INFO | __main__ : Batch-wise Cosine Similarity | 79.38% +2025-05-12T11:02:22 | INFO | __main__ : Cosine Embedding Loss | 0.2062 +2025-05-12T11:02:22 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T11:02:22 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:02:22 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:02:46 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:02:46 | INFO | __main__ : Step: 25700 +2025-05-12T11:02:46 | INFO | __main__ : Current Frame Index within Batch Video: 160/247 +2025-05-12T11:02:46 | INFO | __main__ : Batch-wise Cosine Similarity | 86.56% +2025-05-12T11:02:46 | INFO | __main__ : Cosine Embedding Loss | 0.1344 +2025-05-12T11:02:46 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T11:02:46 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:02:46 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:03:07 | INFO | utils.basic_utils : Train Epoch: [0] [ 106/4978] eta: 3 days, 9:08:23 lr: 0.000004 temperature: 0.0126 video-loss_cosine: 0.1110 eval_avg_sim: 0.4869 video-cosine_similarity: 0.8890 time: 59.7820 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T11:03:10 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:03:10 | INFO | __main__ : Step: 25800 +2025-05-12T11:03:10 | INFO | __main__ : Current Frame Index within Batch Video: 19/247 +2025-05-12T11:03:10 | INFO | __main__ : Batch-wise Cosine Similarity | 64.58% +2025-05-12T11:03:10 | INFO | __main__ : Cosine Embedding Loss | 0.3542 +2025-05-12T11:03:10 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T11:03:10 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:03:10 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:03:34 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:03:34 | INFO | __main__ : Step: 25900 +2025-05-12T11:03:34 | INFO | __main__ : Current Frame Index within Batch Video: 119/247 +2025-05-12T11:03:34 | INFO | __main__ : Batch-wise Cosine Similarity | 85.78% +2025-05-12T11:03:34 | INFO | __main__ : Cosine Embedding Loss | 0.1422 +2025-05-12T11:03:34 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T11:03:34 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:03:34 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:03:58 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:03:58 | INFO | __main__ : Step: 26000 +2025-05-12T11:03:58 | INFO | __main__ : Current Frame Index within Batch Video: 219/247 +2025-05-12T11:03:58 | INFO | __main__ : Batch-wise Cosine Similarity | 92.14% +2025-05-12T11:03:58 | INFO | __main__ : Cosine Embedding Loss | 0.0786 +2025-05-12T11:03:58 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T11:03:58 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:03:58 | INFO | __main__ : Evaluation Average Sim | 0.4869 +2025-05-12T11:03:58 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:03:58 | INFO | __main__ : Performing periodic evaluation at global step 26000... +2025-05-12T11:03:58 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T11:03:58 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T11:03:58 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T11:03:58 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T11:04:07 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.4638 +2025-05-12T11:04:07 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0026000.png +2025-05-12T11:04:07 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T11:04:07 | INFO | __main__ : Evaluation at step 26000 complete. Average Similarity: 0.4638 +2025-05-12T11:04:14 | INFO | utils.basic_utils : Train Epoch: [0] [ 107/4978] eta: 3 days, 9:12:34 lr: 0.000004 temperature: 0.0126 video-loss_cosine: 0.0720 eval_avg_sim: 0.4638 video-cosine_similarity: 0.9280 time: 59.7726 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T11:04:31 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:04:31 | INFO | __main__ : Step: 26100 +2025-05-12T11:04:31 | INFO | __main__ : Current Frame Index within Batch Video: 78/247 +2025-05-12T11:04:31 | INFO | __main__ : Batch-wise Cosine Similarity | 80.37% +2025-05-12T11:04:31 | INFO | __main__ : Cosine Embedding Loss | 0.1963 +2025-05-12T11:04:31 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T11:04:31 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:04:31 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:04:55 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:04:55 | INFO | __main__ : Step: 26200 +2025-05-12T11:04:55 | INFO | __main__ : Current Frame Index within Batch Video: 178/247 +2025-05-12T11:04:55 | INFO | __main__ : Batch-wise Cosine Similarity | 87.01% +2025-05-12T11:04:55 | INFO | __main__ : Cosine Embedding Loss | 0.1299 +2025-05-12T11:04:55 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T11:04:55 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:04:55 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:05:11 | INFO | utils.basic_utils : Train Epoch: [0] [ 108/4978] eta: 3 days, 9:09:35 lr: 0.000004 temperature: 0.0126 video-loss_cosine: 0.1013 eval_avg_sim: 0.4638 video-cosine_similarity: 0.8987 time: 59.7689 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T11:05:18 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:05:18 | INFO | __main__ : Step: 26300 +2025-05-12T11:05:19 | INFO | __main__ : Current Frame Index within Batch Video: 37/247 +2025-05-12T11:05:19 | INFO | __main__ : Batch-wise Cosine Similarity | 73.90% +2025-05-12T11:05:19 | INFO | __main__ : Cosine Embedding Loss | 0.2610 +2025-05-12T11:05:19 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T11:05:19 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:05:19 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:05:42 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:05:42 | INFO | __main__ : Step: 26400 +2025-05-12T11:05:42 | INFO | __main__ : Current Frame Index within Batch Video: 137/247 +2025-05-12T11:05:42 | INFO | __main__ : Batch-wise Cosine Similarity | 84.76% +2025-05-12T11:05:42 | INFO | __main__ : Cosine Embedding Loss | 0.1524 +2025-05-12T11:05:42 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T11:05:42 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:05:42 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:06:06 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:06:06 | INFO | __main__ : Step: 26500 +2025-05-12T11:06:06 | INFO | __main__ : Current Frame Index within Batch Video: 237/247 +2025-05-12T11:06:06 | INFO | __main__ : Batch-wise Cosine Similarity | 89.07% +2025-05-12T11:06:06 | INFO | __main__ : Cosine Embedding Loss | 0.1093 +2025-05-12T11:06:06 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T11:06:06 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:06:06 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:06:08 | INFO | utils.basic_utils : Train Epoch: [0] [ 109/4978] eta: 3 days, 9:06:43 lr: 0.000004 temperature: 0.0126 video-loss_cosine: 0.1120 eval_avg_sim: 0.4638 video-cosine_similarity: 0.8880 time: 59.7681 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T11:06:30 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:06:30 | INFO | __main__ : Step: 26600 +2025-05-12T11:06:30 | INFO | __main__ : Current Frame Index within Batch Video: 96/247 +2025-05-12T11:06:30 | INFO | __main__ : Batch-wise Cosine Similarity | 77.89% +2025-05-12T11:06:30 | INFO | __main__ : Cosine Embedding Loss | 0.2211 +2025-05-12T11:06:30 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T11:06:30 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:06:30 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:06:54 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:06:54 | INFO | __main__ : Step: 26700 +2025-05-12T11:06:54 | INFO | __main__ : Current Frame Index within Batch Video: 196/247 +2025-05-12T11:06:54 | INFO | __main__ : Batch-wise Cosine Similarity | 87.27% +2025-05-12T11:06:54 | INFO | __main__ : Cosine Embedding Loss | 0.1273 +2025-05-12T11:06:54 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T11:06:54 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:06:54 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:07:06 | INFO | utils.basic_utils : Train Epoch: [0] [ 110/4978] eta: 3 days, 9:03:51 lr: 0.000004 temperature: 0.0126 video-loss_cosine: 0.1266 eval_avg_sim: 0.4638 video-cosine_similarity: 0.8734 time: 59.7680 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T11:07:18 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:07:18 | INFO | __main__ : Step: 26800 +2025-05-12T11:07:18 | INFO | __main__ : Current Frame Index within Batch Video: 55/247 +2025-05-12T11:07:18 | INFO | __main__ : Batch-wise Cosine Similarity | 77.23% +2025-05-12T11:07:18 | INFO | __main__ : Cosine Embedding Loss | 0.2277 +2025-05-12T11:07:18 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T11:07:18 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:07:18 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:07:41 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:07:41 | INFO | __main__ : Step: 26900 +2025-05-12T11:07:41 | INFO | __main__ : Current Frame Index within Batch Video: 155/247 +2025-05-12T11:07:41 | INFO | __main__ : Batch-wise Cosine Similarity | 85.50% +2025-05-12T11:07:41 | INFO | __main__ : Cosine Embedding Loss | 0.1450 +2025-05-12T11:07:41 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T11:07:41 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:07:41 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:08:03 | INFO | utils.basic_utils : Train Epoch: [0] [ 111/4978] eta: 3 days, 9:01:03 lr: 0.000004 temperature: 0.0126 video-loss_cosine: 0.1048 eval_avg_sim: 0.4638 video-cosine_similarity: 0.8952 time: 59.3016 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T11:08:05 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:08:05 | INFO | __main__ : Step: 27000 +2025-05-12T11:08:05 | INFO | __main__ : Current Frame Index within Batch Video: 14/247 +2025-05-12T11:08:05 | INFO | __main__ : Batch-wise Cosine Similarity | 61.40% +2025-05-12T11:08:05 | INFO | __main__ : Cosine Embedding Loss | 0.3860 +2025-05-12T11:08:05 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T11:08:05 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:08:05 | INFO | __main__ : Evaluation Average Sim | 0.4638 +2025-05-12T11:08:05 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:08:06 | INFO | __main__ : Performing periodic evaluation at global step 27000... +2025-05-12T11:08:06 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T11:08:06 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T11:08:06 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T11:08:06 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T11:08:16 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.5878 +2025-05-12T11:08:16 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0027000.png +2025-05-12T11:08:16 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T11:08:16 | INFO | __main__ : Evaluation at step 27000 complete. Average Similarity: 0.5878 +2025-05-12T11:08:39 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:08:39 | INFO | __main__ : Step: 27100 +2025-05-12T11:08:39 | INFO | __main__ : Current Frame Index within Batch Video: 114/247 +2025-05-12T11:08:39 | INFO | __main__ : Batch-wise Cosine Similarity | 83.58% +2025-05-12T11:08:39 | INFO | __main__ : Cosine Embedding Loss | 0.1642 +2025-05-12T11:08:39 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T11:08:39 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:08:39 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:09:03 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:09:03 | INFO | __main__ : Step: 27200 +2025-05-12T11:09:03 | INFO | __main__ : Current Frame Index within Batch Video: 214/247 +2025-05-12T11:09:03 | INFO | __main__ : Batch-wise Cosine Similarity | 86.84% +2025-05-12T11:09:03 | INFO | __main__ : Cosine Embedding Loss | 0.1316 +2025-05-12T11:09:03 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T11:09:03 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:09:03 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:09:11 | INFO | utils.basic_utils : Train Epoch: [0] [ 112/4978] eta: 3 days, 9:05:19 lr: 0.000004 temperature: 0.0126 video-loss_cosine: 0.1129 eval_avg_sim: 0.5878 video-cosine_similarity: 0.8871 time: 59.7948 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T11:09:27 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:09:27 | INFO | __main__ : Step: 27300 +2025-05-12T11:09:27 | INFO | __main__ : Current Frame Index within Batch Video: 73/247 +2025-05-12T11:09:27 | INFO | __main__ : Batch-wise Cosine Similarity | 83.45% +2025-05-12T11:09:27 | INFO | __main__ : Cosine Embedding Loss | 0.1655 +2025-05-12T11:09:27 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T11:09:27 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:09:27 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:09:50 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:09:50 | INFO | __main__ : Step: 27400 +2025-05-12T11:09:50 | INFO | __main__ : Current Frame Index within Batch Video: 173/247 +2025-05-12T11:09:50 | INFO | __main__ : Batch-wise Cosine Similarity | 88.01% +2025-05-12T11:09:50 | INFO | __main__ : Cosine Embedding Loss | 0.1199 +2025-05-12T11:09:50 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T11:09:50 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:09:50 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:10:08 | INFO | utils.basic_utils : Train Epoch: [0] [ 113/4978] eta: 3 days, 9:02:30 lr: 0.000004 temperature: 0.0126 video-loss_cosine: 0.0923 eval_avg_sim: 0.5878 video-cosine_similarity: 0.9077 time: 59.7939 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T11:10:14 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:10:14 | INFO | __main__ : Step: 27500 +2025-05-12T11:10:14 | INFO | __main__ : Current Frame Index within Batch Video: 32/247 +2025-05-12T11:10:14 | INFO | __main__ : Batch-wise Cosine Similarity | 74.82% +2025-05-12T11:10:14 | INFO | __main__ : Cosine Embedding Loss | 0.2518 +2025-05-12T11:10:14 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T11:10:14 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:10:14 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:10:38 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:10:38 | INFO | __main__ : Step: 27600 +2025-05-12T11:10:38 | INFO | __main__ : Current Frame Index within Batch Video: 132/247 +2025-05-12T11:10:38 | INFO | __main__ : Batch-wise Cosine Similarity | 86.29% +2025-05-12T11:10:38 | INFO | __main__ : Cosine Embedding Loss | 0.1371 +2025-05-12T11:10:38 | INFO | __main__ : Learning Rate | 0.000004 +2025-05-12T11:10:38 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:10:38 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:11:02 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:11:02 | INFO | __main__ : Step: 27700 +2025-05-12T11:11:02 | INFO | __main__ : Current Frame Index within Batch Video: 232/247 +2025-05-12T11:11:02 | INFO | __main__ : Batch-wise Cosine Similarity | 89.07% +2025-05-12T11:11:02 | INFO | __main__ : Cosine Embedding Loss | 0.1093 +2025-05-12T11:11:02 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:11:02 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:11:02 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:11:05 | INFO | utils.basic_utils : Train Epoch: [0] [ 114/4978] eta: 3 days, 8:59:43 lr: 0.000005 temperature: 0.0126 video-loss_cosine: 0.1018 eval_avg_sim: 0.5878 video-cosine_similarity: 0.8982 time: 59.7945 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T11:11:26 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:11:26 | INFO | __main__ : Step: 27800 +2025-05-12T11:11:26 | INFO | __main__ : Current Frame Index within Batch Video: 91/247 +2025-05-12T11:11:26 | INFO | __main__ : Batch-wise Cosine Similarity | 81.04% +2025-05-12T11:11:26 | INFO | __main__ : Cosine Embedding Loss | 0.1896 +2025-05-12T11:11:26 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:11:26 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:11:26 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:11:50 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:11:50 | INFO | __main__ : Step: 27900 +2025-05-12T11:11:50 | INFO | __main__ : Current Frame Index within Batch Video: 191/247 +2025-05-12T11:11:50 | INFO | __main__ : Batch-wise Cosine Similarity | 89.43% +2025-05-12T11:11:50 | INFO | __main__ : Cosine Embedding Loss | 0.1057 +2025-05-12T11:11:50 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:11:50 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:11:50 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:12:03 | INFO | utils.basic_utils : Train Epoch: [0] [ 115/4978] eta: 3 days, 8:56:56 lr: 0.000005 temperature: 0.0126 video-loss_cosine: 0.1019 eval_avg_sim: 0.5878 video-cosine_similarity: 0.8981 time: 59.3243 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T11:12:14 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:12:14 | INFO | __main__ : Step: 28000 +2025-05-12T11:12:14 | INFO | __main__ : Current Frame Index within Batch Video: 50/247 +2025-05-12T11:12:14 | INFO | __main__ : Batch-wise Cosine Similarity | 77.61% +2025-05-12T11:12:14 | INFO | __main__ : Cosine Embedding Loss | 0.2239 +2025-05-12T11:12:14 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:12:14 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:12:14 | INFO | __main__ : Evaluation Average Sim | 0.5878 +2025-05-12T11:12:14 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:12:14 | INFO | __main__ : Performing periodic evaluation at global step 28000... +2025-05-12T11:12:14 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T11:12:14 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T11:12:14 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T11:12:14 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T11:12:23 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.6096 +2025-05-12T11:12:23 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0028000.png +2025-05-12T11:12:23 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T11:12:23 | INFO | __main__ : Evaluation at step 28000 complete. Average Similarity: 0.6096 +2025-05-12T11:12:47 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:12:47 | INFO | __main__ : Step: 28100 +2025-05-12T11:12:47 | INFO | __main__ : Current Frame Index within Batch Video: 150/247 +2025-05-12T11:12:47 | INFO | __main__ : Batch-wise Cosine Similarity | 88.87% +2025-05-12T11:12:47 | INFO | __main__ : Cosine Embedding Loss | 0.1113 +2025-05-12T11:12:47 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:12:47 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:12:47 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:13:10 | INFO | utils.basic_utils : Train Epoch: [0] [ 116/4978] eta: 3 days, 9:00:42 lr: 0.000005 temperature: 0.0126 video-loss_cosine: 0.1111 eval_avg_sim: 0.6096 video-cosine_similarity: 0.8889 time: 59.7924 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T11:13:11 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:13:11 | INFO | __main__ : Step: 28200 +2025-05-12T11:13:11 | INFO | __main__ : Current Frame Index within Batch Video: 9/247 +2025-05-12T11:13:11 | INFO | __main__ : Batch-wise Cosine Similarity | 59.38% +2025-05-12T11:13:11 | INFO | __main__ : Cosine Embedding Loss | 0.4062 +2025-05-12T11:13:11 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:13:11 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:13:11 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:13:34 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:13:34 | INFO | __main__ : Step: 28300 +2025-05-12T11:13:34 | INFO | __main__ : Current Frame Index within Batch Video: 109/247 +2025-05-12T11:13:34 | INFO | __main__ : Batch-wise Cosine Similarity | 84.96% +2025-05-12T11:13:34 | INFO | __main__ : Cosine Embedding Loss | 0.1504 +2025-05-12T11:13:34 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:13:34 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:13:34 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:13:58 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:13:58 | INFO | __main__ : Step: 28400 +2025-05-12T11:13:58 | INFO | __main__ : Current Frame Index within Batch Video: 209/247 +2025-05-12T11:13:58 | INFO | __main__ : Batch-wise Cosine Similarity | 91.36% +2025-05-12T11:13:58 | INFO | __main__ : Cosine Embedding Loss | 0.0864 +2025-05-12T11:13:58 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:13:58 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:13:58 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:14:07 | INFO | utils.basic_utils : Train Epoch: [0] [ 117/4978] eta: 3 days, 8:57:59 lr: 0.000005 temperature: 0.0126 video-loss_cosine: 0.0758 eval_avg_sim: 0.6096 video-cosine_similarity: 0.9242 time: 59.7906 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T11:14:22 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:14:22 | INFO | __main__ : Step: 28500 +2025-05-12T11:14:22 | INFO | __main__ : Current Frame Index within Batch Video: 68/247 +2025-05-12T11:14:22 | INFO | __main__ : Batch-wise Cosine Similarity | 78.80% +2025-05-12T11:14:22 | INFO | __main__ : Cosine Embedding Loss | 0.2120 +2025-05-12T11:14:22 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:14:22 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:14:22 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:14:46 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:14:46 | INFO | __main__ : Step: 28600 +2025-05-12T11:14:46 | INFO | __main__ : Current Frame Index within Batch Video: 168/247 +2025-05-12T11:14:46 | INFO | __main__ : Batch-wise Cosine Similarity | 86.97% +2025-05-12T11:14:46 | INFO | __main__ : Cosine Embedding Loss | 0.1303 +2025-05-12T11:14:46 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:14:46 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:14:46 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:15:05 | INFO | utils.basic_utils : Train Epoch: [0] [ 118/4978] eta: 3 days, 8:55:15 lr: 0.000005 temperature: 0.0126 video-loss_cosine: 0.0968 eval_avg_sim: 0.6096 video-cosine_similarity: 0.9032 time: 59.7937 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T11:15:10 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:15:10 | INFO | __main__ : Step: 28700 +2025-05-12T11:15:10 | INFO | __main__ : Current Frame Index within Batch Video: 27/247 +2025-05-12T11:15:10 | INFO | __main__ : Batch-wise Cosine Similarity | 70.23% +2025-05-12T11:15:10 | INFO | __main__ : Cosine Embedding Loss | 0.2977 +2025-05-12T11:15:10 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:15:10 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:15:10 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:15:34 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:15:34 | INFO | __main__ : Step: 28800 +2025-05-12T11:15:34 | INFO | __main__ : Current Frame Index within Batch Video: 127/247 +2025-05-12T11:15:34 | INFO | __main__ : Batch-wise Cosine Similarity | 85.30% +2025-05-12T11:15:34 | INFO | __main__ : Cosine Embedding Loss | 0.1470 +2025-05-12T11:15:34 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:15:34 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:15:34 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:15:57 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:15:57 | INFO | __main__ : Step: 28900 +2025-05-12T11:15:57 | INFO | __main__ : Current Frame Index within Batch Video: 227/247 +2025-05-12T11:15:57 | INFO | __main__ : Batch-wise Cosine Similarity | 90.15% +2025-05-12T11:15:57 | INFO | __main__ : Cosine Embedding Loss | 0.0985 +2025-05-12T11:15:57 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:15:57 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:15:57 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:16:02 | INFO | utils.basic_utils : Train Epoch: [0] [ 119/4978] eta: 3 days, 8:52:36 lr: 0.000005 temperature: 0.0126 video-loss_cosine: 0.0914 eval_avg_sim: 0.6096 video-cosine_similarity: 0.9086 time: 59.3347 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T11:16:21 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:16:21 | INFO | __main__ : Step: 29000 +2025-05-12T11:16:21 | INFO | __main__ : Current Frame Index within Batch Video: 86/247 +2025-05-12T11:16:21 | INFO | __main__ : Batch-wise Cosine Similarity | 82.68% +2025-05-12T11:16:21 | INFO | __main__ : Cosine Embedding Loss | 0.1732 +2025-05-12T11:16:21 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:16:21 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:16:21 | INFO | __main__ : Evaluation Average Sim | 0.6096 +2025-05-12T11:16:21 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:16:22 | INFO | __main__ : Performing periodic evaluation at global step 29000... +2025-05-12T11:16:22 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T11:16:22 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T11:16:22 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T11:16:22 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T11:16:31 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.5716 +2025-05-12T11:16:31 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0029000.png +2025-05-12T11:16:31 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T11:16:31 | INFO | __main__ : Evaluation at step 29000 complete. Average Similarity: 0.5716 +2025-05-12T11:16:54 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:16:54 | INFO | __main__ : Step: 29100 +2025-05-12T11:16:54 | INFO | __main__ : Current Frame Index within Batch Video: 186/247 +2025-05-12T11:16:54 | INFO | __main__ : Batch-wise Cosine Similarity | 88.56% +2025-05-12T11:16:54 | INFO | __main__ : Cosine Embedding Loss | 0.1144 +2025-05-12T11:16:54 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:16:54 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:16:54 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:17:09 | INFO | utils.basic_utils : Train Epoch: [0] [ 120/4978] eta: 3 days, 8:56:06 lr: 0.000005 temperature: 0.0126 video-loss_cosine: 0.1118 eval_avg_sim: 0.5716 video-cosine_similarity: 0.8882 time: 59.7964 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T11:17:18 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:17:18 | INFO | __main__ : Step: 29200 +2025-05-12T11:17:18 | INFO | __main__ : Current Frame Index within Batch Video: 45/247 +2025-05-12T11:17:18 | INFO | __main__ : Batch-wise Cosine Similarity | 77.57% +2025-05-12T11:17:18 | INFO | __main__ : Cosine Embedding Loss | 0.2243 +2025-05-12T11:17:18 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:17:18 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:17:18 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:17:42 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:17:42 | INFO | __main__ : Step: 29300 +2025-05-12T11:17:42 | INFO | __main__ : Current Frame Index within Batch Video: 145/247 +2025-05-12T11:17:42 | INFO | __main__ : Batch-wise Cosine Similarity | 87.14% +2025-05-12T11:17:42 | INFO | __main__ : Cosine Embedding Loss | 0.1286 +2025-05-12T11:17:42 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:17:42 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:17:42 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:18:06 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:18:06 | INFO | __main__ : Step: 29400 +2025-05-12T11:18:06 | INFO | __main__ : Current Frame Index within Batch Video: 245/247 +2025-05-12T11:18:06 | INFO | __main__ : Batch-wise Cosine Similarity | 93.90% +2025-05-12T11:18:06 | INFO | __main__ : Cosine Embedding Loss | 0.0610 +2025-05-12T11:18:06 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:18:06 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:18:06 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:18:06 | INFO | utils.basic_utils : Train Epoch: [0] [ 121/4978] eta: 3 days, 8:53:24 lr: 0.000005 temperature: 0.0126 video-loss_cosine: 0.0615 eval_avg_sim: 0.5716 video-cosine_similarity: 0.9385 time: 59.7940 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T11:18:30 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:18:30 | INFO | __main__ : Step: 29500 +2025-05-12T11:18:30 | INFO | __main__ : Current Frame Index within Batch Video: 104/247 +2025-05-12T11:18:30 | INFO | __main__ : Batch-wise Cosine Similarity | 84.26% +2025-05-12T11:18:30 | INFO | __main__ : Cosine Embedding Loss | 0.1574 +2025-05-12T11:18:30 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:18:30 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:18:30 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:18:53 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:18:53 | INFO | __main__ : Step: 29600 +2025-05-12T11:18:53 | INFO | __main__ : Current Frame Index within Batch Video: 204/247 +2025-05-12T11:18:53 | INFO | __main__ : Batch-wise Cosine Similarity | 88.83% +2025-05-12T11:18:53 | INFO | __main__ : Cosine Embedding Loss | 0.1117 +2025-05-12T11:18:53 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:18:53 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:18:53 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:19:04 | INFO | utils.basic_utils : Train Epoch: [0] [ 122/4978] eta: 3 days, 8:50:42 lr: 0.000005 temperature: 0.0126 video-loss_cosine: 0.1072 eval_avg_sim: 0.5716 video-cosine_similarity: 0.8928 time: 59.7915 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T11:19:17 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:19:17 | INFO | __main__ : Step: 29700 +2025-05-12T11:19:17 | INFO | __main__ : Current Frame Index within Batch Video: 63/247 +2025-05-12T11:19:17 | INFO | __main__ : Batch-wise Cosine Similarity | 78.29% +2025-05-12T11:19:17 | INFO | __main__ : Cosine Embedding Loss | 0.2171 +2025-05-12T11:19:17 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:19:17 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:19:17 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:19:41 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:19:41 | INFO | __main__ : Step: 29800 +2025-05-12T11:19:41 | INFO | __main__ : Current Frame Index within Batch Video: 163/247 +2025-05-12T11:19:41 | INFO | __main__ : Batch-wise Cosine Similarity | 85.56% +2025-05-12T11:19:41 | INFO | __main__ : Cosine Embedding Loss | 0.1444 +2025-05-12T11:19:41 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:19:41 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:19:41 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:20:01 | INFO | utils.basic_utils : Train Epoch: [0] [ 123/4978] eta: 3 days, 8:48:05 lr: 0.000005 temperature: 0.0126 video-loss_cosine: 0.0901 eval_avg_sim: 0.5716 video-cosine_similarity: 0.9099 time: 59.3190 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T11:20:05 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:20:05 | INFO | __main__ : Step: 29900 +2025-05-12T11:20:05 | INFO | __main__ : Current Frame Index within Batch Video: 22/247 +2025-05-12T11:20:05 | INFO | __main__ : Batch-wise Cosine Similarity | 68.05% +2025-05-12T11:20:05 | INFO | __main__ : Cosine Embedding Loss | 0.3195 +2025-05-12T11:20:05 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:20:05 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:20:05 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:20:29 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:20:29 | INFO | __main__ : Step: 30000 +2025-05-12T11:20:29 | INFO | __main__ : Current Frame Index within Batch Video: 122/247 +2025-05-12T11:20:29 | INFO | __main__ : Batch-wise Cosine Similarity | 86.44% +2025-05-12T11:20:29 | INFO | __main__ : Cosine Embedding Loss | 0.1356 +2025-05-12T11:20:29 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:20:29 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:20:29 | INFO | __main__ : Evaluation Average Sim | 0.5716 +2025-05-12T11:20:29 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:20:29 | INFO | __main__ : Saving checkpoint at global step 30000 +2025-05-12T11:20:29 | INFO | __main__ : Performing periodic evaluation at global step 30000... +2025-05-12T11:20:29 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T11:20:29 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T11:20:29 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T11:20:29 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T11:20:38 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.6298 +2025-05-12T11:20:38 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0030000.png +2025-05-12T11:20:38 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T11:20:38 | INFO | __main__ : Evaluation at step 30000 complete. Average Similarity: 0.6298 +2025-05-12T11:21:02 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:21:02 | INFO | __main__ : Step: 30100 +2025-05-12T11:21:02 | INFO | __main__ : Current Frame Index within Batch Video: 222/247 +2025-05-12T11:21:02 | INFO | __main__ : Batch-wise Cosine Similarity | 88.17% +2025-05-12T11:21:02 | INFO | __main__ : Cosine Embedding Loss | 0.1183 +2025-05-12T11:21:02 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:21:02 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:21:02 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:21:08 | INFO | utils.basic_utils : Train Epoch: [0] [ 124/4978] eta: 3 days, 8:51:35 lr: 0.000005 temperature: 0.0126 video-loss_cosine: 0.0950 eval_avg_sim: 0.6298 video-cosine_similarity: 0.9050 time: 59.7928 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T11:21:26 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:21:26 | INFO | __main__ : Step: 30200 +2025-05-12T11:21:26 | INFO | __main__ : Current Frame Index within Batch Video: 81/247 +2025-05-12T11:21:26 | INFO | __main__ : Batch-wise Cosine Similarity | 84.93% +2025-05-12T11:21:26 | INFO | __main__ : Cosine Embedding Loss | 0.1507 +2025-05-12T11:21:26 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:21:26 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:21:26 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:21:50 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:21:50 | INFO | __main__ : Step: 30300 +2025-05-12T11:21:50 | INFO | __main__ : Current Frame Index within Batch Video: 181/247 +2025-05-12T11:21:50 | INFO | __main__ : Batch-wise Cosine Similarity | 91.30% +2025-05-12T11:21:50 | INFO | __main__ : Cosine Embedding Loss | 0.0870 +2025-05-12T11:21:50 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:21:50 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:21:50 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:22:05 | INFO | utils.basic_utils : Train Epoch: [0] [ 125/4978] eta: 3 days, 8:48:57 lr: 0.000005 temperature: 0.0126 video-loss_cosine: 0.0658 eval_avg_sim: 0.6298 video-cosine_similarity: 0.9342 time: 59.7927 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T11:22:14 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:22:14 | INFO | __main__ : Step: 30400 +2025-05-12T11:22:14 | INFO | __main__ : Current Frame Index within Batch Video: 40/247 +2025-05-12T11:22:14 | INFO | __main__ : Batch-wise Cosine Similarity | 79.13% +2025-05-12T11:22:14 | INFO | __main__ : Cosine Embedding Loss | 0.2087 +2025-05-12T11:22:14 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:22:14 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:22:14 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:22:37 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:22:37 | INFO | __main__ : Step: 30500 +2025-05-12T11:22:37 | INFO | __main__ : Current Frame Index within Batch Video: 140/247 +2025-05-12T11:22:37 | INFO | __main__ : Batch-wise Cosine Similarity | 89.53% +2025-05-12T11:22:37 | INFO | __main__ : Cosine Embedding Loss | 0.1047 +2025-05-12T11:22:37 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:22:37 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:22:37 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:23:01 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:23:01 | INFO | __main__ : Step: 30600 +2025-05-12T11:23:01 | INFO | __main__ : Current Frame Index within Batch Video: 240/247 +2025-05-12T11:23:01 | INFO | __main__ : Batch-wise Cosine Similarity | 93.47% +2025-05-12T11:23:01 | INFO | __main__ : Cosine Embedding Loss | 0.0653 +2025-05-12T11:23:01 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:23:01 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:23:01 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:23:03 | INFO | utils.basic_utils : Train Epoch: [0] [ 126/4978] eta: 3 days, 8:46:22 lr: 0.000005 temperature: 0.0126 video-loss_cosine: 0.0686 eval_avg_sim: 0.6298 video-cosine_similarity: 0.9314 time: 59.7952 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T11:23:25 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:23:25 | INFO | __main__ : Step: 30700 +2025-05-12T11:23:25 | INFO | __main__ : Current Frame Index within Batch Video: 99/247 +2025-05-12T11:23:25 | INFO | __main__ : Batch-wise Cosine Similarity | 83.64% +2025-05-12T11:23:25 | INFO | __main__ : Cosine Embedding Loss | 0.1636 +2025-05-12T11:23:25 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:23:25 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:23:25 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:23:49 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:23:49 | INFO | __main__ : Step: 30800 +2025-05-12T11:23:49 | INFO | __main__ : Current Frame Index within Batch Video: 199/247 +2025-05-12T11:23:49 | INFO | __main__ : Batch-wise Cosine Similarity | 91.23% +2025-05-12T11:23:49 | INFO | __main__ : Cosine Embedding Loss | 0.0877 +2025-05-12T11:23:49 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:23:49 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:23:49 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:24:00 | INFO | utils.basic_utils : Train Epoch: [0] [ 127/4978] eta: 3 days, 8:43:46 lr: 0.000005 temperature: 0.0126 video-loss_cosine: 0.0777 eval_avg_sim: 0.6298 video-cosine_similarity: 0.9223 time: 59.3239 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T11:24:13 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:24:13 | INFO | __main__ : Step: 30900 +2025-05-12T11:24:13 | INFO | __main__ : Current Frame Index within Batch Video: 58/247 +2025-05-12T11:24:13 | INFO | __main__ : Batch-wise Cosine Similarity | 79.09% +2025-05-12T11:24:13 | INFO | __main__ : Cosine Embedding Loss | 0.2091 +2025-05-12T11:24:13 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:24:13 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:24:13 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:24:37 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:24:37 | INFO | __main__ : Step: 31000 +2025-05-12T11:24:37 | INFO | __main__ : Current Frame Index within Batch Video: 158/247 +2025-05-12T11:24:37 | INFO | __main__ : Batch-wise Cosine Similarity | 87.78% +2025-05-12T11:24:37 | INFO | __main__ : Cosine Embedding Loss | 0.1222 +2025-05-12T11:24:37 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:24:37 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:24:37 | INFO | __main__ : Evaluation Average Sim | 0.6298 +2025-05-12T11:24:37 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:24:37 | INFO | __main__ : Performing periodic evaluation at global step 31000... +2025-05-12T11:24:37 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T11:24:37 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T11:24:37 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T11:24:37 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T11:24:46 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.5905 +2025-05-12T11:24:46 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0031000.png +2025-05-12T11:24:46 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T11:24:46 | INFO | __main__ : Evaluation at step 31000 complete. Average Similarity: 0.5905 +2025-05-12T11:25:07 | INFO | utils.basic_utils : Train Epoch: [0] [ 128/4978] eta: 3 days, 8:47:04 lr: 0.000005 temperature: 0.0126 video-loss_cosine: 0.0866 eval_avg_sim: 0.5905 video-cosine_similarity: 0.9134 time: 59.7934 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T11:25:10 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:25:10 | INFO | __main__ : Step: 31100 +2025-05-12T11:25:10 | INFO | __main__ : Current Frame Index within Batch Video: 17/247 +2025-05-12T11:25:10 | INFO | __main__ : Batch-wise Cosine Similarity | 63.73% +2025-05-12T11:25:10 | INFO | __main__ : Cosine Embedding Loss | 0.3627 +2025-05-12T11:25:10 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:25:10 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:25:10 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:25:34 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:25:34 | INFO | __main__ : Step: 31200 +2025-05-12T11:25:34 | INFO | __main__ : Current Frame Index within Batch Video: 117/247 +2025-05-12T11:25:34 | INFO | __main__ : Batch-wise Cosine Similarity | 84.59% +2025-05-12T11:25:34 | INFO | __main__ : Cosine Embedding Loss | 0.1541 +2025-05-12T11:25:34 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:25:34 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:25:34 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:25:57 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:25:57 | INFO | __main__ : Step: 31300 +2025-05-12T11:25:57 | INFO | __main__ : Current Frame Index within Batch Video: 217/247 +2025-05-12T11:25:57 | INFO | __main__ : Batch-wise Cosine Similarity | 90.44% +2025-05-12T11:25:57 | INFO | __main__ : Cosine Embedding Loss | 0.0956 +2025-05-12T11:25:57 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:25:57 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:25:57 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:26:04 | INFO | utils.basic_utils : Train Epoch: [0] [ 129/4978] eta: 3 days, 8:44:31 lr: 0.000005 temperature: 0.0126 video-loss_cosine: 0.1080 eval_avg_sim: 0.5905 video-cosine_similarity: 0.8920 time: 59.7939 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T11:26:21 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:26:21 | INFO | __main__ : Step: 31400 +2025-05-12T11:26:21 | INFO | __main__ : Current Frame Index within Batch Video: 76/247 +2025-05-12T11:26:21 | INFO | __main__ : Batch-wise Cosine Similarity | 84.08% +2025-05-12T11:26:21 | INFO | __main__ : Cosine Embedding Loss | 0.1592 +2025-05-12T11:26:21 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:26:21 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:26:21 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:26:45 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:26:45 | INFO | __main__ : Step: 31500 +2025-05-12T11:26:45 | INFO | __main__ : Current Frame Index within Batch Video: 176/247 +2025-05-12T11:26:45 | INFO | __main__ : Batch-wise Cosine Similarity | 89.24% +2025-05-12T11:26:45 | INFO | __main__ : Cosine Embedding Loss | 0.1076 +2025-05-12T11:26:45 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:26:45 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:26:45 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:27:02 | INFO | utils.basic_utils : Train Epoch: [0] [ 130/4978] eta: 3 days, 8:41:54 lr: 0.000005 temperature: 0.0126 video-loss_cosine: 0.0668 eval_avg_sim: 0.5905 video-cosine_similarity: 0.9332 time: 59.7896 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T11:27:09 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:27:09 | INFO | __main__ : Step: 31600 +2025-05-12T11:27:09 | INFO | __main__ : Current Frame Index within Batch Video: 35/247 +2025-05-12T11:27:09 | INFO | __main__ : Batch-wise Cosine Similarity | 73.67% +2025-05-12T11:27:09 | INFO | __main__ : Cosine Embedding Loss | 0.2633 +2025-05-12T11:27:09 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:27:09 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:27:09 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:27:33 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:27:33 | INFO | __main__ : Step: 31700 +2025-05-12T11:27:33 | INFO | __main__ : Current Frame Index within Batch Video: 135/247 +2025-05-12T11:27:33 | INFO | __main__ : Batch-wise Cosine Similarity | 89.40% +2025-05-12T11:27:33 | INFO | __main__ : Cosine Embedding Loss | 0.1060 +2025-05-12T11:27:33 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:27:33 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:27:33 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:27:56 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:27:56 | INFO | __main__ : Step: 31800 +2025-05-12T11:27:56 | INFO | __main__ : Current Frame Index within Batch Video: 235/247 +2025-05-12T11:27:56 | INFO | __main__ : Batch-wise Cosine Similarity | 93.63% +2025-05-12T11:27:56 | INFO | __main__ : Cosine Embedding Loss | 0.0637 +2025-05-12T11:27:56 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:27:56 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:27:56 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:27:59 | INFO | utils.basic_utils : Train Epoch: [0] [ 131/4978] eta: 3 days, 8:39:24 lr: 0.000005 temperature: 0.0126 video-loss_cosine: 0.0625 eval_avg_sim: 0.5905 video-cosine_similarity: 0.9375 time: 59.7900 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T11:28:20 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:28:20 | INFO | __main__ : Step: 31900 +2025-05-12T11:28:20 | INFO | __main__ : Current Frame Index within Batch Video: 94/247 +2025-05-12T11:28:20 | INFO | __main__ : Batch-wise Cosine Similarity | 85.29% +2025-05-12T11:28:20 | INFO | __main__ : Cosine Embedding Loss | 0.1471 +2025-05-12T11:28:20 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:28:20 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:28:20 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:28:44 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:28:44 | INFO | __main__ : Step: 32000 +2025-05-12T11:28:44 | INFO | __main__ : Current Frame Index within Batch Video: 194/247 +2025-05-12T11:28:44 | INFO | __main__ : Batch-wise Cosine Similarity | 92.15% +2025-05-12T11:28:44 | INFO | __main__ : Cosine Embedding Loss | 0.0785 +2025-05-12T11:28:44 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:28:44 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:28:44 | INFO | __main__ : Evaluation Average Sim | 0.5905 +2025-05-12T11:28:44 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:28:44 | INFO | __main__ : Performing periodic evaluation at global step 32000... +2025-05-12T11:28:44 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T11:28:44 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T11:28:44 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T11:28:44 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T11:28:54 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.6058 +2025-05-12T11:28:54 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0032000.png +2025-05-12T11:28:54 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T11:28:54 | INFO | __main__ : Evaluation at step 32000 complete. Average Similarity: 0.6058 +2025-05-12T11:29:06 | INFO | utils.basic_utils : Train Epoch: [0] [ 132/4978] eta: 3 days, 8:42:34 lr: 0.000005 temperature: 0.0126 video-loss_cosine: 0.0713 eval_avg_sim: 0.6058 video-cosine_similarity: 0.9287 time: 59.7647 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T11:29:17 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:29:17 | INFO | __main__ : Step: 32100 +2025-05-12T11:29:17 | INFO | __main__ : Current Frame Index within Batch Video: 53/247 +2025-05-12T11:29:17 | INFO | __main__ : Batch-wise Cosine Similarity | 77.15% +2025-05-12T11:29:17 | INFO | __main__ : Cosine Embedding Loss | 0.2285 +2025-05-12T11:29:17 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:29:17 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:29:17 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:29:41 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:29:41 | INFO | __main__ : Step: 32200 +2025-05-12T11:29:41 | INFO | __main__ : Current Frame Index within Batch Video: 153/247 +2025-05-12T11:29:41 | INFO | __main__ : Batch-wise Cosine Similarity | 88.55% +2025-05-12T11:29:41 | INFO | __main__ : Cosine Embedding Loss | 0.1145 +2025-05-12T11:29:41 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:29:41 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:29:41 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:30:03 | INFO | utils.basic_utils : Train Epoch: [0] [ 133/4978] eta: 3 days, 8:40:02 lr: 0.000005 temperature: 0.0126 video-loss_cosine: 0.1020 eval_avg_sim: 0.6058 video-cosine_similarity: 0.8980 time: 59.7644 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T11:30:05 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:30:05 | INFO | __main__ : Step: 32300 +2025-05-12T11:30:05 | INFO | __main__ : Current Frame Index within Batch Video: 12/247 +2025-05-12T11:30:05 | INFO | __main__ : Batch-wise Cosine Similarity | 58.05% +2025-05-12T11:30:05 | INFO | __main__ : Cosine Embedding Loss | 0.4195 +2025-05-12T11:30:05 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:30:05 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:30:05 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:30:29 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:30:29 | INFO | __main__ : Step: 32400 +2025-05-12T11:30:29 | INFO | __main__ : Current Frame Index within Batch Video: 112/247 +2025-05-12T11:30:29 | INFO | __main__ : Batch-wise Cosine Similarity | 86.58% +2025-05-12T11:30:29 | INFO | __main__ : Cosine Embedding Loss | 0.1342 +2025-05-12T11:30:29 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:30:29 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:30:29 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:30:52 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:30:52 | INFO | __main__ : Step: 32500 +2025-05-12T11:30:52 | INFO | __main__ : Current Frame Index within Batch Video: 212/247 +2025-05-12T11:30:52 | INFO | __main__ : Batch-wise Cosine Similarity | 91.50% +2025-05-12T11:30:52 | INFO | __main__ : Cosine Embedding Loss | 0.0850 +2025-05-12T11:30:52 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:30:52 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:30:52 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:31:01 | INFO | utils.basic_utils : Train Epoch: [0] [ 134/4978] eta: 3 days, 8:37:31 lr: 0.000005 temperature: 0.0126 video-loss_cosine: 0.0851 eval_avg_sim: 0.6058 video-cosine_similarity: 0.9149 time: 59.7622 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T11:31:16 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:31:16 | INFO | __main__ : Step: 32600 +2025-05-12T11:31:16 | INFO | __main__ : Current Frame Index within Batch Video: 71/247 +2025-05-12T11:31:16 | INFO | __main__ : Batch-wise Cosine Similarity | 81.16% +2025-05-12T11:31:16 | INFO | __main__ : Cosine Embedding Loss | 0.1884 +2025-05-12T11:31:16 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:31:16 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:31:16 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:31:40 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:31:40 | INFO | __main__ : Step: 32700 +2025-05-12T11:31:40 | INFO | __main__ : Current Frame Index within Batch Video: 171/247 +2025-05-12T11:31:40 | INFO | __main__ : Batch-wise Cosine Similarity | 89.88% +2025-05-12T11:31:40 | INFO | __main__ : Cosine Embedding Loss | 0.1012 +2025-05-12T11:31:40 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:31:40 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:31:40 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:31:58 | INFO | utils.basic_utils : Train Epoch: [0] [ 135/4978] eta: 3 days, 8:35:03 lr: 0.000005 temperature: 0.0126 video-loss_cosine: 0.1146 eval_avg_sim: 0.6058 video-cosine_similarity: 0.8854 time: 59.7634 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T11:32:04 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:32:04 | INFO | __main__ : Step: 32800 +2025-05-12T11:32:04 | INFO | __main__ : Current Frame Index within Batch Video: 30/247 +2025-05-12T11:32:04 | INFO | __main__ : Batch-wise Cosine Similarity | 73.06% +2025-05-12T11:32:04 | INFO | __main__ : Cosine Embedding Loss | 0.2694 +2025-05-12T11:32:04 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:32:04 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:32:04 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:32:28 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:32:28 | INFO | __main__ : Step: 32900 +2025-05-12T11:32:28 | INFO | __main__ : Current Frame Index within Batch Video: 130/247 +2025-05-12T11:32:28 | INFO | __main__ : Batch-wise Cosine Similarity | 85.85% +2025-05-12T11:32:28 | INFO | __main__ : Cosine Embedding Loss | 0.1415 +2025-05-12T11:32:28 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:32:28 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:32:28 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:32:52 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:32:52 | INFO | __main__ : Step: 33000 +2025-05-12T11:32:52 | INFO | __main__ : Current Frame Index within Batch Video: 230/247 +2025-05-12T11:32:52 | INFO | __main__ : Batch-wise Cosine Similarity | 90.33% +2025-05-12T11:32:52 | INFO | __main__ : Cosine Embedding Loss | 0.0967 +2025-05-12T11:32:52 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:32:52 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:32:52 | INFO | __main__ : Evaluation Average Sim | 0.6058 +2025-05-12T11:32:52 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:32:52 | INFO | __main__ : Performing periodic evaluation at global step 33000... +2025-05-12T11:32:52 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T11:32:52 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T11:32:52 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T11:32:52 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T11:33:01 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.7447 +2025-05-12T11:33:01 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0033000.png +2025-05-12T11:33:01 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T11:33:01 | INFO | __main__ : Evaluation at step 33000 complete. Average Similarity: 0.7447 +2025-05-12T11:33:05 | INFO | utils.basic_utils : Train Epoch: [0] [ 136/4978] eta: 3 days, 8:38:04 lr: 0.000005 temperature: 0.0126 video-loss_cosine: 0.1089 eval_avg_sim: 0.7447 video-cosine_similarity: 0.8911 time: 59.7597 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T11:33:25 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:33:25 | INFO | __main__ : Step: 33100 +2025-05-12T11:33:25 | INFO | __main__ : Current Frame Index within Batch Video: 89/247 +2025-05-12T11:33:25 | INFO | __main__ : Batch-wise Cosine Similarity | 79.82% +2025-05-12T11:33:25 | INFO | __main__ : Cosine Embedding Loss | 0.2018 +2025-05-12T11:33:25 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:33:25 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:33:25 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:33:49 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:33:49 | INFO | __main__ : Step: 33200 +2025-05-12T11:33:49 | INFO | __main__ : Current Frame Index within Batch Video: 189/247 +2025-05-12T11:33:49 | INFO | __main__ : Batch-wise Cosine Similarity | 86.29% +2025-05-12T11:33:49 | INFO | __main__ : Cosine Embedding Loss | 0.1371 +2025-05-12T11:33:49 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:33:49 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:33:49 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:34:02 | INFO | utils.basic_utils : Train Epoch: [0] [ 137/4978] eta: 3 days, 8:35:36 lr: 0.000005 temperature: 0.0126 video-loss_cosine: 0.1209 eval_avg_sim: 0.7447 video-cosine_similarity: 0.8791 time: 59.7585 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T11:34:13 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:34:13 | INFO | __main__ : Step: 33300 +2025-05-12T11:34:13 | INFO | __main__ : Current Frame Index within Batch Video: 48/247 +2025-05-12T11:34:13 | INFO | __main__ : Batch-wise Cosine Similarity | 77.35% +2025-05-12T11:34:13 | INFO | __main__ : Cosine Embedding Loss | 0.2265 +2025-05-12T11:34:13 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:34:13 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:34:13 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:34:36 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:34:36 | INFO | __main__ : Step: 33400 +2025-05-12T11:34:36 | INFO | __main__ : Current Frame Index within Batch Video: 148/247 +2025-05-12T11:34:36 | INFO | __main__ : Batch-wise Cosine Similarity | 88.39% +2025-05-12T11:34:36 | INFO | __main__ : Cosine Embedding Loss | 0.1161 +2025-05-12T11:34:36 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:34:36 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:34:36 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:35:00 | INFO | utils.basic_utils : Train Epoch: [0] [ 138/4978] eta: 3 days, 8:33:09 lr: 0.000005 temperature: 0.0126 video-loss_cosine: 0.0643 eval_avg_sim: 0.7447 video-cosine_similarity: 0.9357 time: 59.7591 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T11:35:00 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:35:00 | INFO | __main__ : Step: 33500 +2025-05-12T11:35:00 | INFO | __main__ : Current Frame Index within Batch Video: 7/247 +2025-05-12T11:35:00 | INFO | __main__ : Batch-wise Cosine Similarity | 53.30% +2025-05-12T11:35:00 | INFO | __main__ : Cosine Embedding Loss | 0.4670 +2025-05-12T11:35:00 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:35:00 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:35:00 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:35:24 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:35:24 | INFO | __main__ : Step: 33600 +2025-05-12T11:35:24 | INFO | __main__ : Current Frame Index within Batch Video: 107/247 +2025-05-12T11:35:24 | INFO | __main__ : Batch-wise Cosine Similarity | 88.86% +2025-05-12T11:35:24 | INFO | __main__ : Cosine Embedding Loss | 0.1114 +2025-05-12T11:35:24 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:35:24 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:35:24 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:35:48 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:35:48 | INFO | __main__ : Step: 33700 +2025-05-12T11:35:48 | INFO | __main__ : Current Frame Index within Batch Video: 207/247 +2025-05-12T11:35:48 | INFO | __main__ : Batch-wise Cosine Similarity | 90.49% +2025-05-12T11:35:48 | INFO | __main__ : Cosine Embedding Loss | 0.0951 +2025-05-12T11:35:48 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:35:48 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:35:48 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:35:57 | INFO | utils.basic_utils : Train Epoch: [0] [ 139/4978] eta: 3 days, 8:30:43 lr: 0.000005 temperature: 0.0126 video-loss_cosine: 0.0789 eval_avg_sim: 0.7447 video-cosine_similarity: 0.9211 time: 59.7551 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T11:36:12 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:36:12 | INFO | __main__ : Step: 33800 +2025-05-12T11:36:12 | INFO | __main__ : Current Frame Index within Batch Video: 66/247 +2025-05-12T11:36:12 | INFO | __main__ : Batch-wise Cosine Similarity | 82.61% +2025-05-12T11:36:12 | INFO | __main__ : Cosine Embedding Loss | 0.1739 +2025-05-12T11:36:12 | INFO | __main__ : Learning Rate | 0.000005 +2025-05-12T11:36:12 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:36:12 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:36:35 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:36:35 | INFO | __main__ : Step: 33900 +2025-05-12T11:36:35 | INFO | __main__ : Current Frame Index within Batch Video: 166/247 +2025-05-12T11:36:35 | INFO | __main__ : Batch-wise Cosine Similarity | 90.78% +2025-05-12T11:36:35 | INFO | __main__ : Cosine Embedding Loss | 0.0922 +2025-05-12T11:36:35 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:36:35 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:36:35 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:36:55 | INFO | utils.basic_utils : Train Epoch: [0] [ 140/4978] eta: 3 days, 8:28:19 lr: 0.000006 temperature: 0.0126 video-loss_cosine: 0.0730 eval_avg_sim: 0.7447 video-cosine_similarity: 0.9270 time: 59.2952 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T11:36:59 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:36:59 | INFO | __main__ : Step: 34000 +2025-05-12T11:36:59 | INFO | __main__ : Current Frame Index within Batch Video: 25/247 +2025-05-12T11:36:59 | INFO | __main__ : Batch-wise Cosine Similarity | 69.64% +2025-05-12T11:36:59 | INFO | __main__ : Cosine Embedding Loss | 0.3036 +2025-05-12T11:36:59 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:36:59 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:36:59 | INFO | __main__ : Evaluation Average Sim | 0.7447 +2025-05-12T11:36:59 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:37:00 | INFO | __main__ : Performing periodic evaluation at global step 34000... +2025-05-12T11:37:00 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T11:37:00 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T11:37:00 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T11:37:00 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T11:37:10 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.5888 +2025-05-12T11:37:10 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0034000.png +2025-05-12T11:37:10 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T11:37:10 | INFO | __main__ : Evaluation at step 34000 complete. Average Similarity: 0.5888 +2025-05-12T11:37:33 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:37:33 | INFO | __main__ : Step: 34100 +2025-05-12T11:37:33 | INFO | __main__ : Current Frame Index within Batch Video: 125/247 +2025-05-12T11:37:33 | INFO | __main__ : Batch-wise Cosine Similarity | 87.41% +2025-05-12T11:37:33 | INFO | __main__ : Cosine Embedding Loss | 0.1259 +2025-05-12T11:37:33 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:37:33 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:37:33 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:37:57 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:37:57 | INFO | __main__ : Step: 34200 +2025-05-12T11:37:57 | INFO | __main__ : Current Frame Index within Batch Video: 225/247 +2025-05-12T11:37:57 | INFO | __main__ : Batch-wise Cosine Similarity | 89.25% +2025-05-12T11:37:57 | INFO | __main__ : Cosine Embedding Loss | 0.1075 +2025-05-12T11:37:57 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:37:57 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:37:57 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:38:02 | INFO | utils.basic_utils : Train Epoch: [0] [ 141/4978] eta: 3 days, 8:31:33 lr: 0.000006 temperature: 0.0126 video-loss_cosine: 0.0873 eval_avg_sim: 0.5888 video-cosine_similarity: 0.9127 time: 59.7921 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T11:38:21 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:38:21 | INFO | __main__ : Step: 34300 +2025-05-12T11:38:21 | INFO | __main__ : Current Frame Index within Batch Video: 84/247 +2025-05-12T11:38:21 | INFO | __main__ : Batch-wise Cosine Similarity | 83.04% +2025-05-12T11:38:21 | INFO | __main__ : Cosine Embedding Loss | 0.1696 +2025-05-12T11:38:21 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:38:21 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:38:21 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:38:44 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:38:44 | INFO | __main__ : Step: 34400 +2025-05-12T11:38:44 | INFO | __main__ : Current Frame Index within Batch Video: 184/247 +2025-05-12T11:38:44 | INFO | __main__ : Batch-wise Cosine Similarity | 88.44% +2025-05-12T11:38:44 | INFO | __main__ : Cosine Embedding Loss | 0.1156 +2025-05-12T11:38:44 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:38:44 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:38:44 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:38:59 | INFO | utils.basic_utils : Train Epoch: [0] [ 142/4978] eta: 3 days, 8:29:06 lr: 0.000006 temperature: 0.0126 video-loss_cosine: 0.0957 eval_avg_sim: 0.5888 video-cosine_similarity: 0.9043 time: 59.7907 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T11:39:08 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:39:08 | INFO | __main__ : Step: 34500 +2025-05-12T11:39:08 | INFO | __main__ : Current Frame Index within Batch Video: 43/247 +2025-05-12T11:39:08 | INFO | __main__ : Batch-wise Cosine Similarity | 74.96% +2025-05-12T11:39:08 | INFO | __main__ : Cosine Embedding Loss | 0.2504 +2025-05-12T11:39:08 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:39:08 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:39:08 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:39:32 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:39:32 | INFO | __main__ : Step: 34600 +2025-05-12T11:39:32 | INFO | __main__ : Current Frame Index within Batch Video: 143/247 +2025-05-12T11:39:32 | INFO | __main__ : Batch-wise Cosine Similarity | 89.25% +2025-05-12T11:39:32 | INFO | __main__ : Cosine Embedding Loss | 0.1075 +2025-05-12T11:39:32 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:39:32 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:39:32 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:39:56 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:39:56 | INFO | __main__ : Step: 34700 +2025-05-12T11:39:56 | INFO | __main__ : Current Frame Index within Batch Video: 243/247 +2025-05-12T11:39:56 | INFO | __main__ : Batch-wise Cosine Similarity | 93.94% +2025-05-12T11:39:56 | INFO | __main__ : Cosine Embedding Loss | 0.0606 +2025-05-12T11:39:56 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:39:56 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:39:56 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:39:57 | INFO | utils.basic_utils : Train Epoch: [0] [ 143/4978] eta: 3 days, 8:26:44 lr: 0.000006 temperature: 0.0126 video-loss_cosine: 0.0608 eval_avg_sim: 0.5888 video-cosine_similarity: 0.9392 time: 59.7912 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T11:40:20 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:40:20 | INFO | __main__ : Step: 34800 +2025-05-12T11:40:20 | INFO | __main__ : Current Frame Index within Batch Video: 102/247 +2025-05-12T11:40:20 | INFO | __main__ : Batch-wise Cosine Similarity | 85.74% +2025-05-12T11:40:20 | INFO | __main__ : Cosine Embedding Loss | 0.1426 +2025-05-12T11:40:20 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:40:20 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:40:20 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:40:44 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:40:44 | INFO | __main__ : Step: 34900 +2025-05-12T11:40:44 | INFO | __main__ : Current Frame Index within Batch Video: 202/247 +2025-05-12T11:40:44 | INFO | __main__ : Batch-wise Cosine Similarity | 89.84% +2025-05-12T11:40:44 | INFO | __main__ : Cosine Embedding Loss | 0.1016 +2025-05-12T11:40:44 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:40:44 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:40:44 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:40:54 | INFO | utils.basic_utils : Train Epoch: [0] [ 144/4978] eta: 3 days, 8:24:22 lr: 0.000006 temperature: 0.0126 video-loss_cosine: 0.0823 eval_avg_sim: 0.5888 video-cosine_similarity: 0.9177 time: 59.3199 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T11:41:08 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:41:08 | INFO | __main__ : Step: 35000 +2025-05-12T11:41:08 | INFO | __main__ : Current Frame Index within Batch Video: 61/247 +2025-05-12T11:41:08 | INFO | __main__ : Batch-wise Cosine Similarity | 78.74% +2025-05-12T11:41:08 | INFO | __main__ : Cosine Embedding Loss | 0.2126 +2025-05-12T11:41:08 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:41:08 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:41:08 | INFO | __main__ : Evaluation Average Sim | 0.5888 +2025-05-12T11:41:08 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:41:08 | INFO | __main__ : Saving checkpoint at global step 35000 +2025-05-12T11:41:08 | INFO | __main__ : Performing periodic evaluation at global step 35000... +2025-05-12T11:41:08 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T11:41:08 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T11:41:08 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T11:41:08 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T11:41:17 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.6726 +2025-05-12T11:41:17 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0035000.png +2025-05-12T11:41:17 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T11:41:17 | INFO | __main__ : Evaluation at step 35000 complete. Average Similarity: 0.6726 +2025-05-12T11:41:41 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:41:41 | INFO | __main__ : Step: 35100 +2025-05-12T11:41:41 | INFO | __main__ : Current Frame Index within Batch Video: 161/247 +2025-05-12T11:41:41 | INFO | __main__ : Batch-wise Cosine Similarity | 89.91% +2025-05-12T11:41:41 | INFO | __main__ : Cosine Embedding Loss | 0.1009 +2025-05-12T11:41:41 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:41:41 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:41:41 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:42:01 | INFO | utils.basic_utils : Train Epoch: [0] [ 145/4978] eta: 3 days, 8:27:16 lr: 0.000006 temperature: 0.0126 video-loss_cosine: 0.0538 eval_avg_sim: 0.6726 video-cosine_similarity: 0.9462 time: 59.7950 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T11:42:05 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:42:05 | INFO | __main__ : Step: 35200 +2025-05-12T11:42:05 | INFO | __main__ : Current Frame Index within Batch Video: 20/247 +2025-05-12T11:42:05 | INFO | __main__ : Batch-wise Cosine Similarity | 65.69% +2025-05-12T11:42:05 | INFO | __main__ : Cosine Embedding Loss | 0.3431 +2025-05-12T11:42:05 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:42:05 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:42:05 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:42:28 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:42:29 | INFO | __main__ : Step: 35300 +2025-05-12T11:42:29 | INFO | __main__ : Current Frame Index within Batch Video: 120/247 +2025-05-12T11:42:29 | INFO | __main__ : Batch-wise Cosine Similarity | 89.92% +2025-05-12T11:42:29 | INFO | __main__ : Cosine Embedding Loss | 0.1008 +2025-05-12T11:42:29 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:42:29 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:42:29 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:42:52 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:42:52 | INFO | __main__ : Step: 35400 +2025-05-12T11:42:52 | INFO | __main__ : Current Frame Index within Batch Video: 220/247 +2025-05-12T11:42:52 | INFO | __main__ : Batch-wise Cosine Similarity | 89.72% +2025-05-12T11:42:52 | INFO | __main__ : Cosine Embedding Loss | 0.1028 +2025-05-12T11:42:52 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:42:52 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:42:52 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:42:59 | INFO | utils.basic_utils : Train Epoch: [0] [ 146/4978] eta: 3 days, 8:24:53 lr: 0.000006 temperature: 0.0126 video-loss_cosine: 0.0639 eval_avg_sim: 0.6726 video-cosine_similarity: 0.9361 time: 59.7933 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T11:43:16 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:43:16 | INFO | __main__ : Step: 35500 +2025-05-12T11:43:16 | INFO | __main__ : Current Frame Index within Batch Video: 79/247 +2025-05-12T11:43:16 | INFO | __main__ : Batch-wise Cosine Similarity | 83.55% +2025-05-12T11:43:16 | INFO | __main__ : Cosine Embedding Loss | 0.1645 +2025-05-12T11:43:16 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:43:16 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:43:16 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:43:40 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:43:40 | INFO | __main__ : Step: 35600 +2025-05-12T11:43:40 | INFO | __main__ : Current Frame Index within Batch Video: 179/247 +2025-05-12T11:43:40 | INFO | __main__ : Batch-wise Cosine Similarity | 88.17% +2025-05-12T11:43:40 | INFO | __main__ : Cosine Embedding Loss | 0.1183 +2025-05-12T11:43:40 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:43:40 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:43:40 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:43:56 | INFO | utils.basic_utils : Train Epoch: [0] [ 147/4978] eta: 3 days, 8:22:30 lr: 0.000006 temperature: 0.0126 video-loss_cosine: 0.0605 eval_avg_sim: 0.6726 video-cosine_similarity: 0.9395 time: 59.7919 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T11:44:04 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:44:04 | INFO | __main__ : Step: 35700 +2025-05-12T11:44:04 | INFO | __main__ : Current Frame Index within Batch Video: 38/247 +2025-05-12T11:44:04 | INFO | __main__ : Batch-wise Cosine Similarity | 78.40% +2025-05-12T11:44:04 | INFO | __main__ : Cosine Embedding Loss | 0.2160 +2025-05-12T11:44:04 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:44:04 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:44:04 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:44:28 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:44:28 | INFO | __main__ : Step: 35800 +2025-05-12T11:44:28 | INFO | __main__ : Current Frame Index within Batch Video: 138/247 +2025-05-12T11:44:28 | INFO | __main__ : Batch-wise Cosine Similarity | 89.06% +2025-05-12T11:44:28 | INFO | __main__ : Cosine Embedding Loss | 0.1094 +2025-05-12T11:44:28 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:44:28 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:44:28 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:44:51 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:44:51 | INFO | __main__ : Step: 35900 +2025-05-12T11:44:51 | INFO | __main__ : Current Frame Index within Batch Video: 238/247 +2025-05-12T11:44:51 | INFO | __main__ : Batch-wise Cosine Similarity | 92.27% +2025-05-12T11:44:51 | INFO | __main__ : Cosine Embedding Loss | 0.0773 +2025-05-12T11:44:51 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:44:51 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:44:51 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:44:53 | INFO | utils.basic_utils : Train Epoch: [0] [ 148/4978] eta: 3 days, 8:20:10 lr: 0.000006 temperature: 0.0126 video-loss_cosine: 0.0626 eval_avg_sim: 0.6726 video-cosine_similarity: 0.9374 time: 59.3249 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T11:45:15 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:45:15 | INFO | __main__ : Step: 36000 +2025-05-12T11:45:15 | INFO | __main__ : Current Frame Index within Batch Video: 97/247 +2025-05-12T11:45:15 | INFO | __main__ : Batch-wise Cosine Similarity | 84.74% +2025-05-12T11:45:15 | INFO | __main__ : Cosine Embedding Loss | 0.1526 +2025-05-12T11:45:15 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:45:15 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:45:15 | INFO | __main__ : Evaluation Average Sim | 0.6726 +2025-05-12T11:45:15 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:45:16 | INFO | __main__ : Performing periodic evaluation at global step 36000... +2025-05-12T11:45:16 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T11:45:16 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T11:45:16 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T11:45:16 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T11:45:25 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.6243 +2025-05-12T11:45:25 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0036000.png +2025-05-12T11:45:25 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T11:45:25 | INFO | __main__ : Evaluation at step 36000 complete. Average Similarity: 0.6243 +2025-05-12T11:45:48 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:45:48 | INFO | __main__ : Step: 36100 +2025-05-12T11:45:48 | INFO | __main__ : Current Frame Index within Batch Video: 197/247 +2025-05-12T11:45:48 | INFO | __main__ : Batch-wise Cosine Similarity | 89.49% +2025-05-12T11:45:48 | INFO | __main__ : Cosine Embedding Loss | 0.1051 +2025-05-12T11:45:48 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:45:48 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:45:48 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:46:00 | INFO | utils.basic_utils : Train Epoch: [0] [ 149/4978] eta: 3 days, 8:22:53 lr: 0.000006 temperature: 0.0126 video-loss_cosine: 0.0945 eval_avg_sim: 0.6243 video-cosine_similarity: 0.9055 time: 59.7913 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T11:46:12 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:46:12 | INFO | __main__ : Step: 36200 +2025-05-12T11:46:12 | INFO | __main__ : Current Frame Index within Batch Video: 56/247 +2025-05-12T11:46:12 | INFO | __main__ : Batch-wise Cosine Similarity | 79.73% +2025-05-12T11:46:12 | INFO | __main__ : Cosine Embedding Loss | 0.2027 +2025-05-12T11:46:12 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:46:12 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:46:12 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:46:36 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:46:36 | INFO | __main__ : Step: 36300 +2025-05-12T11:46:36 | INFO | __main__ : Current Frame Index within Batch Video: 156/247 +2025-05-12T11:46:36 | INFO | __main__ : Batch-wise Cosine Similarity | 90.51% +2025-05-12T11:46:36 | INFO | __main__ : Cosine Embedding Loss | 0.0949 +2025-05-12T11:46:36 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:46:36 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:46:36 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:46:58 | INFO | utils.basic_utils : Train Epoch: [0] [ 150/4978] eta: 3 days, 8:20:32 lr: 0.000006 temperature: 0.0126 video-loss_cosine: 0.0628 eval_avg_sim: 0.6243 video-cosine_similarity: 0.9372 time: 59.7936 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T11:47:00 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:47:00 | INFO | __main__ : Step: 36400 +2025-05-12T11:47:00 | INFO | __main__ : Current Frame Index within Batch Video: 15/247 +2025-05-12T11:47:00 | INFO | __main__ : Batch-wise Cosine Similarity | 60.03% +2025-05-12T11:47:00 | INFO | __main__ : Cosine Embedding Loss | 0.3997 +2025-05-12T11:47:00 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:47:00 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:47:00 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:47:24 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:47:24 | INFO | __main__ : Step: 36500 +2025-05-12T11:47:24 | INFO | __main__ : Current Frame Index within Batch Video: 115/247 +2025-05-12T11:47:24 | INFO | __main__ : Batch-wise Cosine Similarity | 87.92% +2025-05-12T11:47:24 | INFO | __main__ : Cosine Embedding Loss | 0.1208 +2025-05-12T11:47:24 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:47:24 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:47:24 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:47:47 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:47:47 | INFO | __main__ : Step: 36600 +2025-05-12T11:47:47 | INFO | __main__ : Current Frame Index within Batch Video: 215/247 +2025-05-12T11:47:47 | INFO | __main__ : Batch-wise Cosine Similarity | 91.30% +2025-05-12T11:47:47 | INFO | __main__ : Cosine Embedding Loss | 0.0870 +2025-05-12T11:47:47 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:47:47 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:47:47 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:47:55 | INFO | utils.basic_utils : Train Epoch: [0] [ 151/4978] eta: 3 days, 8:18:13 lr: 0.000006 temperature: 0.0126 video-loss_cosine: 0.1000 eval_avg_sim: 0.6243 video-cosine_similarity: 0.9000 time: 59.7911 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T11:48:11 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:48:11 | INFO | __main__ : Step: 36700 +2025-05-12T11:48:11 | INFO | __main__ : Current Frame Index within Batch Video: 74/247 +2025-05-12T11:48:11 | INFO | __main__ : Batch-wise Cosine Similarity | 80.97% +2025-05-12T11:48:11 | INFO | __main__ : Cosine Embedding Loss | 0.1903 +2025-05-12T11:48:11 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:48:11 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:48:11 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:48:35 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:48:35 | INFO | __main__ : Step: 36800 +2025-05-12T11:48:35 | INFO | __main__ : Current Frame Index within Batch Video: 174/247 +2025-05-12T11:48:35 | INFO | __main__ : Batch-wise Cosine Similarity | 90.71% +2025-05-12T11:48:35 | INFO | __main__ : Cosine Embedding Loss | 0.0929 +2025-05-12T11:48:35 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:48:35 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:48:35 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:48:52 | INFO | utils.basic_utils : Train Epoch: [0] [ 152/4978] eta: 3 days, 8:15:54 lr: 0.000006 temperature: 0.0126 video-loss_cosine: 0.0880 eval_avg_sim: 0.6243 video-cosine_similarity: 0.9120 time: 59.3228 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T11:48:59 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:48:59 | INFO | __main__ : Step: 36900 +2025-05-12T11:48:59 | INFO | __main__ : Current Frame Index within Batch Video: 33/247 +2025-05-12T11:48:59 | INFO | __main__ : Batch-wise Cosine Similarity | 76.23% +2025-05-12T11:48:59 | INFO | __main__ : Cosine Embedding Loss | 0.2377 +2025-05-12T11:48:59 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:48:59 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:48:59 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:49:23 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:49:23 | INFO | __main__ : Step: 37000 +2025-05-12T11:49:23 | INFO | __main__ : Current Frame Index within Batch Video: 133/247 +2025-05-12T11:49:23 | INFO | __main__ : Batch-wise Cosine Similarity | 84.84% +2025-05-12T11:49:23 | INFO | __main__ : Cosine Embedding Loss | 0.1516 +2025-05-12T11:49:23 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:49:23 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:49:23 | INFO | __main__ : Evaluation Average Sim | 0.6243 +2025-05-12T11:49:23 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:49:23 | INFO | __main__ : Performing periodic evaluation at global step 37000... +2025-05-12T11:49:23 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T11:49:23 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T11:49:23 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T11:49:23 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T11:49:32 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.5974 +2025-05-12T11:49:32 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0037000.png +2025-05-12T11:49:32 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T11:49:32 | INFO | __main__ : Evaluation at step 37000 complete. Average Similarity: 0.5974 +2025-05-12T11:49:56 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:49:56 | INFO | __main__ : Step: 37100 +2025-05-12T11:49:56 | INFO | __main__ : Current Frame Index within Batch Video: 233/247 +2025-05-12T11:49:56 | INFO | __main__ : Batch-wise Cosine Similarity | 88.39% +2025-05-12T11:49:56 | INFO | __main__ : Cosine Embedding Loss | 0.1161 +2025-05-12T11:49:56 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:49:56 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:49:56 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:49:59 | INFO | utils.basic_utils : Train Epoch: [0] [ 153/4978] eta: 3 days, 8:18:31 lr: 0.000006 temperature: 0.0126 video-loss_cosine: 0.0980 eval_avg_sim: 0.5974 video-cosine_similarity: 0.9020 time: 59.7913 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T11:50:20 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:50:20 | INFO | __main__ : Step: 37200 +2025-05-12T11:50:20 | INFO | __main__ : Current Frame Index within Batch Video: 92/247 +2025-05-12T11:50:20 | INFO | __main__ : Batch-wise Cosine Similarity | 86.19% +2025-05-12T11:50:20 | INFO | __main__ : Cosine Embedding Loss | 0.1381 +2025-05-12T11:50:20 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:50:20 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:50:20 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:50:44 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:50:44 | INFO | __main__ : Step: 37300 +2025-05-12T11:50:44 | INFO | __main__ : Current Frame Index within Batch Video: 192/247 +2025-05-12T11:50:44 | INFO | __main__ : Batch-wise Cosine Similarity | 92.24% +2025-05-12T11:50:44 | INFO | __main__ : Cosine Embedding Loss | 0.0776 +2025-05-12T11:50:44 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:50:44 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:50:44 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:50:57 | INFO | utils.basic_utils : Train Epoch: [0] [ 154/4978] eta: 3 days, 8:16:13 lr: 0.000006 temperature: 0.0126 video-loss_cosine: 0.0701 eval_avg_sim: 0.5974 video-cosine_similarity: 0.9299 time: 59.7913 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T11:51:08 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:51:08 | INFO | __main__ : Step: 37400 +2025-05-12T11:51:08 | INFO | __main__ : Current Frame Index within Batch Video: 51/247 +2025-05-12T11:51:08 | INFO | __main__ : Batch-wise Cosine Similarity | 80.87% +2025-05-12T11:51:08 | INFO | __main__ : Cosine Embedding Loss | 0.1913 +2025-05-12T11:51:08 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:51:08 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:51:08 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:51:31 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:51:31 | INFO | __main__ : Step: 37500 +2025-05-12T11:51:31 | INFO | __main__ : Current Frame Index within Batch Video: 151/247 +2025-05-12T11:51:31 | INFO | __main__ : Batch-wise Cosine Similarity | 91.06% +2025-05-12T11:51:31 | INFO | __main__ : Cosine Embedding Loss | 0.0894 +2025-05-12T11:51:31 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:51:31 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:51:31 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:51:54 | INFO | utils.basic_utils : Train Epoch: [0] [ 155/4978] eta: 3 days, 8:13:56 lr: 0.000006 temperature: 0.0126 video-loss_cosine: 0.0736 eval_avg_sim: 0.5974 video-cosine_similarity: 0.9264 time: 59.7911 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T11:51:55 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:51:55 | INFO | __main__ : Step: 37600 +2025-05-12T11:51:55 | INFO | __main__ : Current Frame Index within Batch Video: 10/247 +2025-05-12T11:51:55 | INFO | __main__ : Batch-wise Cosine Similarity | 57.62% +2025-05-12T11:51:55 | INFO | __main__ : Cosine Embedding Loss | 0.4238 +2025-05-12T11:51:55 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:51:55 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:51:55 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:52:19 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:52:19 | INFO | __main__ : Step: 37700 +2025-05-12T11:52:19 | INFO | __main__ : Current Frame Index within Batch Video: 110/247 +2025-05-12T11:52:19 | INFO | __main__ : Batch-wise Cosine Similarity | 87.76% +2025-05-12T11:52:19 | INFO | __main__ : Cosine Embedding Loss | 0.1224 +2025-05-12T11:52:19 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:52:19 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:52:19 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:52:43 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:52:43 | INFO | __main__ : Step: 37800 +2025-05-12T11:52:43 | INFO | __main__ : Current Frame Index within Batch Video: 210/247 +2025-05-12T11:52:43 | INFO | __main__ : Batch-wise Cosine Similarity | 90.41% +2025-05-12T11:52:43 | INFO | __main__ : Cosine Embedding Loss | 0.0959 +2025-05-12T11:52:43 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:52:43 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:52:43 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:52:52 | INFO | utils.basic_utils : Train Epoch: [0] [ 156/4978] eta: 3 days, 8:11:41 lr: 0.000006 temperature: 0.0126 video-loss_cosine: 0.0602 eval_avg_sim: 0.5974 video-cosine_similarity: 0.9398 time: 59.3267 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T11:53:07 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:53:07 | INFO | __main__ : Step: 37900 +2025-05-12T11:53:07 | INFO | __main__ : Current Frame Index within Batch Video: 69/247 +2025-05-12T11:53:07 | INFO | __main__ : Batch-wise Cosine Similarity | 83.99% +2025-05-12T11:53:07 | INFO | __main__ : Cosine Embedding Loss | 0.1601 +2025-05-12T11:53:07 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:53:07 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:53:07 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:53:31 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:53:31 | INFO | __main__ : Step: 38000 +2025-05-12T11:53:31 | INFO | __main__ : Current Frame Index within Batch Video: 169/247 +2025-05-12T11:53:31 | INFO | __main__ : Batch-wise Cosine Similarity | 91.16% +2025-05-12T11:53:31 | INFO | __main__ : Cosine Embedding Loss | 0.0884 +2025-05-12T11:53:31 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:53:31 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:53:31 | INFO | __main__ : Evaluation Average Sim | 0.5974 +2025-05-12T11:53:31 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:53:31 | INFO | __main__ : Performing periodic evaluation at global step 38000... +2025-05-12T11:53:31 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T11:53:31 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T11:53:31 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T11:53:31 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T11:53:40 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.6854 +2025-05-12T11:53:40 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0038000.png +2025-05-12T11:53:40 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T11:53:40 | INFO | __main__ : Evaluation at step 38000 complete. Average Similarity: 0.6854 +2025-05-12T11:53:58 | INFO | utils.basic_utils : Train Epoch: [0] [ 157/4978] eta: 3 days, 8:14:13 lr: 0.000006 temperature: 0.0126 video-loss_cosine: 0.1096 eval_avg_sim: 0.6854 video-cosine_similarity: 0.8904 time: 59.7949 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T11:54:04 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:54:04 | INFO | __main__ : Step: 38100 +2025-05-12T11:54:04 | INFO | __main__ : Current Frame Index within Batch Video: 28/247 +2025-05-12T11:54:04 | INFO | __main__ : Batch-wise Cosine Similarity | 73.86% +2025-05-12T11:54:04 | INFO | __main__ : Cosine Embedding Loss | 0.2614 +2025-05-12T11:54:04 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:54:04 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:54:04 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:54:27 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:54:28 | INFO | __main__ : Step: 38200 +2025-05-12T11:54:28 | INFO | __main__ : Current Frame Index within Batch Video: 128/247 +2025-05-12T11:54:28 | INFO | __main__ : Batch-wise Cosine Similarity | 88.43% +2025-05-12T11:54:28 | INFO | __main__ : Cosine Embedding Loss | 0.1157 +2025-05-12T11:54:28 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:54:28 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:54:28 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:54:51 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:54:51 | INFO | __main__ : Step: 38300 +2025-05-12T11:54:51 | INFO | __main__ : Current Frame Index within Batch Video: 228/247 +2025-05-12T11:54:51 | INFO | __main__ : Batch-wise Cosine Similarity | 91.49% +2025-05-12T11:54:51 | INFO | __main__ : Cosine Embedding Loss | 0.0851 +2025-05-12T11:54:51 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:54:51 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:54:51 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:54:56 | INFO | utils.basic_utils : Train Epoch: [0] [ 158/4978] eta: 3 days, 8:11:58 lr: 0.000006 temperature: 0.0126 video-loss_cosine: 0.0891 eval_avg_sim: 0.6854 video-cosine_similarity: 0.9109 time: 59.7948 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T11:55:15 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:55:15 | INFO | __main__ : Step: 38400 +2025-05-12T11:55:15 | INFO | __main__ : Current Frame Index within Batch Video: 87/247 +2025-05-12T11:55:15 | INFO | __main__ : Batch-wise Cosine Similarity | 83.56% +2025-05-12T11:55:15 | INFO | __main__ : Cosine Embedding Loss | 0.1644 +2025-05-12T11:55:15 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:55:15 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:55:15 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:55:39 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:55:39 | INFO | __main__ : Step: 38500 +2025-05-12T11:55:39 | INFO | __main__ : Current Frame Index within Batch Video: 187/247 +2025-05-12T11:55:39 | INFO | __main__ : Batch-wise Cosine Similarity | 90.75% +2025-05-12T11:55:39 | INFO | __main__ : Cosine Embedding Loss | 0.0925 +2025-05-12T11:55:39 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:55:39 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:55:39 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:55:53 | INFO | utils.basic_utils : Train Epoch: [0] [ 159/4978] eta: 3 days, 8:09:44 lr: 0.000006 temperature: 0.0126 video-loss_cosine: 0.0745 eval_avg_sim: 0.6854 video-cosine_similarity: 0.9255 time: 59.7957 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T11:56:03 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:56:03 | INFO | __main__ : Step: 38600 +2025-05-12T11:56:03 | INFO | __main__ : Current Frame Index within Batch Video: 46/247 +2025-05-12T11:56:03 | INFO | __main__ : Batch-wise Cosine Similarity | 79.37% +2025-05-12T11:56:03 | INFO | __main__ : Cosine Embedding Loss | 0.2063 +2025-05-12T11:56:03 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:56:03 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:56:03 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:56:27 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:56:27 | INFO | __main__ : Step: 38700 +2025-05-12T11:56:27 | INFO | __main__ : Current Frame Index within Batch Video: 146/247 +2025-05-12T11:56:27 | INFO | __main__ : Batch-wise Cosine Similarity | 90.99% +2025-05-12T11:56:27 | INFO | __main__ : Cosine Embedding Loss | 0.0901 +2025-05-12T11:56:27 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:56:27 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:56:27 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:56:50 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:56:50 | INFO | __main__ : Step: 38800 +2025-05-12T11:56:50 | INFO | __main__ : Current Frame Index within Batch Video: 246/247 +2025-05-12T11:56:50 | INFO | __main__ : Batch-wise Cosine Similarity | 93.71% +2025-05-12T11:56:50 | INFO | __main__ : Cosine Embedding Loss | 0.0629 +2025-05-12T11:56:50 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:56:50 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:56:50 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:56:51 | INFO | utils.basic_utils : Train Epoch: [0] [ 160/4978] eta: 3 days, 8:07:31 lr: 0.000006 temperature: 0.0126 video-loss_cosine: 0.0619 eval_avg_sim: 0.6854 video-cosine_similarity: 0.9381 time: 59.7954 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T11:57:14 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:57:14 | INFO | __main__ : Step: 38900 +2025-05-12T11:57:14 | INFO | __main__ : Current Frame Index within Batch Video: 105/247 +2025-05-12T11:57:14 | INFO | __main__ : Batch-wise Cosine Similarity | 87.91% +2025-05-12T11:57:14 | INFO | __main__ : Cosine Embedding Loss | 0.1209 +2025-05-12T11:57:14 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:57:14 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:57:14 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:57:38 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:57:38 | INFO | __main__ : Step: 39000 +2025-05-12T11:57:38 | INFO | __main__ : Current Frame Index within Batch Video: 205/247 +2025-05-12T11:57:38 | INFO | __main__ : Batch-wise Cosine Similarity | 93.09% +2025-05-12T11:57:38 | INFO | __main__ : Cosine Embedding Loss | 0.0691 +2025-05-12T11:57:38 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:57:38 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:57:38 | INFO | __main__ : Evaluation Average Sim | 0.6854 +2025-05-12T11:57:38 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:57:38 | INFO | __main__ : Performing periodic evaluation at global step 39000... +2025-05-12T11:57:38 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T11:57:39 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T11:57:39 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T11:57:39 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T11:57:48 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.5302 +2025-05-12T11:57:48 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0039000.png +2025-05-12T11:57:48 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T11:57:48 | INFO | __main__ : Evaluation at step 39000 complete. Average Similarity: 0.5302 +2025-05-12T11:57:57 | INFO | utils.basic_utils : Train Epoch: [0] [ 161/4978] eta: 3 days, 8:09:55 lr: 0.000006 temperature: 0.0126 video-loss_cosine: 0.0602 eval_avg_sim: 0.5302 video-cosine_similarity: 0.9398 time: 59.7650 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T11:58:11 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:58:11 | INFO | __main__ : Step: 39100 +2025-05-12T11:58:11 | INFO | __main__ : Current Frame Index within Batch Video: 64/247 +2025-05-12T11:58:11 | INFO | __main__ : Batch-wise Cosine Similarity | 84.60% +2025-05-12T11:58:11 | INFO | __main__ : Cosine Embedding Loss | 0.1540 +2025-05-12T11:58:11 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:58:11 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:58:11 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:58:35 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:58:35 | INFO | __main__ : Step: 39200 +2025-05-12T11:58:35 | INFO | __main__ : Current Frame Index within Batch Video: 164/247 +2025-05-12T11:58:35 | INFO | __main__ : Batch-wise Cosine Similarity | 86.45% +2025-05-12T11:58:35 | INFO | __main__ : Cosine Embedding Loss | 0.1355 +2025-05-12T11:58:35 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:58:35 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:58:35 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:58:55 | INFO | utils.basic_utils : Train Epoch: [0] [ 162/4978] eta: 3 days, 8:07:43 lr: 0.000006 temperature: 0.0126 video-loss_cosine: 0.0747 eval_avg_sim: 0.5302 video-cosine_similarity: 0.9253 time: 59.7707 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T11:58:59 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:58:59 | INFO | __main__ : Step: 39300 +2025-05-12T11:58:59 | INFO | __main__ : Current Frame Index within Batch Video: 23/247 +2025-05-12T11:58:59 | INFO | __main__ : Batch-wise Cosine Similarity | 70.35% +2025-05-12T11:58:59 | INFO | __main__ : Cosine Embedding Loss | 0.2965 +2025-05-12T11:58:59 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:58:59 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:58:59 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:59:23 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:59:23 | INFO | __main__ : Step: 39400 +2025-05-12T11:59:23 | INFO | __main__ : Current Frame Index within Batch Video: 123/247 +2025-05-12T11:59:23 | INFO | __main__ : Batch-wise Cosine Similarity | 88.05% +2025-05-12T11:59:23 | INFO | __main__ : Cosine Embedding Loss | 0.1195 +2025-05-12T11:59:23 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:59:23 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:59:23 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:59:47 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:59:47 | INFO | __main__ : Step: 39500 +2025-05-12T11:59:47 | INFO | __main__ : Current Frame Index within Batch Video: 223/247 +2025-05-12T11:59:47 | INFO | __main__ : Batch-wise Cosine Similarity | 88.78% +2025-05-12T11:59:47 | INFO | __main__ : Cosine Embedding Loss | 0.1122 +2025-05-12T11:59:47 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T11:59:47 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T11:59:47 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T11:59:52 | INFO | utils.basic_utils : Train Epoch: [0] [ 163/4978] eta: 3 days, 8:05:32 lr: 0.000006 temperature: 0.0126 video-loss_cosine: 0.0800 eval_avg_sim: 0.5302 video-cosine_similarity: 0.9200 time: 59.7723 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T12:00:11 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:00:11 | INFO | __main__ : Step: 39600 +2025-05-12T12:00:11 | INFO | __main__ : Current Frame Index within Batch Video: 82/247 +2025-05-12T12:00:11 | INFO | __main__ : Batch-wise Cosine Similarity | 85.89% +2025-05-12T12:00:11 | INFO | __main__ : Cosine Embedding Loss | 0.1411 +2025-05-12T12:00:11 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T12:00:11 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:00:11 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:00:34 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:00:34 | INFO | __main__ : Step: 39700 +2025-05-12T12:00:34 | INFO | __main__ : Current Frame Index within Batch Video: 182/247 +2025-05-12T12:00:34 | INFO | __main__ : Batch-wise Cosine Similarity | 90.10% +2025-05-12T12:00:34 | INFO | __main__ : Cosine Embedding Loss | 0.0990 +2025-05-12T12:00:34 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T12:00:34 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:00:34 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:00:50 | INFO | utils.basic_utils : Train Epoch: [0] [ 164/4978] eta: 3 days, 8:03:22 lr: 0.000006 temperature: 0.0126 video-loss_cosine: 0.0599 eval_avg_sim: 0.5302 video-cosine_similarity: 0.9401 time: 59.7742 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T12:00:58 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:00:58 | INFO | __main__ : Step: 39800 +2025-05-12T12:00:58 | INFO | __main__ : Current Frame Index within Batch Video: 41/247 +2025-05-12T12:00:58 | INFO | __main__ : Batch-wise Cosine Similarity | 78.23% +2025-05-12T12:00:58 | INFO | __main__ : Cosine Embedding Loss | 0.2177 +2025-05-12T12:00:58 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T12:00:58 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:00:58 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:01:22 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:01:22 | INFO | __main__ : Step: 39900 +2025-05-12T12:01:22 | INFO | __main__ : Current Frame Index within Batch Video: 141/247 +2025-05-12T12:01:22 | INFO | __main__ : Batch-wise Cosine Similarity | 89.74% +2025-05-12T12:01:22 | INFO | __main__ : Cosine Embedding Loss | 0.1026 +2025-05-12T12:01:22 | INFO | __main__ : Learning Rate | 0.000006 +2025-05-12T12:01:22 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:01:22 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:01:46 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:01:46 | INFO | __main__ : Step: 40000 +2025-05-12T12:01:46 | INFO | __main__ : Current Frame Index within Batch Video: 241/247 +2025-05-12T12:01:46 | INFO | __main__ : Batch-wise Cosine Similarity | 91.84% +2025-05-12T12:01:46 | INFO | __main__ : Cosine Embedding Loss | 0.0816 +2025-05-12T12:01:46 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:01:46 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:01:46 | INFO | __main__ : Evaluation Average Sim | 0.5302 +2025-05-12T12:01:46 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:01:46 | INFO | __main__ : Saving checkpoint at global step 40000 +2025-05-12T12:01:46 | INFO | __main__ : Performing periodic evaluation at global step 40000... +2025-05-12T12:01:46 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T12:01:46 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T12:01:46 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T12:01:46 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T12:01:56 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.5751 +2025-05-12T12:01:56 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0040000.png +2025-05-12T12:01:56 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T12:01:56 | INFO | __main__ : Evaluation at step 40000 complete. Average Similarity: 0.5751 +2025-05-12T12:01:57 | INFO | utils.basic_utils : Train Epoch: [0] [ 165/4978] eta: 3 days, 8:05:46 lr: 0.000007 temperature: 0.0126 video-loss_cosine: 0.0803 eval_avg_sim: 0.5751 video-cosine_similarity: 0.9197 time: 59.7727 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T12:02:19 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:02:19 | INFO | __main__ : Step: 40100 +2025-05-12T12:02:19 | INFO | __main__ : Current Frame Index within Batch Video: 100/247 +2025-05-12T12:02:19 | INFO | __main__ : Batch-wise Cosine Similarity | 82.56% +2025-05-12T12:02:19 | INFO | __main__ : Cosine Embedding Loss | 0.1744 +2025-05-12T12:02:19 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:02:19 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:02:19 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:02:43 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:02:43 | INFO | __main__ : Step: 40200 +2025-05-12T12:02:43 | INFO | __main__ : Current Frame Index within Batch Video: 200/247 +2025-05-12T12:02:43 | INFO | __main__ : Batch-wise Cosine Similarity | 89.70% +2025-05-12T12:02:43 | INFO | __main__ : Cosine Embedding Loss | 0.1030 +2025-05-12T12:02:43 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:02:43 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:02:43 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:02:54 | INFO | utils.basic_utils : Train Epoch: [0] [ 166/4978] eta: 3 days, 8:03:35 lr: 0.000007 temperature: 0.0126 video-loss_cosine: 0.0932 eval_avg_sim: 0.5751 video-cosine_similarity: 0.9068 time: 59.7724 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T12:03:07 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:03:07 | INFO | __main__ : Step: 40300 +2025-05-12T12:03:07 | INFO | __main__ : Current Frame Index within Batch Video: 59/247 +2025-05-12T12:03:07 | INFO | __main__ : Batch-wise Cosine Similarity | 80.19% +2025-05-12T12:03:07 | INFO | __main__ : Cosine Embedding Loss | 0.1981 +2025-05-12T12:03:07 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:03:07 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:03:07 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:03:31 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:03:31 | INFO | __main__ : Step: 40400 +2025-05-12T12:03:31 | INFO | __main__ : Current Frame Index within Batch Video: 159/247 +2025-05-12T12:03:31 | INFO | __main__ : Batch-wise Cosine Similarity | 89.10% +2025-05-12T12:03:31 | INFO | __main__ : Cosine Embedding Loss | 0.1090 +2025-05-12T12:03:31 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:03:31 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:03:31 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:03:52 | INFO | utils.basic_utils : Train Epoch: [0] [ 167/4978] eta: 3 days, 8:01:25 lr: 0.000007 temperature: 0.0126 video-loss_cosine: 0.0987 eval_avg_sim: 0.5751 video-cosine_similarity: 0.9013 time: 59.7764 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T12:03:55 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:03:55 | INFO | __main__ : Step: 40500 +2025-05-12T12:03:55 | INFO | __main__ : Current Frame Index within Batch Video: 18/247 +2025-05-12T12:03:55 | INFO | __main__ : Batch-wise Cosine Similarity | 66.06% +2025-05-12T12:03:55 | INFO | __main__ : Cosine Embedding Loss | 0.3394 +2025-05-12T12:03:55 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:03:55 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:03:55 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:04:18 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:04:18 | INFO | __main__ : Step: 40600 +2025-05-12T12:04:18 | INFO | __main__ : Current Frame Index within Batch Video: 118/247 +2025-05-12T12:04:18 | INFO | __main__ : Batch-wise Cosine Similarity | 87.38% +2025-05-12T12:04:18 | INFO | __main__ : Cosine Embedding Loss | 0.1262 +2025-05-12T12:04:18 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:04:18 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:04:18 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:04:42 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:04:42 | INFO | __main__ : Step: 40700 +2025-05-12T12:04:42 | INFO | __main__ : Current Frame Index within Batch Video: 218/247 +2025-05-12T12:04:42 | INFO | __main__ : Batch-wise Cosine Similarity | 91.92% +2025-05-12T12:04:42 | INFO | __main__ : Cosine Embedding Loss | 0.0808 +2025-05-12T12:04:42 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:04:42 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:04:42 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:04:49 | INFO | utils.basic_utils : Train Epoch: [0] [ 168/4978] eta: 3 days, 7:59:16 lr: 0.000007 temperature: 0.0126 video-loss_cosine: 0.0666 eval_avg_sim: 0.5751 video-cosine_similarity: 0.9334 time: 59.7780 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T12:05:06 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:05:06 | INFO | __main__ : Step: 40800 +2025-05-12T12:05:06 | INFO | __main__ : Current Frame Index within Batch Video: 77/247 +2025-05-12T12:05:06 | INFO | __main__ : Batch-wise Cosine Similarity | 82.29% +2025-05-12T12:05:06 | INFO | __main__ : Cosine Embedding Loss | 0.1771 +2025-05-12T12:05:06 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:05:06 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:05:06 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:05:30 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:05:30 | INFO | __main__ : Step: 40900 +2025-05-12T12:05:30 | INFO | __main__ : Current Frame Index within Batch Video: 177/247 +2025-05-12T12:05:30 | INFO | __main__ : Batch-wise Cosine Similarity | 90.02% +2025-05-12T12:05:30 | INFO | __main__ : Cosine Embedding Loss | 0.0998 +2025-05-12T12:05:30 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:05:30 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:05:30 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:05:47 | INFO | utils.basic_utils : Train Epoch: [0] [ 169/4978] eta: 3 days, 7:57:08 lr: 0.000007 temperature: 0.0126 video-loss_cosine: 0.0670 eval_avg_sim: 0.5751 video-cosine_similarity: 0.9330 time: 59.3120 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T12:05:54 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:05:54 | INFO | __main__ : Step: 41000 +2025-05-12T12:05:54 | INFO | __main__ : Current Frame Index within Batch Video: 36/247 +2025-05-12T12:05:54 | INFO | __main__ : Batch-wise Cosine Similarity | 75.46% +2025-05-12T12:05:54 | INFO | __main__ : Cosine Embedding Loss | 0.2454 +2025-05-12T12:05:54 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:05:54 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:05:54 | INFO | __main__ : Evaluation Average Sim | 0.5751 +2025-05-12T12:05:54 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:05:54 | INFO | __main__ : Performing periodic evaluation at global step 41000... +2025-05-12T12:05:54 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T12:05:54 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T12:05:54 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T12:05:54 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T12:06:04 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.7102 +2025-05-12T12:06:04 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0041000.png +2025-05-12T12:06:04 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T12:06:04 | INFO | __main__ : Evaluation at step 41000 complete. Average Similarity: 0.7102 +2025-05-12T12:06:27 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:06:27 | INFO | __main__ : Step: 41100 +2025-05-12T12:06:27 | INFO | __main__ : Current Frame Index within Batch Video: 136/247 +2025-05-12T12:06:27 | INFO | __main__ : Batch-wise Cosine Similarity | 89.41% +2025-05-12T12:06:27 | INFO | __main__ : Cosine Embedding Loss | 0.1059 +2025-05-12T12:06:27 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:06:27 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:06:27 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:06:51 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:06:51 | INFO | __main__ : Step: 41200 +2025-05-12T12:06:51 | INFO | __main__ : Current Frame Index within Batch Video: 236/247 +2025-05-12T12:06:51 | INFO | __main__ : Batch-wise Cosine Similarity | 95.02% +2025-05-12T12:06:51 | INFO | __main__ : Cosine Embedding Loss | 0.0498 +2025-05-12T12:06:51 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:06:51 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:06:51 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:06:54 | INFO | utils.basic_utils : Train Epoch: [0] [ 170/4978] eta: 3 days, 7:59:31 lr: 0.000007 temperature: 0.0126 video-loss_cosine: 0.0430 eval_avg_sim: 0.7102 video-cosine_similarity: 0.9570 time: 59.7961 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T12:07:15 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:07:15 | INFO | __main__ : Step: 41300 +2025-05-12T12:07:15 | INFO | __main__ : Current Frame Index within Batch Video: 95/247 +2025-05-12T12:07:15 | INFO | __main__ : Batch-wise Cosine Similarity | 84.60% +2025-05-12T12:07:15 | INFO | __main__ : Cosine Embedding Loss | 0.1540 +2025-05-12T12:07:15 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:07:15 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:07:15 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:07:39 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:07:39 | INFO | __main__ : Step: 41400 +2025-05-12T12:07:39 | INFO | __main__ : Current Frame Index within Batch Video: 195/247 +2025-05-12T12:07:39 | INFO | __main__ : Batch-wise Cosine Similarity | 89.15% +2025-05-12T12:07:39 | INFO | __main__ : Cosine Embedding Loss | 0.1085 +2025-05-12T12:07:39 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:07:39 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:07:39 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:07:51 | INFO | utils.basic_utils : Train Epoch: [0] [ 171/4978] eta: 3 days, 7:57:24 lr: 0.000007 temperature: 0.0126 video-loss_cosine: 0.0934 eval_avg_sim: 0.7102 video-cosine_similarity: 0.9066 time: 59.7996 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T12:08:03 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:08:03 | INFO | __main__ : Step: 41500 +2025-05-12T12:08:03 | INFO | __main__ : Current Frame Index within Batch Video: 54/247 +2025-05-12T12:08:03 | INFO | __main__ : Batch-wise Cosine Similarity | 78.91% +2025-05-12T12:08:03 | INFO | __main__ : Cosine Embedding Loss | 0.2109 +2025-05-12T12:08:03 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:08:03 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:08:03 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:08:26 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:08:26 | INFO | __main__ : Step: 41600 +2025-05-12T12:08:26 | INFO | __main__ : Current Frame Index within Batch Video: 154/247 +2025-05-12T12:08:26 | INFO | __main__ : Batch-wise Cosine Similarity | 90.35% +2025-05-12T12:08:26 | INFO | __main__ : Cosine Embedding Loss | 0.0965 +2025-05-12T12:08:26 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:08:26 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:08:26 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:08:49 | INFO | utils.basic_utils : Train Epoch: [0] [ 172/4978] eta: 3 days, 7:55:16 lr: 0.000007 temperature: 0.0126 video-loss_cosine: 0.0850 eval_avg_sim: 0.7102 video-cosine_similarity: 0.9150 time: 59.8021 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T12:08:50 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:08:50 | INFO | __main__ : Step: 41700 +2025-05-12T12:08:50 | INFO | __main__ : Current Frame Index within Batch Video: 13/247 +2025-05-12T12:08:50 | INFO | __main__ : Batch-wise Cosine Similarity | 56.71% +2025-05-12T12:08:50 | INFO | __main__ : Cosine Embedding Loss | 0.4329 +2025-05-12T12:08:50 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:08:50 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:08:50 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:09:14 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:09:14 | INFO | __main__ : Step: 41800 +2025-05-12T12:09:14 | INFO | __main__ : Current Frame Index within Batch Video: 113/247 +2025-05-12T12:09:14 | INFO | __main__ : Batch-wise Cosine Similarity | 86.63% +2025-05-12T12:09:14 | INFO | __main__ : Cosine Embedding Loss | 0.1337 +2025-05-12T12:09:14 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:09:14 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:09:14 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:09:38 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:09:38 | INFO | __main__ : Step: 41900 +2025-05-12T12:09:38 | INFO | __main__ : Current Frame Index within Batch Video: 213/247 +2025-05-12T12:09:38 | INFO | __main__ : Batch-wise Cosine Similarity | 88.86% +2025-05-12T12:09:38 | INFO | __main__ : Cosine Embedding Loss | 0.1114 +2025-05-12T12:09:38 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:09:38 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:09:38 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:09:46 | INFO | utils.basic_utils : Train Epoch: [0] [ 173/4978] eta: 3 days, 7:53:10 lr: 0.000007 temperature: 0.0126 video-loss_cosine: 0.0816 eval_avg_sim: 0.7102 video-cosine_similarity: 0.9184 time: 59.3352 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T12:10:02 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:10:02 | INFO | __main__ : Step: 42000 +2025-05-12T12:10:02 | INFO | __main__ : Current Frame Index within Batch Video: 72/247 +2025-05-12T12:10:02 | INFO | __main__ : Batch-wise Cosine Similarity | 83.47% +2025-05-12T12:10:02 | INFO | __main__ : Cosine Embedding Loss | 0.1653 +2025-05-12T12:10:02 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:10:02 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:10:02 | INFO | __main__ : Evaluation Average Sim | 0.7102 +2025-05-12T12:10:02 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:10:02 | INFO | __main__ : Performing periodic evaluation at global step 42000... +2025-05-12T12:10:02 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T12:10:02 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T12:10:02 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T12:10:02 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T12:10:11 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.5580 +2025-05-12T12:10:12 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0042000.png +2025-05-12T12:10:12 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T12:10:12 | INFO | __main__ : Evaluation at step 42000 complete. Average Similarity: 0.5580 +2025-05-12T12:10:35 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:10:35 | INFO | __main__ : Step: 42100 +2025-05-12T12:10:35 | INFO | __main__ : Current Frame Index within Batch Video: 172/247 +2025-05-12T12:10:35 | INFO | __main__ : Batch-wise Cosine Similarity | 91.18% +2025-05-12T12:10:35 | INFO | __main__ : Cosine Embedding Loss | 0.0882 +2025-05-12T12:10:35 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:10:35 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:10:35 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:10:53 | INFO | utils.basic_utils : Train Epoch: [0] [ 174/4978] eta: 3 days, 7:55:22 lr: 0.000007 temperature: 0.0126 video-loss_cosine: 0.0900 eval_avg_sim: 0.5580 video-cosine_similarity: 0.9100 time: 59.8070 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T12:10:59 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:10:59 | INFO | __main__ : Step: 42200 +2025-05-12T12:10:59 | INFO | __main__ : Current Frame Index within Batch Video: 31/247 +2025-05-12T12:10:59 | INFO | __main__ : Batch-wise Cosine Similarity | 72.41% +2025-05-12T12:10:59 | INFO | __main__ : Cosine Embedding Loss | 0.2759 +2025-05-12T12:10:59 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:10:59 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:10:59 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:11:23 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:11:23 | INFO | __main__ : Step: 42300 +2025-05-12T12:11:23 | INFO | __main__ : Current Frame Index within Batch Video: 131/247 +2025-05-12T12:11:23 | INFO | __main__ : Batch-wise Cosine Similarity | 89.67% +2025-05-12T12:11:23 | INFO | __main__ : Cosine Embedding Loss | 0.1033 +2025-05-12T12:11:23 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:11:23 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:11:23 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:11:47 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:11:47 | INFO | __main__ : Step: 42400 +2025-05-12T12:11:47 | INFO | __main__ : Current Frame Index within Batch Video: 231/247 +2025-05-12T12:11:47 | INFO | __main__ : Batch-wise Cosine Similarity | 94.03% +2025-05-12T12:11:47 | INFO | __main__ : Cosine Embedding Loss | 0.0597 +2025-05-12T12:11:47 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:11:47 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:11:47 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:11:50 | INFO | utils.basic_utils : Train Epoch: [0] [ 175/4978] eta: 3 days, 7:53:17 lr: 0.000007 temperature: 0.0126 video-loss_cosine: 0.0552 eval_avg_sim: 0.5580 video-cosine_similarity: 0.9448 time: 59.8104 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T12:12:11 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:12:11 | INFO | __main__ : Step: 42500 +2025-05-12T12:12:11 | INFO | __main__ : Current Frame Index within Batch Video: 90/247 +2025-05-12T12:12:11 | INFO | __main__ : Batch-wise Cosine Similarity | 85.21% +2025-05-12T12:12:11 | INFO | __main__ : Cosine Embedding Loss | 0.1479 +2025-05-12T12:12:11 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:12:11 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:12:11 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:12:34 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:12:34 | INFO | __main__ : Step: 42600 +2025-05-12T12:12:34 | INFO | __main__ : Current Frame Index within Batch Video: 190/247 +2025-05-12T12:12:34 | INFO | __main__ : Batch-wise Cosine Similarity | 93.38% +2025-05-12T12:12:34 | INFO | __main__ : Cosine Embedding Loss | 0.0662 +2025-05-12T12:12:34 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:12:34 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:12:34 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:12:48 | INFO | utils.basic_utils : Train Epoch: [0] [ 176/4978] eta: 3 days, 7:51:11 lr: 0.000007 temperature: 0.0126 video-loss_cosine: 0.0534 eval_avg_sim: 0.5580 video-cosine_similarity: 0.9466 time: 59.8113 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T12:12:58 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:12:58 | INFO | __main__ : Step: 42700 +2025-05-12T12:12:58 | INFO | __main__ : Current Frame Index within Batch Video: 49/247 +2025-05-12T12:12:58 | INFO | __main__ : Batch-wise Cosine Similarity | 81.65% +2025-05-12T12:12:58 | INFO | __main__ : Cosine Embedding Loss | 0.1835 +2025-05-12T12:12:58 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:12:58 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:12:58 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:13:22 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:13:22 | INFO | __main__ : Step: 42800 +2025-05-12T12:13:22 | INFO | __main__ : Current Frame Index within Batch Video: 149/247 +2025-05-12T12:13:22 | INFO | __main__ : Batch-wise Cosine Similarity | 92.35% +2025-05-12T12:13:22 | INFO | __main__ : Cosine Embedding Loss | 0.0765 +2025-05-12T12:13:22 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:13:22 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:13:22 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:13:45 | INFO | utils.basic_utils : Train Epoch: [0] [ 177/4978] eta: 3 days, 7:49:06 lr: 0.000007 temperature: 0.0126 video-loss_cosine: 0.0496 eval_avg_sim: 0.5580 video-cosine_similarity: 0.9504 time: 59.3440 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T12:13:46 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:13:46 | INFO | __main__ : Step: 42900 +2025-05-12T12:13:46 | INFO | __main__ : Current Frame Index within Batch Video: 8/247 +2025-05-12T12:13:46 | INFO | __main__ : Batch-wise Cosine Similarity | 60.78% +2025-05-12T12:13:46 | INFO | __main__ : Cosine Embedding Loss | 0.3922 +2025-05-12T12:13:46 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:13:46 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:13:46 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:14:10 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:14:10 | INFO | __main__ : Step: 43000 +2025-05-12T12:14:10 | INFO | __main__ : Current Frame Index within Batch Video: 108/247 +2025-05-12T12:14:10 | INFO | __main__ : Batch-wise Cosine Similarity | 85.76% +2025-05-12T12:14:10 | INFO | __main__ : Cosine Embedding Loss | 0.1424 +2025-05-12T12:14:10 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:14:10 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:14:10 | INFO | __main__ : Evaluation Average Sim | 0.5580 +2025-05-12T12:14:10 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:14:10 | INFO | __main__ : Performing periodic evaluation at global step 43000... +2025-05-12T12:14:10 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T12:14:10 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T12:14:10 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T12:14:10 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T12:14:19 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.5446 +2025-05-12T12:14:19 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0043000.png +2025-05-12T12:14:19 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T12:14:19 | INFO | __main__ : Evaluation at step 43000 complete. Average Similarity: 0.5446 +2025-05-12T12:14:43 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:14:43 | INFO | __main__ : Step: 43100 +2025-05-12T12:14:43 | INFO | __main__ : Current Frame Index within Batch Video: 208/247 +2025-05-12T12:14:43 | INFO | __main__ : Batch-wise Cosine Similarity | 89.18% +2025-05-12T12:14:43 | INFO | __main__ : Cosine Embedding Loss | 0.1082 +2025-05-12T12:14:43 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:14:43 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:14:43 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:14:52 | INFO | utils.basic_utils : Train Epoch: [0] [ 178/4978] eta: 3 days, 7:51:13 lr: 0.000007 temperature: 0.0126 video-loss_cosine: 0.0919 eval_avg_sim: 0.5446 video-cosine_similarity: 0.9081 time: 59.8131 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T12:15:07 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:15:07 | INFO | __main__ : Step: 43200 +2025-05-12T12:15:07 | INFO | __main__ : Current Frame Index within Batch Video: 67/247 +2025-05-12T12:15:07 | INFO | __main__ : Batch-wise Cosine Similarity | 84.15% +2025-05-12T12:15:07 | INFO | __main__ : Cosine Embedding Loss | 0.1585 +2025-05-12T12:15:07 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:15:07 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:15:07 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:15:31 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:15:31 | INFO | __main__ : Step: 43300 +2025-05-12T12:15:31 | INFO | __main__ : Current Frame Index within Batch Video: 167/247 +2025-05-12T12:15:31 | INFO | __main__ : Batch-wise Cosine Similarity | 89.66% +2025-05-12T12:15:31 | INFO | __main__ : Cosine Embedding Loss | 0.1034 +2025-05-12T12:15:31 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:15:31 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:15:31 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:15:50 | INFO | utils.basic_utils : Train Epoch: [0] [ 179/4978] eta: 3 days, 7:49:08 lr: 0.000007 temperature: 0.0126 video-loss_cosine: 0.0643 eval_avg_sim: 0.5446 video-cosine_similarity: 0.9357 time: 59.8138 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T12:15:54 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:15:54 | INFO | __main__ : Step: 43400 +2025-05-12T12:15:54 | INFO | __main__ : Current Frame Index within Batch Video: 26/247 +2025-05-12T12:15:54 | INFO | __main__ : Batch-wise Cosine Similarity | 70.39% +2025-05-12T12:15:54 | INFO | __main__ : Cosine Embedding Loss | 0.2961 +2025-05-12T12:15:54 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:15:54 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:15:54 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:16:18 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:16:18 | INFO | __main__ : Step: 43500 +2025-05-12T12:16:18 | INFO | __main__ : Current Frame Index within Batch Video: 126/247 +2025-05-12T12:16:18 | INFO | __main__ : Batch-wise Cosine Similarity | 89.14% +2025-05-12T12:16:18 | INFO | __main__ : Cosine Embedding Loss | 0.1086 +2025-05-12T12:16:18 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:16:18 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:16:18 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:16:42 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:16:42 | INFO | __main__ : Step: 43600 +2025-05-12T12:16:42 | INFO | __main__ : Current Frame Index within Batch Video: 226/247 +2025-05-12T12:16:42 | INFO | __main__ : Batch-wise Cosine Similarity | 93.47% +2025-05-12T12:16:42 | INFO | __main__ : Cosine Embedding Loss | 0.0653 +2025-05-12T12:16:42 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:16:42 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:16:42 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:16:47 | INFO | utils.basic_utils : Train Epoch: [0] [ 180/4978] eta: 3 days, 7:47:04 lr: 0.000007 temperature: 0.0126 video-loss_cosine: 0.0659 eval_avg_sim: 0.5446 video-cosine_similarity: 0.9341 time: 59.8136 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T12:17:06 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:17:06 | INFO | __main__ : Step: 43700 +2025-05-12T12:17:06 | INFO | __main__ : Current Frame Index within Batch Video: 85/247 +2025-05-12T12:17:06 | INFO | __main__ : Batch-wise Cosine Similarity | 85.72% +2025-05-12T12:17:06 | INFO | __main__ : Cosine Embedding Loss | 0.1428 +2025-05-12T12:17:06 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:17:06 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:17:06 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:17:30 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:17:30 | INFO | __main__ : Step: 43800 +2025-05-12T12:17:30 | INFO | __main__ : Current Frame Index within Batch Video: 185/247 +2025-05-12T12:17:30 | INFO | __main__ : Batch-wise Cosine Similarity | 90.29% +2025-05-12T12:17:30 | INFO | __main__ : Cosine Embedding Loss | 0.0971 +2025-05-12T12:17:30 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:17:30 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:17:30 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:17:44 | INFO | utils.basic_utils : Train Epoch: [0] [ 181/4978] eta: 3 days, 7:45:00 lr: 0.000007 temperature: 0.0126 video-loss_cosine: 0.0573 eval_avg_sim: 0.5446 video-cosine_similarity: 0.9427 time: 59.3488 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T12:17:54 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:17:54 | INFO | __main__ : Step: 43900 +2025-05-12T12:17:54 | INFO | __main__ : Current Frame Index within Batch Video: 44/247 +2025-05-12T12:17:54 | INFO | __main__ : Batch-wise Cosine Similarity | 78.68% +2025-05-12T12:17:54 | INFO | __main__ : Cosine Embedding Loss | 0.2132 +2025-05-12T12:17:54 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:17:54 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:17:54 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:18:17 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:18:17 | INFO | __main__ : Step: 44000 +2025-05-12T12:18:17 | INFO | __main__ : Current Frame Index within Batch Video: 144/247 +2025-05-12T12:18:17 | INFO | __main__ : Batch-wise Cosine Similarity | 91.08% +2025-05-12T12:18:17 | INFO | __main__ : Cosine Embedding Loss | 0.0892 +2025-05-12T12:18:17 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:18:17 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:18:17 | INFO | __main__ : Evaluation Average Sim | 0.5446 +2025-05-12T12:18:17 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:18:18 | INFO | __main__ : Performing periodic evaluation at global step 44000... +2025-05-12T12:18:18 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T12:18:18 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T12:18:18 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T12:18:18 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T12:18:27 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.5547 +2025-05-12T12:18:27 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0044000.png +2025-05-12T12:18:27 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T12:18:27 | INFO | __main__ : Evaluation at step 44000 complete. Average Similarity: 0.5547 +2025-05-12T12:18:50 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:18:50 | INFO | __main__ : Step: 44100 +2025-05-12T12:18:50 | INFO | __main__ : Current Frame Index within Batch Video: 244/247 +2025-05-12T12:18:50 | INFO | __main__ : Batch-wise Cosine Similarity | 92.54% +2025-05-12T12:18:50 | INFO | __main__ : Cosine Embedding Loss | 0.0746 +2025-05-12T12:18:50 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:18:50 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:18:50 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:18:51 | INFO | utils.basic_utils : Train Epoch: [0] [ 182/4978] eta: 3 days, 7:47:00 lr: 0.000007 temperature: 0.0126 video-loss_cosine: 0.0756 eval_avg_sim: 0.5547 video-cosine_similarity: 0.9244 time: 59.8103 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T12:19:14 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:19:14 | INFO | __main__ : Step: 44200 +2025-05-12T12:19:14 | INFO | __main__ : Current Frame Index within Batch Video: 103/247 +2025-05-12T12:19:14 | INFO | __main__ : Batch-wise Cosine Similarity | 86.80% +2025-05-12T12:19:14 | INFO | __main__ : Cosine Embedding Loss | 0.1320 +2025-05-12T12:19:14 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:19:14 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:19:14 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:19:38 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:19:38 | INFO | __main__ : Step: 44300 +2025-05-12T12:19:38 | INFO | __main__ : Current Frame Index within Batch Video: 203/247 +2025-05-12T12:19:38 | INFO | __main__ : Batch-wise Cosine Similarity | 93.14% +2025-05-12T12:19:38 | INFO | __main__ : Cosine Embedding Loss | 0.0686 +2025-05-12T12:19:38 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:19:38 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:19:38 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:19:49 | INFO | utils.basic_utils : Train Epoch: [0] [ 183/4978] eta: 3 days, 7:44:56 lr: 0.000007 temperature: 0.0126 video-loss_cosine: 0.0540 eval_avg_sim: 0.5547 video-cosine_similarity: 0.9460 time: 59.8074 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T12:20:02 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:20:02 | INFO | __main__ : Step: 44400 +2025-05-12T12:20:02 | INFO | __main__ : Current Frame Index within Batch Video: 62/247 +2025-05-12T12:20:02 | INFO | __main__ : Batch-wise Cosine Similarity | 83.74% +2025-05-12T12:20:02 | INFO | __main__ : Cosine Embedding Loss | 0.1626 +2025-05-12T12:20:02 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:20:02 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:20:02 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:20:26 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:20:26 | INFO | __main__ : Step: 44500 +2025-05-12T12:20:26 | INFO | __main__ : Current Frame Index within Batch Video: 162/247 +2025-05-12T12:20:26 | INFO | __main__ : Batch-wise Cosine Similarity | 89.88% +2025-05-12T12:20:26 | INFO | __main__ : Cosine Embedding Loss | 0.1012 +2025-05-12T12:20:26 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:20:26 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:20:26 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:20:46 | INFO | utils.basic_utils : Train Epoch: [0] [ 184/4978] eta: 3 days, 7:42:52 lr: 0.000007 temperature: 0.0126 video-loss_cosine: 0.0909 eval_avg_sim: 0.5547 video-cosine_similarity: 0.9091 time: 59.8050 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T12:20:50 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:20:50 | INFO | __main__ : Step: 44600 +2025-05-12T12:20:50 | INFO | __main__ : Current Frame Index within Batch Video: 21/247 +2025-05-12T12:20:50 | INFO | __main__ : Batch-wise Cosine Similarity | 69.11% +2025-05-12T12:20:50 | INFO | __main__ : Cosine Embedding Loss | 0.3089 +2025-05-12T12:20:50 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:20:50 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:20:50 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:21:13 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:21:13 | INFO | __main__ : Step: 44700 +2025-05-12T12:21:13 | INFO | __main__ : Current Frame Index within Batch Video: 121/247 +2025-05-12T12:21:13 | INFO | __main__ : Batch-wise Cosine Similarity | 90.23% +2025-05-12T12:21:13 | INFO | __main__ : Cosine Embedding Loss | 0.0977 +2025-05-12T12:21:13 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:21:13 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:21:13 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:21:37 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:21:37 | INFO | __main__ : Step: 44800 +2025-05-12T12:21:37 | INFO | __main__ : Current Frame Index within Batch Video: 221/247 +2025-05-12T12:21:37 | INFO | __main__ : Batch-wise Cosine Similarity | 95.35% +2025-05-12T12:21:37 | INFO | __main__ : Cosine Embedding Loss | 0.0465 +2025-05-12T12:21:37 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:21:37 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:21:37 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:21:43 | INFO | utils.basic_utils : Train Epoch: [0] [ 185/4978] eta: 3 days, 7:40:50 lr: 0.000007 temperature: 0.0126 video-loss_cosine: 0.0392 eval_avg_sim: 0.5547 video-cosine_similarity: 0.9608 time: 59.3313 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T12:22:01 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:22:01 | INFO | __main__ : Step: 44900 +2025-05-12T12:22:01 | INFO | __main__ : Current Frame Index within Batch Video: 80/247 +2025-05-12T12:22:01 | INFO | __main__ : Batch-wise Cosine Similarity | 85.39% +2025-05-12T12:22:01 | INFO | __main__ : Cosine Embedding Loss | 0.1461 +2025-05-12T12:22:01 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:22:01 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:22:01 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:22:25 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:22:25 | INFO | __main__ : Step: 45000 +2025-05-12T12:22:25 | INFO | __main__ : Current Frame Index within Batch Video: 180/247 +2025-05-12T12:22:25 | INFO | __main__ : Batch-wise Cosine Similarity | 90.74% +2025-05-12T12:22:25 | INFO | __main__ : Cosine Embedding Loss | 0.0926 +2025-05-12T12:22:25 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:22:25 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:22:25 | INFO | __main__ : Evaluation Average Sim | 0.5547 +2025-05-12T12:22:25 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:22:25 | INFO | __main__ : Saving checkpoint at global step 45000 +2025-05-12T12:22:25 | INFO | __main__ : Performing periodic evaluation at global step 45000... +2025-05-12T12:22:25 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T12:22:25 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T12:22:25 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T12:22:25 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T12:22:35 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.4473 +2025-05-12T12:22:35 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0045000.png +2025-05-12T12:22:35 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T12:22:35 | INFO | __main__ : Evaluation at step 45000 complete. Average Similarity: 0.4473 +2025-05-12T12:22:50 | INFO | utils.basic_utils : Train Epoch: [0] [ 186/4978] eta: 3 days, 7:42:50 lr: 0.000007 temperature: 0.0126 video-loss_cosine: 0.0756 eval_avg_sim: 0.4473 video-cosine_similarity: 0.9244 time: 59.8041 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T12:22:58 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:22:58 | INFO | __main__ : Step: 45100 +2025-05-12T12:22:58 | INFO | __main__ : Current Frame Index within Batch Video: 39/247 +2025-05-12T12:22:58 | INFO | __main__ : Batch-wise Cosine Similarity | 75.94% +2025-05-12T12:22:58 | INFO | __main__ : Cosine Embedding Loss | 0.2406 +2025-05-12T12:22:58 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:22:58 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:22:58 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:23:22 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:23:22 | INFO | __main__ : Step: 45200 +2025-05-12T12:23:22 | INFO | __main__ : Current Frame Index within Batch Video: 139/247 +2025-05-12T12:23:22 | INFO | __main__ : Batch-wise Cosine Similarity | 87.99% +2025-05-12T12:23:22 | INFO | __main__ : Cosine Embedding Loss | 0.1201 +2025-05-12T12:23:22 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:23:22 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:23:22 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:23:46 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:23:46 | INFO | __main__ : Step: 45300 +2025-05-12T12:23:46 | INFO | __main__ : Current Frame Index within Batch Video: 239/247 +2025-05-12T12:23:46 | INFO | __main__ : Batch-wise Cosine Similarity | 91.78% +2025-05-12T12:23:46 | INFO | __main__ : Cosine Embedding Loss | 0.0822 +2025-05-12T12:23:46 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:23:46 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:23:46 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:23:48 | INFO | utils.basic_utils : Train Epoch: [0] [ 187/4978] eta: 3 days, 7:40:47 lr: 0.000007 temperature: 0.0126 video-loss_cosine: 0.0767 eval_avg_sim: 0.4473 video-cosine_similarity: 0.9233 time: 59.8024 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T12:24:10 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:24:10 | INFO | __main__ : Step: 45400 +2025-05-12T12:24:10 | INFO | __main__ : Current Frame Index within Batch Video: 98/247 +2025-05-12T12:24:10 | INFO | __main__ : Batch-wise Cosine Similarity | 86.91% +2025-05-12T12:24:10 | INFO | __main__ : Cosine Embedding Loss | 0.1309 +2025-05-12T12:24:10 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:24:10 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:24:10 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:24:33 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:24:33 | INFO | __main__ : Step: 45500 +2025-05-12T12:24:33 | INFO | __main__ : Current Frame Index within Batch Video: 198/247 +2025-05-12T12:24:33 | INFO | __main__ : Batch-wise Cosine Similarity | 90.36% +2025-05-12T12:24:33 | INFO | __main__ : Cosine Embedding Loss | 0.0964 +2025-05-12T12:24:33 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:24:33 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:24:33 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:24:45 | INFO | utils.basic_utils : Train Epoch: [0] [ 188/4978] eta: 3 days, 7:38:44 lr: 0.000007 temperature: 0.0126 video-loss_cosine: 0.0640 eval_avg_sim: 0.4473 video-cosine_similarity: 0.9360 time: 59.7994 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T12:24:57 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:24:57 | INFO | __main__ : Step: 45600 +2025-05-12T12:24:57 | INFO | __main__ : Current Frame Index within Batch Video: 57/247 +2025-05-12T12:24:57 | INFO | __main__ : Batch-wise Cosine Similarity | 82.96% +2025-05-12T12:24:57 | INFO | __main__ : Cosine Embedding Loss | 0.1704 +2025-05-12T12:24:57 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:24:57 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:24:57 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:25:21 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:25:21 | INFO | __main__ : Step: 45700 +2025-05-12T12:25:21 | INFO | __main__ : Current Frame Index within Batch Video: 157/247 +2025-05-12T12:25:21 | INFO | __main__ : Batch-wise Cosine Similarity | 93.49% +2025-05-12T12:25:21 | INFO | __main__ : Cosine Embedding Loss | 0.0651 +2025-05-12T12:25:21 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:25:21 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:25:21 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:25:43 | INFO | utils.basic_utils : Train Epoch: [0] [ 189/4978] eta: 3 days, 7:36:44 lr: 0.000007 temperature: 0.0126 video-loss_cosine: 0.0592 eval_avg_sim: 0.4473 video-cosine_similarity: 0.9408 time: 59.7975 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T12:25:45 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:25:45 | INFO | __main__ : Step: 45800 +2025-05-12T12:25:45 | INFO | __main__ : Current Frame Index within Batch Video: 16/247 +2025-05-12T12:25:45 | INFO | __main__ : Batch-wise Cosine Similarity | 63.91% +2025-05-12T12:25:45 | INFO | __main__ : Cosine Embedding Loss | 0.3609 +2025-05-12T12:25:45 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:25:45 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:25:45 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:26:09 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:26:09 | INFO | __main__ : Step: 45900 +2025-05-12T12:26:09 | INFO | __main__ : Current Frame Index within Batch Video: 116/247 +2025-05-12T12:26:09 | INFO | __main__ : Batch-wise Cosine Similarity | 89.39% +2025-05-12T12:26:09 | INFO | __main__ : Cosine Embedding Loss | 0.1061 +2025-05-12T12:26:09 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:26:09 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:26:09 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:26:33 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:26:33 | INFO | __main__ : Step: 46000 +2025-05-12T12:26:33 | INFO | __main__ : Current Frame Index within Batch Video: 216/247 +2025-05-12T12:26:33 | INFO | __main__ : Batch-wise Cosine Similarity | 95.12% +2025-05-12T12:26:33 | INFO | __main__ : Cosine Embedding Loss | 0.0488 +2025-05-12T12:26:33 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:26:33 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:26:33 | INFO | __main__ : Evaluation Average Sim | 0.4473 +2025-05-12T12:26:33 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:26:33 | INFO | __main__ : Performing periodic evaluation at global step 46000... +2025-05-12T12:26:33 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T12:26:33 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T12:26:33 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T12:26:33 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T12:26:42 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.4943 +2025-05-12T12:26:42 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0046000.png +2025-05-12T12:26:42 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T12:26:42 | INFO | __main__ : Evaluation at step 46000 complete. Average Similarity: 0.4943 +2025-05-12T12:26:49 | INFO | utils.basic_utils : Train Epoch: [0] [ 190/4978] eta: 3 days, 7:38:37 lr: 0.000007 temperature: 0.0126 video-loss_cosine: 0.0509 eval_avg_sim: 0.4943 video-cosine_similarity: 0.9491 time: 59.7808 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T12:27:06 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:27:06 | INFO | __main__ : Step: 46100 +2025-05-12T12:27:06 | INFO | __main__ : Current Frame Index within Batch Video: 75/247 +2025-05-12T12:27:06 | INFO | __main__ : Batch-wise Cosine Similarity | 84.25% +2025-05-12T12:27:06 | INFO | __main__ : Cosine Embedding Loss | 0.1575 +2025-05-12T12:27:06 | INFO | __main__ : Learning Rate | 0.000007 +2025-05-12T12:27:06 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:27:06 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:27:30 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:27:30 | INFO | __main__ : Step: 46200 +2025-05-12T12:27:30 | INFO | __main__ : Current Frame Index within Batch Video: 175/247 +2025-05-12T12:27:30 | INFO | __main__ : Batch-wise Cosine Similarity | 90.41% +2025-05-12T12:27:30 | INFO | __main__ : Cosine Embedding Loss | 0.0959 +2025-05-12T12:27:30 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:27:30 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:27:30 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:27:47 | INFO | utils.basic_utils : Train Epoch: [0] [ 191/4978] eta: 3 days, 7:36:35 lr: 0.000008 temperature: 0.0126 video-loss_cosine: 0.1142 eval_avg_sim: 0.4943 video-cosine_similarity: 0.8858 time: 59.7766 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T12:27:53 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:27:53 | INFO | __main__ : Step: 46300 +2025-05-12T12:27:53 | INFO | __main__ : Current Frame Index within Batch Video: 34/247 +2025-05-12T12:27:53 | INFO | __main__ : Batch-wise Cosine Similarity | 74.61% +2025-05-12T12:27:53 | INFO | __main__ : Cosine Embedding Loss | 0.2539 +2025-05-12T12:27:53 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:27:53 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:27:53 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:28:17 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:28:17 | INFO | __main__ : Step: 46400 +2025-05-12T12:28:17 | INFO | __main__ : Current Frame Index within Batch Video: 134/247 +2025-05-12T12:28:17 | INFO | __main__ : Batch-wise Cosine Similarity | 84.38% +2025-05-12T12:28:17 | INFO | __main__ : Cosine Embedding Loss | 0.1562 +2025-05-12T12:28:17 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:28:17 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:28:17 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:28:41 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:28:41 | INFO | __main__ : Step: 46500 +2025-05-12T12:28:41 | INFO | __main__ : Current Frame Index within Batch Video: 234/247 +2025-05-12T12:28:41 | INFO | __main__ : Batch-wise Cosine Similarity | 90.81% +2025-05-12T12:28:41 | INFO | __main__ : Cosine Embedding Loss | 0.0919 +2025-05-12T12:28:41 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:28:41 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:28:41 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:28:44 | INFO | utils.basic_utils : Train Epoch: [0] [ 192/4978] eta: 3 days, 7:34:33 lr: 0.000008 temperature: 0.0126 video-loss_cosine: 0.0989 eval_avg_sim: 0.4943 video-cosine_similarity: 0.9011 time: 59.7743 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T12:29:05 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:29:05 | INFO | __main__ : Step: 46600 +2025-05-12T12:29:05 | INFO | __main__ : Current Frame Index within Batch Video: 93/247 +2025-05-12T12:29:05 | INFO | __main__ : Batch-wise Cosine Similarity | 87.18% +2025-05-12T12:29:05 | INFO | __main__ : Cosine Embedding Loss | 0.1282 +2025-05-12T12:29:05 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:29:05 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:29:05 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:29:29 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:29:29 | INFO | __main__ : Step: 46700 +2025-05-12T12:29:29 | INFO | __main__ : Current Frame Index within Batch Video: 193/247 +2025-05-12T12:29:29 | INFO | __main__ : Batch-wise Cosine Similarity | 90.92% +2025-05-12T12:29:29 | INFO | __main__ : Cosine Embedding Loss | 0.0908 +2025-05-12T12:29:29 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:29:29 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:29:29 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:29:42 | INFO | utils.basic_utils : Train Epoch: [0] [ 193/4978] eta: 3 days, 7:32:34 lr: 0.000008 temperature: 0.0126 video-loss_cosine: 0.0633 eval_avg_sim: 0.4943 video-cosine_similarity: 0.9367 time: 59.7744 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T12:29:53 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:29:53 | INFO | __main__ : Step: 46800 +2025-05-12T12:29:53 | INFO | __main__ : Current Frame Index within Batch Video: 52/247 +2025-05-12T12:29:53 | INFO | __main__ : Batch-wise Cosine Similarity | 80.75% +2025-05-12T12:29:53 | INFO | __main__ : Cosine Embedding Loss | 0.1925 +2025-05-12T12:29:53 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:29:53 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:29:53 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:30:16 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:30:16 | INFO | __main__ : Step: 46900 +2025-05-12T12:30:16 | INFO | __main__ : Current Frame Index within Batch Video: 152/247 +2025-05-12T12:30:16 | INFO | __main__ : Batch-wise Cosine Similarity | 89.39% +2025-05-12T12:30:16 | INFO | __main__ : Cosine Embedding Loss | 0.1061 +2025-05-12T12:30:16 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:30:16 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:30:16 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:30:39 | INFO | utils.basic_utils : Train Epoch: [0] [ 194/4978] eta: 3 days, 7:30:36 lr: 0.000008 temperature: 0.0126 video-loss_cosine: 0.0683 eval_avg_sim: 0.4943 video-cosine_similarity: 0.9317 time: 59.3043 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T12:30:40 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:30:40 | INFO | __main__ : Step: 47000 +2025-05-12T12:30:40 | INFO | __main__ : Current Frame Index within Batch Video: 11/247 +2025-05-12T12:30:40 | INFO | __main__ : Batch-wise Cosine Similarity | 61.51% +2025-05-12T12:30:40 | INFO | __main__ : Cosine Embedding Loss | 0.3849 +2025-05-12T12:30:40 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:30:40 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:30:40 | INFO | __main__ : Evaluation Average Sim | 0.4943 +2025-05-12T12:30:40 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:30:41 | INFO | __main__ : Performing periodic evaluation at global step 47000... +2025-05-12T12:30:41 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T12:30:41 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T12:30:41 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T12:30:41 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T12:30:50 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.6490 +2025-05-12T12:30:50 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0047000.png +2025-05-12T12:30:50 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T12:30:50 | INFO | __main__ : Evaluation at step 47000 complete. Average Similarity: 0.6490 +2025-05-12T12:31:14 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:31:14 | INFO | __main__ : Step: 47100 +2025-05-12T12:31:14 | INFO | __main__ : Current Frame Index within Batch Video: 111/247 +2025-05-12T12:31:14 | INFO | __main__ : Batch-wise Cosine Similarity | 88.21% +2025-05-12T12:31:14 | INFO | __main__ : Cosine Embedding Loss | 0.1179 +2025-05-12T12:31:14 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:31:14 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:31:14 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:31:38 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:31:38 | INFO | __main__ : Step: 47200 +2025-05-12T12:31:38 | INFO | __main__ : Current Frame Index within Batch Video: 211/247 +2025-05-12T12:31:38 | INFO | __main__ : Batch-wise Cosine Similarity | 89.94% +2025-05-12T12:31:38 | INFO | __main__ : Cosine Embedding Loss | 0.1006 +2025-05-12T12:31:38 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:31:38 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:31:38 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:31:46 | INFO | utils.basic_utils : Train Epoch: [0] [ 195/4978] eta: 3 days, 7:32:37 lr: 0.000008 temperature: 0.0126 video-loss_cosine: 0.0593 eval_avg_sim: 0.6490 video-cosine_similarity: 0.9407 time: 59.7919 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T12:32:02 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:32:02 | INFO | __main__ : Step: 47300 +2025-05-12T12:32:02 | INFO | __main__ : Current Frame Index within Batch Video: 70/247 +2025-05-12T12:32:02 | INFO | __main__ : Batch-wise Cosine Similarity | 86.50% +2025-05-12T12:32:02 | INFO | __main__ : Cosine Embedding Loss | 0.1350 +2025-05-12T12:32:02 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:32:02 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:32:02 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:32:25 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:32:25 | INFO | __main__ : Step: 47400 +2025-05-12T12:32:25 | INFO | __main__ : Current Frame Index within Batch Video: 170/247 +2025-05-12T12:32:25 | INFO | __main__ : Batch-wise Cosine Similarity | 92.58% +2025-05-12T12:32:25 | INFO | __main__ : Cosine Embedding Loss | 0.0742 +2025-05-12T12:32:25 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:32:25 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:32:25 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:32:44 | INFO | utils.basic_utils : Train Epoch: [0] [ 196/4978] eta: 3 days, 7:30:38 lr: 0.000008 temperature: 0.0126 video-loss_cosine: 0.0529 eval_avg_sim: 0.6490 video-cosine_similarity: 0.9471 time: 59.7911 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T12:32:49 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:32:49 | INFO | __main__ : Step: 47500 +2025-05-12T12:32:49 | INFO | __main__ : Current Frame Index within Batch Video: 29/247 +2025-05-12T12:32:49 | INFO | __main__ : Batch-wise Cosine Similarity | 75.68% +2025-05-12T12:32:49 | INFO | __main__ : Cosine Embedding Loss | 0.2432 +2025-05-12T12:32:49 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:32:49 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:32:49 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:33:13 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:33:13 | INFO | __main__ : Step: 47600 +2025-05-12T12:33:13 | INFO | __main__ : Current Frame Index within Batch Video: 129/247 +2025-05-12T12:33:13 | INFO | __main__ : Batch-wise Cosine Similarity | 92.05% +2025-05-12T12:33:13 | INFO | __main__ : Cosine Embedding Loss | 0.0795 +2025-05-12T12:33:13 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:33:13 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:33:13 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:33:37 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:33:37 | INFO | __main__ : Step: 47700 +2025-05-12T12:33:37 | INFO | __main__ : Current Frame Index within Batch Video: 229/247 +2025-05-12T12:33:37 | INFO | __main__ : Batch-wise Cosine Similarity | 93.38% +2025-05-12T12:33:37 | INFO | __main__ : Cosine Embedding Loss | 0.0662 +2025-05-12T12:33:37 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:33:37 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:33:37 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:33:41 | INFO | utils.basic_utils : Train Epoch: [0] [ 197/4978] eta: 3 days, 7:28:41 lr: 0.000008 temperature: 0.0126 video-loss_cosine: 0.0686 eval_avg_sim: 0.6490 video-cosine_similarity: 0.9314 time: 59.7935 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T12:34:01 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:34:01 | INFO | __main__ : Step: 47800 +2025-05-12T12:34:01 | INFO | __main__ : Current Frame Index within Batch Video: 88/247 +2025-05-12T12:34:01 | INFO | __main__ : Batch-wise Cosine Similarity | 84.77% +2025-05-12T12:34:01 | INFO | __main__ : Cosine Embedding Loss | 0.1523 +2025-05-12T12:34:01 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:34:01 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:34:01 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:34:25 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:34:25 | INFO | __main__ : Step: 47900 +2025-05-12T12:34:25 | INFO | __main__ : Current Frame Index within Batch Video: 188/247 +2025-05-12T12:34:25 | INFO | __main__ : Batch-wise Cosine Similarity | 90.73% +2025-05-12T12:34:25 | INFO | __main__ : Cosine Embedding Loss | 0.0927 +2025-05-12T12:34:25 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:34:25 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:34:25 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:34:39 | INFO | utils.basic_utils : Train Epoch: [0] [ 198/4978] eta: 3 days, 7:26:43 lr: 0.000008 temperature: 0.0126 video-loss_cosine: 0.0776 eval_avg_sim: 0.6490 video-cosine_similarity: 0.9224 time: 59.3240 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T12:34:49 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:34:49 | INFO | __main__ : Step: 48000 +2025-05-12T12:34:49 | INFO | __main__ : Current Frame Index within Batch Video: 47/247 +2025-05-12T12:34:49 | INFO | __main__ : Batch-wise Cosine Similarity | 80.27% +2025-05-12T12:34:49 | INFO | __main__ : Cosine Embedding Loss | 0.1973 +2025-05-12T12:34:49 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:34:49 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:34:49 | INFO | __main__ : Evaluation Average Sim | 0.6490 +2025-05-12T12:34:49 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:34:49 | INFO | __main__ : Performing periodic evaluation at global step 48000... +2025-05-12T12:34:49 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T12:34:49 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T12:34:49 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T12:34:49 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T12:34:58 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.6847 +2025-05-12T12:34:58 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0048000.png +2025-05-12T12:34:58 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T12:34:58 | INFO | __main__ : Evaluation at step 48000 complete. Average Similarity: 0.6847 +2025-05-12T12:35:22 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:35:22 | INFO | __main__ : Step: 48100 +2025-05-12T12:35:22 | INFO | __main__ : Current Frame Index within Batch Video: 147/247 +2025-05-12T12:35:22 | INFO | __main__ : Batch-wise Cosine Similarity | 91.11% +2025-05-12T12:35:22 | INFO | __main__ : Cosine Embedding Loss | 0.0889 +2025-05-12T12:35:22 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:35:22 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:35:22 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:35:46 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:35:46 | INFO | __main__ : Step: 48200 +2025-05-12T12:35:46 | INFO | __main__ : Current Frame Index within Batch Video: 247/247 +2025-05-12T12:35:46 | INFO | __main__ : Batch-wise Cosine Similarity | 95.25% +2025-05-12T12:35:46 | INFO | __main__ : Cosine Embedding Loss | 0.0475 +2025-05-12T12:35:46 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:35:46 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:35:46 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:35:46 | INFO | utils.basic_utils : Train Epoch: [0] [ 199/4978] eta: 3 days, 7:28:37 lr: 0.000008 temperature: 0.0126 video-loss_cosine: 0.0475 eval_avg_sim: 0.6847 video-cosine_similarity: 0.9525 time: 59.8054 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T12:36:10 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:36:10 | INFO | __main__ : Step: 48300 +2025-05-12T12:36:10 | INFO | __main__ : Current Frame Index within Batch Video: 106/247 +2025-05-12T12:36:10 | INFO | __main__ : Batch-wise Cosine Similarity | 88.62% +2025-05-12T12:36:10 | INFO | __main__ : Cosine Embedding Loss | 0.1138 +2025-05-12T12:36:10 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:36:10 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:36:10 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:36:33 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:36:33 | INFO | __main__ : Step: 48400 +2025-05-12T12:36:33 | INFO | __main__ : Current Frame Index within Batch Video: 206/247 +2025-05-12T12:36:33 | INFO | __main__ : Batch-wise Cosine Similarity | 92.59% +2025-05-12T12:36:33 | INFO | __main__ : Cosine Embedding Loss | 0.0741 +2025-05-12T12:36:33 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:36:33 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:36:33 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:36:43 | INFO | utils.basic_utils : Train Epoch: [0] [ 200/4978] eta: 3 days, 7:26:38 lr: 0.000008 temperature: 0.0126 video-loss_cosine: 0.0693 eval_avg_sim: 0.6847 video-cosine_similarity: 0.9307 time: 59.8046 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T12:36:57 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:36:57 | INFO | __main__ : Step: 48500 +2025-05-12T12:36:57 | INFO | __main__ : Current Frame Index within Batch Video: 65/247 +2025-05-12T12:36:57 | INFO | __main__ : Batch-wise Cosine Similarity | 82.03% +2025-05-12T12:36:57 | INFO | __main__ : Cosine Embedding Loss | 0.1797 +2025-05-12T12:36:57 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:36:57 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:36:57 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:37:21 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:37:21 | INFO | __main__ : Step: 48600 +2025-05-12T12:37:21 | INFO | __main__ : Current Frame Index within Batch Video: 165/247 +2025-05-12T12:37:21 | INFO | __main__ : Batch-wise Cosine Similarity | 91.86% +2025-05-12T12:37:21 | INFO | __main__ : Cosine Embedding Loss | 0.0814 +2025-05-12T12:37:21 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:37:21 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:37:21 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:37:41 | INFO | utils.basic_utils : Train Epoch: [0] [ 201/4978] eta: 3 days, 7:24:41 lr: 0.000008 temperature: 0.0126 video-loss_cosine: 0.0706 eval_avg_sim: 0.6847 video-cosine_similarity: 0.9294 time: 59.8044 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T12:37:45 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:37:45 | INFO | __main__ : Step: 48700 +2025-05-12T12:37:45 | INFO | __main__ : Current Frame Index within Batch Video: 24/247 +2025-05-12T12:37:45 | INFO | __main__ : Batch-wise Cosine Similarity | 71.95% +2025-05-12T12:37:45 | INFO | __main__ : Cosine Embedding Loss | 0.2805 +2025-05-12T12:37:45 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:37:45 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:37:45 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:38:09 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:38:09 | INFO | __main__ : Step: 48800 +2025-05-12T12:38:09 | INFO | __main__ : Current Frame Index within Batch Video: 124/247 +2025-05-12T12:38:09 | INFO | __main__ : Batch-wise Cosine Similarity | 89.52% +2025-05-12T12:38:09 | INFO | __main__ : Cosine Embedding Loss | 0.1048 +2025-05-12T12:38:09 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:38:09 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:38:09 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:38:33 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:38:33 | INFO | __main__ : Step: 48900 +2025-05-12T12:38:33 | INFO | __main__ : Current Frame Index within Batch Video: 224/247 +2025-05-12T12:38:33 | INFO | __main__ : Batch-wise Cosine Similarity | 91.89% +2025-05-12T12:38:33 | INFO | __main__ : Cosine Embedding Loss | 0.0811 +2025-05-12T12:38:33 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:38:33 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:38:33 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:38:38 | INFO | utils.basic_utils : Train Epoch: [0] [ 202/4978] eta: 3 days, 7:22:45 lr: 0.000008 temperature: 0.0126 video-loss_cosine: 0.0746 eval_avg_sim: 0.6847 video-cosine_similarity: 0.9254 time: 59.3406 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T12:38:57 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:38:57 | INFO | __main__ : Step: 49000 +2025-05-12T12:38:57 | INFO | __main__ : Current Frame Index within Batch Video: 83/247 +2025-05-12T12:38:57 | INFO | __main__ : Batch-wise Cosine Similarity | 85.19% +2025-05-12T12:38:57 | INFO | __main__ : Cosine Embedding Loss | 0.1481 +2025-05-12T12:38:57 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:38:57 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:38:57 | INFO | __main__ : Evaluation Average Sim | 0.6847 +2025-05-12T12:38:57 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:38:57 | INFO | __main__ : Performing periodic evaluation at global step 49000... +2025-05-12T12:38:57 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T12:38:57 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T12:38:57 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T12:38:57 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T12:39:06 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.5557 +2025-05-12T12:39:06 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0049000.png +2025-05-12T12:39:06 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T12:39:06 | INFO | __main__ : Evaluation at step 49000 complete. Average Similarity: 0.5557 +2025-05-12T12:39:30 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:39:30 | INFO | __main__ : Step: 49100 +2025-05-12T12:39:30 | INFO | __main__ : Current Frame Index within Batch Video: 183/247 +2025-05-12T12:39:30 | INFO | __main__ : Batch-wise Cosine Similarity | 92.54% +2025-05-12T12:39:30 | INFO | __main__ : Cosine Embedding Loss | 0.0746 +2025-05-12T12:39:30 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:39:30 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:39:30 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:39:45 | INFO | utils.basic_utils : Train Epoch: [0] [ 203/4978] eta: 3 days, 7:24:27 lr: 0.000008 temperature: 0.0126 video-loss_cosine: 0.0724 eval_avg_sim: 0.5557 video-cosine_similarity: 0.9276 time: 59.8063 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T12:39:53 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:39:53 | INFO | __main__ : Step: 49200 +2025-05-12T12:39:53 | INFO | __main__ : Current Frame Index within Batch Video: 42/247 +2025-05-12T12:39:53 | INFO | __main__ : Batch-wise Cosine Similarity | 78.46% +2025-05-12T12:39:53 | INFO | __main__ : Cosine Embedding Loss | 0.2154 +2025-05-12T12:39:53 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:39:53 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:39:53 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:40:17 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:40:17 | INFO | __main__ : Step: 49300 +2025-05-12T12:40:17 | INFO | __main__ : Current Frame Index within Batch Video: 142/247 +2025-05-12T12:40:17 | INFO | __main__ : Batch-wise Cosine Similarity | 89.34% +2025-05-12T12:40:17 | INFO | __main__ : Cosine Embedding Loss | 0.1066 +2025-05-12T12:40:17 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:40:17 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:40:17 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:40:41 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:40:41 | INFO | __main__ : Step: 49400 +2025-05-12T12:40:41 | INFO | __main__ : Current Frame Index within Batch Video: 242/247 +2025-05-12T12:40:41 | INFO | __main__ : Batch-wise Cosine Similarity | 91.57% +2025-05-12T12:40:41 | INFO | __main__ : Cosine Embedding Loss | 0.0843 +2025-05-12T12:40:41 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:40:41 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:40:41 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:40:42 | INFO | utils.basic_utils : Train Epoch: [0] [ 204/4978] eta: 3 days, 7:22:30 lr: 0.000008 temperature: 0.0126 video-loss_cosine: 0.0858 eval_avg_sim: 0.5557 video-cosine_similarity: 0.9142 time: 59.8072 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T12:41:05 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:41:05 | INFO | __main__ : Step: 49500 +2025-05-12T12:41:05 | INFO | __main__ : Current Frame Index within Batch Video: 101/247 +2025-05-12T12:41:05 | INFO | __main__ : Batch-wise Cosine Similarity | 86.55% +2025-05-12T12:41:05 | INFO | __main__ : Cosine Embedding Loss | 0.1345 +2025-05-12T12:41:05 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:41:05 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:41:05 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:41:29 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:41:29 | INFO | __main__ : Step: 49600 +2025-05-12T12:41:29 | INFO | __main__ : Current Frame Index within Batch Video: 201/247 +2025-05-12T12:41:29 | INFO | __main__ : Batch-wise Cosine Similarity | 91.51% +2025-05-12T12:41:29 | INFO | __main__ : Cosine Embedding Loss | 0.0849 +2025-05-12T12:41:29 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:41:29 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:41:29 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:41:40 | INFO | utils.basic_utils : Train Epoch: [0] [ 205/4978] eta: 3 days, 7:20:35 lr: 0.000008 temperature: 0.0126 video-loss_cosine: 0.0752 eval_avg_sim: 0.5557 video-cosine_similarity: 0.9248 time: 59.8082 data: 0.0013 max mem: 11173 res mem: 15204 +2025-05-12T12:41:53 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:41:53 | INFO | __main__ : Step: 49700 +2025-05-12T12:41:53 | INFO | __main__ : Current Frame Index within Batch Video: 60/247 +2025-05-12T12:41:53 | INFO | __main__ : Batch-wise Cosine Similarity | 81.48% +2025-05-12T12:41:53 | INFO | __main__ : Cosine Embedding Loss | 0.1852 +2025-05-12T12:41:53 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:41:53 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:41:53 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:42:16 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:42:16 | INFO | __main__ : Step: 49800 +2025-05-12T12:42:16 | INFO | __main__ : Current Frame Index within Batch Video: 160/247 +2025-05-12T12:42:16 | INFO | __main__ : Batch-wise Cosine Similarity | 89.90% +2025-05-12T12:42:16 | INFO | __main__ : Cosine Embedding Loss | 0.1010 +2025-05-12T12:42:16 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:42:16 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:42:16 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:42:37 | INFO | utils.basic_utils : Train Epoch: [0] [ 206/4978] eta: 3 days, 7:18:39 lr: 0.000008 temperature: 0.0126 video-loss_cosine: 0.0658 eval_avg_sim: 0.5557 video-cosine_similarity: 0.9342 time: 59.3369 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T12:42:40 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:42:40 | INFO | __main__ : Step: 49900 +2025-05-12T12:42:40 | INFO | __main__ : Current Frame Index within Batch Video: 19/247 +2025-05-12T12:42:40 | INFO | __main__ : Batch-wise Cosine Similarity | 70.06% +2025-05-12T12:42:40 | INFO | __main__ : Cosine Embedding Loss | 0.2994 +2025-05-12T12:42:40 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:42:40 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:42:40 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:43:04 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:43:04 | INFO | __main__ : Step: 50000 +2025-05-12T12:43:04 | INFO | __main__ : Current Frame Index within Batch Video: 119/247 +2025-05-12T12:43:04 | INFO | __main__ : Batch-wise Cosine Similarity | 89.44% +2025-05-12T12:43:04 | INFO | __main__ : Cosine Embedding Loss | 0.1056 +2025-05-12T12:43:04 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:43:04 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:43:04 | INFO | __main__ : Evaluation Average Sim | 0.5557 +2025-05-12T12:43:04 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:43:04 | INFO | __main__ : Saving checkpoint at global step 50000 +2025-05-12T12:43:05 | INFO | __main__ : Performing periodic evaluation at global step 50000... +2025-05-12T12:43:05 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T12:43:05 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T12:43:05 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T12:43:05 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T12:43:14 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.5857 +2025-05-12T12:43:14 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0050000.png +2025-05-12T12:43:14 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T12:43:14 | INFO | __main__ : Evaluation at step 50000 complete. Average Similarity: 0.5857 +2025-05-12T12:43:37 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:43:37 | INFO | __main__ : Step: 50100 +2025-05-12T12:43:37 | INFO | __main__ : Current Frame Index within Batch Video: 219/247 +2025-05-12T12:43:37 | INFO | __main__ : Batch-wise Cosine Similarity | 89.95% +2025-05-12T12:43:37 | INFO | __main__ : Cosine Embedding Loss | 0.1005 +2025-05-12T12:43:37 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:43:37 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:43:37 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:43:44 | INFO | utils.basic_utils : Train Epoch: [0] [ 207/4978] eta: 3 days, 7:20:24 lr: 0.000008 temperature: 0.0126 video-loss_cosine: 0.0608 eval_avg_sim: 0.5857 video-cosine_similarity: 0.9392 time: 59.8155 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T12:44:01 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:44:01 | INFO | __main__ : Step: 50200 +2025-05-12T12:44:01 | INFO | __main__ : Current Frame Index within Batch Video: 78/247 +2025-05-12T12:44:01 | INFO | __main__ : Batch-wise Cosine Similarity | 83.38% +2025-05-12T12:44:01 | INFO | __main__ : Cosine Embedding Loss | 0.1662 +2025-05-12T12:44:01 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:44:01 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:44:01 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:44:25 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:44:25 | INFO | __main__ : Step: 50300 +2025-05-12T12:44:25 | INFO | __main__ : Current Frame Index within Batch Video: 178/247 +2025-05-12T12:44:25 | INFO | __main__ : Batch-wise Cosine Similarity | 90.84% +2025-05-12T12:44:25 | INFO | __main__ : Cosine Embedding Loss | 0.0916 +2025-05-12T12:44:25 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:44:25 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:44:25 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:44:42 | INFO | utils.basic_utils : Train Epoch: [0] [ 208/4978] eta: 3 days, 7:18:29 lr: 0.000008 temperature: 0.0126 video-loss_cosine: 0.0603 eval_avg_sim: 0.5857 video-cosine_similarity: 0.9397 time: 59.8181 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T12:44:49 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:44:49 | INFO | __main__ : Step: 50400 +2025-05-12T12:44:49 | INFO | __main__ : Current Frame Index within Batch Video: 37/247 +2025-05-12T12:44:49 | INFO | __main__ : Batch-wise Cosine Similarity | 76.71% +2025-05-12T12:44:49 | INFO | __main__ : Cosine Embedding Loss | 0.2329 +2025-05-12T12:44:49 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:44:49 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:44:49 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:45:13 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:45:13 | INFO | __main__ : Step: 50500 +2025-05-12T12:45:13 | INFO | __main__ : Current Frame Index within Batch Video: 137/247 +2025-05-12T12:45:13 | INFO | __main__ : Batch-wise Cosine Similarity | 89.45% +2025-05-12T12:45:13 | INFO | __main__ : Cosine Embedding Loss | 0.1055 +2025-05-12T12:45:13 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:45:13 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:45:13 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:45:37 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:45:37 | INFO | __main__ : Step: 50600 +2025-05-12T12:45:37 | INFO | __main__ : Current Frame Index within Batch Video: 237/247 +2025-05-12T12:45:37 | INFO | __main__ : Batch-wise Cosine Similarity | 93.85% +2025-05-12T12:45:37 | INFO | __main__ : Cosine Embedding Loss | 0.0615 +2025-05-12T12:45:37 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:45:37 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:45:37 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:45:39 | INFO | utils.basic_utils : Train Epoch: [0] [ 209/4978] eta: 3 days, 7:16:34 lr: 0.000008 temperature: 0.0126 video-loss_cosine: 0.0554 eval_avg_sim: 0.5857 video-cosine_similarity: 0.9446 time: 59.8184 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T12:46:01 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:46:01 | INFO | __main__ : Step: 50700 +2025-05-12T12:46:01 | INFO | __main__ : Current Frame Index within Batch Video: 96/247 +2025-05-12T12:46:01 | INFO | __main__ : Batch-wise Cosine Similarity | 87.66% +2025-05-12T12:46:01 | INFO | __main__ : Cosine Embedding Loss | 0.1234 +2025-05-12T12:46:01 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:46:01 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:46:01 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:46:24 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:46:24 | INFO | __main__ : Step: 50800 +2025-05-12T12:46:24 | INFO | __main__ : Current Frame Index within Batch Video: 196/247 +2025-05-12T12:46:24 | INFO | __main__ : Batch-wise Cosine Similarity | 92.39% +2025-05-12T12:46:24 | INFO | __main__ : Cosine Embedding Loss | 0.0761 +2025-05-12T12:46:24 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:46:24 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:46:24 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:46:36 | INFO | utils.basic_utils : Train Epoch: [0] [ 210/4978] eta: 3 days, 7:14:39 lr: 0.000008 temperature: 0.0126 video-loss_cosine: 0.0735 eval_avg_sim: 0.5857 video-cosine_similarity: 0.9265 time: 59.3509 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T12:46:48 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:46:48 | INFO | __main__ : Step: 50900 +2025-05-12T12:46:48 | INFO | __main__ : Current Frame Index within Batch Video: 55/247 +2025-05-12T12:46:48 | INFO | __main__ : Batch-wise Cosine Similarity | 81.79% +2025-05-12T12:46:48 | INFO | __main__ : Cosine Embedding Loss | 0.1821 +2025-05-12T12:46:48 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:46:48 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:46:48 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:47:12 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:47:12 | INFO | __main__ : Step: 51000 +2025-05-12T12:47:12 | INFO | __main__ : Current Frame Index within Batch Video: 155/247 +2025-05-12T12:47:12 | INFO | __main__ : Batch-wise Cosine Similarity | 90.52% +2025-05-12T12:47:12 | INFO | __main__ : Cosine Embedding Loss | 0.0948 +2025-05-12T12:47:12 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:47:12 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:47:12 | INFO | __main__ : Evaluation Average Sim | 0.5857 +2025-05-12T12:47:12 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:47:12 | INFO | __main__ : Performing periodic evaluation at global step 51000... +2025-05-12T12:47:12 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T12:47:12 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T12:47:12 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T12:47:12 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T12:47:22 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.6695 +2025-05-12T12:47:22 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0051000.png +2025-05-12T12:47:22 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T12:47:22 | INFO | __main__ : Evaluation at step 51000 complete. Average Similarity: 0.6695 +2025-05-12T12:47:43 | INFO | utils.basic_utils : Train Epoch: [0] [ 211/4978] eta: 3 days, 7:16:16 lr: 0.000008 temperature: 0.0126 video-loss_cosine: 0.0533 eval_avg_sim: 0.6695 video-cosine_similarity: 0.9467 time: 59.8228 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T12:47:45 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:47:45 | INFO | __main__ : Step: 51100 +2025-05-12T12:47:45 | INFO | __main__ : Current Frame Index within Batch Video: 14/247 +2025-05-12T12:47:45 | INFO | __main__ : Batch-wise Cosine Similarity | 62.85% +2025-05-12T12:47:45 | INFO | __main__ : Cosine Embedding Loss | 0.3715 +2025-05-12T12:47:45 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:47:45 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:47:45 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:48:09 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:48:09 | INFO | __main__ : Step: 51200 +2025-05-12T12:48:09 | INFO | __main__ : Current Frame Index within Batch Video: 114/247 +2025-05-12T12:48:09 | INFO | __main__ : Batch-wise Cosine Similarity | 89.36% +2025-05-12T12:48:09 | INFO | __main__ : Cosine Embedding Loss | 0.1064 +2025-05-12T12:48:09 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:48:09 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:48:09 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:48:33 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:48:33 | INFO | __main__ : Step: 51300 +2025-05-12T12:48:33 | INFO | __main__ : Current Frame Index within Batch Video: 214/247 +2025-05-12T12:48:33 | INFO | __main__ : Batch-wise Cosine Similarity | 93.78% +2025-05-12T12:48:33 | INFO | __main__ : Cosine Embedding Loss | 0.0622 +2025-05-12T12:48:33 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:48:33 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:48:33 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:48:41 | INFO | utils.basic_utils : Train Epoch: [0] [ 212/4978] eta: 3 days, 7:14:21 lr: 0.000008 temperature: 0.0126 video-loss_cosine: 0.0499 eval_avg_sim: 0.6695 video-cosine_similarity: 0.9501 time: 59.8228 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T12:48:57 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:48:57 | INFO | __main__ : Step: 51400 +2025-05-12T12:48:57 | INFO | __main__ : Current Frame Index within Batch Video: 73/247 +2025-05-12T12:48:57 | INFO | __main__ : Batch-wise Cosine Similarity | 85.06% +2025-05-12T12:48:57 | INFO | __main__ : Cosine Embedding Loss | 0.1494 +2025-05-12T12:48:57 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:48:57 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:48:57 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:49:20 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:49:20 | INFO | __main__ : Step: 51500 +2025-05-12T12:49:20 | INFO | __main__ : Current Frame Index within Batch Video: 173/247 +2025-05-12T12:49:20 | INFO | __main__ : Batch-wise Cosine Similarity | 90.90% +2025-05-12T12:49:20 | INFO | __main__ : Cosine Embedding Loss | 0.0910 +2025-05-12T12:49:20 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:49:20 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:49:20 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:49:38 | INFO | utils.basic_utils : Train Epoch: [0] [ 213/4978] eta: 3 days, 7:12:28 lr: 0.000008 temperature: 0.0126 video-loss_cosine: 0.0798 eval_avg_sim: 0.6695 video-cosine_similarity: 0.9202 time: 59.8228 data: 0.0013 max mem: 11173 res mem: 15204 +2025-05-12T12:49:44 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:49:44 | INFO | __main__ : Step: 51600 +2025-05-12T12:49:44 | INFO | __main__ : Current Frame Index within Batch Video: 32/247 +2025-05-12T12:49:44 | INFO | __main__ : Batch-wise Cosine Similarity | 76.65% +2025-05-12T12:49:44 | INFO | __main__ : Cosine Embedding Loss | 0.2335 +2025-05-12T12:49:44 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:49:44 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:49:44 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:50:08 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:50:08 | INFO | __main__ : Step: 51700 +2025-05-12T12:50:08 | INFO | __main__ : Current Frame Index within Batch Video: 132/247 +2025-05-12T12:50:08 | INFO | __main__ : Batch-wise Cosine Similarity | 90.30% +2025-05-12T12:50:08 | INFO | __main__ : Cosine Embedding Loss | 0.0970 +2025-05-12T12:50:08 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:50:08 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:50:08 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:50:32 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:50:32 | INFO | __main__ : Step: 51800 +2025-05-12T12:50:32 | INFO | __main__ : Current Frame Index within Batch Video: 232/247 +2025-05-12T12:50:32 | INFO | __main__ : Batch-wise Cosine Similarity | 91.81% +2025-05-12T12:50:32 | INFO | __main__ : Cosine Embedding Loss | 0.0819 +2025-05-12T12:50:32 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:50:32 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:50:32 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:50:35 | INFO | utils.basic_utils : Train Epoch: [0] [ 214/4978] eta: 3 days, 7:10:35 lr: 0.000008 temperature: 0.0126 video-loss_cosine: 0.0759 eval_avg_sim: 0.6695 video-cosine_similarity: 0.9241 time: 59.8227 data: 0.0013 max mem: 11173 res mem: 15204 +2025-05-12T12:50:56 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:50:56 | INFO | __main__ : Step: 51900 +2025-05-12T12:50:56 | INFO | __main__ : Current Frame Index within Batch Video: 91/247 +2025-05-12T12:50:56 | INFO | __main__ : Batch-wise Cosine Similarity | 87.94% +2025-05-12T12:50:56 | INFO | __main__ : Cosine Embedding Loss | 0.1206 +2025-05-12T12:50:56 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:50:56 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:50:56 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:51:20 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:51:20 | INFO | __main__ : Step: 52000 +2025-05-12T12:51:20 | INFO | __main__ : Current Frame Index within Batch Video: 191/247 +2025-05-12T12:51:20 | INFO | __main__ : Batch-wise Cosine Similarity | 91.87% +2025-05-12T12:51:20 | INFO | __main__ : Cosine Embedding Loss | 0.0813 +2025-05-12T12:51:20 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:51:20 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:51:20 | INFO | __main__ : Evaluation Average Sim | 0.6695 +2025-05-12T12:51:20 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:51:20 | INFO | __main__ : Performing periodic evaluation at global step 52000... +2025-05-12T12:51:20 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T12:51:20 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T12:51:20 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T12:51:20 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T12:51:29 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.6347 +2025-05-12T12:51:29 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0052000.png +2025-05-12T12:51:29 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T12:51:29 | INFO | __main__ : Evaluation at step 52000 complete. Average Similarity: 0.6347 +2025-05-12T12:51:42 | INFO | utils.basic_utils : Train Epoch: [0] [ 215/4978] eta: 3 days, 7:12:08 lr: 0.000008 temperature: 0.0126 video-loss_cosine: 0.0741 eval_avg_sim: 0.6347 video-cosine_similarity: 0.9259 time: 59.7985 data: 0.0013 max mem: 11173 res mem: 15204 +2025-05-12T12:51:53 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:51:53 | INFO | __main__ : Step: 52100 +2025-05-12T12:51:53 | INFO | __main__ : Current Frame Index within Batch Video: 50/247 +2025-05-12T12:51:53 | INFO | __main__ : Batch-wise Cosine Similarity | 81.24% +2025-05-12T12:51:53 | INFO | __main__ : Cosine Embedding Loss | 0.1876 +2025-05-12T12:51:53 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:51:53 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:51:53 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:52:17 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:52:17 | INFO | __main__ : Step: 52200 +2025-05-12T12:52:17 | INFO | __main__ : Current Frame Index within Batch Video: 150/247 +2025-05-12T12:52:17 | INFO | __main__ : Batch-wise Cosine Similarity | 92.41% +2025-05-12T12:52:17 | INFO | __main__ : Cosine Embedding Loss | 0.0759 +2025-05-12T12:52:17 | INFO | __main__ : Learning Rate | 0.000008 +2025-05-12T12:52:17 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:52:17 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:52:40 | INFO | utils.basic_utils : Train Epoch: [0] [ 216/4978] eta: 3 days, 7:10:12 lr: 0.000009 temperature: 0.0126 video-loss_cosine: 0.0701 eval_avg_sim: 0.6347 video-cosine_similarity: 0.9299 time: 59.7930 data: 0.0013 max mem: 11173 res mem: 15204 +2025-05-12T12:52:40 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:52:40 | INFO | __main__ : Step: 52300 +2025-05-12T12:52:40 | INFO | __main__ : Current Frame Index within Batch Video: 9/247 +2025-05-12T12:52:40 | INFO | __main__ : Batch-wise Cosine Similarity | 57.20% +2025-05-12T12:52:40 | INFO | __main__ : Cosine Embedding Loss | 0.4280 +2025-05-12T12:52:40 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T12:52:40 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:52:40 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:53:04 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:53:04 | INFO | __main__ : Step: 52400 +2025-05-12T12:53:04 | INFO | __main__ : Current Frame Index within Batch Video: 109/247 +2025-05-12T12:53:04 | INFO | __main__ : Batch-wise Cosine Similarity | 90.26% +2025-05-12T12:53:04 | INFO | __main__ : Cosine Embedding Loss | 0.0974 +2025-05-12T12:53:04 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T12:53:04 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:53:04 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:53:28 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:53:28 | INFO | __main__ : Step: 52500 +2025-05-12T12:53:28 | INFO | __main__ : Current Frame Index within Batch Video: 209/247 +2025-05-12T12:53:28 | INFO | __main__ : Batch-wise Cosine Similarity | 94.51% +2025-05-12T12:53:28 | INFO | __main__ : Cosine Embedding Loss | 0.0549 +2025-05-12T12:53:28 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T12:53:28 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:53:28 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:53:37 | INFO | utils.basic_utils : Train Epoch: [0] [ 217/4978] eta: 3 days, 7:08:20 lr: 0.000009 temperature: 0.0126 video-loss_cosine: 0.0636 eval_avg_sim: 0.6347 video-cosine_similarity: 0.9364 time: 59.7887 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T12:53:52 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:53:52 | INFO | __main__ : Step: 52600 +2025-05-12T12:53:52 | INFO | __main__ : Current Frame Index within Batch Video: 68/247 +2025-05-12T12:53:52 | INFO | __main__ : Batch-wise Cosine Similarity | 84.29% +2025-05-12T12:53:52 | INFO | __main__ : Cosine Embedding Loss | 0.1571 +2025-05-12T12:53:52 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T12:53:52 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:53:52 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:54:16 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:54:16 | INFO | __main__ : Step: 52700 +2025-05-12T12:54:16 | INFO | __main__ : Current Frame Index within Batch Video: 168/247 +2025-05-12T12:54:16 | INFO | __main__ : Batch-wise Cosine Similarity | 93.08% +2025-05-12T12:54:16 | INFO | __main__ : Cosine Embedding Loss | 0.0692 +2025-05-12T12:54:16 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T12:54:16 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:54:16 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:54:34 | INFO | utils.basic_utils : Train Epoch: [0] [ 218/4978] eta: 3 days, 7:06:27 lr: 0.000009 temperature: 0.0126 video-loss_cosine: 0.0478 eval_avg_sim: 0.6347 video-cosine_similarity: 0.9522 time: 59.7878 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T12:54:40 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:54:40 | INFO | __main__ : Step: 52800 +2025-05-12T12:54:40 | INFO | __main__ : Current Frame Index within Batch Video: 27/247 +2025-05-12T12:54:40 | INFO | __main__ : Batch-wise Cosine Similarity | 74.47% +2025-05-12T12:54:40 | INFO | __main__ : Cosine Embedding Loss | 0.2553 +2025-05-12T12:54:40 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T12:54:40 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:54:40 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:55:03 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:55:03 | INFO | __main__ : Step: 52900 +2025-05-12T12:55:03 | INFO | __main__ : Current Frame Index within Batch Video: 127/247 +2025-05-12T12:55:03 | INFO | __main__ : Batch-wise Cosine Similarity | 91.60% +2025-05-12T12:55:03 | INFO | __main__ : Cosine Embedding Loss | 0.0840 +2025-05-12T12:55:03 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T12:55:03 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:55:03 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:55:27 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:55:27 | INFO | __main__ : Step: 53000 +2025-05-12T12:55:27 | INFO | __main__ : Current Frame Index within Batch Video: 227/247 +2025-05-12T12:55:27 | INFO | __main__ : Batch-wise Cosine Similarity | 90.29% +2025-05-12T12:55:27 | INFO | __main__ : Cosine Embedding Loss | 0.0971 +2025-05-12T12:55:27 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T12:55:27 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:55:27 | INFO | __main__ : Evaluation Average Sim | 0.6347 +2025-05-12T12:55:27 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:55:28 | INFO | __main__ : Performing periodic evaluation at global step 53000... +2025-05-12T12:55:28 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T12:55:28 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T12:55:28 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T12:55:28 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T12:55:37 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.4938 +2025-05-12T12:55:37 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0053000.png +2025-05-12T12:55:37 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T12:55:37 | INFO | __main__ : Evaluation at step 53000 complete. Average Similarity: 0.4938 +2025-05-12T12:55:41 | INFO | utils.basic_utils : Train Epoch: [0] [ 219/4978] eta: 3 days, 7:07:59 lr: 0.000009 temperature: 0.0126 video-loss_cosine: 0.1055 eval_avg_sim: 0.4938 video-cosine_similarity: 0.8945 time: 59.7766 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T12:56:00 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:56:00 | INFO | __main__ : Step: 53100 +2025-05-12T12:56:00 | INFO | __main__ : Current Frame Index within Batch Video: 86/247 +2025-05-12T12:56:00 | INFO | __main__ : Batch-wise Cosine Similarity | 85.72% +2025-05-12T12:56:00 | INFO | __main__ : Cosine Embedding Loss | 0.1428 +2025-05-12T12:56:00 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T12:56:00 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:56:00 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:56:24 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:56:24 | INFO | __main__ : Step: 53200 +2025-05-12T12:56:24 | INFO | __main__ : Current Frame Index within Batch Video: 186/247 +2025-05-12T12:56:24 | INFO | __main__ : Batch-wise Cosine Similarity | 94.03% +2025-05-12T12:56:24 | INFO | __main__ : Cosine Embedding Loss | 0.0597 +2025-05-12T12:56:24 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T12:56:24 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:56:24 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:56:39 | INFO | utils.basic_utils : Train Epoch: [0] [ 220/4978] eta: 3 days, 7:06:05 lr: 0.000009 temperature: 0.0126 video-loss_cosine: 0.0726 eval_avg_sim: 0.4938 video-cosine_similarity: 0.9274 time: 59.7734 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T12:56:48 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:56:48 | INFO | __main__ : Step: 53300 +2025-05-12T12:56:48 | INFO | __main__ : Current Frame Index within Batch Video: 45/247 +2025-05-12T12:56:48 | INFO | __main__ : Batch-wise Cosine Similarity | 77.58% +2025-05-12T12:56:48 | INFO | __main__ : Cosine Embedding Loss | 0.2242 +2025-05-12T12:56:48 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T12:56:48 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:56:48 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:57:12 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:57:12 | INFO | __main__ : Step: 53400 +2025-05-12T12:57:12 | INFO | __main__ : Current Frame Index within Batch Video: 145/247 +2025-05-12T12:57:12 | INFO | __main__ : Batch-wise Cosine Similarity | 89.85% +2025-05-12T12:57:12 | INFO | __main__ : Cosine Embedding Loss | 0.1015 +2025-05-12T12:57:12 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T12:57:12 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:57:12 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:57:36 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:57:36 | INFO | __main__ : Step: 53500 +2025-05-12T12:57:36 | INFO | __main__ : Current Frame Index within Batch Video: 245/247 +2025-05-12T12:57:36 | INFO | __main__ : Batch-wise Cosine Similarity | 92.73% +2025-05-12T12:57:36 | INFO | __main__ : Cosine Embedding Loss | 0.0727 +2025-05-12T12:57:36 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T12:57:36 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:57:36 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:57:36 | INFO | utils.basic_utils : Train Epoch: [0] [ 221/4978] eta: 3 days, 7:04:14 lr: 0.000009 temperature: 0.0126 video-loss_cosine: 0.0729 eval_avg_sim: 0.4938 video-cosine_similarity: 0.9271 time: 59.7732 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T12:58:00 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:58:00 | INFO | __main__ : Step: 53600 +2025-05-12T12:58:00 | INFO | __main__ : Current Frame Index within Batch Video: 104/247 +2025-05-12T12:58:00 | INFO | __main__ : Batch-wise Cosine Similarity | 88.29% +2025-05-12T12:58:00 | INFO | __main__ : Cosine Embedding Loss | 0.1171 +2025-05-12T12:58:00 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T12:58:00 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:58:00 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:58:23 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:58:23 | INFO | __main__ : Step: 53700 +2025-05-12T12:58:23 | INFO | __main__ : Current Frame Index within Batch Video: 204/247 +2025-05-12T12:58:23 | INFO | __main__ : Batch-wise Cosine Similarity | 93.33% +2025-05-12T12:58:23 | INFO | __main__ : Cosine Embedding Loss | 0.0667 +2025-05-12T12:58:23 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T12:58:23 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:58:23 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:58:34 | INFO | utils.basic_utils : Train Epoch: [0] [ 222/4978] eta: 3 days, 7:02:23 lr: 0.000009 temperature: 0.0126 video-loss_cosine: 0.0599 eval_avg_sim: 0.4938 video-cosine_similarity: 0.9401 time: 59.7742 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T12:58:47 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:58:47 | INFO | __main__ : Step: 53800 +2025-05-12T12:58:47 | INFO | __main__ : Current Frame Index within Batch Video: 63/247 +2025-05-12T12:58:47 | INFO | __main__ : Batch-wise Cosine Similarity | 83.89% +2025-05-12T12:58:47 | INFO | __main__ : Cosine Embedding Loss | 0.1611 +2025-05-12T12:58:47 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T12:58:47 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:58:47 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:59:11 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:59:11 | INFO | __main__ : Step: 53900 +2025-05-12T12:59:11 | INFO | __main__ : Current Frame Index within Batch Video: 163/247 +2025-05-12T12:59:11 | INFO | __main__ : Batch-wise Cosine Similarity | 90.88% +2025-05-12T12:59:11 | INFO | __main__ : Cosine Embedding Loss | 0.0912 +2025-05-12T12:59:11 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T12:59:11 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:59:11 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:59:31 | INFO | utils.basic_utils : Train Epoch: [0] [ 223/4978] eta: 3 days, 7:00:32 lr: 0.000009 temperature: 0.0126 video-loss_cosine: 0.0539 eval_avg_sim: 0.4938 video-cosine_similarity: 0.9461 time: 59.3096 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T12:59:35 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:59:35 | INFO | __main__ : Step: 54000 +2025-05-12T12:59:35 | INFO | __main__ : Current Frame Index within Batch Video: 22/247 +2025-05-12T12:59:35 | INFO | __main__ : Batch-wise Cosine Similarity | 71.13% +2025-05-12T12:59:35 | INFO | __main__ : Cosine Embedding Loss | 0.2887 +2025-05-12T12:59:35 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T12:59:35 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T12:59:35 | INFO | __main__ : Evaluation Average Sim | 0.4938 +2025-05-12T12:59:35 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T12:59:35 | INFO | __main__ : Performing periodic evaluation at global step 54000... +2025-05-12T12:59:35 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T12:59:35 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T12:59:35 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T12:59:35 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T12:59:45 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.5660 +2025-05-12T12:59:45 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0054000.png +2025-05-12T12:59:45 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T12:59:45 | INFO | __main__ : Evaluation at step 54000 complete. Average Similarity: 0.5660 +2025-05-12T13:00:09 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:00:09 | INFO | __main__ : Step: 54100 +2025-05-12T13:00:09 | INFO | __main__ : Current Frame Index within Batch Video: 122/247 +2025-05-12T13:00:09 | INFO | __main__ : Batch-wise Cosine Similarity | 91.24% +2025-05-12T13:00:09 | INFO | __main__ : Cosine Embedding Loss | 0.0876 +2025-05-12T13:00:09 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T13:00:09 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:00:09 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:00:32 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:00:32 | INFO | __main__ : Step: 54200 +2025-05-12T13:00:32 | INFO | __main__ : Current Frame Index within Batch Video: 222/247 +2025-05-12T13:00:32 | INFO | __main__ : Batch-wise Cosine Similarity | 94.22% +2025-05-12T13:00:32 | INFO | __main__ : Cosine Embedding Loss | 0.0578 +2025-05-12T13:00:32 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T13:00:32 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:00:32 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:00:38 | INFO | utils.basic_utils : Train Epoch: [0] [ 224/4978] eta: 3 days, 7:02:12 lr: 0.000009 temperature: 0.0126 video-loss_cosine: 0.0522 eval_avg_sim: 0.5660 video-cosine_similarity: 0.9478 time: 59.8068 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T13:00:56 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:00:56 | INFO | __main__ : Step: 54300 +2025-05-12T13:00:56 | INFO | __main__ : Current Frame Index within Batch Video: 81/247 +2025-05-12T13:00:56 | INFO | __main__ : Batch-wise Cosine Similarity | 88.08% +2025-05-12T13:00:56 | INFO | __main__ : Cosine Embedding Loss | 0.1192 +2025-05-12T13:00:56 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T13:00:56 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:00:56 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:01:20 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:01:20 | INFO | __main__ : Step: 54400 +2025-05-12T13:01:20 | INFO | __main__ : Current Frame Index within Batch Video: 181/247 +2025-05-12T13:01:20 | INFO | __main__ : Batch-wise Cosine Similarity | 94.31% +2025-05-12T13:01:20 | INFO | __main__ : Cosine Embedding Loss | 0.0569 +2025-05-12T13:01:20 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T13:01:20 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:01:20 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:01:36 | INFO | utils.basic_utils : Train Epoch: [0] [ 225/4978] eta: 3 days, 7:00:21 lr: 0.000009 temperature: 0.0126 video-loss_cosine: 0.0454 eval_avg_sim: 0.5660 video-cosine_similarity: 0.9546 time: 59.8050 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T13:01:44 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:01:44 | INFO | __main__ : Step: 54500 +2025-05-12T13:01:44 | INFO | __main__ : Current Frame Index within Batch Video: 40/247 +2025-05-12T13:01:44 | INFO | __main__ : Batch-wise Cosine Similarity | 77.49% +2025-05-12T13:01:44 | INFO | __main__ : Cosine Embedding Loss | 0.2251 +2025-05-12T13:01:44 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T13:01:44 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:01:44 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:02:08 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:02:08 | INFO | __main__ : Step: 54600 +2025-05-12T13:02:08 | INFO | __main__ : Current Frame Index within Batch Video: 140/247 +2025-05-12T13:02:08 | INFO | __main__ : Batch-wise Cosine Similarity | 90.81% +2025-05-12T13:02:08 | INFO | __main__ : Cosine Embedding Loss | 0.0919 +2025-05-12T13:02:08 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T13:02:08 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:02:08 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:02:32 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:02:32 | INFO | __main__ : Step: 54700 +2025-05-12T13:02:32 | INFO | __main__ : Current Frame Index within Batch Video: 240/247 +2025-05-12T13:02:32 | INFO | __main__ : Batch-wise Cosine Similarity | 94.03% +2025-05-12T13:02:32 | INFO | __main__ : Cosine Embedding Loss | 0.0597 +2025-05-12T13:02:32 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T13:02:32 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:02:32 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:02:33 | INFO | utils.basic_utils : Train Epoch: [0] [ 226/4978] eta: 3 days, 6:58:32 lr: 0.000009 temperature: 0.0126 video-loss_cosine: 0.0531 eval_avg_sim: 0.5660 video-cosine_similarity: 0.9469 time: 59.8057 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T13:02:56 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:02:56 | INFO | __main__ : Step: 54800 +2025-05-12T13:02:56 | INFO | __main__ : Current Frame Index within Batch Video: 99/247 +2025-05-12T13:02:56 | INFO | __main__ : Batch-wise Cosine Similarity | 87.62% +2025-05-12T13:02:56 | INFO | __main__ : Cosine Embedding Loss | 0.1238 +2025-05-12T13:02:56 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T13:02:56 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:02:56 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:03:19 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:03:19 | INFO | __main__ : Step: 54900 +2025-05-12T13:03:19 | INFO | __main__ : Current Frame Index within Batch Video: 199/247 +2025-05-12T13:03:19 | INFO | __main__ : Batch-wise Cosine Similarity | 91.45% +2025-05-12T13:03:19 | INFO | __main__ : Cosine Embedding Loss | 0.0855 +2025-05-12T13:03:19 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T13:03:19 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:03:19 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:03:31 | INFO | utils.basic_utils : Train Epoch: [0] [ 227/4978] eta: 3 days, 6:56:42 lr: 0.000009 temperature: 0.0126 video-loss_cosine: 0.0755 eval_avg_sim: 0.5660 video-cosine_similarity: 0.9245 time: 59.3290 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T13:03:43 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:03:43 | INFO | __main__ : Step: 55000 +2025-05-12T13:03:43 | INFO | __main__ : Current Frame Index within Batch Video: 58/247 +2025-05-12T13:03:43 | INFO | __main__ : Batch-wise Cosine Similarity | 81.61% +2025-05-12T13:03:43 | INFO | __main__ : Cosine Embedding Loss | 0.1839 +2025-05-12T13:03:43 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T13:03:43 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:03:43 | INFO | __main__ : Evaluation Average Sim | 0.5660 +2025-05-12T13:03:43 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:03:43 | INFO | __main__ : Saving checkpoint at global step 55000 +2025-05-12T13:03:44 | INFO | __main__ : Performing periodic evaluation at global step 55000... +2025-05-12T13:03:44 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T13:03:44 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T13:03:44 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T13:03:44 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T13:03:53 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.5796 +2025-05-12T13:03:53 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0055000.png +2025-05-12T13:03:53 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T13:03:53 | INFO | __main__ : Evaluation at step 55000 complete. Average Similarity: 0.5796 +2025-05-12T13:04:16 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:04:16 | INFO | __main__ : Step: 55100 +2025-05-12T13:04:16 | INFO | __main__ : Current Frame Index within Batch Video: 158/247 +2025-05-12T13:04:16 | INFO | __main__ : Batch-wise Cosine Similarity | 90.06% +2025-05-12T13:04:16 | INFO | __main__ : Cosine Embedding Loss | 0.0994 +2025-05-12T13:04:16 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T13:04:16 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:04:16 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:04:37 | INFO | utils.basic_utils : Train Epoch: [0] [ 228/4978] eta: 3 days, 6:58:06 lr: 0.000009 temperature: 0.0126 video-loss_cosine: 0.0652 eval_avg_sim: 0.5796 video-cosine_similarity: 0.9348 time: 59.7933 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T13:04:40 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:04:40 | INFO | __main__ : Step: 55200 +2025-05-12T13:04:40 | INFO | __main__ : Current Frame Index within Batch Video: 17/247 +2025-05-12T13:04:40 | INFO | __main__ : Batch-wise Cosine Similarity | 66.53% +2025-05-12T13:04:40 | INFO | __main__ : Cosine Embedding Loss | 0.3347 +2025-05-12T13:04:40 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T13:04:40 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:04:40 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:05:04 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:05:04 | INFO | __main__ : Step: 55300 +2025-05-12T13:05:04 | INFO | __main__ : Current Frame Index within Batch Video: 117/247 +2025-05-12T13:05:04 | INFO | __main__ : Batch-wise Cosine Similarity | 88.95% +2025-05-12T13:05:04 | INFO | __main__ : Cosine Embedding Loss | 0.1105 +2025-05-12T13:05:04 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T13:05:04 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:05:04 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:05:28 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:05:28 | INFO | __main__ : Step: 55400 +2025-05-12T13:05:28 | INFO | __main__ : Current Frame Index within Batch Video: 217/247 +2025-05-12T13:05:28 | INFO | __main__ : Batch-wise Cosine Similarity | 91.49% +2025-05-12T13:05:28 | INFO | __main__ : Cosine Embedding Loss | 0.0851 +2025-05-12T13:05:28 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T13:05:28 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:05:28 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:05:35 | INFO | utils.basic_utils : Train Epoch: [0] [ 229/4978] eta: 3 days, 6:56:17 lr: 0.000009 temperature: 0.0126 video-loss_cosine: 0.0637 eval_avg_sim: 0.5796 video-cosine_similarity: 0.9363 time: 59.7935 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T13:05:52 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:05:52 | INFO | __main__ : Step: 55500 +2025-05-12T13:05:52 | INFO | __main__ : Current Frame Index within Batch Video: 76/247 +2025-05-12T13:05:52 | INFO | __main__ : Batch-wise Cosine Similarity | 84.46% +2025-05-12T13:05:52 | INFO | __main__ : Cosine Embedding Loss | 0.1554 +2025-05-12T13:05:52 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T13:05:52 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:05:52 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:06:15 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:06:15 | INFO | __main__ : Step: 55600 +2025-05-12T13:06:15 | INFO | __main__ : Current Frame Index within Batch Video: 176/247 +2025-05-12T13:06:15 | INFO | __main__ : Batch-wise Cosine Similarity | 88.10% +2025-05-12T13:06:15 | INFO | __main__ : Cosine Embedding Loss | 0.1190 +2025-05-12T13:06:15 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T13:06:15 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:06:15 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:06:32 | INFO | utils.basic_utils : Train Epoch: [0] [ 230/4978] eta: 3 days, 6:54:27 lr: 0.000009 temperature: 0.0126 video-loss_cosine: 0.1005 eval_avg_sim: 0.5796 video-cosine_similarity: 0.8995 time: 59.7962 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T13:06:39 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:06:39 | INFO | __main__ : Step: 55700 +2025-05-12T13:06:39 | INFO | __main__ : Current Frame Index within Batch Video: 35/247 +2025-05-12T13:06:39 | INFO | __main__ : Batch-wise Cosine Similarity | 77.87% +2025-05-12T13:06:39 | INFO | __main__ : Cosine Embedding Loss | 0.2213 +2025-05-12T13:06:39 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T13:06:39 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:06:39 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:07:03 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:07:03 | INFO | __main__ : Step: 55800 +2025-05-12T13:07:03 | INFO | __main__ : Current Frame Index within Batch Video: 135/247 +2025-05-12T13:07:03 | INFO | __main__ : Batch-wise Cosine Similarity | 90.20% +2025-05-12T13:07:03 | INFO | __main__ : Cosine Embedding Loss | 0.0980 +2025-05-12T13:07:03 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T13:07:03 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:07:03 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:07:27 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:07:27 | INFO | __main__ : Step: 55900 +2025-05-12T13:07:27 | INFO | __main__ : Current Frame Index within Batch Video: 235/247 +2025-05-12T13:07:27 | INFO | __main__ : Batch-wise Cosine Similarity | 93.78% +2025-05-12T13:07:27 | INFO | __main__ : Cosine Embedding Loss | 0.0622 +2025-05-12T13:07:27 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T13:07:27 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:07:27 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:07:30 | INFO | utils.basic_utils : Train Epoch: [0] [ 231/4978] eta: 3 days, 6:52:39 lr: 0.000009 temperature: 0.0126 video-loss_cosine: 0.0645 eval_avg_sim: 0.5796 video-cosine_similarity: 0.9355 time: 59.3279 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T13:07:51 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:07:51 | INFO | __main__ : Step: 56000 +2025-05-12T13:07:51 | INFO | __main__ : Current Frame Index within Batch Video: 94/247 +2025-05-12T13:07:51 | INFO | __main__ : Batch-wise Cosine Similarity | 88.74% +2025-05-12T13:07:51 | INFO | __main__ : Cosine Embedding Loss | 0.1126 +2025-05-12T13:07:51 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T13:07:51 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:07:51 | INFO | __main__ : Evaluation Average Sim | 0.5796 +2025-05-12T13:07:51 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:07:51 | INFO | __main__ : Performing periodic evaluation at global step 56000... +2025-05-12T13:07:51 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T13:07:51 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T13:07:51 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T13:07:51 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T13:08:00 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.6372 +2025-05-12T13:08:00 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0056000.png +2025-05-12T13:08:00 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T13:08:00 | INFO | __main__ : Evaluation at step 56000 complete. Average Similarity: 0.6372 +2025-05-12T13:08:24 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:08:24 | INFO | __main__ : Step: 56100 +2025-05-12T13:08:24 | INFO | __main__ : Current Frame Index within Batch Video: 194/247 +2025-05-12T13:08:24 | INFO | __main__ : Batch-wise Cosine Similarity | 94.99% +2025-05-12T13:08:24 | INFO | __main__ : Cosine Embedding Loss | 0.0501 +2025-05-12T13:08:24 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T13:08:24 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:08:24 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:08:37 | INFO | utils.basic_utils : Train Epoch: [0] [ 232/4978] eta: 3 days, 6:54:00 lr: 0.000009 temperature: 0.0126 video-loss_cosine: 0.0564 eval_avg_sim: 0.6372 video-cosine_similarity: 0.9436 time: 59.7940 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T13:08:48 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:08:48 | INFO | __main__ : Step: 56200 +2025-05-12T13:08:48 | INFO | __main__ : Current Frame Index within Batch Video: 53/247 +2025-05-12T13:08:48 | INFO | __main__ : Batch-wise Cosine Similarity | 82.67% +2025-05-12T13:08:48 | INFO | __main__ : Cosine Embedding Loss | 0.1733 +2025-05-12T13:08:48 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T13:08:48 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:08:48 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:09:12 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:09:12 | INFO | __main__ : Step: 56300 +2025-05-12T13:09:12 | INFO | __main__ : Current Frame Index within Batch Video: 153/247 +2025-05-12T13:09:12 | INFO | __main__ : Batch-wise Cosine Similarity | 92.15% +2025-05-12T13:09:12 | INFO | __main__ : Cosine Embedding Loss | 0.0785 +2025-05-12T13:09:12 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T13:09:12 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:09:12 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:09:34 | INFO | utils.basic_utils : Train Epoch: [0] [ 233/4978] eta: 3 days, 6:52:11 lr: 0.000009 temperature: 0.0126 video-loss_cosine: 0.0652 eval_avg_sim: 0.6372 video-cosine_similarity: 0.9348 time: 59.7921 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T13:09:35 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:09:35 | INFO | __main__ : Step: 56400 +2025-05-12T13:09:35 | INFO | __main__ : Current Frame Index within Batch Video: 12/247 +2025-05-12T13:09:35 | INFO | __main__ : Batch-wise Cosine Similarity | 59.42% +2025-05-12T13:09:35 | INFO | __main__ : Cosine Embedding Loss | 0.4058 +2025-05-12T13:09:35 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T13:09:35 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:09:35 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:09:59 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:09:59 | INFO | __main__ : Step: 56500 +2025-05-12T13:09:59 | INFO | __main__ : Current Frame Index within Batch Video: 112/247 +2025-05-12T13:09:59 | INFO | __main__ : Batch-wise Cosine Similarity | 88.94% +2025-05-12T13:09:59 | INFO | __main__ : Cosine Embedding Loss | 0.1106 +2025-05-12T13:09:59 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T13:09:59 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:09:59 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:10:23 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:10:23 | INFO | __main__ : Step: 56600 +2025-05-12T13:10:23 | INFO | __main__ : Current Frame Index within Batch Video: 212/247 +2025-05-12T13:10:23 | INFO | __main__ : Batch-wise Cosine Similarity | 92.68% +2025-05-12T13:10:23 | INFO | __main__ : Cosine Embedding Loss | 0.0732 +2025-05-12T13:10:23 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T13:10:23 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:10:23 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:10:31 | INFO | utils.basic_utils : Train Epoch: [0] [ 234/4978] eta: 3 days, 6:50:22 lr: 0.000009 temperature: 0.0126 video-loss_cosine: 0.0457 eval_avg_sim: 0.6372 video-cosine_similarity: 0.9543 time: 59.7916 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T13:10:47 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:10:47 | INFO | __main__ : Step: 56700 +2025-05-12T13:10:47 | INFO | __main__ : Current Frame Index within Batch Video: 71/247 +2025-05-12T13:10:47 | INFO | __main__ : Batch-wise Cosine Similarity | 86.43% +2025-05-12T13:10:47 | INFO | __main__ : Cosine Embedding Loss | 0.1357 +2025-05-12T13:10:47 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T13:10:47 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:10:47 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:11:11 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:11:11 | INFO | __main__ : Step: 56800 +2025-05-12T13:11:11 | INFO | __main__ : Current Frame Index within Batch Video: 171/247 +2025-05-12T13:11:11 | INFO | __main__ : Batch-wise Cosine Similarity | 89.32% +2025-05-12T13:11:11 | INFO | __main__ : Cosine Embedding Loss | 0.1068 +2025-05-12T13:11:11 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T13:11:11 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:11:11 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:11:29 | INFO | utils.basic_utils : Train Epoch: [0] [ 235/4978] eta: 3 days, 6:48:34 lr: 0.000009 temperature: 0.0126 video-loss_cosine: 0.0770 eval_avg_sim: 0.6372 video-cosine_similarity: 0.9230 time: 59.3243 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T13:11:35 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:11:35 | INFO | __main__ : Step: 56900 +2025-05-12T13:11:35 | INFO | __main__ : Current Frame Index within Batch Video: 30/247 +2025-05-12T13:11:35 | INFO | __main__ : Batch-wise Cosine Similarity | 76.47% +2025-05-12T13:11:35 | INFO | __main__ : Cosine Embedding Loss | 0.2353 +2025-05-12T13:11:35 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T13:11:35 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:11:35 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:11:58 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:11:58 | INFO | __main__ : Step: 57000 +2025-05-12T13:11:58 | INFO | __main__ : Current Frame Index within Batch Video: 130/247 +2025-05-12T13:11:58 | INFO | __main__ : Batch-wise Cosine Similarity | 91.67% +2025-05-12T13:11:58 | INFO | __main__ : Cosine Embedding Loss | 0.0833 +2025-05-12T13:11:58 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T13:11:58 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:11:58 | INFO | __main__ : Evaluation Average Sim | 0.6372 +2025-05-12T13:11:58 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:11:59 | INFO | __main__ : Performing periodic evaluation at global step 57000... +2025-05-12T13:11:59 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T13:11:59 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T13:11:59 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T13:11:59 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T13:12:08 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.5562 +2025-05-12T13:12:08 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0057000.png +2025-05-12T13:12:08 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T13:12:08 | INFO | __main__ : Evaluation at step 57000 complete. Average Similarity: 0.5562 +2025-05-12T13:12:32 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:12:32 | INFO | __main__ : Step: 57100 +2025-05-12T13:12:32 | INFO | __main__ : Current Frame Index within Batch Video: 230/247 +2025-05-12T13:12:32 | INFO | __main__ : Batch-wise Cosine Similarity | 94.34% +2025-05-12T13:12:32 | INFO | __main__ : Cosine Embedding Loss | 0.0566 +2025-05-12T13:12:32 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T13:12:32 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:12:32 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:12:36 | INFO | utils.basic_utils : Train Epoch: [0] [ 236/4978] eta: 3 days, 6:49:53 lr: 0.000009 temperature: 0.0126 video-loss_cosine: 0.0640 eval_avg_sim: 0.5562 video-cosine_similarity: 0.9360 time: 59.7961 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T13:12:55 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:12:55 | INFO | __main__ : Step: 57200 +2025-05-12T13:12:55 | INFO | __main__ : Current Frame Index within Batch Video: 89/247 +2025-05-12T13:12:55 | INFO | __main__ : Batch-wise Cosine Similarity | 87.89% +2025-05-12T13:12:55 | INFO | __main__ : Cosine Embedding Loss | 0.1211 +2025-05-12T13:12:55 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T13:12:55 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:12:55 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:13:19 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:13:19 | INFO | __main__ : Step: 57300 +2025-05-12T13:13:19 | INFO | __main__ : Current Frame Index within Batch Video: 189/247 +2025-05-12T13:13:19 | INFO | __main__ : Batch-wise Cosine Similarity | 93.13% +2025-05-12T13:13:19 | INFO | __main__ : Cosine Embedding Loss | 0.0687 +2025-05-12T13:13:19 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T13:13:19 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:13:19 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:13:33 | INFO | utils.basic_utils : Train Epoch: [0] [ 237/4978] eta: 3 days, 6:48:05 lr: 0.000009 temperature: 0.0126 video-loss_cosine: 0.0417 eval_avg_sim: 0.5562 video-cosine_similarity: 0.9583 time: 59.7950 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T13:13:43 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:13:43 | INFO | __main__ : Step: 57400 +2025-05-12T13:13:43 | INFO | __main__ : Current Frame Index within Batch Video: 48/247 +2025-05-12T13:13:43 | INFO | __main__ : Batch-wise Cosine Similarity | 77.04% +2025-05-12T13:13:43 | INFO | __main__ : Cosine Embedding Loss | 0.2296 +2025-05-12T13:13:43 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T13:13:43 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:13:43 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:14:07 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:14:07 | INFO | __main__ : Step: 57500 +2025-05-12T13:14:07 | INFO | __main__ : Current Frame Index within Batch Video: 148/247 +2025-05-12T13:14:07 | INFO | __main__ : Batch-wise Cosine Similarity | 90.65% +2025-05-12T13:14:07 | INFO | __main__ : Cosine Embedding Loss | 0.0935 +2025-05-12T13:14:07 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T13:14:07 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:14:07 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:14:30 | INFO | utils.basic_utils : Train Epoch: [0] [ 238/4978] eta: 3 days, 6:46:17 lr: 0.000009 temperature: 0.0126 video-loss_cosine: 0.0721 eval_avg_sim: 0.5562 video-cosine_similarity: 0.9279 time: 59.7963 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T13:14:31 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:14:31 | INFO | __main__ : Step: 57600 +2025-05-12T13:14:31 | INFO | __main__ : Current Frame Index within Batch Video: 7/247 +2025-05-12T13:14:31 | INFO | __main__ : Batch-wise Cosine Similarity | 58.35% +2025-05-12T13:14:31 | INFO | __main__ : Cosine Embedding Loss | 0.4165 +2025-05-12T13:14:31 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T13:14:31 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:14:31 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:14:55 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:14:55 | INFO | __main__ : Step: 57700 +2025-05-12T13:14:55 | INFO | __main__ : Current Frame Index within Batch Video: 107/247 +2025-05-12T13:14:55 | INFO | __main__ : Batch-wise Cosine Similarity | 90.66% +2025-05-12T13:14:55 | INFO | __main__ : Cosine Embedding Loss | 0.0934 +2025-05-12T13:14:55 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T13:14:55 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:14:55 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:15:18 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:15:18 | INFO | __main__ : Step: 57800 +2025-05-12T13:15:18 | INFO | __main__ : Current Frame Index within Batch Video: 207/247 +2025-05-12T13:15:18 | INFO | __main__ : Batch-wise Cosine Similarity | 93.51% +2025-05-12T13:15:18 | INFO | __main__ : Cosine Embedding Loss | 0.0649 +2025-05-12T13:15:18 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T13:15:18 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:15:18 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:15:28 | INFO | utils.basic_utils : Train Epoch: [0] [ 239/4978] eta: 3 days, 6:44:30 lr: 0.000009 temperature: 0.0126 video-loss_cosine: 0.0530 eval_avg_sim: 0.5562 video-cosine_similarity: 0.9470 time: 59.3260 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T13:15:42 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:15:42 | INFO | __main__ : Step: 57900 +2025-05-12T13:15:42 | INFO | __main__ : Current Frame Index within Batch Video: 66/247 +2025-05-12T13:15:42 | INFO | __main__ : Batch-wise Cosine Similarity | 79.37% +2025-05-12T13:15:42 | INFO | __main__ : Cosine Embedding Loss | 0.2063 +2025-05-12T13:15:42 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T13:15:42 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:15:42 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:16:06 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:16:06 | INFO | __main__ : Step: 58000 +2025-05-12T13:16:06 | INFO | __main__ : Current Frame Index within Batch Video: 166/247 +2025-05-12T13:16:06 | INFO | __main__ : Batch-wise Cosine Similarity | 91.15% +2025-05-12T13:16:06 | INFO | __main__ : Cosine Embedding Loss | 0.0885 +2025-05-12T13:16:06 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T13:16:06 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:16:06 | INFO | __main__ : Evaluation Average Sim | 0.5562 +2025-05-12T13:16:06 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:16:06 | INFO | __main__ : Performing periodic evaluation at global step 58000... +2025-05-12T13:16:06 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T13:16:06 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T13:16:06 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T13:16:06 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T13:16:16 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.5830 +2025-05-12T13:16:16 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0058000.png +2025-05-12T13:16:16 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T13:16:16 | INFO | __main__ : Evaluation at step 58000 complete. Average Similarity: 0.5830 +2025-05-12T13:16:35 | INFO | utils.basic_utils : Train Epoch: [0] [ 240/4978] eta: 3 days, 6:45:46 lr: 0.000009 temperature: 0.0126 video-loss_cosine: 0.0843 eval_avg_sim: 0.5830 video-cosine_similarity: 0.9157 time: 59.7946 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T13:16:39 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:16:39 | INFO | __main__ : Step: 58100 +2025-05-12T13:16:39 | INFO | __main__ : Current Frame Index within Batch Video: 25/247 +2025-05-12T13:16:39 | INFO | __main__ : Batch-wise Cosine Similarity | 72.63% +2025-05-12T13:16:39 | INFO | __main__ : Cosine Embedding Loss | 0.2737 +2025-05-12T13:16:39 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T13:16:39 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:16:39 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:17:03 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:17:03 | INFO | __main__ : Step: 58200 +2025-05-12T13:17:03 | INFO | __main__ : Current Frame Index within Batch Video: 125/247 +2025-05-12T13:17:03 | INFO | __main__ : Batch-wise Cosine Similarity | 91.98% +2025-05-12T13:17:03 | INFO | __main__ : Cosine Embedding Loss | 0.0802 +2025-05-12T13:17:03 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T13:17:03 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:17:03 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:17:27 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:17:27 | INFO | __main__ : Step: 58300 +2025-05-12T13:17:27 | INFO | __main__ : Current Frame Index within Batch Video: 225/247 +2025-05-12T13:17:27 | INFO | __main__ : Batch-wise Cosine Similarity | 95.62% +2025-05-12T13:17:27 | INFO | __main__ : Cosine Embedding Loss | 0.0438 +2025-05-12T13:17:27 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T13:17:27 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:17:27 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:17:32 | INFO | utils.basic_utils : Train Epoch: [0] [ 241/4978] eta: 3 days, 6:44:00 lr: 0.000009 temperature: 0.0126 video-loss_cosine: 0.0388 eval_avg_sim: 0.5830 video-cosine_similarity: 0.9612 time: 59.7952 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T13:17:51 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:17:51 | INFO | __main__ : Step: 58400 +2025-05-12T13:17:51 | INFO | __main__ : Current Frame Index within Batch Video: 84/247 +2025-05-12T13:17:51 | INFO | __main__ : Batch-wise Cosine Similarity | 86.60% +2025-05-12T13:17:51 | INFO | __main__ : Cosine Embedding Loss | 0.1340 +2025-05-12T13:17:51 | INFO | __main__ : Learning Rate | 0.000009 +2025-05-12T13:17:51 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:17:51 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:18:15 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:18:15 | INFO | __main__ : Step: 58500 +2025-05-12T13:18:15 | INFO | __main__ : Current Frame Index within Batch Video: 184/247 +2025-05-12T13:18:15 | INFO | __main__ : Batch-wise Cosine Similarity | 93.12% +2025-05-12T13:18:15 | INFO | __main__ : Cosine Embedding Loss | 0.0688 +2025-05-12T13:18:15 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:18:15 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:18:15 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:18:30 | INFO | utils.basic_utils : Train Epoch: [0] [ 242/4978] eta: 3 days, 6:42:14 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0684 eval_avg_sim: 0.5830 video-cosine_similarity: 0.9316 time: 59.7962 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T13:18:38 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:18:38 | INFO | __main__ : Step: 58600 +2025-05-12T13:18:38 | INFO | __main__ : Current Frame Index within Batch Video: 43/247 +2025-05-12T13:18:38 | INFO | __main__ : Batch-wise Cosine Similarity | 78.86% +2025-05-12T13:18:38 | INFO | __main__ : Cosine Embedding Loss | 0.2114 +2025-05-12T13:18:38 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:18:38 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:18:38 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:19:02 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:19:02 | INFO | __main__ : Step: 58700 +2025-05-12T13:19:02 | INFO | __main__ : Current Frame Index within Batch Video: 143/247 +2025-05-12T13:19:02 | INFO | __main__ : Batch-wise Cosine Similarity | 91.53% +2025-05-12T13:19:02 | INFO | __main__ : Cosine Embedding Loss | 0.0847 +2025-05-12T13:19:02 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:19:02 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:19:02 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:19:26 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:19:26 | INFO | __main__ : Step: 58800 +2025-05-12T13:19:26 | INFO | __main__ : Current Frame Index within Batch Video: 243/247 +2025-05-12T13:19:26 | INFO | __main__ : Batch-wise Cosine Similarity | 93.70% +2025-05-12T13:19:26 | INFO | __main__ : Cosine Embedding Loss | 0.0630 +2025-05-12T13:19:26 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:19:26 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:19:26 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:19:27 | INFO | utils.basic_utils : Train Epoch: [0] [ 243/4978] eta: 3 days, 6:40:28 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0625 eval_avg_sim: 0.5830 video-cosine_similarity: 0.9375 time: 59.7968 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T13:19:50 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:19:50 | INFO | __main__ : Step: 58900 +2025-05-12T13:19:50 | INFO | __main__ : Current Frame Index within Batch Video: 102/247 +2025-05-12T13:19:50 | INFO | __main__ : Batch-wise Cosine Similarity | 86.94% +2025-05-12T13:19:50 | INFO | __main__ : Cosine Embedding Loss | 0.1306 +2025-05-12T13:19:50 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:19:50 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:19:50 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:20:14 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:20:14 | INFO | __main__ : Step: 59000 +2025-05-12T13:20:14 | INFO | __main__ : Current Frame Index within Batch Video: 202/247 +2025-05-12T13:20:14 | INFO | __main__ : Batch-wise Cosine Similarity | 94.16% +2025-05-12T13:20:14 | INFO | __main__ : Cosine Embedding Loss | 0.0584 +2025-05-12T13:20:14 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:20:14 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:20:14 | INFO | __main__ : Evaluation Average Sim | 0.5830 +2025-05-12T13:20:14 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:20:14 | INFO | __main__ : Performing periodic evaluation at global step 59000... +2025-05-12T13:20:14 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T13:20:14 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T13:20:14 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T13:20:14 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T13:20:23 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.6483 +2025-05-12T13:20:23 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0059000.png +2025-05-12T13:20:23 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T13:20:23 | INFO | __main__ : Evaluation at step 59000 complete. Average Similarity: 0.6483 +2025-05-12T13:20:34 | INFO | utils.basic_utils : Train Epoch: [0] [ 244/4978] eta: 3 days, 6:41:42 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0537 eval_avg_sim: 0.6483 video-cosine_similarity: 0.9463 time: 59.7638 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T13:20:47 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:20:47 | INFO | __main__ : Step: 59100 +2025-05-12T13:20:47 | INFO | __main__ : Current Frame Index within Batch Video: 61/247 +2025-05-12T13:20:47 | INFO | __main__ : Batch-wise Cosine Similarity | 83.79% +2025-05-12T13:20:47 | INFO | __main__ : Cosine Embedding Loss | 0.1621 +2025-05-12T13:20:47 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:20:47 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:20:47 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:21:11 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:21:11 | INFO | __main__ : Step: 59200 +2025-05-12T13:21:11 | INFO | __main__ : Current Frame Index within Batch Video: 161/247 +2025-05-12T13:21:11 | INFO | __main__ : Batch-wise Cosine Similarity | 93.98% +2025-05-12T13:21:11 | INFO | __main__ : Cosine Embedding Loss | 0.0602 +2025-05-12T13:21:11 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:21:11 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:21:11 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:21:31 | INFO | utils.basic_utils : Train Epoch: [0] [ 245/4978] eta: 3 days, 6:39:55 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0694 eval_avg_sim: 0.6483 video-cosine_similarity: 0.9306 time: 59.7648 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T13:21:35 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:21:35 | INFO | __main__ : Step: 59300 +2025-05-12T13:21:35 | INFO | __main__ : Current Frame Index within Batch Video: 20/247 +2025-05-12T13:21:35 | INFO | __main__ : Batch-wise Cosine Similarity | 68.01% +2025-05-12T13:21:35 | INFO | __main__ : Cosine Embedding Loss | 0.3199 +2025-05-12T13:21:35 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:21:35 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:21:35 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:21:58 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:21:58 | INFO | __main__ : Step: 59400 +2025-05-12T13:21:58 | INFO | __main__ : Current Frame Index within Batch Video: 120/247 +2025-05-12T13:21:58 | INFO | __main__ : Batch-wise Cosine Similarity | 91.75% +2025-05-12T13:21:58 | INFO | __main__ : Cosine Embedding Loss | 0.0825 +2025-05-12T13:21:58 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:21:58 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:21:58 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:22:22 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:22:22 | INFO | __main__ : Step: 59500 +2025-05-12T13:22:22 | INFO | __main__ : Current Frame Index within Batch Video: 220/247 +2025-05-12T13:22:22 | INFO | __main__ : Batch-wise Cosine Similarity | 95.74% +2025-05-12T13:22:22 | INFO | __main__ : Cosine Embedding Loss | 0.0426 +2025-05-12T13:22:22 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:22:22 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:22:22 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:22:29 | INFO | utils.basic_utils : Train Epoch: [0] [ 246/4978] eta: 3 days, 6:38:10 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0673 eval_avg_sim: 0.6483 video-cosine_similarity: 0.9327 time: 59.7640 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T13:22:46 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:22:46 | INFO | __main__ : Step: 59600 +2025-05-12T13:22:46 | INFO | __main__ : Current Frame Index within Batch Video: 79/247 +2025-05-12T13:22:46 | INFO | __main__ : Batch-wise Cosine Similarity | 86.61% +2025-05-12T13:22:46 | INFO | __main__ : Cosine Embedding Loss | 0.1339 +2025-05-12T13:22:46 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:22:46 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:22:46 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:23:10 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:23:10 | INFO | __main__ : Step: 59700 +2025-05-12T13:23:10 | INFO | __main__ : Current Frame Index within Batch Video: 179/247 +2025-05-12T13:23:10 | INFO | __main__ : Batch-wise Cosine Similarity | 90.08% +2025-05-12T13:23:10 | INFO | __main__ : Cosine Embedding Loss | 0.0992 +2025-05-12T13:23:10 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:23:10 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:23:10 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:23:26 | INFO | utils.basic_utils : Train Epoch: [0] [ 247/4978] eta: 3 days, 6:36:25 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0747 eval_avg_sim: 0.6483 video-cosine_similarity: 0.9253 time: 59.7652 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T13:23:34 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:23:34 | INFO | __main__ : Step: 59800 +2025-05-12T13:23:34 | INFO | __main__ : Current Frame Index within Batch Video: 38/247 +2025-05-12T13:23:34 | INFO | __main__ : Batch-wise Cosine Similarity | 77.28% +2025-05-12T13:23:34 | INFO | __main__ : Cosine Embedding Loss | 0.2272 +2025-05-12T13:23:34 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:23:34 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:23:34 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:23:58 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:23:58 | INFO | __main__ : Step: 59900 +2025-05-12T13:23:58 | INFO | __main__ : Current Frame Index within Batch Video: 138/247 +2025-05-12T13:23:58 | INFO | __main__ : Batch-wise Cosine Similarity | 91.58% +2025-05-12T13:23:58 | INFO | __main__ : Cosine Embedding Loss | 0.0842 +2025-05-12T13:23:58 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:23:58 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:23:58 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:24:21 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:24:21 | INFO | __main__ : Step: 60000 +2025-05-12T13:24:21 | INFO | __main__ : Current Frame Index within Batch Video: 238/247 +2025-05-12T13:24:21 | INFO | __main__ : Batch-wise Cosine Similarity | 94.92% +2025-05-12T13:24:21 | INFO | __main__ : Cosine Embedding Loss | 0.0508 +2025-05-12T13:24:21 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:24:21 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:24:21 | INFO | __main__ : Evaluation Average Sim | 0.6483 +2025-05-12T13:24:21 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:24:21 | INFO | __main__ : Saving checkpoint at global step 60000 +2025-05-12T13:24:22 | INFO | __main__ : Performing periodic evaluation at global step 60000... +2025-05-12T13:24:22 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T13:24:22 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T13:24:22 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T13:24:22 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T13:24:31 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.6401 +2025-05-12T13:24:31 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0060000.png +2025-05-12T13:24:31 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T13:24:31 | INFO | __main__ : Evaluation at step 60000 complete. Average Similarity: 0.6401 +2025-05-12T13:24:33 | INFO | utils.basic_utils : Train Epoch: [0] [ 248/4978] eta: 3 days, 6:37:40 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0541 eval_avg_sim: 0.6401 video-cosine_similarity: 0.9459 time: 59.7741 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T13:24:55 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:24:55 | INFO | __main__ : Step: 60100 +2025-05-12T13:24:55 | INFO | __main__ : Current Frame Index within Batch Video: 97/247 +2025-05-12T13:24:55 | INFO | __main__ : Batch-wise Cosine Similarity | 89.61% +2025-05-12T13:24:55 | INFO | __main__ : Cosine Embedding Loss | 0.1039 +2025-05-12T13:24:55 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:24:55 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:24:55 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:25:19 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:25:19 | INFO | __main__ : Step: 60200 +2025-05-12T13:25:19 | INFO | __main__ : Current Frame Index within Batch Video: 197/247 +2025-05-12T13:25:19 | INFO | __main__ : Batch-wise Cosine Similarity | 93.25% +2025-05-12T13:25:19 | INFO | __main__ : Cosine Embedding Loss | 0.0675 +2025-05-12T13:25:19 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:25:19 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:25:19 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:25:30 | INFO | utils.basic_utils : Train Epoch: [0] [ 249/4978] eta: 3 days, 6:35:56 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0617 eval_avg_sim: 0.6401 video-cosine_similarity: 0.9383 time: 59.7756 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T13:25:43 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:25:43 | INFO | __main__ : Step: 60300 +2025-05-12T13:25:43 | INFO | __main__ : Current Frame Index within Batch Video: 56/247 +2025-05-12T13:25:43 | INFO | __main__ : Batch-wise Cosine Similarity | 82.81% +2025-05-12T13:25:43 | INFO | __main__ : Cosine Embedding Loss | 0.1719 +2025-05-12T13:25:43 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:25:43 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:25:43 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:26:06 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:26:06 | INFO | __main__ : Step: 60400 +2025-05-12T13:26:06 | INFO | __main__ : Current Frame Index within Batch Video: 156/247 +2025-05-12T13:26:06 | INFO | __main__ : Batch-wise Cosine Similarity | 94.34% +2025-05-12T13:26:06 | INFO | __main__ : Cosine Embedding Loss | 0.0566 +2025-05-12T13:26:06 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:26:06 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:26:06 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:26:28 | INFO | utils.basic_utils : Train Epoch: [0] [ 250/4978] eta: 3 days, 6:34:11 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0443 eval_avg_sim: 0.6401 video-cosine_similarity: 0.9557 time: 59.7760 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T13:26:30 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:26:30 | INFO | __main__ : Step: 60500 +2025-05-12T13:26:30 | INFO | __main__ : Current Frame Index within Batch Video: 15/247 +2025-05-12T13:26:30 | INFO | __main__ : Batch-wise Cosine Similarity | 65.53% +2025-05-12T13:26:30 | INFO | __main__ : Cosine Embedding Loss | 0.3447 +2025-05-12T13:26:30 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:26:30 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:26:30 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:26:54 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:26:54 | INFO | __main__ : Step: 60600 +2025-05-12T13:26:54 | INFO | __main__ : Current Frame Index within Batch Video: 115/247 +2025-05-12T13:26:54 | INFO | __main__ : Batch-wise Cosine Similarity | 89.96% +2025-05-12T13:26:54 | INFO | __main__ : Cosine Embedding Loss | 0.1004 +2025-05-12T13:26:54 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:26:54 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:26:54 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:27:18 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:27:18 | INFO | __main__ : Step: 60700 +2025-05-12T13:27:18 | INFO | __main__ : Current Frame Index within Batch Video: 215/247 +2025-05-12T13:27:18 | INFO | __main__ : Batch-wise Cosine Similarity | 94.27% +2025-05-12T13:27:18 | INFO | __main__ : Cosine Embedding Loss | 0.0573 +2025-05-12T13:27:18 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:27:18 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:27:18 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:27:25 | INFO | utils.basic_utils : Train Epoch: [0] [ 251/4978] eta: 3 days, 6:32:27 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0613 eval_avg_sim: 0.6401 video-cosine_similarity: 0.9387 time: 59.7766 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T13:27:42 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:27:42 | INFO | __main__ : Step: 60800 +2025-05-12T13:27:42 | INFO | __main__ : Current Frame Index within Batch Video: 74/247 +2025-05-12T13:27:42 | INFO | __main__ : Batch-wise Cosine Similarity | 86.53% +2025-05-12T13:27:42 | INFO | __main__ : Cosine Embedding Loss | 0.1347 +2025-05-12T13:27:42 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:27:42 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:27:42 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:28:05 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:28:05 | INFO | __main__ : Step: 60900 +2025-05-12T13:28:05 | INFO | __main__ : Current Frame Index within Batch Video: 174/247 +2025-05-12T13:28:05 | INFO | __main__ : Batch-wise Cosine Similarity | 93.84% +2025-05-12T13:28:05 | INFO | __main__ : Cosine Embedding Loss | 0.0616 +2025-05-12T13:28:05 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:28:05 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:28:05 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:28:23 | INFO | utils.basic_utils : Train Epoch: [0] [ 252/4978] eta: 3 days, 6:30:43 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0433 eval_avg_sim: 0.6401 video-cosine_similarity: 0.9567 time: 59.3130 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T13:28:29 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:28:29 | INFO | __main__ : Step: 61000 +2025-05-12T13:28:29 | INFO | __main__ : Current Frame Index within Batch Video: 33/247 +2025-05-12T13:28:29 | INFO | __main__ : Batch-wise Cosine Similarity | 75.61% +2025-05-12T13:28:29 | INFO | __main__ : Cosine Embedding Loss | 0.2439 +2025-05-12T13:28:29 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:28:29 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:28:29 | INFO | __main__ : Evaluation Average Sim | 0.6401 +2025-05-12T13:28:29 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:28:30 | INFO | __main__ : Performing periodic evaluation at global step 61000... +2025-05-12T13:28:30 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T13:28:30 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T13:28:30 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T13:28:30 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T13:28:40 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.5769 +2025-05-12T13:28:40 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0061000.png +2025-05-12T13:28:40 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T13:28:40 | INFO | __main__ : Evaluation at step 61000 complete. Average Similarity: 0.5769 +2025-05-12T13:29:03 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:29:03 | INFO | __main__ : Step: 61100 +2025-05-12T13:29:03 | INFO | __main__ : Current Frame Index within Batch Video: 133/247 +2025-05-12T13:29:03 | INFO | __main__ : Batch-wise Cosine Similarity | 91.26% +2025-05-12T13:29:03 | INFO | __main__ : Cosine Embedding Loss | 0.0874 +2025-05-12T13:29:03 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:29:03 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:29:03 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:29:27 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:29:27 | INFO | __main__ : Step: 61200 +2025-05-12T13:29:27 | INFO | __main__ : Current Frame Index within Batch Video: 233/247 +2025-05-12T13:29:27 | INFO | __main__ : Batch-wise Cosine Similarity | 88.51% +2025-05-12T13:29:27 | INFO | __main__ : Cosine Embedding Loss | 0.1149 +2025-05-12T13:29:27 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:29:27 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:29:27 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:29:30 | INFO | utils.basic_utils : Train Epoch: [0] [ 253/4978] eta: 3 days, 6:32:05 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0902 eval_avg_sim: 0.5769 video-cosine_similarity: 0.9098 time: 59.8148 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T13:29:51 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:29:51 | INFO | __main__ : Step: 61300 +2025-05-12T13:29:51 | INFO | __main__ : Current Frame Index within Batch Video: 92/247 +2025-05-12T13:29:51 | INFO | __main__ : Batch-wise Cosine Similarity | 90.28% +2025-05-12T13:29:51 | INFO | __main__ : Cosine Embedding Loss | 0.0972 +2025-05-12T13:29:51 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:29:51 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:29:51 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:30:15 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:30:15 | INFO | __main__ : Step: 61400 +2025-05-12T13:30:15 | INFO | __main__ : Current Frame Index within Batch Video: 192/247 +2025-05-12T13:30:15 | INFO | __main__ : Batch-wise Cosine Similarity | 91.77% +2025-05-12T13:30:15 | INFO | __main__ : Cosine Embedding Loss | 0.0823 +2025-05-12T13:30:15 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:30:15 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:30:15 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:30:28 | INFO | utils.basic_utils : Train Epoch: [0] [ 254/4978] eta: 3 days, 6:30:21 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0829 eval_avg_sim: 0.5769 video-cosine_similarity: 0.9171 time: 59.8155 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T13:30:39 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:30:39 | INFO | __main__ : Step: 61500 +2025-05-12T13:30:39 | INFO | __main__ : Current Frame Index within Batch Video: 51/247 +2025-05-12T13:30:39 | INFO | __main__ : Batch-wise Cosine Similarity | 80.56% +2025-05-12T13:30:39 | INFO | __main__ : Cosine Embedding Loss | 0.1944 +2025-05-12T13:30:39 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:30:39 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:30:39 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:31:02 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:31:02 | INFO | __main__ : Step: 61600 +2025-05-12T13:31:02 | INFO | __main__ : Current Frame Index within Batch Video: 151/247 +2025-05-12T13:31:02 | INFO | __main__ : Batch-wise Cosine Similarity | 93.37% +2025-05-12T13:31:02 | INFO | __main__ : Cosine Embedding Loss | 0.0663 +2025-05-12T13:31:02 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:31:02 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:31:02 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:31:25 | INFO | utils.basic_utils : Train Epoch: [0] [ 255/4978] eta: 3 days, 6:28:39 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0408 eval_avg_sim: 0.5769 video-cosine_similarity: 0.9592 time: 59.8206 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T13:31:26 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:31:26 | INFO | __main__ : Step: 61700 +2025-05-12T13:31:26 | INFO | __main__ : Current Frame Index within Batch Video: 10/247 +2025-05-12T13:31:26 | INFO | __main__ : Batch-wise Cosine Similarity | 63.14% +2025-05-12T13:31:26 | INFO | __main__ : Cosine Embedding Loss | 0.3686 +2025-05-12T13:31:26 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:31:26 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:31:26 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:31:50 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:31:50 | INFO | __main__ : Step: 61800 +2025-05-12T13:31:50 | INFO | __main__ : Current Frame Index within Batch Video: 110/247 +2025-05-12T13:31:50 | INFO | __main__ : Batch-wise Cosine Similarity | 92.34% +2025-05-12T13:31:50 | INFO | __main__ : Cosine Embedding Loss | 0.0766 +2025-05-12T13:31:50 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:31:50 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:31:50 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:32:14 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:32:14 | INFO | __main__ : Step: 61900 +2025-05-12T13:32:14 | INFO | __main__ : Current Frame Index within Batch Video: 210/247 +2025-05-12T13:32:14 | INFO | __main__ : Batch-wise Cosine Similarity | 92.63% +2025-05-12T13:32:14 | INFO | __main__ : Cosine Embedding Loss | 0.0737 +2025-05-12T13:32:14 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:32:14 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:32:14 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:32:23 | INFO | utils.basic_utils : Train Epoch: [0] [ 256/4978] eta: 3 days, 6:26:56 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0735 eval_avg_sim: 0.5769 video-cosine_similarity: 0.9265 time: 59.3556 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T13:32:38 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:32:38 | INFO | __main__ : Step: 62000 +2025-05-12T13:32:38 | INFO | __main__ : Current Frame Index within Batch Video: 69/247 +2025-05-12T13:32:38 | INFO | __main__ : Batch-wise Cosine Similarity | 82.68% +2025-05-12T13:32:38 | INFO | __main__ : Cosine Embedding Loss | 0.1732 +2025-05-12T13:32:38 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:32:38 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:32:38 | INFO | __main__ : Evaluation Average Sim | 0.5769 +2025-05-12T13:32:38 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:32:38 | INFO | __main__ : Performing periodic evaluation at global step 62000... +2025-05-12T13:32:38 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T13:32:38 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T13:32:38 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T13:32:38 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T13:32:47 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.6325 +2025-05-12T13:32:48 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0062000.png +2025-05-12T13:32:48 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T13:32:48 | INFO | __main__ : Evaluation at step 62000 complete. Average Similarity: 0.6325 +2025-05-12T13:33:11 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:33:11 | INFO | __main__ : Step: 62100 +2025-05-12T13:33:11 | INFO | __main__ : Current Frame Index within Batch Video: 169/247 +2025-05-12T13:33:11 | INFO | __main__ : Batch-wise Cosine Similarity | 93.06% +2025-05-12T13:33:11 | INFO | __main__ : Cosine Embedding Loss | 0.0694 +2025-05-12T13:33:11 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:33:11 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:33:11 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:33:30 | INFO | utils.basic_utils : Train Epoch: [0] [ 257/4978] eta: 3 days, 6:28:05 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0681 eval_avg_sim: 0.6325 video-cosine_similarity: 0.9319 time: 59.8283 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T13:33:35 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:33:35 | INFO | __main__ : Step: 62200 +2025-05-12T13:33:35 | INFO | __main__ : Current Frame Index within Batch Video: 28/247 +2025-05-12T13:33:35 | INFO | __main__ : Batch-wise Cosine Similarity | 73.11% +2025-05-12T13:33:35 | INFO | __main__ : Cosine Embedding Loss | 0.2689 +2025-05-12T13:33:35 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:33:35 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:33:35 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:33:59 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:33:59 | INFO | __main__ : Step: 62300 +2025-05-12T13:33:59 | INFO | __main__ : Current Frame Index within Batch Video: 128/247 +2025-05-12T13:33:59 | INFO | __main__ : Batch-wise Cosine Similarity | 89.26% +2025-05-12T13:33:59 | INFO | __main__ : Cosine Embedding Loss | 0.1074 +2025-05-12T13:33:59 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:33:59 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:33:59 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:34:23 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:34:23 | INFO | __main__ : Step: 62400 +2025-05-12T13:34:23 | INFO | __main__ : Current Frame Index within Batch Video: 228/247 +2025-05-12T13:34:23 | INFO | __main__ : Batch-wise Cosine Similarity | 95.34% +2025-05-12T13:34:23 | INFO | __main__ : Cosine Embedding Loss | 0.0466 +2025-05-12T13:34:23 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:34:23 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:34:23 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:34:27 | INFO | utils.basic_utils : Train Epoch: [0] [ 258/4978] eta: 3 days, 6:26:22 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0557 eval_avg_sim: 0.6325 video-cosine_similarity: 0.9443 time: 59.8314 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T13:34:47 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:34:47 | INFO | __main__ : Step: 62500 +2025-05-12T13:34:47 | INFO | __main__ : Current Frame Index within Batch Video: 87/247 +2025-05-12T13:34:47 | INFO | __main__ : Batch-wise Cosine Similarity | 85.83% +2025-05-12T13:34:47 | INFO | __main__ : Cosine Embedding Loss | 0.1417 +2025-05-12T13:34:47 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:34:47 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:34:47 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:35:10 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:35:10 | INFO | __main__ : Step: 62600 +2025-05-12T13:35:10 | INFO | __main__ : Current Frame Index within Batch Video: 187/247 +2025-05-12T13:35:10 | INFO | __main__ : Batch-wise Cosine Similarity | 92.58% +2025-05-12T13:35:10 | INFO | __main__ : Cosine Embedding Loss | 0.0742 +2025-05-12T13:35:10 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:35:10 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:35:10 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:35:25 | INFO | utils.basic_utils : Train Epoch: [0] [ 259/4978] eta: 3 days, 6:24:40 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0560 eval_avg_sim: 0.6325 video-cosine_similarity: 0.9440 time: 59.8326 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T13:35:34 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:35:34 | INFO | __main__ : Step: 62700 +2025-05-12T13:35:34 | INFO | __main__ : Current Frame Index within Batch Video: 46/247 +2025-05-12T13:35:34 | INFO | __main__ : Batch-wise Cosine Similarity | 80.41% +2025-05-12T13:35:34 | INFO | __main__ : Cosine Embedding Loss | 0.1959 +2025-05-12T13:35:34 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:35:34 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:35:34 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:35:58 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:35:58 | INFO | __main__ : Step: 62800 +2025-05-12T13:35:58 | INFO | __main__ : Current Frame Index within Batch Video: 146/247 +2025-05-12T13:35:58 | INFO | __main__ : Batch-wise Cosine Similarity | 92.82% +2025-05-12T13:35:58 | INFO | __main__ : Cosine Embedding Loss | 0.0718 +2025-05-12T13:35:58 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:35:58 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:35:58 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:36:22 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:36:22 | INFO | __main__ : Step: 62900 +2025-05-12T13:36:22 | INFO | __main__ : Current Frame Index within Batch Video: 246/247 +2025-05-12T13:36:22 | INFO | __main__ : Batch-wise Cosine Similarity | 90.25% +2025-05-12T13:36:22 | INFO | __main__ : Cosine Embedding Loss | 0.0975 +2025-05-12T13:36:22 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:36:22 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:36:22 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:36:22 | INFO | utils.basic_utils : Train Epoch: [0] [ 260/4978] eta: 3 days, 6:22:57 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0968 eval_avg_sim: 0.6325 video-cosine_similarity: 0.9032 time: 59.3691 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T13:36:46 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:36:46 | INFO | __main__ : Step: 63000 +2025-05-12T13:36:46 | INFO | __main__ : Current Frame Index within Batch Video: 105/247 +2025-05-12T13:36:46 | INFO | __main__ : Batch-wise Cosine Similarity | 87.14% +2025-05-12T13:36:46 | INFO | __main__ : Cosine Embedding Loss | 0.1286 +2025-05-12T13:36:46 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:36:46 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:36:46 | INFO | __main__ : Evaluation Average Sim | 0.6325 +2025-05-12T13:36:46 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:36:46 | INFO | __main__ : Performing periodic evaluation at global step 63000... +2025-05-12T13:36:46 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T13:36:46 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T13:36:46 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T13:36:46 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T13:36:55 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.4988 +2025-05-12T13:36:55 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0063000.png +2025-05-12T13:36:55 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T13:36:55 | INFO | __main__ : Evaluation at step 63000 complete. Average Similarity: 0.4988 +2025-05-12T13:37:19 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:37:19 | INFO | __main__ : Step: 63100 +2025-05-12T13:37:19 | INFO | __main__ : Current Frame Index within Batch Video: 205/247 +2025-05-12T13:37:19 | INFO | __main__ : Batch-wise Cosine Similarity | 92.28% +2025-05-12T13:37:19 | INFO | __main__ : Cosine Embedding Loss | 0.0772 +2025-05-12T13:37:19 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:37:19 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:37:19 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:37:29 | INFO | utils.basic_utils : Train Epoch: [0] [ 261/4978] eta: 3 days, 6:24:04 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0804 eval_avg_sim: 0.4988 video-cosine_similarity: 0.9196 time: 59.8373 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T13:37:43 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:37:43 | INFO | __main__ : Step: 63200 +2025-05-12T13:37:43 | INFO | __main__ : Current Frame Index within Batch Video: 64/247 +2025-05-12T13:37:43 | INFO | __main__ : Batch-wise Cosine Similarity | 83.42% +2025-05-12T13:37:43 | INFO | __main__ : Cosine Embedding Loss | 0.1658 +2025-05-12T13:37:43 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:37:43 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:37:43 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:38:07 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:38:07 | INFO | __main__ : Step: 63300 +2025-05-12T13:38:07 | INFO | __main__ : Current Frame Index within Batch Video: 164/247 +2025-05-12T13:38:07 | INFO | __main__ : Batch-wise Cosine Similarity | 90.94% +2025-05-12T13:38:07 | INFO | __main__ : Cosine Embedding Loss | 0.0906 +2025-05-12T13:38:07 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:38:07 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:38:07 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:38:26 | INFO | utils.basic_utils : Train Epoch: [0] [ 262/4978] eta: 3 days, 6:22:20 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0575 eval_avg_sim: 0.4988 video-cosine_similarity: 0.9425 time: 59.8346 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T13:38:30 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:38:30 | INFO | __main__ : Step: 63400 +2025-05-12T13:38:30 | INFO | __main__ : Current Frame Index within Batch Video: 23/247 +2025-05-12T13:38:30 | INFO | __main__ : Batch-wise Cosine Similarity | 70.86% +2025-05-12T13:38:30 | INFO | __main__ : Cosine Embedding Loss | 0.2914 +2025-05-12T13:38:30 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:38:30 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:38:30 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:38:54 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:38:54 | INFO | __main__ : Step: 63500 +2025-05-12T13:38:54 | INFO | __main__ : Current Frame Index within Batch Video: 123/247 +2025-05-12T13:38:54 | INFO | __main__ : Batch-wise Cosine Similarity | 89.72% +2025-05-12T13:38:54 | INFO | __main__ : Cosine Embedding Loss | 0.1028 +2025-05-12T13:38:54 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:38:54 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:38:54 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:39:18 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:39:18 | INFO | __main__ : Step: 63600 +2025-05-12T13:39:18 | INFO | __main__ : Current Frame Index within Batch Video: 223/247 +2025-05-12T13:39:18 | INFO | __main__ : Batch-wise Cosine Similarity | 96.34% +2025-05-12T13:39:18 | INFO | __main__ : Cosine Embedding Loss | 0.0366 +2025-05-12T13:39:18 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:39:18 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:39:18 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:39:24 | INFO | utils.basic_utils : Train Epoch: [0] [ 263/4978] eta: 3 days, 6:20:38 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0384 eval_avg_sim: 0.4988 video-cosine_similarity: 0.9616 time: 59.8352 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T13:39:42 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:39:42 | INFO | __main__ : Step: 63700 +2025-05-12T13:39:42 | INFO | __main__ : Current Frame Index within Batch Video: 82/247 +2025-05-12T13:39:42 | INFO | __main__ : Batch-wise Cosine Similarity | 88.89% +2025-05-12T13:39:42 | INFO | __main__ : Cosine Embedding Loss | 0.1111 +2025-05-12T13:39:42 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:39:42 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:39:42 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:40:06 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:40:06 | INFO | __main__ : Step: 63800 +2025-05-12T13:40:06 | INFO | __main__ : Current Frame Index within Batch Video: 182/247 +2025-05-12T13:40:06 | INFO | __main__ : Batch-wise Cosine Similarity | 96.19% +2025-05-12T13:40:06 | INFO | __main__ : Cosine Embedding Loss | 0.0381 +2025-05-12T13:40:06 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:40:06 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:40:06 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:40:21 | INFO | utils.basic_utils : Train Epoch: [0] [ 264/4978] eta: 3 days, 6:18:56 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0363 eval_avg_sim: 0.4988 video-cosine_similarity: 0.9637 time: 59.3710 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T13:40:30 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:40:30 | INFO | __main__ : Step: 63900 +2025-05-12T13:40:30 | INFO | __main__ : Current Frame Index within Batch Video: 41/247 +2025-05-12T13:40:30 | INFO | __main__ : Batch-wise Cosine Similarity | 79.53% +2025-05-12T13:40:30 | INFO | __main__ : Cosine Embedding Loss | 0.2047 +2025-05-12T13:40:30 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:40:30 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:40:30 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:40:54 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:40:54 | INFO | __main__ : Step: 64000 +2025-05-12T13:40:54 | INFO | __main__ : Current Frame Index within Batch Video: 141/247 +2025-05-12T13:40:54 | INFO | __main__ : Batch-wise Cosine Similarity | 89.51% +2025-05-12T13:40:54 | INFO | __main__ : Cosine Embedding Loss | 0.1049 +2025-05-12T13:40:54 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:40:54 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:40:54 | INFO | __main__ : Evaluation Average Sim | 0.4988 +2025-05-12T13:40:54 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:40:54 | INFO | __main__ : Performing periodic evaluation at global step 64000... +2025-05-12T13:40:54 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T13:40:54 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T13:40:54 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T13:40:54 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T13:41:03 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.5627 +2025-05-12T13:41:03 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0064000.png +2025-05-12T13:41:03 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T13:41:03 | INFO | __main__ : Evaluation at step 64000 complete. Average Similarity: 0.5627 +2025-05-12T13:41:26 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:41:26 | INFO | __main__ : Step: 64100 +2025-05-12T13:41:26 | INFO | __main__ : Current Frame Index within Batch Video: 241/247 +2025-05-12T13:41:26 | INFO | __main__ : Batch-wise Cosine Similarity | 94.45% +2025-05-12T13:41:26 | INFO | __main__ : Cosine Embedding Loss | 0.0555 +2025-05-12T13:41:26 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:41:26 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:41:26 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:41:28 | INFO | utils.basic_utils : Train Epoch: [0] [ 265/4978] eta: 3 days, 6:20:00 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0492 eval_avg_sim: 0.5627 video-cosine_similarity: 0.9508 time: 59.8370 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T13:41:50 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:41:50 | INFO | __main__ : Step: 64200 +2025-05-12T13:41:50 | INFO | __main__ : Current Frame Index within Batch Video: 100/247 +2025-05-12T13:41:50 | INFO | __main__ : Batch-wise Cosine Similarity | 89.38% +2025-05-12T13:41:50 | INFO | __main__ : Cosine Embedding Loss | 0.1062 +2025-05-12T13:41:50 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:41:50 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:41:50 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:42:14 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:42:14 | INFO | __main__ : Step: 64300 +2025-05-12T13:42:14 | INFO | __main__ : Current Frame Index within Batch Video: 200/247 +2025-05-12T13:42:14 | INFO | __main__ : Batch-wise Cosine Similarity | 94.41% +2025-05-12T13:42:14 | INFO | __main__ : Cosine Embedding Loss | 0.0559 +2025-05-12T13:42:14 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:42:14 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:42:14 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:42:25 | INFO | utils.basic_utils : Train Epoch: [0] [ 266/4978] eta: 3 days, 6:18:18 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0469 eval_avg_sim: 0.5627 video-cosine_similarity: 0.9531 time: 59.8367 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T13:42:40 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:42:40 | INFO | __main__ : Step: 64400 +2025-05-12T13:42:40 | INFO | __main__ : Current Frame Index within Batch Video: 59/247 +2025-05-12T13:42:40 | INFO | __main__ : Batch-wise Cosine Similarity | 83.90% +2025-05-12T13:42:40 | INFO | __main__ : Cosine Embedding Loss | 0.1610 +2025-05-12T13:42:40 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:42:40 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:42:40 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:43:04 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:43:04 | INFO | __main__ : Step: 64500 +2025-05-12T13:43:04 | INFO | __main__ : Current Frame Index within Batch Video: 159/247 +2025-05-12T13:43:04 | INFO | __main__ : Batch-wise Cosine Similarity | 90.67% +2025-05-12T13:43:04 | INFO | __main__ : Cosine Embedding Loss | 0.0933 +2025-05-12T13:43:04 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:43:04 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:43:04 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:43:25 | INFO | utils.basic_utils : Train Epoch: [0] [ 267/4978] eta: 3 days, 6:17:14 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0734 eval_avg_sim: 0.5627 video-cosine_similarity: 0.9266 time: 59.9435 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T13:43:28 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:43:28 | INFO | __main__ : Step: 64600 +2025-05-12T13:43:28 | INFO | __main__ : Current Frame Index within Batch Video: 18/247 +2025-05-12T13:43:28 | INFO | __main__ : Batch-wise Cosine Similarity | 70.34% +2025-05-12T13:43:28 | INFO | __main__ : Cosine Embedding Loss | 0.2966 +2025-05-12T13:43:28 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:43:28 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:43:28 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:43:52 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:43:52 | INFO | __main__ : Step: 64700 +2025-05-12T13:43:52 | INFO | __main__ : Current Frame Index within Batch Video: 118/247 +2025-05-12T13:43:52 | INFO | __main__ : Batch-wise Cosine Similarity | 91.41% +2025-05-12T13:43:52 | INFO | __main__ : Cosine Embedding Loss | 0.0859 +2025-05-12T13:43:52 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:43:52 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:43:52 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:44:16 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:44:16 | INFO | __main__ : Step: 64800 +2025-05-12T13:44:16 | INFO | __main__ : Current Frame Index within Batch Video: 218/247 +2025-05-12T13:44:16 | INFO | __main__ : Batch-wise Cosine Similarity | 94.64% +2025-05-12T13:44:16 | INFO | __main__ : Cosine Embedding Loss | 0.0536 +2025-05-12T13:44:16 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:44:16 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:44:16 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:44:22 | INFO | utils.basic_utils : Train Epoch: [0] [ 268/4978] eta: 3 days, 6:15:32 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0432 eval_avg_sim: 0.5627 video-cosine_similarity: 0.9568 time: 59.4696 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T13:44:39 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:44:39 | INFO | __main__ : Step: 64900 +2025-05-12T13:44:39 | INFO | __main__ : Current Frame Index within Batch Video: 77/247 +2025-05-12T13:44:39 | INFO | __main__ : Batch-wise Cosine Similarity | 87.95% +2025-05-12T13:44:39 | INFO | __main__ : Cosine Embedding Loss | 0.1205 +2025-05-12T13:44:39 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:44:39 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:44:39 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:45:03 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:45:03 | INFO | __main__ : Step: 65000 +2025-05-12T13:45:03 | INFO | __main__ : Current Frame Index within Batch Video: 177/247 +2025-05-12T13:45:03 | INFO | __main__ : Batch-wise Cosine Similarity | 94.30% +2025-05-12T13:45:03 | INFO | __main__ : Cosine Embedding Loss | 0.0570 +2025-05-12T13:45:03 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:45:03 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:45:03 | INFO | __main__ : Evaluation Average Sim | 0.5627 +2025-05-12T13:45:03 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:45:03 | INFO | __main__ : Saving checkpoint at global step 65000 +2025-05-12T13:45:04 | INFO | __main__ : Performing periodic evaluation at global step 65000... +2025-05-12T13:45:04 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T13:45:04 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T13:45:04 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T13:45:04 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T13:45:13 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.4995 +2025-05-12T13:45:13 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0065000.png +2025-05-12T13:45:13 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T13:45:13 | INFO | __main__ : Evaluation at step 65000 complete. Average Similarity: 0.4995 +2025-05-12T13:45:30 | INFO | utils.basic_utils : Train Epoch: [0] [ 269/4978] eta: 3 days, 6:16:40 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0497 eval_avg_sim: 0.4995 video-cosine_similarity: 0.9503 time: 59.9525 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T13:45:37 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:45:37 | INFO | __main__ : Step: 65100 +2025-05-12T13:45:37 | INFO | __main__ : Current Frame Index within Batch Video: 36/247 +2025-05-12T13:45:37 | INFO | __main__ : Batch-wise Cosine Similarity | 77.34% +2025-05-12T13:45:37 | INFO | __main__ : Cosine Embedding Loss | 0.2266 +2025-05-12T13:45:37 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:45:37 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:45:37 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:46:01 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:46:01 | INFO | __main__ : Step: 65200 +2025-05-12T13:46:01 | INFO | __main__ : Current Frame Index within Batch Video: 136/247 +2025-05-12T13:46:01 | INFO | __main__ : Batch-wise Cosine Similarity | 91.50% +2025-05-12T13:46:01 | INFO | __main__ : Cosine Embedding Loss | 0.0850 +2025-05-12T13:46:01 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:46:01 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:46:01 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:46:24 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:46:24 | INFO | __main__ : Step: 65300 +2025-05-12T13:46:24 | INFO | __main__ : Current Frame Index within Batch Video: 236/247 +2025-05-12T13:46:24 | INFO | __main__ : Batch-wise Cosine Similarity | 92.19% +2025-05-12T13:46:24 | INFO | __main__ : Cosine Embedding Loss | 0.0781 +2025-05-12T13:46:24 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:46:24 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:46:24 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:46:27 | INFO | utils.basic_utils : Train Epoch: [0] [ 270/4978] eta: 3 days, 6:14:58 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0729 eval_avg_sim: 0.4995 video-cosine_similarity: 0.9271 time: 59.9502 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T13:46:48 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:46:48 | INFO | __main__ : Step: 65400 +2025-05-12T13:46:48 | INFO | __main__ : Current Frame Index within Batch Video: 95/247 +2025-05-12T13:46:48 | INFO | __main__ : Batch-wise Cosine Similarity | 86.12% +2025-05-12T13:46:48 | INFO | __main__ : Cosine Embedding Loss | 0.1388 +2025-05-12T13:46:48 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:46:48 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:46:48 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:47:12 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:47:12 | INFO | __main__ : Step: 65500 +2025-05-12T13:47:12 | INFO | __main__ : Current Frame Index within Batch Video: 195/247 +2025-05-12T13:47:12 | INFO | __main__ : Batch-wise Cosine Similarity | 92.58% +2025-05-12T13:47:12 | INFO | __main__ : Cosine Embedding Loss | 0.0742 +2025-05-12T13:47:12 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:47:12 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:47:12 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:47:24 | INFO | utils.basic_utils : Train Epoch: [0] [ 271/4978] eta: 3 days, 6:13:17 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0625 eval_avg_sim: 0.4995 video-cosine_similarity: 0.9375 time: 59.9481 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T13:47:36 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:47:36 | INFO | __main__ : Step: 65600 +2025-05-12T13:47:36 | INFO | __main__ : Current Frame Index within Batch Video: 54/247 +2025-05-12T13:47:36 | INFO | __main__ : Batch-wise Cosine Similarity | 82.73% +2025-05-12T13:47:36 | INFO | __main__ : Cosine Embedding Loss | 0.1727 +2025-05-12T13:47:36 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:47:36 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:47:36 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:48:00 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:48:00 | INFO | __main__ : Step: 65700 +2025-05-12T13:48:00 | INFO | __main__ : Current Frame Index within Batch Video: 154/247 +2025-05-12T13:48:00 | INFO | __main__ : Batch-wise Cosine Similarity | 91.01% +2025-05-12T13:48:00 | INFO | __main__ : Cosine Embedding Loss | 0.0899 +2025-05-12T13:48:00 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:48:00 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:48:00 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:48:22 | INFO | utils.basic_utils : Train Epoch: [0] [ 272/4978] eta: 3 days, 6:11:35 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0690 eval_avg_sim: 0.4995 video-cosine_similarity: 0.9310 time: 59.9469 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T13:48:24 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:48:24 | INFO | __main__ : Step: 65800 +2025-05-12T13:48:24 | INFO | __main__ : Current Frame Index within Batch Video: 13/247 +2025-05-12T13:48:24 | INFO | __main__ : Batch-wise Cosine Similarity | 62.81% +2025-05-12T13:48:24 | INFO | __main__ : Cosine Embedding Loss | 0.3719 +2025-05-12T13:48:24 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:48:24 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:48:24 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:48:47 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:48:47 | INFO | __main__ : Step: 65900 +2025-05-12T13:48:47 | INFO | __main__ : Current Frame Index within Batch Video: 113/247 +2025-05-12T13:48:47 | INFO | __main__ : Batch-wise Cosine Similarity | 89.45% +2025-05-12T13:48:47 | INFO | __main__ : Cosine Embedding Loss | 0.1055 +2025-05-12T13:48:47 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:48:47 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:48:47 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:49:11 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:49:11 | INFO | __main__ : Step: 66000 +2025-05-12T13:49:11 | INFO | __main__ : Current Frame Index within Batch Video: 213/247 +2025-05-12T13:49:11 | INFO | __main__ : Batch-wise Cosine Similarity | 91.70% +2025-05-12T13:49:11 | INFO | __main__ : Cosine Embedding Loss | 0.0830 +2025-05-12T13:49:11 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:49:11 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:49:11 | INFO | __main__ : Evaluation Average Sim | 0.4995 +2025-05-12T13:49:11 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:49:11 | INFO | __main__ : Performing periodic evaluation at global step 66000... +2025-05-12T13:49:11 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T13:49:12 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T13:49:12 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T13:49:12 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T13:49:21 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.6734 +2025-05-12T13:49:21 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0066000.png +2025-05-12T13:49:21 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T13:49:21 | INFO | __main__ : Evaluation at step 66000 complete. Average Similarity: 0.6734 +2025-05-12T13:49:29 | INFO | utils.basic_utils : Train Epoch: [0] [ 273/4978] eta: 3 days, 6:12:36 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0969 eval_avg_sim: 0.6734 video-cosine_similarity: 0.9031 time: 59.9163 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T13:49:45 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:49:45 | INFO | __main__ : Step: 66100 +2025-05-12T13:49:45 | INFO | __main__ : Current Frame Index within Batch Video: 72/247 +2025-05-12T13:49:45 | INFO | __main__ : Batch-wise Cosine Similarity | 84.46% +2025-05-12T13:49:45 | INFO | __main__ : Cosine Embedding Loss | 0.1554 +2025-05-12T13:49:45 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:49:45 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:49:45 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:50:08 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:50:08 | INFO | __main__ : Step: 66200 +2025-05-12T13:50:08 | INFO | __main__ : Current Frame Index within Batch Video: 172/247 +2025-05-12T13:50:08 | INFO | __main__ : Batch-wise Cosine Similarity | 93.51% +2025-05-12T13:50:08 | INFO | __main__ : Cosine Embedding Loss | 0.0649 +2025-05-12T13:50:08 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:50:08 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:50:08 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:50:26 | INFO | utils.basic_utils : Train Epoch: [0] [ 274/4978] eta: 3 days, 6:10:55 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0400 eval_avg_sim: 0.6734 video-cosine_similarity: 0.9600 time: 59.9159 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T13:50:32 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:50:32 | INFO | __main__ : Step: 66300 +2025-05-12T13:50:32 | INFO | __main__ : Current Frame Index within Batch Video: 31/247 +2025-05-12T13:50:32 | INFO | __main__ : Batch-wise Cosine Similarity | 74.61% +2025-05-12T13:50:32 | INFO | __main__ : Cosine Embedding Loss | 0.2539 +2025-05-12T13:50:32 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:50:32 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:50:32 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:50:56 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:50:56 | INFO | __main__ : Step: 66400 +2025-05-12T13:50:56 | INFO | __main__ : Current Frame Index within Batch Video: 131/247 +2025-05-12T13:50:56 | INFO | __main__ : Batch-wise Cosine Similarity | 88.16% +2025-05-12T13:50:56 | INFO | __main__ : Cosine Embedding Loss | 0.1184 +2025-05-12T13:50:56 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:50:56 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:50:56 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:51:20 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:51:20 | INFO | __main__ : Step: 66500 +2025-05-12T13:51:20 | INFO | __main__ : Current Frame Index within Batch Video: 231/247 +2025-05-12T13:51:20 | INFO | __main__ : Batch-wise Cosine Similarity | 92.20% +2025-05-12T13:51:20 | INFO | __main__ : Cosine Embedding Loss | 0.0780 +2025-05-12T13:51:20 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:51:20 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:51:20 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:51:24 | INFO | utils.basic_utils : Train Epoch: [0] [ 275/4978] eta: 3 days, 6:09:16 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0672 eval_avg_sim: 0.6734 video-cosine_similarity: 0.9328 time: 59.9149 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T13:51:44 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:51:44 | INFO | __main__ : Step: 66600 +2025-05-12T13:51:44 | INFO | __main__ : Current Frame Index within Batch Video: 90/247 +2025-05-12T13:51:44 | INFO | __main__ : Batch-wise Cosine Similarity | 87.17% +2025-05-12T13:51:44 | INFO | __main__ : Cosine Embedding Loss | 0.1283 +2025-05-12T13:51:44 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:51:44 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:51:44 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:52:07 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:52:07 | INFO | __main__ : Step: 66700 +2025-05-12T13:52:07 | INFO | __main__ : Current Frame Index within Batch Video: 190/247 +2025-05-12T13:52:07 | INFO | __main__ : Batch-wise Cosine Similarity | 91.57% +2025-05-12T13:52:07 | INFO | __main__ : Cosine Embedding Loss | 0.0843 +2025-05-12T13:52:07 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:52:07 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:52:07 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:52:21 | INFO | utils.basic_utils : Train Epoch: [0] [ 276/4978] eta: 3 days, 6:07:35 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0580 eval_avg_sim: 0.6734 video-cosine_similarity: 0.9420 time: 59.9127 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T13:52:31 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:52:31 | INFO | __main__ : Step: 66800 +2025-05-12T13:52:31 | INFO | __main__ : Current Frame Index within Batch Video: 49/247 +2025-05-12T13:52:31 | INFO | __main__ : Batch-wise Cosine Similarity | 82.25% +2025-05-12T13:52:31 | INFO | __main__ : Cosine Embedding Loss | 0.1775 +2025-05-12T13:52:31 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:52:31 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:52:31 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:52:55 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:52:55 | INFO | __main__ : Step: 66900 +2025-05-12T13:52:55 | INFO | __main__ : Current Frame Index within Batch Video: 149/247 +2025-05-12T13:52:55 | INFO | __main__ : Batch-wise Cosine Similarity | 92.81% +2025-05-12T13:52:55 | INFO | __main__ : Cosine Embedding Loss | 0.0719 +2025-05-12T13:52:55 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:52:55 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:52:55 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:53:18 | INFO | utils.basic_utils : Train Epoch: [0] [ 277/4978] eta: 3 days, 6:05:55 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0451 eval_avg_sim: 0.6734 video-cosine_similarity: 0.9549 time: 59.4426 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T13:53:19 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:53:19 | INFO | __main__ : Step: 67000 +2025-05-12T13:53:19 | INFO | __main__ : Current Frame Index within Batch Video: 8/247 +2025-05-12T13:53:19 | INFO | __main__ : Batch-wise Cosine Similarity | 56.62% +2025-05-12T13:53:19 | INFO | __main__ : Cosine Embedding Loss | 0.4338 +2025-05-12T13:53:19 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:53:19 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:53:19 | INFO | __main__ : Evaluation Average Sim | 0.6734 +2025-05-12T13:53:19 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:53:19 | INFO | __main__ : Performing periodic evaluation at global step 67000... +2025-05-12T13:53:19 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T13:53:19 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T13:53:19 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T13:53:19 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T13:53:29 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.5979 +2025-05-12T13:53:29 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0067000.png +2025-05-12T13:53:29 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T13:53:29 | INFO | __main__ : Evaluation at step 67000 complete. Average Similarity: 0.5979 +2025-05-12T13:53:53 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:53:53 | INFO | __main__ : Step: 67100 +2025-05-12T13:53:53 | INFO | __main__ : Current Frame Index within Batch Video: 108/247 +2025-05-12T13:53:53 | INFO | __main__ : Batch-wise Cosine Similarity | 89.38% +2025-05-12T13:53:53 | INFO | __main__ : Cosine Embedding Loss | 0.1062 +2025-05-12T13:53:53 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:53:53 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:53:53 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:54:17 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:54:17 | INFO | __main__ : Step: 67200 +2025-05-12T13:54:17 | INFO | __main__ : Current Frame Index within Batch Video: 208/247 +2025-05-12T13:54:17 | INFO | __main__ : Batch-wise Cosine Similarity | 94.58% +2025-05-12T13:54:17 | INFO | __main__ : Cosine Embedding Loss | 0.0542 +2025-05-12T13:54:17 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:54:17 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:54:17 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:54:26 | INFO | utils.basic_utils : Train Epoch: [0] [ 278/4978] eta: 3 days, 6:07:05 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0485 eval_avg_sim: 0.5979 video-cosine_similarity: 0.9515 time: 59.9424 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T13:54:41 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:54:41 | INFO | __main__ : Step: 67300 +2025-05-12T13:54:41 | INFO | __main__ : Current Frame Index within Batch Video: 67/247 +2025-05-12T13:54:41 | INFO | __main__ : Batch-wise Cosine Similarity | 83.16% +2025-05-12T13:54:41 | INFO | __main__ : Cosine Embedding Loss | 0.1684 +2025-05-12T13:54:41 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:54:41 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:54:41 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:55:04 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:55:04 | INFO | __main__ : Step: 67400 +2025-05-12T13:55:04 | INFO | __main__ : Current Frame Index within Batch Video: 167/247 +2025-05-12T13:55:04 | INFO | __main__ : Batch-wise Cosine Similarity | 91.96% +2025-05-12T13:55:04 | INFO | __main__ : Cosine Embedding Loss | 0.0804 +2025-05-12T13:55:04 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:55:04 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:55:04 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:55:23 | INFO | utils.basic_utils : Train Epoch: [0] [ 279/4978] eta: 3 days, 6:05:25 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0586 eval_avg_sim: 0.5979 video-cosine_similarity: 0.9414 time: 59.9417 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T13:55:28 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:55:28 | INFO | __main__ : Step: 67500 +2025-05-12T13:55:28 | INFO | __main__ : Current Frame Index within Batch Video: 26/247 +2025-05-12T13:55:28 | INFO | __main__ : Batch-wise Cosine Similarity | 73.30% +2025-05-12T13:55:28 | INFO | __main__ : Cosine Embedding Loss | 0.2670 +2025-05-12T13:55:28 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:55:28 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:55:28 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:55:52 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:55:52 | INFO | __main__ : Step: 67600 +2025-05-12T13:55:52 | INFO | __main__ : Current Frame Index within Batch Video: 126/247 +2025-05-12T13:55:52 | INFO | __main__ : Batch-wise Cosine Similarity | 90.72% +2025-05-12T13:55:52 | INFO | __main__ : Cosine Embedding Loss | 0.0928 +2025-05-12T13:55:52 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:55:52 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:55:52 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:56:16 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:56:16 | INFO | __main__ : Step: 67700 +2025-05-12T13:56:16 | INFO | __main__ : Current Frame Index within Batch Video: 226/247 +2025-05-12T13:56:16 | INFO | __main__ : Batch-wise Cosine Similarity | 94.07% +2025-05-12T13:56:16 | INFO | __main__ : Cosine Embedding Loss | 0.0593 +2025-05-12T13:56:16 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:56:16 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:56:16 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:56:21 | INFO | utils.basic_utils : Train Epoch: [0] [ 280/4978] eta: 3 days, 6:03:45 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0643 eval_avg_sim: 0.5979 video-cosine_similarity: 0.9357 time: 59.9400 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T13:56:40 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:56:40 | INFO | __main__ : Step: 67800 +2025-05-12T13:56:40 | INFO | __main__ : Current Frame Index within Batch Video: 85/247 +2025-05-12T13:56:40 | INFO | __main__ : Batch-wise Cosine Similarity | 88.17% +2025-05-12T13:56:40 | INFO | __main__ : Cosine Embedding Loss | 0.1183 +2025-05-12T13:56:40 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:56:40 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:56:40 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:57:04 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:57:04 | INFO | __main__ : Step: 67900 +2025-05-12T13:57:04 | INFO | __main__ : Current Frame Index within Batch Video: 185/247 +2025-05-12T13:57:04 | INFO | __main__ : Batch-wise Cosine Similarity | 91.32% +2025-05-12T13:57:04 | INFO | __main__ : Cosine Embedding Loss | 0.0868 +2025-05-12T13:57:04 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:57:04 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:57:04 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:57:18 | INFO | utils.basic_utils : Train Epoch: [0] [ 281/4978] eta: 3 days, 6:02:06 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0955 eval_avg_sim: 0.5979 video-cosine_similarity: 0.9045 time: 59.4731 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T13:57:28 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:57:28 | INFO | __main__ : Step: 68000 +2025-05-12T13:57:28 | INFO | __main__ : Current Frame Index within Batch Video: 44/247 +2025-05-12T13:57:28 | INFO | __main__ : Batch-wise Cosine Similarity | 79.58% +2025-05-12T13:57:28 | INFO | __main__ : Cosine Embedding Loss | 0.2042 +2025-05-12T13:57:28 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:57:28 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:57:28 | INFO | __main__ : Evaluation Average Sim | 0.5979 +2025-05-12T13:57:28 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:57:28 | INFO | __main__ : Performing periodic evaluation at global step 68000... +2025-05-12T13:57:28 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T13:57:28 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T13:57:28 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T13:57:28 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T13:57:37 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.6694 +2025-05-12T13:57:37 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0068000.png +2025-05-12T13:57:37 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T13:57:37 | INFO | __main__ : Evaluation at step 68000 complete. Average Similarity: 0.6694 +2025-05-12T13:58:01 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:58:01 | INFO | __main__ : Step: 68100 +2025-05-12T13:58:01 | INFO | __main__ : Current Frame Index within Batch Video: 144/247 +2025-05-12T13:58:01 | INFO | __main__ : Batch-wise Cosine Similarity | 91.27% +2025-05-12T13:58:01 | INFO | __main__ : Cosine Embedding Loss | 0.0873 +2025-05-12T13:58:01 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:58:01 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:58:01 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:58:25 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:58:25 | INFO | __main__ : Step: 68200 +2025-05-12T13:58:25 | INFO | __main__ : Current Frame Index within Batch Video: 244/247 +2025-05-12T13:58:25 | INFO | __main__ : Batch-wise Cosine Similarity | 93.64% +2025-05-12T13:58:25 | INFO | __main__ : Cosine Embedding Loss | 0.0636 +2025-05-12T13:58:25 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:58:25 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:58:25 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:58:25 | INFO | utils.basic_utils : Train Epoch: [0] [ 282/4978] eta: 3 days, 6:03:06 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0602 eval_avg_sim: 0.6694 video-cosine_similarity: 0.9398 time: 59.9521 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T13:58:49 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:58:49 | INFO | __main__ : Step: 68300 +2025-05-12T13:58:49 | INFO | __main__ : Current Frame Index within Batch Video: 103/247 +2025-05-12T13:58:49 | INFO | __main__ : Batch-wise Cosine Similarity | 87.57% +2025-05-12T13:58:49 | INFO | __main__ : Cosine Embedding Loss | 0.1243 +2025-05-12T13:58:49 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:58:49 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:58:49 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:59:12 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:59:12 | INFO | __main__ : Step: 68400 +2025-05-12T13:59:12 | INFO | __main__ : Current Frame Index within Batch Video: 203/247 +2025-05-12T13:59:12 | INFO | __main__ : Batch-wise Cosine Similarity | 91.17% +2025-05-12T13:59:12 | INFO | __main__ : Cosine Embedding Loss | 0.0883 +2025-05-12T13:59:12 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:59:12 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:59:12 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:59:23 | INFO | utils.basic_utils : Train Epoch: [0] [ 283/4978] eta: 3 days, 6:01:27 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0652 eval_avg_sim: 0.6694 video-cosine_similarity: 0.9348 time: 59.9536 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T13:59:36 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T13:59:36 | INFO | __main__ : Step: 68500 +2025-05-12T13:59:36 | INFO | __main__ : Current Frame Index within Batch Video: 62/247 +2025-05-12T13:59:36 | INFO | __main__ : Batch-wise Cosine Similarity | 84.20% +2025-05-12T13:59:36 | INFO | __main__ : Cosine Embedding Loss | 0.1580 +2025-05-12T13:59:36 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T13:59:36 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T13:59:36 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:00:00 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:00:00 | INFO | __main__ : Step: 68600 +2025-05-12T14:00:00 | INFO | __main__ : Current Frame Index within Batch Video: 162/247 +2025-05-12T14:00:00 | INFO | __main__ : Batch-wise Cosine Similarity | 92.67% +2025-05-12T14:00:00 | INFO | __main__ : Cosine Embedding Loss | 0.0733 +2025-05-12T14:00:00 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:00:00 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:00:00 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:00:20 | INFO | utils.basic_utils : Train Epoch: [0] [ 284/4978] eta: 3 days, 5:59:48 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0687 eval_avg_sim: 0.6694 video-cosine_similarity: 0.9313 time: 59.9551 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T14:00:24 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:00:24 | INFO | __main__ : Step: 68700 +2025-05-12T14:00:24 | INFO | __main__ : Current Frame Index within Batch Video: 21/247 +2025-05-12T14:00:24 | INFO | __main__ : Batch-wise Cosine Similarity | 67.99% +2025-05-12T14:00:24 | INFO | __main__ : Cosine Embedding Loss | 0.3201 +2025-05-12T14:00:24 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:00:24 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:00:24 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:00:48 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:00:48 | INFO | __main__ : Step: 68800 +2025-05-12T14:00:48 | INFO | __main__ : Current Frame Index within Batch Video: 121/247 +2025-05-12T14:00:48 | INFO | __main__ : Batch-wise Cosine Similarity | 89.02% +2025-05-12T14:00:48 | INFO | __main__ : Cosine Embedding Loss | 0.1098 +2025-05-12T14:00:48 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:00:48 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:00:48 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:01:12 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:01:12 | INFO | __main__ : Step: 68900 +2025-05-12T14:01:12 | INFO | __main__ : Current Frame Index within Batch Video: 221/247 +2025-05-12T14:01:12 | INFO | __main__ : Batch-wise Cosine Similarity | 91.10% +2025-05-12T14:01:12 | INFO | __main__ : Cosine Embedding Loss | 0.0890 +2025-05-12T14:01:12 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:01:12 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:01:12 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:01:18 | INFO | utils.basic_utils : Train Epoch: [0] [ 285/4978] eta: 3 days, 5:58:11 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0695 eval_avg_sim: 0.6694 video-cosine_similarity: 0.9305 time: 59.4919 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T14:01:36 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:01:36 | INFO | __main__ : Step: 69000 +2025-05-12T14:01:36 | INFO | __main__ : Current Frame Index within Batch Video: 80/247 +2025-05-12T14:01:36 | INFO | __main__ : Batch-wise Cosine Similarity | 87.62% +2025-05-12T14:01:36 | INFO | __main__ : Cosine Embedding Loss | 0.1238 +2025-05-12T14:01:36 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:01:36 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:01:36 | INFO | __main__ : Evaluation Average Sim | 0.6694 +2025-05-12T14:01:36 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:01:36 | INFO | __main__ : Performing periodic evaluation at global step 69000... +2025-05-12T14:01:36 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T14:01:36 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T14:01:36 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T14:01:36 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T14:01:45 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.5462 +2025-05-12T14:01:45 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0069000.png +2025-05-12T14:01:45 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T14:01:45 | INFO | __main__ : Evaluation at step 69000 complete. Average Similarity: 0.5462 +2025-05-12T14:02:09 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:02:09 | INFO | __main__ : Step: 69100 +2025-05-12T14:02:09 | INFO | __main__ : Current Frame Index within Batch Video: 180/247 +2025-05-12T14:02:09 | INFO | __main__ : Batch-wise Cosine Similarity | 93.05% +2025-05-12T14:02:09 | INFO | __main__ : Cosine Embedding Loss | 0.0695 +2025-05-12T14:02:09 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:02:09 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:02:09 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:02:25 | INFO | utils.basic_utils : Train Epoch: [0] [ 286/4978] eta: 3 days, 5:59:06 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0424 eval_avg_sim: 0.5462 video-cosine_similarity: 0.9576 time: 59.9616 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T14:02:33 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:02:33 | INFO | __main__ : Step: 69200 +2025-05-12T14:02:33 | INFO | __main__ : Current Frame Index within Batch Video: 39/247 +2025-05-12T14:02:33 | INFO | __main__ : Batch-wise Cosine Similarity | 77.41% +2025-05-12T14:02:33 | INFO | __main__ : Cosine Embedding Loss | 0.2259 +2025-05-12T14:02:33 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:02:33 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:02:33 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:02:56 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:02:56 | INFO | __main__ : Step: 69300 +2025-05-12T14:02:56 | INFO | __main__ : Current Frame Index within Batch Video: 139/247 +2025-05-12T14:02:56 | INFO | __main__ : Batch-wise Cosine Similarity | 88.96% +2025-05-12T14:02:56 | INFO | __main__ : Cosine Embedding Loss | 0.1104 +2025-05-12T14:02:56 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:02:56 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:02:56 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:03:20 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:03:20 | INFO | __main__ : Step: 69400 +2025-05-12T14:03:20 | INFO | __main__ : Current Frame Index within Batch Video: 239/247 +2025-05-12T14:03:20 | INFO | __main__ : Batch-wise Cosine Similarity | 90.62% +2025-05-12T14:03:20 | INFO | __main__ : Cosine Embedding Loss | 0.0938 +2025-05-12T14:03:20 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:03:20 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:03:20 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:03:22 | INFO | utils.basic_utils : Train Epoch: [0] [ 287/4978] eta: 3 days, 5:57:27 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0978 eval_avg_sim: 0.5462 video-cosine_similarity: 0.9022 time: 59.8542 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T14:03:44 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:03:44 | INFO | __main__ : Step: 69500 +2025-05-12T14:03:44 | INFO | __main__ : Current Frame Index within Batch Video: 98/247 +2025-05-12T14:03:44 | INFO | __main__ : Batch-wise Cosine Similarity | 87.02% +2025-05-12T14:03:44 | INFO | __main__ : Cosine Embedding Loss | 0.1298 +2025-05-12T14:03:44 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:03:44 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:03:44 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:04:08 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:04:08 | INFO | __main__ : Step: 69600 +2025-05-12T14:04:08 | INFO | __main__ : Current Frame Index within Batch Video: 198/247 +2025-05-12T14:04:08 | INFO | __main__ : Batch-wise Cosine Similarity | 93.96% +2025-05-12T14:04:08 | INFO | __main__ : Cosine Embedding Loss | 0.0604 +2025-05-12T14:04:08 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:04:08 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:04:08 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:04:20 | INFO | utils.basic_utils : Train Epoch: [0] [ 288/4978] eta: 3 days, 5:55:48 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0491 eval_avg_sim: 0.5462 video-cosine_similarity: 0.9509 time: 59.8536 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T14:04:32 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:04:32 | INFO | __main__ : Step: 69700 +2025-05-12T14:04:32 | INFO | __main__ : Current Frame Index within Batch Video: 57/247 +2025-05-12T14:04:32 | INFO | __main__ : Batch-wise Cosine Similarity | 82.64% +2025-05-12T14:04:32 | INFO | __main__ : Cosine Embedding Loss | 0.1736 +2025-05-12T14:04:32 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:04:32 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:04:32 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:04:56 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:04:56 | INFO | __main__ : Step: 69800 +2025-05-12T14:04:56 | INFO | __main__ : Current Frame Index within Batch Video: 157/247 +2025-05-12T14:04:56 | INFO | __main__ : Batch-wise Cosine Similarity | 92.37% +2025-05-12T14:04:56 | INFO | __main__ : Cosine Embedding Loss | 0.0763 +2025-05-12T14:04:56 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:04:56 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:04:56 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:05:17 | INFO | utils.basic_utils : Train Epoch: [0] [ 289/4978] eta: 3 days, 5:54:10 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0712 eval_avg_sim: 0.5462 video-cosine_similarity: 0.9288 time: 59.3685 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T14:05:19 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:05:19 | INFO | __main__ : Step: 69900 +2025-05-12T14:05:19 | INFO | __main__ : Current Frame Index within Batch Video: 16/247 +2025-05-12T14:05:19 | INFO | __main__ : Batch-wise Cosine Similarity | 66.91% +2025-05-12T14:05:19 | INFO | __main__ : Cosine Embedding Loss | 0.3309 +2025-05-12T14:05:19 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:05:19 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:05:19 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:05:43 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:05:43 | INFO | __main__ : Step: 70000 +2025-05-12T14:05:43 | INFO | __main__ : Current Frame Index within Batch Video: 116/247 +2025-05-12T14:05:43 | INFO | __main__ : Batch-wise Cosine Similarity | 87.59% +2025-05-12T14:05:43 | INFO | __main__ : Cosine Embedding Loss | 0.1241 +2025-05-12T14:05:43 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:05:43 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:05:43 | INFO | __main__ : Evaluation Average Sim | 0.5462 +2025-05-12T14:05:43 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:05:43 | INFO | __main__ : Saving checkpoint at global step 70000 +2025-05-12T14:05:44 | INFO | __main__ : Performing periodic evaluation at global step 70000... +2025-05-12T14:05:44 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T14:05:44 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T14:05:44 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T14:05:44 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T14:05:53 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.5972 +2025-05-12T14:05:53 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0070000.png +2025-05-12T14:05:53 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T14:05:53 | INFO | __main__ : Evaluation at step 70000 complete. Average Similarity: 0.5972 +2025-05-12T14:06:17 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:06:17 | INFO | __main__ : Step: 70100 +2025-05-12T14:06:17 | INFO | __main__ : Current Frame Index within Batch Video: 216/247 +2025-05-12T14:06:17 | INFO | __main__ : Batch-wise Cosine Similarity | 92.05% +2025-05-12T14:06:17 | INFO | __main__ : Cosine Embedding Loss | 0.0795 +2025-05-12T14:06:17 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:06:17 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:06:17 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:06:24 | INFO | utils.basic_utils : Train Epoch: [0] [ 290/4978] eta: 3 days, 5:55:04 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0976 eval_avg_sim: 0.5972 video-cosine_similarity: 0.9024 time: 59.8425 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T14:06:40 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:06:40 | INFO | __main__ : Step: 70200 +2025-05-12T14:06:40 | INFO | __main__ : Current Frame Index within Batch Video: 75/247 +2025-05-12T14:06:40 | INFO | __main__ : Batch-wise Cosine Similarity | 85.19% +2025-05-12T14:06:40 | INFO | __main__ : Cosine Embedding Loss | 0.1481 +2025-05-12T14:06:40 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:06:40 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:06:40 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:07:04 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:07:04 | INFO | __main__ : Step: 70300 +2025-05-12T14:07:04 | INFO | __main__ : Current Frame Index within Batch Video: 175/247 +2025-05-12T14:07:04 | INFO | __main__ : Batch-wise Cosine Similarity | 91.41% +2025-05-12T14:07:04 | INFO | __main__ : Cosine Embedding Loss | 0.0859 +2025-05-12T14:07:04 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:07:04 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:07:04 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:07:21 | INFO | utils.basic_utils : Train Epoch: [0] [ 291/4978] eta: 3 days, 5:53:27 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0446 eval_avg_sim: 0.5972 video-cosine_similarity: 0.9554 time: 59.8454 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T14:07:28 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:07:28 | INFO | __main__ : Step: 70400 +2025-05-12T14:07:28 | INFO | __main__ : Current Frame Index within Batch Video: 34/247 +2025-05-12T14:07:28 | INFO | __main__ : Batch-wise Cosine Similarity | 76.52% +2025-05-12T14:07:28 | INFO | __main__ : Cosine Embedding Loss | 0.2348 +2025-05-12T14:07:28 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:07:28 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:07:28 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:07:52 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:07:52 | INFO | __main__ : Step: 70500 +2025-05-12T14:07:52 | INFO | __main__ : Current Frame Index within Batch Video: 134/247 +2025-05-12T14:07:52 | INFO | __main__ : Batch-wise Cosine Similarity | 91.31% +2025-05-12T14:07:52 | INFO | __main__ : Cosine Embedding Loss | 0.0869 +2025-05-12T14:07:52 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:07:52 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:07:52 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:08:16 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:08:16 | INFO | __main__ : Step: 70600 +2025-05-12T14:08:16 | INFO | __main__ : Current Frame Index within Batch Video: 234/247 +2025-05-12T14:08:16 | INFO | __main__ : Batch-wise Cosine Similarity | 93.54% +2025-05-12T14:08:16 | INFO | __main__ : Cosine Embedding Loss | 0.0646 +2025-05-12T14:08:16 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:08:16 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:08:16 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:08:19 | INFO | utils.basic_utils : Train Epoch: [0] [ 292/4978] eta: 3 days, 5:51:49 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0593 eval_avg_sim: 0.5972 video-cosine_similarity: 0.9407 time: 59.8486 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T14:08:40 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:08:40 | INFO | __main__ : Step: 70700 +2025-05-12T14:08:40 | INFO | __main__ : Current Frame Index within Batch Video: 93/247 +2025-05-12T14:08:40 | INFO | __main__ : Batch-wise Cosine Similarity | 89.93% +2025-05-12T14:08:40 | INFO | __main__ : Cosine Embedding Loss | 0.1007 +2025-05-12T14:08:40 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:08:40 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:08:40 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:09:04 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:09:04 | INFO | __main__ : Step: 70800 +2025-05-12T14:09:04 | INFO | __main__ : Current Frame Index within Batch Video: 193/247 +2025-05-12T14:09:04 | INFO | __main__ : Batch-wise Cosine Similarity | 93.83% +2025-05-12T14:09:04 | INFO | __main__ : Cosine Embedding Loss | 0.0617 +2025-05-12T14:09:04 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:09:04 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:09:04 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:09:16 | INFO | utils.basic_utils : Train Epoch: [0] [ 293/4978] eta: 3 days, 5:50:12 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0435 eval_avg_sim: 0.5972 video-cosine_similarity: 0.9565 time: 59.3803 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T14:09:27 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:09:27 | INFO | __main__ : Step: 70900 +2025-05-12T14:09:27 | INFO | __main__ : Current Frame Index within Batch Video: 52/247 +2025-05-12T14:09:27 | INFO | __main__ : Batch-wise Cosine Similarity | 79.30% +2025-05-12T14:09:27 | INFO | __main__ : Cosine Embedding Loss | 0.2070 +2025-05-12T14:09:27 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:09:27 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:09:27 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:09:51 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:09:51 | INFO | __main__ : Step: 71000 +2025-05-12T14:09:51 | INFO | __main__ : Current Frame Index within Batch Video: 152/247 +2025-05-12T14:09:51 | INFO | __main__ : Batch-wise Cosine Similarity | 90.88% +2025-05-12T14:09:51 | INFO | __main__ : Cosine Embedding Loss | 0.0912 +2025-05-12T14:09:51 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:09:51 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:09:51 | INFO | __main__ : Evaluation Average Sim | 0.5972 +2025-05-12T14:09:51 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:09:52 | INFO | __main__ : Performing periodic evaluation at global step 71000... +2025-05-12T14:09:52 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T14:09:52 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T14:09:52 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T14:09:52 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T14:10:01 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.6345 +2025-05-12T14:10:01 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0071000.png +2025-05-12T14:10:01 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T14:10:01 | INFO | __main__ : Evaluation at step 71000 complete. Average Similarity: 0.6345 +2025-05-12T14:10:23 | INFO | utils.basic_utils : Train Epoch: [0] [ 294/4978] eta: 3 days, 5:51:03 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0510 eval_avg_sim: 0.6345 video-cosine_similarity: 0.9490 time: 59.8476 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T14:10:24 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:10:24 | INFO | __main__ : Step: 71100 +2025-05-12T14:10:24 | INFO | __main__ : Current Frame Index within Batch Video: 11/247 +2025-05-12T14:10:24 | INFO | __main__ : Batch-wise Cosine Similarity | 58.84% +2025-05-12T14:10:24 | INFO | __main__ : Cosine Embedding Loss | 0.4116 +2025-05-12T14:10:24 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:10:24 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:10:24 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:10:48 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:10:48 | INFO | __main__ : Step: 71200 +2025-05-12T14:10:48 | INFO | __main__ : Current Frame Index within Batch Video: 111/247 +2025-05-12T14:10:48 | INFO | __main__ : Batch-wise Cosine Similarity | 91.04% +2025-05-12T14:10:48 | INFO | __main__ : Cosine Embedding Loss | 0.0896 +2025-05-12T14:10:48 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:10:48 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:10:48 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:11:12 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:11:12 | INFO | __main__ : Step: 71300 +2025-05-12T14:11:12 | INFO | __main__ : Current Frame Index within Batch Video: 211/247 +2025-05-12T14:11:12 | INFO | __main__ : Batch-wise Cosine Similarity | 91.24% +2025-05-12T14:11:12 | INFO | __main__ : Cosine Embedding Loss | 0.0876 +2025-05-12T14:11:12 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:11:12 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:11:12 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:11:21 | INFO | utils.basic_utils : Train Epoch: [0] [ 295/4978] eta: 3 days, 5:49:26 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0677 eval_avg_sim: 0.6345 video-cosine_similarity: 0.9323 time: 59.8459 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T14:11:36 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:11:36 | INFO | __main__ : Step: 71400 +2025-05-12T14:11:36 | INFO | __main__ : Current Frame Index within Batch Video: 70/247 +2025-05-12T14:11:36 | INFO | __main__ : Batch-wise Cosine Similarity | 83.63% +2025-05-12T14:11:36 | INFO | __main__ : Cosine Embedding Loss | 0.1637 +2025-05-12T14:11:36 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:11:36 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:11:36 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:12:00 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:12:00 | INFO | __main__ : Step: 71500 +2025-05-12T14:12:00 | INFO | __main__ : Current Frame Index within Batch Video: 170/247 +2025-05-12T14:12:00 | INFO | __main__ : Batch-wise Cosine Similarity | 91.00% +2025-05-12T14:12:00 | INFO | __main__ : Cosine Embedding Loss | 0.0900 +2025-05-12T14:12:00 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:12:00 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:12:00 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:12:18 | INFO | utils.basic_utils : Train Epoch: [0] [ 296/4978] eta: 3 days, 5:47:48 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0992 eval_avg_sim: 0.6345 video-cosine_similarity: 0.9008 time: 59.8443 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T14:12:24 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:12:24 | INFO | __main__ : Step: 71600 +2025-05-12T14:12:24 | INFO | __main__ : Current Frame Index within Batch Video: 29/247 +2025-05-12T14:12:24 | INFO | __main__ : Batch-wise Cosine Similarity | 74.61% +2025-05-12T14:12:24 | INFO | __main__ : Cosine Embedding Loss | 0.2539 +2025-05-12T14:12:24 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:12:24 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:12:24 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:12:47 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:12:47 | INFO | __main__ : Step: 71700 +2025-05-12T14:12:47 | INFO | __main__ : Current Frame Index within Batch Video: 129/247 +2025-05-12T14:12:47 | INFO | __main__ : Batch-wise Cosine Similarity | 86.72% +2025-05-12T14:12:47 | INFO | __main__ : Cosine Embedding Loss | 0.1328 +2025-05-12T14:12:47 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:12:47 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:12:47 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:13:11 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:13:11 | INFO | __main__ : Step: 71800 +2025-05-12T14:13:11 | INFO | __main__ : Current Frame Index within Batch Video: 229/247 +2025-05-12T14:13:11 | INFO | __main__ : Batch-wise Cosine Similarity | 91.83% +2025-05-12T14:13:11 | INFO | __main__ : Cosine Embedding Loss | 0.0817 +2025-05-12T14:13:11 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:13:11 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:13:11 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:13:15 | INFO | utils.basic_utils : Train Epoch: [0] [ 297/4978] eta: 3 days, 5:46:11 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0967 eval_avg_sim: 0.6345 video-cosine_similarity: 0.9033 time: 59.8457 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T14:13:35 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:13:35 | INFO | __main__ : Step: 71900 +2025-05-12T14:13:35 | INFO | __main__ : Current Frame Index within Batch Video: 88/247 +2025-05-12T14:13:35 | INFO | __main__ : Batch-wise Cosine Similarity | 87.71% +2025-05-12T14:13:35 | INFO | __main__ : Cosine Embedding Loss | 0.1229 +2025-05-12T14:13:35 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:13:35 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:13:35 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:13:59 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:13:59 | INFO | __main__ : Step: 72000 +2025-05-12T14:13:59 | INFO | __main__ : Current Frame Index within Batch Video: 188/247 +2025-05-12T14:13:59 | INFO | __main__ : Batch-wise Cosine Similarity | 93.41% +2025-05-12T14:13:59 | INFO | __main__ : Cosine Embedding Loss | 0.0659 +2025-05-12T14:13:59 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:13:59 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:13:59 | INFO | __main__ : Evaluation Average Sim | 0.6345 +2025-05-12T14:13:59 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:13:59 | INFO | __main__ : Performing periodic evaluation at global step 72000... +2025-05-12T14:13:59 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T14:13:59 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T14:13:59 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T14:13:59 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T14:14:09 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.6282 +2025-05-12T14:14:09 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0072000.png +2025-05-12T14:14:09 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T14:14:09 | INFO | __main__ : Evaluation at step 72000 complete. Average Similarity: 0.6282 +2025-05-12T14:14:22 | INFO | utils.basic_utils : Train Epoch: [0] [ 298/4978] eta: 3 days, 5:47:02 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0611 eval_avg_sim: 0.6282 video-cosine_similarity: 0.9389 time: 59.8158 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T14:14:32 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:14:32 | INFO | __main__ : Step: 72100 +2025-05-12T14:14:32 | INFO | __main__ : Current Frame Index within Batch Video: 47/247 +2025-05-12T14:14:32 | INFO | __main__ : Batch-wise Cosine Similarity | 77.59% +2025-05-12T14:14:32 | INFO | __main__ : Cosine Embedding Loss | 0.2241 +2025-05-12T14:14:32 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:14:32 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:14:32 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:14:56 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:14:56 | INFO | __main__ : Step: 72200 +2025-05-12T14:14:56 | INFO | __main__ : Current Frame Index within Batch Video: 147/247 +2025-05-12T14:14:56 | INFO | __main__ : Batch-wise Cosine Similarity | 89.46% +2025-05-12T14:14:56 | INFO | __main__ : Cosine Embedding Loss | 0.1054 +2025-05-12T14:14:56 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:14:56 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:14:56 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:15:20 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:15:20 | INFO | __main__ : Step: 72300 +2025-05-12T14:15:20 | INFO | __main__ : Current Frame Index within Batch Video: 247/247 +2025-05-12T14:15:20 | INFO | __main__ : Batch-wise Cosine Similarity | 89.50% +2025-05-12T14:15:20 | INFO | __main__ : Cosine Embedding Loss | 0.1050 +2025-05-12T14:15:20 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:15:20 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:15:20 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:15:20 | INFO | utils.basic_utils : Train Epoch: [0] [ 299/4978] eta: 3 days, 5:45:25 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.1050 eval_avg_sim: 0.6282 video-cosine_similarity: 0.8950 time: 59.8139 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T14:15:44 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:15:44 | INFO | __main__ : Step: 72400 +2025-05-12T14:15:44 | INFO | __main__ : Current Frame Index within Batch Video: 106/247 +2025-05-12T14:15:44 | INFO | __main__ : Batch-wise Cosine Similarity | 85.48% +2025-05-12T14:15:44 | INFO | __main__ : Cosine Embedding Loss | 0.1452 +2025-05-12T14:15:44 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:15:44 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:15:44 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:16:07 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:16:07 | INFO | __main__ : Step: 72500 +2025-05-12T14:16:07 | INFO | __main__ : Current Frame Index within Batch Video: 206/247 +2025-05-12T14:16:07 | INFO | __main__ : Batch-wise Cosine Similarity | 91.04% +2025-05-12T14:16:07 | INFO | __main__ : Cosine Embedding Loss | 0.0896 +2025-05-12T14:16:07 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:16:07 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:16:07 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:16:17 | INFO | utils.basic_utils : Train Epoch: [0] [ 300/4978] eta: 3 days, 5:43:47 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0611 eval_avg_sim: 0.6282 video-cosine_similarity: 0.9389 time: 59.8141 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T14:16:31 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:16:31 | INFO | __main__ : Step: 72600 +2025-05-12T14:16:31 | INFO | __main__ : Current Frame Index within Batch Video: 65/247 +2025-05-12T14:16:31 | INFO | __main__ : Batch-wise Cosine Similarity | 82.54% +2025-05-12T14:16:31 | INFO | __main__ : Cosine Embedding Loss | 0.1746 +2025-05-12T14:16:31 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:16:31 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:16:31 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:16:55 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:16:55 | INFO | __main__ : Step: 72700 +2025-05-12T14:16:55 | INFO | __main__ : Current Frame Index within Batch Video: 165/247 +2025-05-12T14:16:55 | INFO | __main__ : Batch-wise Cosine Similarity | 92.14% +2025-05-12T14:16:55 | INFO | __main__ : Cosine Embedding Loss | 0.0786 +2025-05-12T14:16:55 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:16:55 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:16:55 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:17:15 | INFO | utils.basic_utils : Train Epoch: [0] [ 301/4978] eta: 3 days, 5:42:11 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0617 eval_avg_sim: 0.6282 video-cosine_similarity: 0.9383 time: 59.8127 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T14:17:19 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:17:19 | INFO | __main__ : Step: 72800 +2025-05-12T14:17:19 | INFO | __main__ : Current Frame Index within Batch Video: 24/247 +2025-05-12T14:17:19 | INFO | __main__ : Batch-wise Cosine Similarity | 74.72% +2025-05-12T14:17:19 | INFO | __main__ : Cosine Embedding Loss | 0.2528 +2025-05-12T14:17:19 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:17:19 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:17:19 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:17:43 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:17:43 | INFO | __main__ : Step: 72900 +2025-05-12T14:17:43 | INFO | __main__ : Current Frame Index within Batch Video: 124/247 +2025-05-12T14:17:43 | INFO | __main__ : Batch-wise Cosine Similarity | 87.98% +2025-05-12T14:17:43 | INFO | __main__ : Cosine Embedding Loss | 0.1202 +2025-05-12T14:17:43 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:17:43 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:17:43 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:18:07 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:18:07 | INFO | __main__ : Step: 73000 +2025-05-12T14:18:07 | INFO | __main__ : Current Frame Index within Batch Video: 224/247 +2025-05-12T14:18:07 | INFO | __main__ : Batch-wise Cosine Similarity | 93.69% +2025-05-12T14:18:07 | INFO | __main__ : Cosine Embedding Loss | 0.0631 +2025-05-12T14:18:07 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:18:07 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:18:07 | INFO | __main__ : Evaluation Average Sim | 0.6282 +2025-05-12T14:18:07 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:18:07 | INFO | __main__ : Performing periodic evaluation at global step 73000... +2025-05-12T14:18:07 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T14:18:07 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T14:18:07 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T14:18:07 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T14:18:16 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.4395 +2025-05-12T14:18:16 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0073000.png +2025-05-12T14:18:16 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T14:18:16 | INFO | __main__ : Evaluation at step 73000 complete. Average Similarity: 0.4395 +2025-05-12T14:18:22 | INFO | utils.basic_utils : Train Epoch: [0] [ 302/4978] eta: 3 days, 5:43:00 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0691 eval_avg_sim: 0.4395 video-cosine_similarity: 0.9309 time: 59.8050 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T14:18:40 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:18:40 | INFO | __main__ : Step: 73100 +2025-05-12T14:18:40 | INFO | __main__ : Current Frame Index within Batch Video: 83/247 +2025-05-12T14:18:40 | INFO | __main__ : Batch-wise Cosine Similarity | 86.97% +2025-05-12T14:18:40 | INFO | __main__ : Cosine Embedding Loss | 0.1303 +2025-05-12T14:18:40 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:18:40 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:18:40 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:19:04 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:19:04 | INFO | __main__ : Step: 73200 +2025-05-12T14:19:04 | INFO | __main__ : Current Frame Index within Batch Video: 183/247 +2025-05-12T14:19:04 | INFO | __main__ : Batch-wise Cosine Similarity | 93.45% +2025-05-12T14:19:04 | INFO | __main__ : Cosine Embedding Loss | 0.0655 +2025-05-12T14:19:04 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:19:04 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:19:04 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:19:19 | INFO | utils.basic_utils : Train Epoch: [0] [ 303/4978] eta: 3 days, 5:41:23 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0621 eval_avg_sim: 0.4395 video-cosine_similarity: 0.9379 time: 59.8020 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T14:19:28 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:19:28 | INFO | __main__ : Step: 73300 +2025-05-12T14:19:28 | INFO | __main__ : Current Frame Index within Batch Video: 42/247 +2025-05-12T14:19:28 | INFO | __main__ : Batch-wise Cosine Similarity | 79.55% +2025-05-12T14:19:28 | INFO | __main__ : Cosine Embedding Loss | 0.2045 +2025-05-12T14:19:28 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:19:28 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:19:28 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:19:51 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:19:51 | INFO | __main__ : Step: 73400 +2025-05-12T14:19:51 | INFO | __main__ : Current Frame Index within Batch Video: 142/247 +2025-05-12T14:19:51 | INFO | __main__ : Batch-wise Cosine Similarity | 90.01% +2025-05-12T14:19:52 | INFO | __main__ : Cosine Embedding Loss | 0.0999 +2025-05-12T14:19:52 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:19:52 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:19:52 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:20:15 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:20:15 | INFO | __main__ : Step: 73500 +2025-05-12T14:20:15 | INFO | __main__ : Current Frame Index within Batch Video: 242/247 +2025-05-12T14:20:15 | INFO | __main__ : Batch-wise Cosine Similarity | 91.31% +2025-05-12T14:20:15 | INFO | __main__ : Cosine Embedding Loss | 0.0869 +2025-05-12T14:20:15 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:20:15 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:20:15 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:20:16 | INFO | utils.basic_utils : Train Epoch: [0] [ 304/4978] eta: 3 days, 5:39:47 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0844 eval_avg_sim: 0.4395 video-cosine_similarity: 0.9156 time: 59.8016 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T14:20:39 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:20:39 | INFO | __main__ : Step: 73600 +2025-05-12T14:20:39 | INFO | __main__ : Current Frame Index within Batch Video: 101/247 +2025-05-12T14:20:39 | INFO | __main__ : Batch-wise Cosine Similarity | 91.31% +2025-05-12T14:20:39 | INFO | __main__ : Cosine Embedding Loss | 0.0869 +2025-05-12T14:20:39 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:20:39 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:20:39 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:21:03 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:21:03 | INFO | __main__ : Step: 73700 +2025-05-12T14:21:03 | INFO | __main__ : Current Frame Index within Batch Video: 201/247 +2025-05-12T14:21:03 | INFO | __main__ : Batch-wise Cosine Similarity | 93.84% +2025-05-12T14:21:03 | INFO | __main__ : Cosine Embedding Loss | 0.0616 +2025-05-12T14:21:03 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:21:03 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:21:03 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:21:14 | INFO | utils.basic_utils : Train Epoch: [0] [ 305/4978] eta: 3 days, 5:38:10 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0711 eval_avg_sim: 0.4395 video-cosine_similarity: 0.9289 time: 59.7959 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T14:21:27 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:21:27 | INFO | __main__ : Step: 73800 +2025-05-12T14:21:27 | INFO | __main__ : Current Frame Index within Batch Video: 60/247 +2025-05-12T14:21:27 | INFO | __main__ : Batch-wise Cosine Similarity | 85.76% +2025-05-12T14:21:27 | INFO | __main__ : Cosine Embedding Loss | 0.1424 +2025-05-12T14:21:27 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:21:27 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:21:27 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:21:51 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:21:51 | INFO | __main__ : Step: 73900 +2025-05-12T14:21:51 | INFO | __main__ : Current Frame Index within Batch Video: 160/247 +2025-05-12T14:21:51 | INFO | __main__ : Batch-wise Cosine Similarity | 92.49% +2025-05-12T14:21:51 | INFO | __main__ : Cosine Embedding Loss | 0.0751 +2025-05-12T14:21:51 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:21:51 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:21:51 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:22:11 | INFO | utils.basic_utils : Train Epoch: [0] [ 306/4978] eta: 3 days, 5:36:34 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0655 eval_avg_sim: 0.4395 video-cosine_similarity: 0.9345 time: 59.3258 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T14:22:15 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:22:15 | INFO | __main__ : Step: 74000 +2025-05-12T14:22:15 | INFO | __main__ : Current Frame Index within Batch Video: 19/247 +2025-05-12T14:22:15 | INFO | __main__ : Batch-wise Cosine Similarity | 68.46% +2025-05-12T14:22:15 | INFO | __main__ : Cosine Embedding Loss | 0.3154 +2025-05-12T14:22:15 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:22:15 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:22:15 | INFO | __main__ : Evaluation Average Sim | 0.4395 +2025-05-12T14:22:15 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:22:15 | INFO | __main__ : Performing periodic evaluation at global step 74000... +2025-05-12T14:22:15 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T14:22:15 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T14:22:15 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T14:22:15 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T14:22:25 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.6490 +2025-05-12T14:22:25 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0074000.png +2025-05-12T14:22:25 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T14:22:25 | INFO | __main__ : Evaluation at step 74000 complete. Average Similarity: 0.6490 +2025-05-12T14:22:48 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:22:48 | INFO | __main__ : Step: 74100 +2025-05-12T14:22:48 | INFO | __main__ : Current Frame Index within Batch Video: 119/247 +2025-05-12T14:22:48 | INFO | __main__ : Batch-wise Cosine Similarity | 89.26% +2025-05-12T14:22:48 | INFO | __main__ : Cosine Embedding Loss | 0.1074 +2025-05-12T14:22:48 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:22:48 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:22:48 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:23:12 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:23:12 | INFO | __main__ : Step: 74200 +2025-05-12T14:23:12 | INFO | __main__ : Current Frame Index within Batch Video: 219/247 +2025-05-12T14:23:12 | INFO | __main__ : Batch-wise Cosine Similarity | 90.16% +2025-05-12T14:23:12 | INFO | __main__ : Cosine Embedding Loss | 0.0984 +2025-05-12T14:23:12 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:23:12 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:23:12 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:23:19 | INFO | utils.basic_utils : Train Epoch: [0] [ 307/4978] eta: 3 days, 5:37:28 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0843 eval_avg_sim: 0.6490 video-cosine_similarity: 0.9157 time: 59.8176 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T14:23:36 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:23:36 | INFO | __main__ : Step: 74300 +2025-05-12T14:23:36 | INFO | __main__ : Current Frame Index within Batch Video: 78/247 +2025-05-12T14:23:36 | INFO | __main__ : Batch-wise Cosine Similarity | 85.12% +2025-05-12T14:23:36 | INFO | __main__ : Cosine Embedding Loss | 0.1488 +2025-05-12T14:23:36 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:23:36 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:23:36 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:24:00 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:24:00 | INFO | __main__ : Step: 74400 +2025-05-12T14:24:00 | INFO | __main__ : Current Frame Index within Batch Video: 178/247 +2025-05-12T14:24:00 | INFO | __main__ : Batch-wise Cosine Similarity | 93.15% +2025-05-12T14:24:00 | INFO | __main__ : Cosine Embedding Loss | 0.0685 +2025-05-12T14:24:00 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:24:00 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:24:00 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:24:16 | INFO | utils.basic_utils : Train Epoch: [0] [ 308/4978] eta: 3 days, 5:35:51 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0468 eval_avg_sim: 0.6490 video-cosine_similarity: 0.9532 time: 59.8172 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T14:24:24 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:24:24 | INFO | __main__ : Step: 74500 +2025-05-12T14:24:24 | INFO | __main__ : Current Frame Index within Batch Video: 37/247 +2025-05-12T14:24:24 | INFO | __main__ : Batch-wise Cosine Similarity | 77.27% +2025-05-12T14:24:24 | INFO | __main__ : Cosine Embedding Loss | 0.2273 +2025-05-12T14:24:24 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:24:24 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:24:24 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:24:47 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:24:47 | INFO | __main__ : Step: 74600 +2025-05-12T14:24:47 | INFO | __main__ : Current Frame Index within Batch Video: 137/247 +2025-05-12T14:24:47 | INFO | __main__ : Batch-wise Cosine Similarity | 90.84% +2025-05-12T14:24:47 | INFO | __main__ : Cosine Embedding Loss | 0.0916 +2025-05-12T14:24:47 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:24:47 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:24:47 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:25:11 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:25:11 | INFO | __main__ : Step: 74700 +2025-05-12T14:25:11 | INFO | __main__ : Current Frame Index within Batch Video: 237/247 +2025-05-12T14:25:11 | INFO | __main__ : Batch-wise Cosine Similarity | 90.59% +2025-05-12T14:25:11 | INFO | __main__ : Cosine Embedding Loss | 0.0941 +2025-05-12T14:25:11 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:25:11 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:25:11 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:25:13 | INFO | utils.basic_utils : Train Epoch: [0] [ 309/4978] eta: 3 days, 5:34:16 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0843 eval_avg_sim: 0.6490 video-cosine_similarity: 0.9157 time: 59.8191 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T14:25:35 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:25:35 | INFO | __main__ : Step: 74800 +2025-05-12T14:25:35 | INFO | __main__ : Current Frame Index within Batch Video: 96/247 +2025-05-12T14:25:35 | INFO | __main__ : Batch-wise Cosine Similarity | 89.09% +2025-05-12T14:25:35 | INFO | __main__ : Cosine Embedding Loss | 0.1091 +2025-05-12T14:25:35 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:25:35 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:25:35 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:25:59 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:25:59 | INFO | __main__ : Step: 74900 +2025-05-12T14:25:59 | INFO | __main__ : Current Frame Index within Batch Video: 196/247 +2025-05-12T14:25:59 | INFO | __main__ : Batch-wise Cosine Similarity | 90.30% +2025-05-12T14:25:59 | INFO | __main__ : Cosine Embedding Loss | 0.0970 +2025-05-12T14:25:59 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:25:59 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:25:59 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:26:11 | INFO | utils.basic_utils : Train Epoch: [0] [ 310/4978] eta: 3 days, 5:32:41 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0661 eval_avg_sim: 0.6490 video-cosine_similarity: 0.9339 time: 59.3479 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T14:26:23 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:26:23 | INFO | __main__ : Step: 75000 +2025-05-12T14:26:23 | INFO | __main__ : Current Frame Index within Batch Video: 55/247 +2025-05-12T14:26:23 | INFO | __main__ : Batch-wise Cosine Similarity | 80.64% +2025-05-12T14:26:23 | INFO | __main__ : Cosine Embedding Loss | 0.1936 +2025-05-12T14:26:23 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:26:23 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:26:23 | INFO | __main__ : Evaluation Average Sim | 0.6490 +2025-05-12T14:26:23 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:26:23 | INFO | __main__ : Saving checkpoint at global step 75000 +2025-05-12T14:26:23 | INFO | __main__ : Performing periodic evaluation at global step 75000... +2025-05-12T14:26:23 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T14:26:23 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T14:26:23 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T14:26:23 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T14:26:33 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.5906 +2025-05-12T14:26:33 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0075000.png +2025-05-12T14:26:33 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T14:26:33 | INFO | __main__ : Evaluation at step 75000 complete. Average Similarity: 0.5906 +2025-05-12T14:26:56 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:26:56 | INFO | __main__ : Step: 75100 +2025-05-12T14:26:56 | INFO | __main__ : Current Frame Index within Batch Video: 155/247 +2025-05-12T14:26:56 | INFO | __main__ : Batch-wise Cosine Similarity | 89.21% +2025-05-12T14:26:56 | INFO | __main__ : Cosine Embedding Loss | 0.1079 +2025-05-12T14:26:56 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:26:56 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:26:56 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:27:18 | INFO | utils.basic_utils : Train Epoch: [0] [ 311/4978] eta: 3 days, 5:33:33 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0844 eval_avg_sim: 0.5906 video-cosine_similarity: 0.9156 time: 59.8363 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T14:27:20 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:27:20 | INFO | __main__ : Step: 75200 +2025-05-12T14:27:20 | INFO | __main__ : Current Frame Index within Batch Video: 14/247 +2025-05-12T14:27:20 | INFO | __main__ : Batch-wise Cosine Similarity | 65.30% +2025-05-12T14:27:20 | INFO | __main__ : Cosine Embedding Loss | 0.3470 +2025-05-12T14:27:20 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:27:20 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:27:20 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:27:44 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:27:44 | INFO | __main__ : Step: 75300 +2025-05-12T14:27:44 | INFO | __main__ : Current Frame Index within Batch Video: 114/247 +2025-05-12T14:27:44 | INFO | __main__ : Batch-wise Cosine Similarity | 90.79% +2025-05-12T14:27:44 | INFO | __main__ : Cosine Embedding Loss | 0.0921 +2025-05-12T14:27:44 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:27:44 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:27:44 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:28:08 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:28:08 | INFO | __main__ : Step: 75400 +2025-05-12T14:28:08 | INFO | __main__ : Current Frame Index within Batch Video: 214/247 +2025-05-12T14:28:08 | INFO | __main__ : Batch-wise Cosine Similarity | 95.27% +2025-05-12T14:28:08 | INFO | __main__ : Cosine Embedding Loss | 0.0473 +2025-05-12T14:28:08 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:28:08 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:28:08 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:28:16 | INFO | utils.basic_utils : Train Epoch: [0] [ 312/4978] eta: 3 days, 5:31:58 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0393 eval_avg_sim: 0.5906 video-cosine_similarity: 0.9607 time: 59.8341 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T14:28:32 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:28:32 | INFO | __main__ : Step: 75500 +2025-05-12T14:28:32 | INFO | __main__ : Current Frame Index within Batch Video: 73/247 +2025-05-12T14:28:32 | INFO | __main__ : Batch-wise Cosine Similarity | 84.07% +2025-05-12T14:28:32 | INFO | __main__ : Cosine Embedding Loss | 0.1593 +2025-05-12T14:28:32 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:28:32 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:28:32 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:28:56 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:28:56 | INFO | __main__ : Step: 75600 +2025-05-12T14:28:56 | INFO | __main__ : Current Frame Index within Batch Video: 173/247 +2025-05-12T14:28:56 | INFO | __main__ : Batch-wise Cosine Similarity | 88.89% +2025-05-12T14:28:56 | INFO | __main__ : Cosine Embedding Loss | 0.1111 +2025-05-12T14:28:56 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:28:56 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:28:56 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:29:13 | INFO | utils.basic_utils : Train Epoch: [0] [ 313/4978] eta: 3 days, 5:30:23 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0803 eval_avg_sim: 0.5906 video-cosine_similarity: 0.9197 time: 59.8347 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T14:29:19 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:29:19 | INFO | __main__ : Step: 75700 +2025-05-12T14:29:19 | INFO | __main__ : Current Frame Index within Batch Video: 32/247 +2025-05-12T14:29:19 | INFO | __main__ : Batch-wise Cosine Similarity | 72.65% +2025-05-12T14:29:19 | INFO | __main__ : Cosine Embedding Loss | 0.2735 +2025-05-12T14:29:19 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:29:19 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:29:19 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:29:43 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:29:43 | INFO | __main__ : Step: 75800 +2025-05-12T14:29:43 | INFO | __main__ : Current Frame Index within Batch Video: 132/247 +2025-05-12T14:29:43 | INFO | __main__ : Batch-wise Cosine Similarity | 90.04% +2025-05-12T14:29:43 | INFO | __main__ : Cosine Embedding Loss | 0.0996 +2025-05-12T14:29:43 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:29:43 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:29:43 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:30:07 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:30:07 | INFO | __main__ : Step: 75900 +2025-05-12T14:30:07 | INFO | __main__ : Current Frame Index within Batch Video: 232/247 +2025-05-12T14:30:07 | INFO | __main__ : Batch-wise Cosine Similarity | 94.77% +2025-05-12T14:30:07 | INFO | __main__ : Cosine Embedding Loss | 0.0523 +2025-05-12T14:30:07 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:30:07 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:30:07 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:30:11 | INFO | utils.basic_utils : Train Epoch: [0] [ 314/4978] eta: 3 days, 5:28:49 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0498 eval_avg_sim: 0.5906 video-cosine_similarity: 0.9502 time: 59.3694 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T14:30:31 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:30:31 | INFO | __main__ : Step: 76000 +2025-05-12T14:30:31 | INFO | __main__ : Current Frame Index within Batch Video: 91/247 +2025-05-12T14:30:31 | INFO | __main__ : Batch-wise Cosine Similarity | 85.60% +2025-05-12T14:30:31 | INFO | __main__ : Cosine Embedding Loss | 0.1440 +2025-05-12T14:30:31 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:30:31 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:30:31 | INFO | __main__ : Evaluation Average Sim | 0.5906 +2025-05-12T14:30:31 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:30:31 | INFO | __main__ : Performing periodic evaluation at global step 76000... +2025-05-12T14:30:31 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T14:30:31 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T14:30:31 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T14:30:31 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T14:30:41 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.6762 +2025-05-12T14:30:41 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0076000.png +2025-05-12T14:30:41 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T14:30:41 | INFO | __main__ : Evaluation at step 76000 complete. Average Similarity: 0.6762 +2025-05-12T14:31:04 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:31:04 | INFO | __main__ : Step: 76100 +2025-05-12T14:31:04 | INFO | __main__ : Current Frame Index within Batch Video: 191/247 +2025-05-12T14:31:04 | INFO | __main__ : Batch-wise Cosine Similarity | 93.84% +2025-05-12T14:31:04 | INFO | __main__ : Cosine Embedding Loss | 0.0616 +2025-05-12T14:31:04 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:31:04 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:31:04 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:31:17 | INFO | utils.basic_utils : Train Epoch: [0] [ 315/4978] eta: 3 days, 5:29:32 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0590 eval_avg_sim: 0.6762 video-cosine_similarity: 0.9410 time: 59.8345 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T14:31:28 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:31:28 | INFO | __main__ : Step: 76200 +2025-05-12T14:31:28 | INFO | __main__ : Current Frame Index within Batch Video: 50/247 +2025-05-12T14:31:28 | INFO | __main__ : Batch-wise Cosine Similarity | 81.24% +2025-05-12T14:31:28 | INFO | __main__ : Cosine Embedding Loss | 0.1876 +2025-05-12T14:31:28 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:31:28 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:31:28 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:31:52 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:31:52 | INFO | __main__ : Step: 76300 +2025-05-12T14:31:52 | INFO | __main__ : Current Frame Index within Batch Video: 150/247 +2025-05-12T14:31:52 | INFO | __main__ : Batch-wise Cosine Similarity | 91.40% +2025-05-12T14:31:52 | INFO | __main__ : Cosine Embedding Loss | 0.0860 +2025-05-12T14:31:52 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:31:52 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:31:52 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:32:15 | INFO | utils.basic_utils : Train Epoch: [0] [ 316/4978] eta: 3 days, 5:27:56 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0653 eval_avg_sim: 0.6762 video-cosine_similarity: 0.9347 time: 59.8352 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T14:32:16 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:32:16 | INFO | __main__ : Step: 76400 +2025-05-12T14:32:16 | INFO | __main__ : Current Frame Index within Batch Video: 9/247 +2025-05-12T14:32:16 | INFO | __main__ : Batch-wise Cosine Similarity | 57.88% +2025-05-12T14:32:16 | INFO | __main__ : Cosine Embedding Loss | 0.4212 +2025-05-12T14:32:16 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:32:16 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:32:16 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:32:39 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:32:39 | INFO | __main__ : Step: 76500 +2025-05-12T14:32:40 | INFO | __main__ : Current Frame Index within Batch Video: 109/247 +2025-05-12T14:32:40 | INFO | __main__ : Batch-wise Cosine Similarity | 91.29% +2025-05-12T14:32:40 | INFO | __main__ : Cosine Embedding Loss | 0.0871 +2025-05-12T14:32:40 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:32:40 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:32:40 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:33:03 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:33:03 | INFO | __main__ : Step: 76600 +2025-05-12T14:33:03 | INFO | __main__ : Current Frame Index within Batch Video: 209/247 +2025-05-12T14:33:03 | INFO | __main__ : Batch-wise Cosine Similarity | 94.21% +2025-05-12T14:33:03 | INFO | __main__ : Cosine Embedding Loss | 0.0579 +2025-05-12T14:33:03 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:33:03 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:33:03 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:33:12 | INFO | utils.basic_utils : Train Epoch: [0] [ 317/4978] eta: 3 days, 5:26:22 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0528 eval_avg_sim: 0.6762 video-cosine_similarity: 0.9472 time: 59.8359 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T14:33:27 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:33:27 | INFO | __main__ : Step: 76700 +2025-05-12T14:33:27 | INFO | __main__ : Current Frame Index within Batch Video: 68/247 +2025-05-12T14:33:27 | INFO | __main__ : Batch-wise Cosine Similarity | 85.10% +2025-05-12T14:33:27 | INFO | __main__ : Cosine Embedding Loss | 0.1490 +2025-05-12T14:33:27 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:33:27 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:33:27 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:33:51 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:33:51 | INFO | __main__ : Step: 76800 +2025-05-12T14:33:51 | INFO | __main__ : Current Frame Index within Batch Video: 168/247 +2025-05-12T14:33:51 | INFO | __main__ : Batch-wise Cosine Similarity | 91.60% +2025-05-12T14:33:51 | INFO | __main__ : Cosine Embedding Loss | 0.0840 +2025-05-12T14:33:51 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:33:51 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:33:51 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:34:10 | INFO | utils.basic_utils : Train Epoch: [0] [ 318/4978] eta: 3 days, 5:24:48 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0696 eval_avg_sim: 0.6762 video-cosine_similarity: 0.9304 time: 59.3634 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T14:34:15 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:34:15 | INFO | __main__ : Step: 76900 +2025-05-12T14:34:15 | INFO | __main__ : Current Frame Index within Batch Video: 27/247 +2025-05-12T14:34:15 | INFO | __main__ : Batch-wise Cosine Similarity | 74.03% +2025-05-12T14:34:15 | INFO | __main__ : Cosine Embedding Loss | 0.2597 +2025-05-12T14:34:15 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:34:15 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:34:15 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:34:39 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:34:39 | INFO | __main__ : Step: 77000 +2025-05-12T14:34:39 | INFO | __main__ : Current Frame Index within Batch Video: 127/247 +2025-05-12T14:34:39 | INFO | __main__ : Batch-wise Cosine Similarity | 90.04% +2025-05-12T14:34:39 | INFO | __main__ : Cosine Embedding Loss | 0.0996 +2025-05-12T14:34:39 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:34:39 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:34:39 | INFO | __main__ : Evaluation Average Sim | 0.6762 +2025-05-12T14:34:39 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:34:39 | INFO | __main__ : Performing periodic evaluation at global step 77000... +2025-05-12T14:34:39 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T14:34:39 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T14:34:39 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T14:34:39 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T14:34:48 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.6530 +2025-05-12T14:34:48 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0077000.png +2025-05-12T14:34:48 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T14:34:48 | INFO | __main__ : Evaluation at step 77000 complete. Average Similarity: 0.6530 +2025-05-12T14:35:12 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:35:12 | INFO | __main__ : Step: 77100 +2025-05-12T14:35:12 | INFO | __main__ : Current Frame Index within Batch Video: 227/247 +2025-05-12T14:35:12 | INFO | __main__ : Batch-wise Cosine Similarity | 93.15% +2025-05-12T14:35:12 | INFO | __main__ : Cosine Embedding Loss | 0.0685 +2025-05-12T14:35:12 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:35:12 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:35:12 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:35:17 | INFO | utils.basic_utils : Train Epoch: [0] [ 319/4978] eta: 3 days, 5:25:30 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0770 eval_avg_sim: 0.6530 video-cosine_similarity: 0.9230 time: 59.8324 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T14:35:36 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:35:36 | INFO | __main__ : Step: 77200 +2025-05-12T14:35:36 | INFO | __main__ : Current Frame Index within Batch Video: 86/247 +2025-05-12T14:35:36 | INFO | __main__ : Batch-wise Cosine Similarity | 85.80% +2025-05-12T14:35:36 | INFO | __main__ : Cosine Embedding Loss | 0.1420 +2025-05-12T14:35:36 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:35:36 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:35:36 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:35:59 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:35:59 | INFO | __main__ : Step: 77300 +2025-05-12T14:35:59 | INFO | __main__ : Current Frame Index within Batch Video: 186/247 +2025-05-12T14:35:59 | INFO | __main__ : Batch-wise Cosine Similarity | 90.11% +2025-05-12T14:35:59 | INFO | __main__ : Cosine Embedding Loss | 0.0989 +2025-05-12T14:35:59 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:35:59 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:35:59 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:36:14 | INFO | utils.basic_utils : Train Epoch: [0] [ 320/4978] eta: 3 days, 5:23:55 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0570 eval_avg_sim: 0.6530 video-cosine_similarity: 0.9430 time: 59.8303 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T14:36:23 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:36:23 | INFO | __main__ : Step: 77400 +2025-05-12T14:36:23 | INFO | __main__ : Current Frame Index within Batch Video: 45/247 +2025-05-12T14:36:23 | INFO | __main__ : Batch-wise Cosine Similarity | 79.61% +2025-05-12T14:36:23 | INFO | __main__ : Cosine Embedding Loss | 0.2039 +2025-05-12T14:36:23 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:36:23 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:36:23 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:36:47 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:36:47 | INFO | __main__ : Step: 77500 +2025-05-12T14:36:47 | INFO | __main__ : Current Frame Index within Batch Video: 145/247 +2025-05-12T14:36:47 | INFO | __main__ : Batch-wise Cosine Similarity | 88.87% +2025-05-12T14:36:47 | INFO | __main__ : Cosine Embedding Loss | 0.1113 +2025-05-12T14:36:47 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:36:47 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:36:47 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:37:11 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:37:11 | INFO | __main__ : Step: 77600 +2025-05-12T14:37:11 | INFO | __main__ : Current Frame Index within Batch Video: 245/247 +2025-05-12T14:37:11 | INFO | __main__ : Batch-wise Cosine Similarity | 89.91% +2025-05-12T14:37:11 | INFO | __main__ : Cosine Embedding Loss | 0.1009 +2025-05-12T14:37:11 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:37:11 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:37:11 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:37:11 | INFO | utils.basic_utils : Train Epoch: [0] [ 321/4978] eta: 3 days, 5:22:21 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.1028 eval_avg_sim: 0.6530 video-cosine_similarity: 0.8972 time: 59.8317 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T14:37:35 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:37:35 | INFO | __main__ : Step: 77700 +2025-05-12T14:37:35 | INFO | __main__ : Current Frame Index within Batch Video: 104/247 +2025-05-12T14:37:35 | INFO | __main__ : Batch-wise Cosine Similarity | 87.11% +2025-05-12T14:37:35 | INFO | __main__ : Cosine Embedding Loss | 0.1289 +2025-05-12T14:37:35 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:37:35 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:37:35 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:37:59 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:37:59 | INFO | __main__ : Step: 77800 +2025-05-12T14:37:59 | INFO | __main__ : Current Frame Index within Batch Video: 204/247 +2025-05-12T14:37:59 | INFO | __main__ : Batch-wise Cosine Similarity | 88.92% +2025-05-12T14:37:59 | INFO | __main__ : Cosine Embedding Loss | 0.1108 +2025-05-12T14:37:59 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:37:59 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:37:59 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:38:09 | INFO | utils.basic_utils : Train Epoch: [0] [ 322/4978] eta: 3 days, 5:20:46 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0812 eval_avg_sim: 0.6530 video-cosine_similarity: 0.9188 time: 59.3582 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T14:38:23 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:38:23 | INFO | __main__ : Step: 77900 +2025-05-12T14:38:23 | INFO | __main__ : Current Frame Index within Batch Video: 63/247 +2025-05-12T14:38:23 | INFO | __main__ : Batch-wise Cosine Similarity | 81.82% +2025-05-12T14:38:23 | INFO | __main__ : Cosine Embedding Loss | 0.1818 +2025-05-12T14:38:23 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:38:23 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:38:23 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:38:46 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:38:46 | INFO | __main__ : Step: 78000 +2025-05-12T14:38:46 | INFO | __main__ : Current Frame Index within Batch Video: 163/247 +2025-05-12T14:38:46 | INFO | __main__ : Batch-wise Cosine Similarity | 93.14% +2025-05-12T14:38:46 | INFO | __main__ : Cosine Embedding Loss | 0.0686 +2025-05-12T14:38:46 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:38:46 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:38:46 | INFO | __main__ : Evaluation Average Sim | 0.6530 +2025-05-12T14:38:46 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:38:47 | INFO | __main__ : Performing periodic evaluation at global step 78000... +2025-05-12T14:38:47 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T14:38:47 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T14:38:47 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T14:38:47 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T14:38:56 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.6812 +2025-05-12T14:38:56 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0078000.png +2025-05-12T14:38:56 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T14:38:56 | INFO | __main__ : Evaluation at step 78000 complete. Average Similarity: 0.6812 +2025-05-12T14:39:16 | INFO | utils.basic_utils : Train Epoch: [0] [ 323/4978] eta: 3 days, 5:21:27 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0955 eval_avg_sim: 0.6812 video-cosine_similarity: 0.9045 time: 59.8284 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T14:39:20 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:39:20 | INFO | __main__ : Step: 78100 +2025-05-12T14:39:20 | INFO | __main__ : Current Frame Index within Batch Video: 22/247 +2025-05-12T14:39:20 | INFO | __main__ : Batch-wise Cosine Similarity | 72.46% +2025-05-12T14:39:20 | INFO | __main__ : Cosine Embedding Loss | 0.2754 +2025-05-12T14:39:20 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:39:20 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:39:20 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:39:43 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:39:43 | INFO | __main__ : Step: 78200 +2025-05-12T14:39:43 | INFO | __main__ : Current Frame Index within Batch Video: 122/247 +2025-05-12T14:39:43 | INFO | __main__ : Batch-wise Cosine Similarity | 88.02% +2025-05-12T14:39:43 | INFO | __main__ : Cosine Embedding Loss | 0.1198 +2025-05-12T14:39:43 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:39:43 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:39:43 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:40:07 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:40:07 | INFO | __main__ : Step: 78300 +2025-05-12T14:40:07 | INFO | __main__ : Current Frame Index within Batch Video: 222/247 +2025-05-12T14:40:07 | INFO | __main__ : Batch-wise Cosine Similarity | 89.83% +2025-05-12T14:40:07 | INFO | __main__ : Cosine Embedding Loss | 0.1017 +2025-05-12T14:40:07 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:40:07 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:40:07 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:40:13 | INFO | utils.basic_utils : Train Epoch: [0] [ 324/4978] eta: 3 days, 5:19:53 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0784 eval_avg_sim: 0.6812 video-cosine_similarity: 0.9216 time: 59.8269 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T14:40:31 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:40:31 | INFO | __main__ : Step: 78400 +2025-05-12T14:40:31 | INFO | __main__ : Current Frame Index within Batch Video: 81/247 +2025-05-12T14:40:31 | INFO | __main__ : Batch-wise Cosine Similarity | 87.42% +2025-05-12T14:40:31 | INFO | __main__ : Cosine Embedding Loss | 0.1258 +2025-05-12T14:40:31 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:40:31 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:40:31 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:40:55 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:40:55 | INFO | __main__ : Step: 78500 +2025-05-12T14:40:55 | INFO | __main__ : Current Frame Index within Batch Video: 181/247 +2025-05-12T14:40:55 | INFO | __main__ : Batch-wise Cosine Similarity | 93.37% +2025-05-12T14:40:55 | INFO | __main__ : Cosine Embedding Loss | 0.0663 +2025-05-12T14:40:55 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:40:55 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:40:55 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:41:11 | INFO | utils.basic_utils : Train Epoch: [0] [ 325/4978] eta: 3 days, 5:18:20 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0673 eval_avg_sim: 0.6812 video-cosine_similarity: 0.9327 time: 59.8331 data: 0.0013 max mem: 11173 res mem: 15204 +2025-05-12T14:41:19 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:41:19 | INFO | __main__ : Step: 78600 +2025-05-12T14:41:19 | INFO | __main__ : Current Frame Index within Batch Video: 40/247 +2025-05-12T14:41:19 | INFO | __main__ : Batch-wise Cosine Similarity | 78.63% +2025-05-12T14:41:19 | INFO | __main__ : Cosine Embedding Loss | 0.2137 +2025-05-12T14:41:19 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:41:19 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:41:19 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:41:43 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:41:43 | INFO | __main__ : Step: 78700 +2025-05-12T14:41:43 | INFO | __main__ : Current Frame Index within Batch Video: 140/247 +2025-05-12T14:41:43 | INFO | __main__ : Batch-wise Cosine Similarity | 90.77% +2025-05-12T14:41:43 | INFO | __main__ : Cosine Embedding Loss | 0.0923 +2025-05-12T14:41:43 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:41:43 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:41:43 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:42:06 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:42:06 | INFO | __main__ : Step: 78800 +2025-05-12T14:42:06 | INFO | __main__ : Current Frame Index within Batch Video: 240/247 +2025-05-12T14:42:06 | INFO | __main__ : Batch-wise Cosine Similarity | 94.63% +2025-05-12T14:42:06 | INFO | __main__ : Cosine Embedding Loss | 0.0537 +2025-05-12T14:42:06 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:42:06 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:42:06 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:42:08 | INFO | utils.basic_utils : Train Epoch: [0] [ 326/4978] eta: 3 days, 5:16:47 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0525 eval_avg_sim: 0.6812 video-cosine_similarity: 0.9475 time: 59.8336 data: 0.0013 max mem: 11173 res mem: 15204 +2025-05-12T14:42:30 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:42:30 | INFO | __main__ : Step: 78900 +2025-05-12T14:42:30 | INFO | __main__ : Current Frame Index within Batch Video: 99/247 +2025-05-12T14:42:30 | INFO | __main__ : Batch-wise Cosine Similarity | 87.57% +2025-05-12T14:42:30 | INFO | __main__ : Cosine Embedding Loss | 0.1243 +2025-05-12T14:42:30 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:42:30 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:42:30 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:42:54 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:42:54 | INFO | __main__ : Step: 79000 +2025-05-12T14:42:54 | INFO | __main__ : Current Frame Index within Batch Video: 199/247 +2025-05-12T14:42:54 | INFO | __main__ : Batch-wise Cosine Similarity | 91.58% +2025-05-12T14:42:54 | INFO | __main__ : Cosine Embedding Loss | 0.0842 +2025-05-12T14:42:54 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:42:54 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:42:54 | INFO | __main__ : Evaluation Average Sim | 0.6812 +2025-05-12T14:42:54 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:42:54 | INFO | __main__ : Performing periodic evaluation at global step 79000... +2025-05-12T14:42:54 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T14:42:55 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T14:42:55 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T14:42:55 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T14:43:04 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.6937 +2025-05-12T14:43:04 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0079000.png +2025-05-12T14:43:04 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T14:43:04 | INFO | __main__ : Evaluation at step 79000 complete. Average Similarity: 0.6937 +2025-05-12T14:43:15 | INFO | utils.basic_utils : Train Epoch: [0] [ 327/4978] eta: 3 days, 5:17:27 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0460 eval_avg_sim: 0.6937 video-cosine_similarity: 0.9540 time: 59.8124 data: 0.0013 max mem: 11173 res mem: 15204 +2025-05-12T14:43:27 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:43:27 | INFO | __main__ : Step: 79100 +2025-05-12T14:43:27 | INFO | __main__ : Current Frame Index within Batch Video: 58/247 +2025-05-12T14:43:27 | INFO | __main__ : Batch-wise Cosine Similarity | 82.49% +2025-05-12T14:43:27 | INFO | __main__ : Cosine Embedding Loss | 0.1751 +2025-05-12T14:43:27 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:43:27 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:43:27 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:43:51 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:43:51 | INFO | __main__ : Step: 79200 +2025-05-12T14:43:51 | INFO | __main__ : Current Frame Index within Batch Video: 158/247 +2025-05-12T14:43:51 | INFO | __main__ : Batch-wise Cosine Similarity | 91.51% +2025-05-12T14:43:51 | INFO | __main__ : Cosine Embedding Loss | 0.0849 +2025-05-12T14:43:51 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:43:51 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:43:51 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:44:12 | INFO | utils.basic_utils : Train Epoch: [0] [ 328/4978] eta: 3 days, 5:15:53 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0440 eval_avg_sim: 0.6937 video-cosine_similarity: 0.9560 time: 59.8121 data: 0.0013 max mem: 11173 res mem: 15204 +2025-05-12T14:44:15 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:44:15 | INFO | __main__ : Step: 79300 +2025-05-12T14:44:15 | INFO | __main__ : Current Frame Index within Batch Video: 17/247 +2025-05-12T14:44:15 | INFO | __main__ : Batch-wise Cosine Similarity | 65.79% +2025-05-12T14:44:15 | INFO | __main__ : Cosine Embedding Loss | 0.3421 +2025-05-12T14:44:15 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:44:15 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:44:15 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:44:39 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:44:39 | INFO | __main__ : Step: 79400 +2025-05-12T14:44:39 | INFO | __main__ : Current Frame Index within Batch Video: 117/247 +2025-05-12T14:44:39 | INFO | __main__ : Batch-wise Cosine Similarity | 91.40% +2025-05-12T14:44:39 | INFO | __main__ : Cosine Embedding Loss | 0.0860 +2025-05-12T14:44:39 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:44:39 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:44:39 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:45:03 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:45:03 | INFO | __main__ : Step: 79500 +2025-05-12T14:45:03 | INFO | __main__ : Current Frame Index within Batch Video: 217/247 +2025-05-12T14:45:03 | INFO | __main__ : Batch-wise Cosine Similarity | 95.45% +2025-05-12T14:45:03 | INFO | __main__ : Cosine Embedding Loss | 0.0455 +2025-05-12T14:45:03 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:45:03 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:45:03 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:45:10 | INFO | utils.basic_utils : Train Epoch: [0] [ 329/4978] eta: 3 days, 5:14:20 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0499 eval_avg_sim: 0.6937 video-cosine_similarity: 0.9501 time: 59.8114 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T14:45:27 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:45:27 | INFO | __main__ : Step: 79600 +2025-05-12T14:45:27 | INFO | __main__ : Current Frame Index within Batch Video: 76/247 +2025-05-12T14:45:27 | INFO | __main__ : Batch-wise Cosine Similarity | 86.95% +2025-05-12T14:45:27 | INFO | __main__ : Cosine Embedding Loss | 0.1305 +2025-05-12T14:45:27 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:45:27 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:45:27 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:45:50 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:45:50 | INFO | __main__ : Step: 79700 +2025-05-12T14:45:50 | INFO | __main__ : Current Frame Index within Batch Video: 176/247 +2025-05-12T14:45:50 | INFO | __main__ : Batch-wise Cosine Similarity | 91.93% +2025-05-12T14:45:50 | INFO | __main__ : Cosine Embedding Loss | 0.0807 +2025-05-12T14:45:50 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:45:50 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:45:50 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:46:07 | INFO | utils.basic_utils : Train Epoch: [0] [ 330/4978] eta: 3 days, 5:12:47 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0568 eval_avg_sim: 0.6937 video-cosine_similarity: 0.9432 time: 59.8110 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T14:46:14 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:46:14 | INFO | __main__ : Step: 79800 +2025-05-12T14:46:14 | INFO | __main__ : Current Frame Index within Batch Video: 35/247 +2025-05-12T14:46:14 | INFO | __main__ : Batch-wise Cosine Similarity | 76.29% +2025-05-12T14:46:14 | INFO | __main__ : Cosine Embedding Loss | 0.2371 +2025-05-12T14:46:14 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:46:14 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:46:14 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:46:38 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:46:38 | INFO | __main__ : Step: 79900 +2025-05-12T14:46:38 | INFO | __main__ : Current Frame Index within Batch Video: 135/247 +2025-05-12T14:46:38 | INFO | __main__ : Batch-wise Cosine Similarity | 87.38% +2025-05-12T14:46:38 | INFO | __main__ : Cosine Embedding Loss | 0.1262 +2025-05-12T14:46:38 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:46:38 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:46:38 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:47:02 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:47:02 | INFO | __main__ : Step: 80000 +2025-05-12T14:47:02 | INFO | __main__ : Current Frame Index within Batch Video: 235/247 +2025-05-12T14:47:02 | INFO | __main__ : Batch-wise Cosine Similarity | 93.80% +2025-05-12T14:47:02 | INFO | __main__ : Cosine Embedding Loss | 0.0620 +2025-05-12T14:47:02 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:47:02 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:47:02 | INFO | __main__ : Evaluation Average Sim | 0.6937 +2025-05-12T14:47:02 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:47:02 | INFO | __main__ : Saving checkpoint at global step 80000 +2025-05-12T14:47:02 | INFO | __main__ : Performing periodic evaluation at global step 80000... +2025-05-12T14:47:02 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T14:47:02 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T14:47:02 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T14:47:02 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T14:47:12 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.5030 +2025-05-12T14:47:12 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0080000.png +2025-05-12T14:47:12 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T14:47:12 | INFO | __main__ : Evaluation at step 80000 complete. Average Similarity: 0.5030 +2025-05-12T14:47:14 | INFO | utils.basic_utils : Train Epoch: [0] [ 331/4978] eta: 3 days, 5:13:28 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0619 eval_avg_sim: 0.5030 video-cosine_similarity: 0.9381 time: 59.7972 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T14:47:35 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:47:35 | INFO | __main__ : Step: 80100 +2025-05-12T14:47:35 | INFO | __main__ : Current Frame Index within Batch Video: 94/247 +2025-05-12T14:47:35 | INFO | __main__ : Batch-wise Cosine Similarity | 90.44% +2025-05-12T14:47:35 | INFO | __main__ : Cosine Embedding Loss | 0.0956 +2025-05-12T14:47:35 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:47:35 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:47:35 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:47:59 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:47:59 | INFO | __main__ : Step: 80200 +2025-05-12T14:47:59 | INFO | __main__ : Current Frame Index within Batch Video: 194/247 +2025-05-12T14:47:59 | INFO | __main__ : Batch-wise Cosine Similarity | 94.83% +2025-05-12T14:47:59 | INFO | __main__ : Cosine Embedding Loss | 0.0517 +2025-05-12T14:47:59 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:47:59 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:47:59 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:48:12 | INFO | utils.basic_utils : Train Epoch: [0] [ 332/4978] eta: 3 days, 5:11:54 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0461 eval_avg_sim: 0.5030 video-cosine_similarity: 0.9539 time: 59.7947 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T14:48:23 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:48:23 | INFO | __main__ : Step: 80300 +2025-05-12T14:48:23 | INFO | __main__ : Current Frame Index within Batch Video: 53/247 +2025-05-12T14:48:23 | INFO | __main__ : Batch-wise Cosine Similarity | 82.74% +2025-05-12T14:48:23 | INFO | __main__ : Cosine Embedding Loss | 0.1726 +2025-05-12T14:48:23 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:48:23 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:48:23 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:48:47 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:48:47 | INFO | __main__ : Step: 80400 +2025-05-12T14:48:47 | INFO | __main__ : Current Frame Index within Batch Video: 153/247 +2025-05-12T14:48:47 | INFO | __main__ : Batch-wise Cosine Similarity | 92.13% +2025-05-12T14:48:47 | INFO | __main__ : Cosine Embedding Loss | 0.0787 +2025-05-12T14:48:47 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:48:47 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:48:47 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:49:09 | INFO | utils.basic_utils : Train Epoch: [0] [ 333/4978] eta: 3 days, 5:10:21 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0397 eval_avg_sim: 0.5030 video-cosine_similarity: 0.9603 time: 59.7918 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T14:49:11 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:49:11 | INFO | __main__ : Step: 80500 +2025-05-12T14:49:11 | INFO | __main__ : Current Frame Index within Batch Video: 12/247 +2025-05-12T14:49:11 | INFO | __main__ : Batch-wise Cosine Similarity | 64.45% +2025-05-12T14:49:11 | INFO | __main__ : Cosine Embedding Loss | 0.3555 +2025-05-12T14:49:11 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:49:11 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:49:11 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:49:34 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:49:34 | INFO | __main__ : Step: 80600 +2025-05-12T14:49:34 | INFO | __main__ : Current Frame Index within Batch Video: 112/247 +2025-05-12T14:49:34 | INFO | __main__ : Batch-wise Cosine Similarity | 91.27% +2025-05-12T14:49:34 | INFO | __main__ : Cosine Embedding Loss | 0.0873 +2025-05-12T14:49:34 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:49:34 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:49:34 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:49:58 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:49:58 | INFO | __main__ : Step: 80700 +2025-05-12T14:49:58 | INFO | __main__ : Current Frame Index within Batch Video: 212/247 +2025-05-12T14:49:58 | INFO | __main__ : Batch-wise Cosine Similarity | 94.24% +2025-05-12T14:49:58 | INFO | __main__ : Cosine Embedding Loss | 0.0576 +2025-05-12T14:49:58 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:49:58 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:49:58 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:50:06 | INFO | utils.basic_utils : Train Epoch: [0] [ 334/4978] eta: 3 days, 5:08:48 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0609 eval_avg_sim: 0.5030 video-cosine_similarity: 0.9391 time: 59.7891 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T14:50:22 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:50:22 | INFO | __main__ : Step: 80800 +2025-05-12T14:50:22 | INFO | __main__ : Current Frame Index within Batch Video: 71/247 +2025-05-12T14:50:22 | INFO | __main__ : Batch-wise Cosine Similarity | 86.27% +2025-05-12T14:50:22 | INFO | __main__ : Cosine Embedding Loss | 0.1373 +2025-05-12T14:50:22 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:50:22 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:50:22 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:50:46 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:50:46 | INFO | __main__ : Step: 80900 +2025-05-12T14:50:46 | INFO | __main__ : Current Frame Index within Batch Video: 171/247 +2025-05-12T14:50:46 | INFO | __main__ : Batch-wise Cosine Similarity | 95.15% +2025-05-12T14:50:46 | INFO | __main__ : Cosine Embedding Loss | 0.0485 +2025-05-12T14:50:46 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:50:46 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:50:46 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:51:04 | INFO | utils.basic_utils : Train Epoch: [0] [ 335/4978] eta: 3 days, 5:07:16 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0619 eval_avg_sim: 0.5030 video-cosine_similarity: 0.9381 time: 59.3220 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T14:51:10 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:51:10 | INFO | __main__ : Step: 81000 +2025-05-12T14:51:10 | INFO | __main__ : Current Frame Index within Batch Video: 30/247 +2025-05-12T14:51:10 | INFO | __main__ : Batch-wise Cosine Similarity | 73.07% +2025-05-12T14:51:10 | INFO | __main__ : Cosine Embedding Loss | 0.2693 +2025-05-12T14:51:10 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:51:10 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:51:10 | INFO | __main__ : Evaluation Average Sim | 0.5030 +2025-05-12T14:51:10 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:51:10 | INFO | __main__ : Performing periodic evaluation at global step 81000... +2025-05-12T14:51:10 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T14:51:10 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T14:51:10 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T14:51:10 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T14:51:20 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.6935 +2025-05-12T14:51:20 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0081000.png +2025-05-12T14:51:20 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T14:51:20 | INFO | __main__ : Evaluation at step 81000 complete. Average Similarity: 0.6935 +2025-05-12T14:51:43 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:51:43 | INFO | __main__ : Step: 81100 +2025-05-12T14:51:43 | INFO | __main__ : Current Frame Index within Batch Video: 130/247 +2025-05-12T14:51:43 | INFO | __main__ : Batch-wise Cosine Similarity | 88.96% +2025-05-12T14:51:43 | INFO | __main__ : Cosine Embedding Loss | 0.1104 +2025-05-12T14:51:43 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:51:43 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:51:43 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:52:07 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:52:07 | INFO | __main__ : Step: 81200 +2025-05-12T14:52:07 | INFO | __main__ : Current Frame Index within Batch Video: 230/247 +2025-05-12T14:52:07 | INFO | __main__ : Batch-wise Cosine Similarity | 89.31% +2025-05-12T14:52:07 | INFO | __main__ : Cosine Embedding Loss | 0.1069 +2025-05-12T14:52:07 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:52:07 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:52:07 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:52:11 | INFO | utils.basic_utils : Train Epoch: [0] [ 336/4978] eta: 3 days, 5:07:58 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0876 eval_avg_sim: 0.6935 video-cosine_similarity: 0.9124 time: 59.8130 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T14:52:31 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:52:31 | INFO | __main__ : Step: 81300 +2025-05-12T14:52:31 | INFO | __main__ : Current Frame Index within Batch Video: 89/247 +2025-05-12T14:52:31 | INFO | __main__ : Batch-wise Cosine Similarity | 86.11% +2025-05-12T14:52:31 | INFO | __main__ : Cosine Embedding Loss | 0.1389 +2025-05-12T14:52:31 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:52:31 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:52:31 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:52:55 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:52:55 | INFO | __main__ : Step: 81400 +2025-05-12T14:52:55 | INFO | __main__ : Current Frame Index within Batch Video: 189/247 +2025-05-12T14:52:55 | INFO | __main__ : Batch-wise Cosine Similarity | 90.85% +2025-05-12T14:52:55 | INFO | __main__ : Cosine Embedding Loss | 0.0915 +2025-05-12T14:52:55 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:52:55 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:52:55 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:53:09 | INFO | utils.basic_utils : Train Epoch: [0] [ 337/4978] eta: 3 days, 5:06:26 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0837 eval_avg_sim: 0.6935 video-cosine_similarity: 0.9163 time: 59.8105 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T14:53:19 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:53:19 | INFO | __main__ : Step: 81500 +2025-05-12T14:53:19 | INFO | __main__ : Current Frame Index within Batch Video: 48/247 +2025-05-12T14:53:19 | INFO | __main__ : Batch-wise Cosine Similarity | 79.86% +2025-05-12T14:53:19 | INFO | __main__ : Cosine Embedding Loss | 0.2014 +2025-05-12T14:53:19 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:53:19 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:53:19 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:53:43 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:53:43 | INFO | __main__ : Step: 81600 +2025-05-12T14:53:43 | INFO | __main__ : Current Frame Index within Batch Video: 148/247 +2025-05-12T14:53:43 | INFO | __main__ : Batch-wise Cosine Similarity | 86.54% +2025-05-12T14:53:43 | INFO | __main__ : Cosine Embedding Loss | 0.1346 +2025-05-12T14:53:43 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:53:43 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:53:43 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:54:06 | INFO | utils.basic_utils : Train Epoch: [0] [ 338/4978] eta: 3 days, 5:04:54 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.1351 eval_avg_sim: 0.6935 video-cosine_similarity: 0.8649 time: 59.8110 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T14:54:06 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:54:06 | INFO | __main__ : Step: 81700 +2025-05-12T14:54:06 | INFO | __main__ : Current Frame Index within Batch Video: 7/247 +2025-05-12T14:54:06 | INFO | __main__ : Batch-wise Cosine Similarity | 60.61% +2025-05-12T14:54:06 | INFO | __main__ : Cosine Embedding Loss | 0.3939 +2025-05-12T14:54:06 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:54:06 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:54:06 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:54:30 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:54:30 | INFO | __main__ : Step: 81800 +2025-05-12T14:54:30 | INFO | __main__ : Current Frame Index within Batch Video: 107/247 +2025-05-12T14:54:30 | INFO | __main__ : Batch-wise Cosine Similarity | 89.06% +2025-05-12T14:54:30 | INFO | __main__ : Cosine Embedding Loss | 0.1094 +2025-05-12T14:54:30 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:54:30 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:54:30 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:54:54 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:54:54 | INFO | __main__ : Step: 81900 +2025-05-12T14:54:54 | INFO | __main__ : Current Frame Index within Batch Video: 207/247 +2025-05-12T14:54:54 | INFO | __main__ : Batch-wise Cosine Similarity | 92.74% +2025-05-12T14:54:54 | INFO | __main__ : Cosine Embedding Loss | 0.0726 +2025-05-12T14:54:54 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:54:54 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:54:54 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:55:04 | INFO | utils.basic_utils : Train Epoch: [0] [ 339/4978] eta: 3 days, 5:03:22 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0668 eval_avg_sim: 0.6935 video-cosine_similarity: 0.9332 time: 59.3438 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T14:55:18 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:55:18 | INFO | __main__ : Step: 82000 +2025-05-12T14:55:18 | INFO | __main__ : Current Frame Index within Batch Video: 66/247 +2025-05-12T14:55:18 | INFO | __main__ : Batch-wise Cosine Similarity | 85.96% +2025-05-12T14:55:18 | INFO | __main__ : Cosine Embedding Loss | 0.1404 +2025-05-12T14:55:18 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:55:18 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:55:18 | INFO | __main__ : Evaluation Average Sim | 0.6935 +2025-05-12T14:55:18 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:55:18 | INFO | __main__ : Performing periodic evaluation at global step 82000... +2025-05-12T14:55:18 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T14:55:18 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T14:55:18 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T14:55:18 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T14:55:27 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.5568 +2025-05-12T14:55:28 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0082000.png +2025-05-12T14:55:28 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T14:55:28 | INFO | __main__ : Evaluation at step 82000 complete. Average Similarity: 0.5568 +2025-05-12T14:55:51 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:55:51 | INFO | __main__ : Step: 82100 +2025-05-12T14:55:51 | INFO | __main__ : Current Frame Index within Batch Video: 166/247 +2025-05-12T14:55:51 | INFO | __main__ : Batch-wise Cosine Similarity | 93.17% +2025-05-12T14:55:51 | INFO | __main__ : Cosine Embedding Loss | 0.0683 +2025-05-12T14:55:51 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:55:51 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:55:51 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:56:10 | INFO | utils.basic_utils : Train Epoch: [0] [ 340/4978] eta: 3 days, 5:03:57 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0709 eval_avg_sim: 0.5568 video-cosine_similarity: 0.9291 time: 59.8132 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T14:56:15 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:56:15 | INFO | __main__ : Step: 82200 +2025-05-12T14:56:15 | INFO | __main__ : Current Frame Index within Batch Video: 25/247 +2025-05-12T14:56:15 | INFO | __main__ : Batch-wise Cosine Similarity | 73.06% +2025-05-12T14:56:15 | INFO | __main__ : Cosine Embedding Loss | 0.2694 +2025-05-12T14:56:15 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:56:15 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:56:15 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:56:39 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:56:39 | INFO | __main__ : Step: 82300 +2025-05-12T14:56:39 | INFO | __main__ : Current Frame Index within Batch Video: 125/247 +2025-05-12T14:56:39 | INFO | __main__ : Batch-wise Cosine Similarity | 90.14% +2025-05-12T14:56:39 | INFO | __main__ : Cosine Embedding Loss | 0.0986 +2025-05-12T14:56:39 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:56:39 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:56:39 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:57:03 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:57:03 | INFO | __main__ : Step: 82400 +2025-05-12T14:57:03 | INFO | __main__ : Current Frame Index within Batch Video: 225/247 +2025-05-12T14:57:03 | INFO | __main__ : Batch-wise Cosine Similarity | 92.60% +2025-05-12T14:57:03 | INFO | __main__ : Cosine Embedding Loss | 0.0740 +2025-05-12T14:57:03 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:57:03 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:57:03 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:57:08 | INFO | utils.basic_utils : Train Epoch: [0] [ 341/4978] eta: 3 days, 5:02:25 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0600 eval_avg_sim: 0.5568 video-cosine_similarity: 0.9400 time: 59.8136 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T14:57:26 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:57:26 | INFO | __main__ : Step: 82500 +2025-05-12T14:57:26 | INFO | __main__ : Current Frame Index within Batch Video: 84/247 +2025-05-12T14:57:26 | INFO | __main__ : Batch-wise Cosine Similarity | 86.86% +2025-05-12T14:57:26 | INFO | __main__ : Cosine Embedding Loss | 0.1314 +2025-05-12T14:57:26 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:57:26 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:57:26 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:57:50 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:57:50 | INFO | __main__ : Step: 82600 +2025-05-12T14:57:50 | INFO | __main__ : Current Frame Index within Batch Video: 184/247 +2025-05-12T14:57:50 | INFO | __main__ : Batch-wise Cosine Similarity | 91.24% +2025-05-12T14:57:50 | INFO | __main__ : Cosine Embedding Loss | 0.0876 +2025-05-12T14:57:50 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:57:50 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:57:50 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:58:05 | INFO | utils.basic_utils : Train Epoch: [0] [ 342/4978] eta: 3 days, 5:00:53 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0788 eval_avg_sim: 0.5568 video-cosine_similarity: 0.9212 time: 59.8165 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T14:58:14 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:58:14 | INFO | __main__ : Step: 82700 +2025-05-12T14:58:14 | INFO | __main__ : Current Frame Index within Batch Video: 43/247 +2025-05-12T14:58:14 | INFO | __main__ : Batch-wise Cosine Similarity | 78.23% +2025-05-12T14:58:14 | INFO | __main__ : Cosine Embedding Loss | 0.2177 +2025-05-12T14:58:14 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:58:14 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:58:14 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:58:38 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:58:38 | INFO | __main__ : Step: 82800 +2025-05-12T14:58:38 | INFO | __main__ : Current Frame Index within Batch Video: 143/247 +2025-05-12T14:58:38 | INFO | __main__ : Batch-wise Cosine Similarity | 90.67% +2025-05-12T14:58:38 | INFO | __main__ : Cosine Embedding Loss | 0.0933 +2025-05-12T14:58:38 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:58:38 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:58:38 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:59:02 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:59:02 | INFO | __main__ : Step: 82900 +2025-05-12T14:59:02 | INFO | __main__ : Current Frame Index within Batch Video: 243/247 +2025-05-12T14:59:02 | INFO | __main__ : Batch-wise Cosine Similarity | 94.54% +2025-05-12T14:59:02 | INFO | __main__ : Cosine Embedding Loss | 0.0546 +2025-05-12T14:59:02 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:59:02 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:59:02 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:59:03 | INFO | utils.basic_utils : Train Epoch: [0] [ 343/4978] eta: 3 days, 4:59:22 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0609 eval_avg_sim: 0.5568 video-cosine_similarity: 0.9391 time: 59.3477 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T14:59:26 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:59:26 | INFO | __main__ : Step: 83000 +2025-05-12T14:59:26 | INFO | __main__ : Current Frame Index within Batch Video: 102/247 +2025-05-12T14:59:26 | INFO | __main__ : Batch-wise Cosine Similarity | 89.25% +2025-05-12T14:59:26 | INFO | __main__ : Cosine Embedding Loss | 0.1075 +2025-05-12T14:59:26 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:59:26 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:59:26 | INFO | __main__ : Evaluation Average Sim | 0.5568 +2025-05-12T14:59:26 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:59:26 | INFO | __main__ : Performing periodic evaluation at global step 83000... +2025-05-12T14:59:26 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T14:59:26 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T14:59:26 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T14:59:26 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T14:59:35 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.6197 +2025-05-12T14:59:35 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0083000.png +2025-05-12T14:59:35 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T14:59:35 | INFO | __main__ : Evaluation at step 83000 complete. Average Similarity: 0.6197 +2025-05-12T14:59:59 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T14:59:59 | INFO | __main__ : Step: 83100 +2025-05-12T14:59:59 | INFO | __main__ : Current Frame Index within Batch Video: 202/247 +2025-05-12T14:59:59 | INFO | __main__ : Batch-wise Cosine Similarity | 94.13% +2025-05-12T14:59:59 | INFO | __main__ : Cosine Embedding Loss | 0.0587 +2025-05-12T14:59:59 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T14:59:59 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T14:59:59 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:00:10 | INFO | utils.basic_utils : Train Epoch: [0] [ 344/4978] eta: 3 days, 4:59:57 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0497 eval_avg_sim: 0.6197 video-cosine_similarity: 0.9503 time: 59.8172 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T15:00:23 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:00:23 | INFO | __main__ : Step: 83200 +2025-05-12T15:00:23 | INFO | __main__ : Current Frame Index within Batch Video: 61/247 +2025-05-12T15:00:23 | INFO | __main__ : Batch-wise Cosine Similarity | 83.67% +2025-05-12T15:00:23 | INFO | __main__ : Cosine Embedding Loss | 0.1633 +2025-05-12T15:00:23 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:00:23 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:00:23 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:00:47 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:00:47 | INFO | __main__ : Step: 83300 +2025-05-12T15:00:47 | INFO | __main__ : Current Frame Index within Batch Video: 161/247 +2025-05-12T15:00:47 | INFO | __main__ : Batch-wise Cosine Similarity | 91.22% +2025-05-12T15:00:47 | INFO | __main__ : Cosine Embedding Loss | 0.0878 +2025-05-12T15:00:47 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:00:47 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:00:47 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:01:07 | INFO | utils.basic_utils : Train Epoch: [0] [ 345/4978] eta: 3 days, 4:58:25 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0580 eval_avg_sim: 0.6197 video-cosine_similarity: 0.9420 time: 59.8146 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T15:01:10 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:01:10 | INFO | __main__ : Step: 83400 +2025-05-12T15:01:10 | INFO | __main__ : Current Frame Index within Batch Video: 20/247 +2025-05-12T15:01:10 | INFO | __main__ : Batch-wise Cosine Similarity | 69.59% +2025-05-12T15:01:10 | INFO | __main__ : Cosine Embedding Loss | 0.3041 +2025-05-12T15:01:10 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:01:10 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:01:10 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:01:34 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:01:34 | INFO | __main__ : Step: 83500 +2025-05-12T15:01:34 | INFO | __main__ : Current Frame Index within Batch Video: 120/247 +2025-05-12T15:01:34 | INFO | __main__ : Batch-wise Cosine Similarity | 89.99% +2025-05-12T15:01:34 | INFO | __main__ : Cosine Embedding Loss | 0.1001 +2025-05-12T15:01:34 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:01:34 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:01:34 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:01:58 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:01:58 | INFO | __main__ : Step: 83600 +2025-05-12T15:01:58 | INFO | __main__ : Current Frame Index within Batch Video: 220/247 +2025-05-12T15:01:58 | INFO | __main__ : Batch-wise Cosine Similarity | 94.69% +2025-05-12T15:01:58 | INFO | __main__ : Cosine Embedding Loss | 0.0531 +2025-05-12T15:01:58 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:01:58 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:01:58 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:02:04 | INFO | utils.basic_utils : Train Epoch: [0] [ 346/4978] eta: 3 days, 4:56:54 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0538 eval_avg_sim: 0.6197 video-cosine_similarity: 0.9462 time: 59.8144 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T15:02:22 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:02:22 | INFO | __main__ : Step: 83700 +2025-05-12T15:02:22 | INFO | __main__ : Current Frame Index within Batch Video: 79/247 +2025-05-12T15:02:22 | INFO | __main__ : Batch-wise Cosine Similarity | 86.72% +2025-05-12T15:02:22 | INFO | __main__ : Cosine Embedding Loss | 0.1328 +2025-05-12T15:02:22 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:02:22 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:02:22 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:02:46 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:02:46 | INFO | __main__ : Step: 83800 +2025-05-12T15:02:46 | INFO | __main__ : Current Frame Index within Batch Video: 179/247 +2025-05-12T15:02:46 | INFO | __main__ : Batch-wise Cosine Similarity | 89.41% +2025-05-12T15:02:46 | INFO | __main__ : Cosine Embedding Loss | 0.1059 +2025-05-12T15:02:46 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:02:46 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:02:46 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:03:02 | INFO | utils.basic_utils : Train Epoch: [0] [ 347/4978] eta: 3 days, 4:55:23 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0914 eval_avg_sim: 0.6197 video-cosine_similarity: 0.9086 time: 59.3431 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T15:03:10 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:03:10 | INFO | __main__ : Step: 83900 +2025-05-12T15:03:10 | INFO | __main__ : Current Frame Index within Batch Video: 38/247 +2025-05-12T15:03:10 | INFO | __main__ : Batch-wise Cosine Similarity | 79.45% +2025-05-12T15:03:10 | INFO | __main__ : Cosine Embedding Loss | 0.2055 +2025-05-12T15:03:10 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:03:10 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:03:10 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:03:33 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:03:33 | INFO | __main__ : Step: 84000 +2025-05-12T15:03:33 | INFO | __main__ : Current Frame Index within Batch Video: 138/247 +2025-05-12T15:03:33 | INFO | __main__ : Batch-wise Cosine Similarity | 90.78% +2025-05-12T15:03:33 | INFO | __main__ : Cosine Embedding Loss | 0.0922 +2025-05-12T15:03:33 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:03:33 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:03:33 | INFO | __main__ : Evaluation Average Sim | 0.6197 +2025-05-12T15:03:33 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:03:34 | INFO | __main__ : Performing periodic evaluation at global step 84000... +2025-05-12T15:03:34 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T15:03:34 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T15:03:34 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T15:03:34 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T15:03:43 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.6500 +2025-05-12T15:03:43 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0084000.png +2025-05-12T15:03:43 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T15:03:43 | INFO | __main__ : Evaluation at step 84000 complete. Average Similarity: 0.6500 +2025-05-12T15:04:06 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:04:06 | INFO | __main__ : Step: 84100 +2025-05-12T15:04:06 | INFO | __main__ : Current Frame Index within Batch Video: 238/247 +2025-05-12T15:04:06 | INFO | __main__ : Batch-wise Cosine Similarity | 93.25% +2025-05-12T15:04:06 | INFO | __main__ : Cosine Embedding Loss | 0.0675 +2025-05-12T15:04:06 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:04:06 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:04:06 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:04:09 | INFO | utils.basic_utils : Train Epoch: [0] [ 348/4978] eta: 3 days, 4:55:55 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0700 eval_avg_sim: 0.6500 video-cosine_similarity: 0.9300 time: 59.8093 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T15:04:30 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:04:30 | INFO | __main__ : Step: 84200 +2025-05-12T15:04:30 | INFO | __main__ : Current Frame Index within Batch Video: 97/247 +2025-05-12T15:04:30 | INFO | __main__ : Batch-wise Cosine Similarity | 87.22% +2025-05-12T15:04:30 | INFO | __main__ : Cosine Embedding Loss | 0.1278 +2025-05-12T15:04:30 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:04:30 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:04:30 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:04:54 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:04:54 | INFO | __main__ : Step: 84300 +2025-05-12T15:04:54 | INFO | __main__ : Current Frame Index within Batch Video: 197/247 +2025-05-12T15:04:54 | INFO | __main__ : Batch-wise Cosine Similarity | 94.13% +2025-05-12T15:04:54 | INFO | __main__ : Cosine Embedding Loss | 0.0587 +2025-05-12T15:04:54 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:04:54 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:04:54 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:05:06 | INFO | utils.basic_utils : Train Epoch: [0] [ 349/4978] eta: 3 days, 4:54:24 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0521 eval_avg_sim: 0.6500 video-cosine_similarity: 0.9479 time: 59.8099 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T15:05:18 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:05:18 | INFO | __main__ : Step: 84400 +2025-05-12T15:05:18 | INFO | __main__ : Current Frame Index within Batch Video: 56/247 +2025-05-12T15:05:18 | INFO | __main__ : Batch-wise Cosine Similarity | 82.46% +2025-05-12T15:05:18 | INFO | __main__ : Cosine Embedding Loss | 0.1754 +2025-05-12T15:05:18 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:05:18 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:05:18 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:05:42 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:05:42 | INFO | __main__ : Step: 84500 +2025-05-12T15:05:42 | INFO | __main__ : Current Frame Index within Batch Video: 156/247 +2025-05-12T15:05:42 | INFO | __main__ : Batch-wise Cosine Similarity | 93.80% +2025-05-12T15:05:42 | INFO | __main__ : Cosine Embedding Loss | 0.0620 +2025-05-12T15:05:42 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:05:42 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:05:42 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:06:03 | INFO | utils.basic_utils : Train Epoch: [0] [ 350/4978] eta: 3 days, 4:52:52 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0431 eval_avg_sim: 0.6500 video-cosine_similarity: 0.9569 time: 59.8058 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T15:06:06 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:06:06 | INFO | __main__ : Step: 84600 +2025-05-12T15:06:06 | INFO | __main__ : Current Frame Index within Batch Video: 15/247 +2025-05-12T15:06:06 | INFO | __main__ : Batch-wise Cosine Similarity | 66.17% +2025-05-12T15:06:06 | INFO | __main__ : Cosine Embedding Loss | 0.3383 +2025-05-12T15:06:06 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:06:06 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:06:06 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:06:30 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:06:30 | INFO | __main__ : Step: 84700 +2025-05-12T15:06:30 | INFO | __main__ : Current Frame Index within Batch Video: 115/247 +2025-05-12T15:06:30 | INFO | __main__ : Batch-wise Cosine Similarity | 90.28% +2025-05-12T15:06:30 | INFO | __main__ : Cosine Embedding Loss | 0.0972 +2025-05-12T15:06:30 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:06:30 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:06:30 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:06:53 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:06:53 | INFO | __main__ : Step: 84800 +2025-05-12T15:06:53 | INFO | __main__ : Current Frame Index within Batch Video: 215/247 +2025-05-12T15:06:53 | INFO | __main__ : Batch-wise Cosine Similarity | 92.77% +2025-05-12T15:06:53 | INFO | __main__ : Cosine Embedding Loss | 0.0723 +2025-05-12T15:06:53 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:06:53 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:06:53 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:07:01 | INFO | utils.basic_utils : Train Epoch: [0] [ 351/4978] eta: 3 days, 4:51:21 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0670 eval_avg_sim: 0.6500 video-cosine_similarity: 0.9330 time: 59.3306 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T15:07:17 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:07:17 | INFO | __main__ : Step: 84900 +2025-05-12T15:07:17 | INFO | __main__ : Current Frame Index within Batch Video: 74/247 +2025-05-12T15:07:17 | INFO | __main__ : Batch-wise Cosine Similarity | 83.90% +2025-05-12T15:07:17 | INFO | __main__ : Cosine Embedding Loss | 0.1610 +2025-05-12T15:07:17 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:07:17 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:07:17 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:07:41 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:07:41 | INFO | __main__ : Step: 85000 +2025-05-12T15:07:41 | INFO | __main__ : Current Frame Index within Batch Video: 174/247 +2025-05-12T15:07:41 | INFO | __main__ : Batch-wise Cosine Similarity | 91.99% +2025-05-12T15:07:41 | INFO | __main__ : Cosine Embedding Loss | 0.0801 +2025-05-12T15:07:41 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:07:41 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:07:41 | INFO | __main__ : Evaluation Average Sim | 0.6500 +2025-05-12T15:07:41 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:07:41 | INFO | __main__ : Saving checkpoint at global step 85000 +2025-05-12T15:07:41 | INFO | __main__ : Performing periodic evaluation at global step 85000... +2025-05-12T15:07:41 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T15:07:42 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T15:07:42 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T15:07:42 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T15:07:51 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.6002 +2025-05-12T15:07:51 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0085000.png +2025-05-12T15:07:51 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T15:07:51 | INFO | __main__ : Evaluation at step 85000 complete. Average Similarity: 0.6002 +2025-05-12T15:08:08 | INFO | utils.basic_utils : Train Epoch: [0] [ 352/4978] eta: 3 days, 4:51:54 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.1022 eval_avg_sim: 0.6002 video-cosine_similarity: 0.8978 time: 59.8026 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T15:08:14 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:08:14 | INFO | __main__ : Step: 85100 +2025-05-12T15:08:14 | INFO | __main__ : Current Frame Index within Batch Video: 33/247 +2025-05-12T15:08:14 | INFO | __main__ : Batch-wise Cosine Similarity | 76.05% +2025-05-12T15:08:14 | INFO | __main__ : Cosine Embedding Loss | 0.2395 +2025-05-12T15:08:14 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:08:14 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:08:14 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:08:38 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:08:38 | INFO | __main__ : Step: 85200 +2025-05-12T15:08:38 | INFO | __main__ : Current Frame Index within Batch Video: 133/247 +2025-05-12T15:08:38 | INFO | __main__ : Batch-wise Cosine Similarity | 87.02% +2025-05-12T15:08:38 | INFO | __main__ : Cosine Embedding Loss | 0.1298 +2025-05-12T15:08:38 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:08:38 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:08:38 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:09:02 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:09:02 | INFO | __main__ : Step: 85300 +2025-05-12T15:09:02 | INFO | __main__ : Current Frame Index within Batch Video: 233/247 +2025-05-12T15:09:02 | INFO | __main__ : Batch-wise Cosine Similarity | 92.58% +2025-05-12T15:09:02 | INFO | __main__ : Cosine Embedding Loss | 0.0742 +2025-05-12T15:09:02 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:09:02 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:09:02 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:09:05 | INFO | utils.basic_utils : Train Epoch: [0] [ 353/4978] eta: 3 days, 4:50:23 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0825 eval_avg_sim: 0.6002 video-cosine_similarity: 0.9175 time: 59.8035 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T15:09:26 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:09:26 | INFO | __main__ : Step: 85400 +2025-05-12T15:09:26 | INFO | __main__ : Current Frame Index within Batch Video: 92/247 +2025-05-12T15:09:26 | INFO | __main__ : Batch-wise Cosine Similarity | 87.71% +2025-05-12T15:09:26 | INFO | __main__ : Cosine Embedding Loss | 0.1229 +2025-05-12T15:09:26 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:09:26 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:09:26 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:09:50 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:09:50 | INFO | __main__ : Step: 85500 +2025-05-12T15:09:50 | INFO | __main__ : Current Frame Index within Batch Video: 192/247 +2025-05-12T15:09:50 | INFO | __main__ : Batch-wise Cosine Similarity | 91.46% +2025-05-12T15:09:50 | INFO | __main__ : Cosine Embedding Loss | 0.0854 +2025-05-12T15:09:50 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:09:50 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:09:50 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:10:03 | INFO | utils.basic_utils : Train Epoch: [0] [ 354/4978] eta: 3 days, 4:48:53 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0684 eval_avg_sim: 0.6002 video-cosine_similarity: 0.9316 time: 59.8051 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T15:10:14 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:10:14 | INFO | __main__ : Step: 85600 +2025-05-12T15:10:14 | INFO | __main__ : Current Frame Index within Batch Video: 51/247 +2025-05-12T15:10:14 | INFO | __main__ : Batch-wise Cosine Similarity | 80.29% +2025-05-12T15:10:14 | INFO | __main__ : Cosine Embedding Loss | 0.1971 +2025-05-12T15:10:14 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:10:14 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:10:14 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:10:37 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:10:37 | INFO | __main__ : Step: 85700 +2025-05-12T15:10:37 | INFO | __main__ : Current Frame Index within Batch Video: 151/247 +2025-05-12T15:10:37 | INFO | __main__ : Batch-wise Cosine Similarity | 91.88% +2025-05-12T15:10:37 | INFO | __main__ : Cosine Embedding Loss | 0.0812 +2025-05-12T15:10:37 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:10:37 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:10:37 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:11:00 | INFO | utils.basic_utils : Train Epoch: [0] [ 355/4978] eta: 3 days, 4:47:23 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0558 eval_avg_sim: 0.6002 video-cosine_similarity: 0.9442 time: 59.8081 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T15:11:01 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:11:01 | INFO | __main__ : Step: 85800 +2025-05-12T15:11:01 | INFO | __main__ : Current Frame Index within Batch Video: 10/247 +2025-05-12T15:11:01 | INFO | __main__ : Batch-wise Cosine Similarity | 57.68% +2025-05-12T15:11:01 | INFO | __main__ : Cosine Embedding Loss | 0.4232 +2025-05-12T15:11:01 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:11:01 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:11:01 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:11:25 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:11:25 | INFO | __main__ : Step: 85900 +2025-05-12T15:11:25 | INFO | __main__ : Current Frame Index within Batch Video: 110/247 +2025-05-12T15:11:25 | INFO | __main__ : Batch-wise Cosine Similarity | 89.76% +2025-05-12T15:11:25 | INFO | __main__ : Cosine Embedding Loss | 0.1024 +2025-05-12T15:11:25 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:11:25 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:11:25 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:11:49 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:11:49 | INFO | __main__ : Step: 86000 +2025-05-12T15:11:49 | INFO | __main__ : Current Frame Index within Batch Video: 210/247 +2025-05-12T15:11:49 | INFO | __main__ : Batch-wise Cosine Similarity | 93.84% +2025-05-12T15:11:49 | INFO | __main__ : Cosine Embedding Loss | 0.0616 +2025-05-12T15:11:49 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:11:49 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:11:49 | INFO | __main__ : Evaluation Average Sim | 0.6002 +2025-05-12T15:11:49 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:11:49 | INFO | __main__ : Performing periodic evaluation at global step 86000... +2025-05-12T15:11:49 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T15:11:49 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T15:11:49 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T15:11:49 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T15:11:58 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.5039 +2025-05-12T15:11:59 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0086000.png +2025-05-12T15:11:59 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T15:11:59 | INFO | __main__ : Evaluation at step 86000 complete. Average Similarity: 0.5039 +2025-05-12T15:12:07 | INFO | utils.basic_utils : Train Epoch: [0] [ 356/4978] eta: 3 days, 4:47:55 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0568 eval_avg_sim: 0.5039 video-cosine_similarity: 0.9432 time: 59.7926 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T15:12:22 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:12:22 | INFO | __main__ : Step: 86100 +2025-05-12T15:12:22 | INFO | __main__ : Current Frame Index within Batch Video: 69/247 +2025-05-12T15:12:22 | INFO | __main__ : Batch-wise Cosine Similarity | 86.24% +2025-05-12T15:12:22 | INFO | __main__ : Cosine Embedding Loss | 0.1376 +2025-05-12T15:12:22 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:12:22 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:12:22 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:12:46 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:12:46 | INFO | __main__ : Step: 86200 +2025-05-12T15:12:46 | INFO | __main__ : Current Frame Index within Batch Video: 169/247 +2025-05-12T15:12:46 | INFO | __main__ : Batch-wise Cosine Similarity | 92.81% +2025-05-12T15:12:46 | INFO | __main__ : Cosine Embedding Loss | 0.0719 +2025-05-12T15:12:46 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:12:46 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:12:46 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:13:05 | INFO | utils.basic_utils : Train Epoch: [0] [ 357/4978] eta: 3 days, 4:46:24 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0756 eval_avg_sim: 0.5039 video-cosine_similarity: 0.9244 time: 59.7906 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T15:13:10 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:13:10 | INFO | __main__ : Step: 86300 +2025-05-12T15:13:10 | INFO | __main__ : Current Frame Index within Batch Video: 28/247 +2025-05-12T15:13:10 | INFO | __main__ : Batch-wise Cosine Similarity | 75.53% +2025-05-12T15:13:10 | INFO | __main__ : Cosine Embedding Loss | 0.2447 +2025-05-12T15:13:10 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:13:10 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:13:10 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:13:34 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:13:34 | INFO | __main__ : Step: 86400 +2025-05-12T15:13:34 | INFO | __main__ : Current Frame Index within Batch Video: 128/247 +2025-05-12T15:13:34 | INFO | __main__ : Batch-wise Cosine Similarity | 91.38% +2025-05-12T15:13:34 | INFO | __main__ : Cosine Embedding Loss | 0.0862 +2025-05-12T15:13:34 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:13:34 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:13:34 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:13:57 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:13:57 | INFO | __main__ : Step: 86500 +2025-05-12T15:13:57 | INFO | __main__ : Current Frame Index within Batch Video: 228/247 +2025-05-12T15:13:57 | INFO | __main__ : Batch-wise Cosine Similarity | 94.25% +2025-05-12T15:13:57 | INFO | __main__ : Cosine Embedding Loss | 0.0575 +2025-05-12T15:13:57 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:13:57 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:13:57 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:14:02 | INFO | utils.basic_utils : Train Epoch: [0] [ 358/4978] eta: 3 days, 4:44:54 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0505 eval_avg_sim: 0.5039 video-cosine_similarity: 0.9495 time: 59.7899 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T15:14:21 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:14:21 | INFO | __main__ : Step: 86600 +2025-05-12T15:14:21 | INFO | __main__ : Current Frame Index within Batch Video: 87/247 +2025-05-12T15:14:21 | INFO | __main__ : Batch-wise Cosine Similarity | 88.40% +2025-05-12T15:14:21 | INFO | __main__ : Cosine Embedding Loss | 0.1160 +2025-05-12T15:14:21 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:14:21 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:14:21 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:14:45 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:14:45 | INFO | __main__ : Step: 86700 +2025-05-12T15:14:45 | INFO | __main__ : Current Frame Index within Batch Video: 187/247 +2025-05-12T15:14:45 | INFO | __main__ : Batch-wise Cosine Similarity | 92.45% +2025-05-12T15:14:45 | INFO | __main__ : Cosine Embedding Loss | 0.0755 +2025-05-12T15:14:45 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:14:45 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:14:45 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:14:59 | INFO | utils.basic_utils : Train Epoch: [0] [ 359/4978] eta: 3 days, 4:43:23 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0754 eval_avg_sim: 0.5039 video-cosine_similarity: 0.9246 time: 59.7879 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T15:15:09 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:15:09 | INFO | __main__ : Step: 86800 +2025-05-12T15:15:09 | INFO | __main__ : Current Frame Index within Batch Video: 46/247 +2025-05-12T15:15:09 | INFO | __main__ : Batch-wise Cosine Similarity | 80.90% +2025-05-12T15:15:09 | INFO | __main__ : Cosine Embedding Loss | 0.1910 +2025-05-12T15:15:09 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:15:09 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:15:09 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:15:33 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:15:33 | INFO | __main__ : Step: 86900 +2025-05-12T15:15:33 | INFO | __main__ : Current Frame Index within Batch Video: 146/247 +2025-05-12T15:15:33 | INFO | __main__ : Batch-wise Cosine Similarity | 90.94% +2025-05-12T15:15:33 | INFO | __main__ : Cosine Embedding Loss | 0.0906 +2025-05-12T15:15:33 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:15:33 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:15:33 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:15:57 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:15:57 | INFO | __main__ : Step: 87000 +2025-05-12T15:15:57 | INFO | __main__ : Current Frame Index within Batch Video: 246/247 +2025-05-12T15:15:57 | INFO | __main__ : Batch-wise Cosine Similarity | 93.50% +2025-05-12T15:15:57 | INFO | __main__ : Cosine Embedding Loss | 0.0650 +2025-05-12T15:15:57 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:15:57 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:15:57 | INFO | __main__ : Evaluation Average Sim | 0.5039 +2025-05-12T15:15:57 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:15:57 | INFO | __main__ : Performing periodic evaluation at global step 87000... +2025-05-12T15:15:57 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T15:15:57 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T15:15:57 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T15:15:57 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T15:16:06 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.5810 +2025-05-12T15:16:06 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0087000.png +2025-05-12T15:16:06 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T15:16:06 | INFO | __main__ : Evaluation at step 87000 complete. Average Similarity: 0.5810 +2025-05-12T15:16:06 | INFO | utils.basic_utils : Train Epoch: [0] [ 360/4978] eta: 3 days, 4:43:52 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0682 eval_avg_sim: 0.5810 video-cosine_similarity: 0.9318 time: 59.7862 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T15:16:30 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:16:30 | INFO | __main__ : Step: 87100 +2025-05-12T15:16:30 | INFO | __main__ : Current Frame Index within Batch Video: 105/247 +2025-05-12T15:16:30 | INFO | __main__ : Batch-wise Cosine Similarity | 86.46% +2025-05-12T15:16:30 | INFO | __main__ : Cosine Embedding Loss | 0.1354 +2025-05-12T15:16:30 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:16:30 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:16:30 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:16:54 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:16:54 | INFO | __main__ : Step: 87200 +2025-05-12T15:16:54 | INFO | __main__ : Current Frame Index within Batch Video: 205/247 +2025-05-12T15:16:54 | INFO | __main__ : Batch-wise Cosine Similarity | 95.13% +2025-05-12T15:16:54 | INFO | __main__ : Cosine Embedding Loss | 0.0487 +2025-05-12T15:16:54 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:16:54 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:16:54 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:17:04 | INFO | utils.basic_utils : Train Epoch: [0] [ 361/4978] eta: 3 days, 4:42:23 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0635 eval_avg_sim: 0.5810 video-cosine_similarity: 0.9365 time: 59.7860 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T15:17:18 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:17:18 | INFO | __main__ : Step: 87300 +2025-05-12T15:17:18 | INFO | __main__ : Current Frame Index within Batch Video: 64/247 +2025-05-12T15:17:18 | INFO | __main__ : Batch-wise Cosine Similarity | 82.81% +2025-05-12T15:17:18 | INFO | __main__ : Cosine Embedding Loss | 0.1719 +2025-05-12T15:17:18 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:17:18 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:17:18 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:17:41 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:17:41 | INFO | __main__ : Step: 87400 +2025-05-12T15:17:41 | INFO | __main__ : Current Frame Index within Batch Video: 164/247 +2025-05-12T15:17:41 | INFO | __main__ : Batch-wise Cosine Similarity | 92.22% +2025-05-12T15:17:41 | INFO | __main__ : Cosine Embedding Loss | 0.0778 +2025-05-12T15:17:41 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:17:41 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:17:41 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:18:01 | INFO | utils.basic_utils : Train Epoch: [0] [ 362/4978] eta: 3 days, 4:40:53 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0622 eval_avg_sim: 0.5810 video-cosine_similarity: 0.9378 time: 59.7876 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T15:18:05 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:18:05 | INFO | __main__ : Step: 87500 +2025-05-12T15:18:05 | INFO | __main__ : Current Frame Index within Batch Video: 23/247 +2025-05-12T15:18:05 | INFO | __main__ : Batch-wise Cosine Similarity | 75.15% +2025-05-12T15:18:05 | INFO | __main__ : Cosine Embedding Loss | 0.2485 +2025-05-12T15:18:05 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:18:05 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:18:05 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:18:29 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:18:29 | INFO | __main__ : Step: 87600 +2025-05-12T15:18:29 | INFO | __main__ : Current Frame Index within Batch Video: 123/247 +2025-05-12T15:18:29 | INFO | __main__ : Batch-wise Cosine Similarity | 90.94% +2025-05-12T15:18:29 | INFO | __main__ : Cosine Embedding Loss | 0.0906 +2025-05-12T15:18:29 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:18:29 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:18:29 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:18:53 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:18:53 | INFO | __main__ : Step: 87700 +2025-05-12T15:18:53 | INFO | __main__ : Current Frame Index within Batch Video: 223/247 +2025-05-12T15:18:53 | INFO | __main__ : Batch-wise Cosine Similarity | 93.86% +2025-05-12T15:18:53 | INFO | __main__ : Cosine Embedding Loss | 0.0614 +2025-05-12T15:18:53 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:18:53 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:18:53 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:18:59 | INFO | utils.basic_utils : Train Epoch: [0] [ 363/4978] eta: 3 days, 4:39:24 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0557 eval_avg_sim: 0.5810 video-cosine_similarity: 0.9443 time: 59.7880 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T15:19:17 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:19:17 | INFO | __main__ : Step: 87800 +2025-05-12T15:19:17 | INFO | __main__ : Current Frame Index within Batch Video: 82/247 +2025-05-12T15:19:17 | INFO | __main__ : Batch-wise Cosine Similarity | 88.34% +2025-05-12T15:19:17 | INFO | __main__ : Cosine Embedding Loss | 0.1166 +2025-05-12T15:19:17 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:19:17 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:19:17 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:19:41 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:19:41 | INFO | __main__ : Step: 87900 +2025-05-12T15:19:41 | INFO | __main__ : Current Frame Index within Batch Video: 182/247 +2025-05-12T15:19:41 | INFO | __main__ : Batch-wise Cosine Similarity | 94.86% +2025-05-12T15:19:41 | INFO | __main__ : Cosine Embedding Loss | 0.0514 +2025-05-12T15:19:41 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:19:41 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:19:41 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:19:56 | INFO | utils.basic_utils : Train Epoch: [0] [ 364/4978] eta: 3 days, 4:37:54 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0413 eval_avg_sim: 0.5810 video-cosine_similarity: 0.9587 time: 59.3196 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T15:20:05 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:20:05 | INFO | __main__ : Step: 88000 +2025-05-12T15:20:05 | INFO | __main__ : Current Frame Index within Batch Video: 41/247 +2025-05-12T15:20:05 | INFO | __main__ : Batch-wise Cosine Similarity | 78.25% +2025-05-12T15:20:05 | INFO | __main__ : Cosine Embedding Loss | 0.2175 +2025-05-12T15:20:05 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:20:05 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:20:05 | INFO | __main__ : Evaluation Average Sim | 0.5810 +2025-05-12T15:20:05 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:20:05 | INFO | __main__ : Performing periodic evaluation at global step 88000... +2025-05-12T15:20:05 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T15:20:05 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T15:20:05 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T15:20:05 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T15:20:14 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.5954 +2025-05-12T15:20:14 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0088000.png +2025-05-12T15:20:14 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T15:20:14 | INFO | __main__ : Evaluation at step 88000 complete. Average Similarity: 0.5954 +2025-05-12T15:20:38 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:20:38 | INFO | __main__ : Step: 88100 +2025-05-12T15:20:38 | INFO | __main__ : Current Frame Index within Batch Video: 141/247 +2025-05-12T15:20:38 | INFO | __main__ : Batch-wise Cosine Similarity | 88.61% +2025-05-12T15:20:38 | INFO | __main__ : Cosine Embedding Loss | 0.1139 +2025-05-12T15:20:38 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:20:38 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:20:38 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:21:02 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:21:02 | INFO | __main__ : Step: 88200 +2025-05-12T15:21:02 | INFO | __main__ : Current Frame Index within Batch Video: 241/247 +2025-05-12T15:21:02 | INFO | __main__ : Batch-wise Cosine Similarity | 91.14% +2025-05-12T15:21:02 | INFO | __main__ : Cosine Embedding Loss | 0.0886 +2025-05-12T15:21:02 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:21:02 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:21:02 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:21:03 | INFO | utils.basic_utils : Train Epoch: [0] [ 365/4978] eta: 3 days, 4:38:27 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0786 eval_avg_sim: 0.5954 video-cosine_similarity: 0.9214 time: 59.8048 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T15:21:26 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:21:26 | INFO | __main__ : Step: 88300 +2025-05-12T15:21:26 | INFO | __main__ : Current Frame Index within Batch Video: 100/247 +2025-05-12T15:21:26 | INFO | __main__ : Batch-wise Cosine Similarity | 86.74% +2025-05-12T15:21:26 | INFO | __main__ : Cosine Embedding Loss | 0.1326 +2025-05-12T15:21:26 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:21:26 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:21:26 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:21:49 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:21:49 | INFO | __main__ : Step: 88400 +2025-05-12T15:21:49 | INFO | __main__ : Current Frame Index within Batch Video: 200/247 +2025-05-12T15:21:49 | INFO | __main__ : Batch-wise Cosine Similarity | 93.70% +2025-05-12T15:21:49 | INFO | __main__ : Cosine Embedding Loss | 0.0630 +2025-05-12T15:21:49 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:21:49 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:21:49 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:22:01 | INFO | utils.basic_utils : Train Epoch: [0] [ 366/4978] eta: 3 days, 4:36:58 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0449 eval_avg_sim: 0.5954 video-cosine_similarity: 0.9551 time: 59.8064 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T15:22:13 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:22:13 | INFO | __main__ : Step: 88500 +2025-05-12T15:22:13 | INFO | __main__ : Current Frame Index within Batch Video: 59/247 +2025-05-12T15:22:13 | INFO | __main__ : Batch-wise Cosine Similarity | 84.30% +2025-05-12T15:22:13 | INFO | __main__ : Cosine Embedding Loss | 0.1570 +2025-05-12T15:22:13 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:22:13 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:22:13 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:22:37 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:22:37 | INFO | __main__ : Step: 88600 +2025-05-12T15:22:37 | INFO | __main__ : Current Frame Index within Batch Video: 159/247 +2025-05-12T15:22:37 | INFO | __main__ : Batch-wise Cosine Similarity | 90.42% +2025-05-12T15:22:37 | INFO | __main__ : Cosine Embedding Loss | 0.0958 +2025-05-12T15:22:37 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:22:37 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:22:37 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:22:58 | INFO | utils.basic_utils : Train Epoch: [0] [ 367/4978] eta: 3 days, 4:35:29 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0939 eval_avg_sim: 0.5954 video-cosine_similarity: 0.9061 time: 59.8077 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T15:23:01 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:23:01 | INFO | __main__ : Step: 88700 +2025-05-12T15:23:01 | INFO | __main__ : Current Frame Index within Batch Video: 18/247 +2025-05-12T15:23:01 | INFO | __main__ : Batch-wise Cosine Similarity | 66.10% +2025-05-12T15:23:01 | INFO | __main__ : Cosine Embedding Loss | 0.3390 +2025-05-12T15:23:01 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:23:01 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:23:01 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:23:25 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:23:25 | INFO | __main__ : Step: 88800 +2025-05-12T15:23:25 | INFO | __main__ : Current Frame Index within Batch Video: 118/247 +2025-05-12T15:23:25 | INFO | __main__ : Batch-wise Cosine Similarity | 91.34% +2025-05-12T15:23:25 | INFO | __main__ : Cosine Embedding Loss | 0.0866 +2025-05-12T15:23:25 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:23:25 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:23:25 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:23:49 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:23:49 | INFO | __main__ : Step: 88900 +2025-05-12T15:23:49 | INFO | __main__ : Current Frame Index within Batch Video: 218/247 +2025-05-12T15:23:49 | INFO | __main__ : Batch-wise Cosine Similarity | 94.59% +2025-05-12T15:23:49 | INFO | __main__ : Cosine Embedding Loss | 0.0541 +2025-05-12T15:23:49 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:23:49 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:23:49 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:23:56 | INFO | utils.basic_utils : Train Epoch: [0] [ 368/4978] eta: 3 days, 4:34:00 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0537 eval_avg_sim: 0.5954 video-cosine_similarity: 0.9463 time: 59.3447 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T15:24:13 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:24:13 | INFO | __main__ : Step: 89000 +2025-05-12T15:24:13 | INFO | __main__ : Current Frame Index within Batch Video: 77/247 +2025-05-12T15:24:13 | INFO | __main__ : Batch-wise Cosine Similarity | 85.03% +2025-05-12T15:24:13 | INFO | __main__ : Cosine Embedding Loss | 0.1497 +2025-05-12T15:24:13 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:24:13 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:24:13 | INFO | __main__ : Evaluation Average Sim | 0.5954 +2025-05-12T15:24:13 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:24:13 | INFO | __main__ : Performing periodic evaluation at global step 89000... +2025-05-12T15:24:13 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T15:24:13 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T15:24:13 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T15:24:13 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T15:24:22 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.5706 +2025-05-12T15:24:22 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0089000.png +2025-05-12T15:24:22 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T15:24:22 | INFO | __main__ : Evaluation at step 89000 complete. Average Similarity: 0.5706 +2025-05-12T15:24:46 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:24:46 | INFO | __main__ : Step: 89100 +2025-05-12T15:24:46 | INFO | __main__ : Current Frame Index within Batch Video: 177/247 +2025-05-12T15:24:46 | INFO | __main__ : Batch-wise Cosine Similarity | 91.52% +2025-05-12T15:24:46 | INFO | __main__ : Cosine Embedding Loss | 0.0848 +2025-05-12T15:24:46 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:24:46 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:24:46 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:25:02 | INFO | utils.basic_utils : Train Epoch: [0] [ 369/4978] eta: 3 days, 4:34:28 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0806 eval_avg_sim: 0.5706 video-cosine_similarity: 0.9194 time: 59.8116 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T15:25:10 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:25:10 | INFO | __main__ : Step: 89200 +2025-05-12T15:25:10 | INFO | __main__ : Current Frame Index within Batch Video: 36/247 +2025-05-12T15:25:10 | INFO | __main__ : Batch-wise Cosine Similarity | 77.34% +2025-05-12T15:25:10 | INFO | __main__ : Cosine Embedding Loss | 0.2266 +2025-05-12T15:25:10 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:25:10 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:25:10 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:25:34 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:25:34 | INFO | __main__ : Step: 89300 +2025-05-12T15:25:34 | INFO | __main__ : Current Frame Index within Batch Video: 136/247 +2025-05-12T15:25:34 | INFO | __main__ : Batch-wise Cosine Similarity | 92.48% +2025-05-12T15:25:34 | INFO | __main__ : Cosine Embedding Loss | 0.0752 +2025-05-12T15:25:34 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:25:34 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:25:34 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:25:57 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:25:57 | INFO | __main__ : Step: 89400 +2025-05-12T15:25:57 | INFO | __main__ : Current Frame Index within Batch Video: 236/247 +2025-05-12T15:25:57 | INFO | __main__ : Batch-wise Cosine Similarity | 94.58% +2025-05-12T15:25:57 | INFO | __main__ : Cosine Embedding Loss | 0.0542 +2025-05-12T15:25:57 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:25:57 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:25:57 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:26:00 | INFO | utils.basic_utils : Train Epoch: [0] [ 370/4978] eta: 3 days, 4:32:59 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0978 eval_avg_sim: 0.5706 video-cosine_similarity: 0.9022 time: 59.8180 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T15:26:21 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:26:21 | INFO | __main__ : Step: 89500 +2025-05-12T15:26:21 | INFO | __main__ : Current Frame Index within Batch Video: 95/247 +2025-05-12T15:26:21 | INFO | __main__ : Batch-wise Cosine Similarity | 87.16% +2025-05-12T15:26:21 | INFO | __main__ : Cosine Embedding Loss | 0.1284 +2025-05-12T15:26:21 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:26:21 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:26:21 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:26:45 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:26:45 | INFO | __main__ : Step: 89600 +2025-05-12T15:26:45 | INFO | __main__ : Current Frame Index within Batch Video: 195/247 +2025-05-12T15:26:45 | INFO | __main__ : Batch-wise Cosine Similarity | 92.82% +2025-05-12T15:26:45 | INFO | __main__ : Cosine Embedding Loss | 0.0718 +2025-05-12T15:26:45 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:26:45 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:26:45 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:26:57 | INFO | utils.basic_utils : Train Epoch: [0] [ 371/4978] eta: 3 days, 4:31:31 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0710 eval_avg_sim: 0.5706 video-cosine_similarity: 0.9290 time: 59.8185 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T15:27:09 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:27:09 | INFO | __main__ : Step: 89700 +2025-05-12T15:27:09 | INFO | __main__ : Current Frame Index within Batch Video: 54/247 +2025-05-12T15:27:09 | INFO | __main__ : Batch-wise Cosine Similarity | 82.09% +2025-05-12T15:27:09 | INFO | __main__ : Cosine Embedding Loss | 0.1791 +2025-05-12T15:27:09 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:27:09 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:27:09 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:27:33 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:27:33 | INFO | __main__ : Step: 89800 +2025-05-12T15:27:33 | INFO | __main__ : Current Frame Index within Batch Video: 154/247 +2025-05-12T15:27:33 | INFO | __main__ : Batch-wise Cosine Similarity | 91.94% +2025-05-12T15:27:33 | INFO | __main__ : Cosine Embedding Loss | 0.0806 +2025-05-12T15:27:33 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:27:33 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:27:33 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:27:55 | INFO | utils.basic_utils : Train Epoch: [0] [ 372/4978] eta: 3 days, 4:30:02 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0665 eval_avg_sim: 0.5706 video-cosine_similarity: 0.9335 time: 59.3505 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T15:27:57 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:27:57 | INFO | __main__ : Step: 89900 +2025-05-12T15:27:57 | INFO | __main__ : Current Frame Index within Batch Video: 13/247 +2025-05-12T15:27:57 | INFO | __main__ : Batch-wise Cosine Similarity | 62.20% +2025-05-12T15:27:57 | INFO | __main__ : Cosine Embedding Loss | 0.3780 +2025-05-12T15:27:57 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:27:57 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:27:57 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:28:21 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:28:21 | INFO | __main__ : Step: 90000 +2025-05-12T15:28:21 | INFO | __main__ : Current Frame Index within Batch Video: 113/247 +2025-05-12T15:28:21 | INFO | __main__ : Batch-wise Cosine Similarity | 87.12% +2025-05-12T15:28:21 | INFO | __main__ : Cosine Embedding Loss | 0.1288 +2025-05-12T15:28:21 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:28:21 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:28:21 | INFO | __main__ : Evaluation Average Sim | 0.5706 +2025-05-12T15:28:21 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:28:21 | INFO | __main__ : Saving checkpoint at global step 90000 +2025-05-12T15:28:21 | INFO | __main__ : Performing periodic evaluation at global step 90000... +2025-05-12T15:28:21 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T15:28:21 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T15:28:21 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T15:28:21 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T15:28:30 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.6442 +2025-05-12T15:28:30 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0090000.png +2025-05-12T15:28:30 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T15:28:30 | INFO | __main__ : Evaluation at step 90000 complete. Average Similarity: 0.6442 +2025-05-12T15:28:54 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:28:54 | INFO | __main__ : Step: 90100 +2025-05-12T15:28:54 | INFO | __main__ : Current Frame Index within Batch Video: 213/247 +2025-05-12T15:28:54 | INFO | __main__ : Batch-wise Cosine Similarity | 92.56% +2025-05-12T15:28:54 | INFO | __main__ : Cosine Embedding Loss | 0.0744 +2025-05-12T15:28:54 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:28:54 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:28:54 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:29:02 | INFO | utils.basic_utils : Train Epoch: [0] [ 373/4978] eta: 3 days, 4:30:30 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0681 eval_avg_sim: 0.6442 video-cosine_similarity: 0.9319 time: 59.8232 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T15:29:18 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:29:18 | INFO | __main__ : Step: 90200 +2025-05-12T15:29:18 | INFO | __main__ : Current Frame Index within Batch Video: 72/247 +2025-05-12T15:29:18 | INFO | __main__ : Batch-wise Cosine Similarity | 86.27% +2025-05-12T15:29:18 | INFO | __main__ : Cosine Embedding Loss | 0.1373 +2025-05-12T15:29:18 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:29:18 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:29:18 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:29:41 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:29:41 | INFO | __main__ : Step: 90300 +2025-05-12T15:29:41 | INFO | __main__ : Current Frame Index within Batch Video: 172/247 +2025-05-12T15:29:41 | INFO | __main__ : Batch-wise Cosine Similarity | 91.45% +2025-05-12T15:29:41 | INFO | __main__ : Cosine Embedding Loss | 0.0855 +2025-05-12T15:29:41 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:29:41 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:29:41 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:29:59 | INFO | utils.basic_utils : Train Epoch: [0] [ 374/4978] eta: 3 days, 4:29:01 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0653 eval_avg_sim: 0.6442 video-cosine_similarity: 0.9347 time: 59.8214 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T15:30:05 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:30:05 | INFO | __main__ : Step: 90400 +2025-05-12T15:30:05 | INFO | __main__ : Current Frame Index within Batch Video: 31/247 +2025-05-12T15:30:05 | INFO | __main__ : Batch-wise Cosine Similarity | 76.61% +2025-05-12T15:30:05 | INFO | __main__ : Cosine Embedding Loss | 0.2339 +2025-05-12T15:30:05 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:30:05 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:30:05 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:30:29 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:30:29 | INFO | __main__ : Step: 90500 +2025-05-12T15:30:29 | INFO | __main__ : Current Frame Index within Batch Video: 131/247 +2025-05-12T15:30:29 | INFO | __main__ : Batch-wise Cosine Similarity | 90.74% +2025-05-12T15:30:29 | INFO | __main__ : Cosine Embedding Loss | 0.0926 +2025-05-12T15:30:29 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:30:29 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:30:29 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:30:53 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:30:53 | INFO | __main__ : Step: 90600 +2025-05-12T15:30:53 | INFO | __main__ : Current Frame Index within Batch Video: 231/247 +2025-05-12T15:30:53 | INFO | __main__ : Batch-wise Cosine Similarity | 95.12% +2025-05-12T15:30:53 | INFO | __main__ : Cosine Embedding Loss | 0.0488 +2025-05-12T15:30:53 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:30:53 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:30:53 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:30:57 | INFO | utils.basic_utils : Train Epoch: [0] [ 375/4978] eta: 3 days, 4:27:33 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0470 eval_avg_sim: 0.6442 video-cosine_similarity: 0.9530 time: 59.8216 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T15:31:17 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:31:17 | INFO | __main__ : Step: 90700 +2025-05-12T15:31:17 | INFO | __main__ : Current Frame Index within Batch Video: 90/247 +2025-05-12T15:31:17 | INFO | __main__ : Batch-wise Cosine Similarity | 86.20% +2025-05-12T15:31:17 | INFO | __main__ : Cosine Embedding Loss | 0.1380 +2025-05-12T15:31:17 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:31:17 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:31:17 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:31:41 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:31:41 | INFO | __main__ : Step: 90800 +2025-05-12T15:31:41 | INFO | __main__ : Current Frame Index within Batch Video: 190/247 +2025-05-12T15:31:41 | INFO | __main__ : Batch-wise Cosine Similarity | 93.94% +2025-05-12T15:31:41 | INFO | __main__ : Cosine Embedding Loss | 0.0606 +2025-05-12T15:31:41 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:31:41 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:31:41 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:31:54 | INFO | utils.basic_utils : Train Epoch: [0] [ 376/4978] eta: 3 days, 4:26:04 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0843 eval_avg_sim: 0.6442 video-cosine_similarity: 0.9157 time: 59.3483 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T15:32:05 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:32:05 | INFO | __main__ : Step: 90900 +2025-05-12T15:32:05 | INFO | __main__ : Current Frame Index within Batch Video: 49/247 +2025-05-12T15:32:05 | INFO | __main__ : Batch-wise Cosine Similarity | 78.47% +2025-05-12T15:32:05 | INFO | __main__ : Cosine Embedding Loss | 0.2153 +2025-05-12T15:32:05 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:32:05 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:32:05 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:32:28 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:32:28 | INFO | __main__ : Step: 91000 +2025-05-12T15:32:28 | INFO | __main__ : Current Frame Index within Batch Video: 149/247 +2025-05-12T15:32:28 | INFO | __main__ : Batch-wise Cosine Similarity | 91.31% +2025-05-12T15:32:28 | INFO | __main__ : Cosine Embedding Loss | 0.0869 +2025-05-12T15:32:28 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:32:28 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:32:28 | INFO | __main__ : Evaluation Average Sim | 0.6442 +2025-05-12T15:32:28 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:32:29 | INFO | __main__ : Performing periodic evaluation at global step 91000... +2025-05-12T15:32:29 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T15:32:29 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T15:32:29 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T15:32:29 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T15:32:38 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.5626 +2025-05-12T15:32:38 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0091000.png +2025-05-12T15:32:38 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T15:32:38 | INFO | __main__ : Evaluation at step 91000 complete. Average Similarity: 0.5626 +2025-05-12T15:33:01 | INFO | utils.basic_utils : Train Epoch: [0] [ 377/4978] eta: 3 days, 4:26:29 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0450 eval_avg_sim: 0.5626 video-cosine_similarity: 0.9550 time: 59.8164 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T15:33:01 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:33:01 | INFO | __main__ : Step: 91100 +2025-05-12T15:33:01 | INFO | __main__ : Current Frame Index within Batch Video: 8/247 +2025-05-12T15:33:01 | INFO | __main__ : Batch-wise Cosine Similarity | 60.08% +2025-05-12T15:33:01 | INFO | __main__ : Cosine Embedding Loss | 0.3992 +2025-05-12T15:33:01 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:33:01 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:33:01 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:33:25 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:33:25 | INFO | __main__ : Step: 91200 +2025-05-12T15:33:25 | INFO | __main__ : Current Frame Index within Batch Video: 108/247 +2025-05-12T15:33:25 | INFO | __main__ : Batch-wise Cosine Similarity | 91.56% +2025-05-12T15:33:25 | INFO | __main__ : Cosine Embedding Loss | 0.0844 +2025-05-12T15:33:25 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:33:25 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:33:25 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:33:49 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:33:49 | INFO | __main__ : Step: 91300 +2025-05-12T15:33:49 | INFO | __main__ : Current Frame Index within Batch Video: 208/247 +2025-05-12T15:33:49 | INFO | __main__ : Batch-wise Cosine Similarity | 91.60% +2025-05-12T15:33:49 | INFO | __main__ : Cosine Embedding Loss | 0.0840 +2025-05-12T15:33:49 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:33:49 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:33:49 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:33:58 | INFO | utils.basic_utils : Train Epoch: [0] [ 378/4978] eta: 3 days, 4:25:01 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0691 eval_avg_sim: 0.5626 video-cosine_similarity: 0.9309 time: 59.8169 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T15:34:13 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:34:13 | INFO | __main__ : Step: 91400 +2025-05-12T15:34:13 | INFO | __main__ : Current Frame Index within Batch Video: 67/247 +2025-05-12T15:34:13 | INFO | __main__ : Batch-wise Cosine Similarity | 82.79% +2025-05-12T15:34:13 | INFO | __main__ : Cosine Embedding Loss | 0.1721 +2025-05-12T15:34:13 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:34:13 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:34:13 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:34:37 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:34:37 | INFO | __main__ : Step: 91500 +2025-05-12T15:34:37 | INFO | __main__ : Current Frame Index within Batch Video: 167/247 +2025-05-12T15:34:37 | INFO | __main__ : Batch-wise Cosine Similarity | 88.29% +2025-05-12T15:34:37 | INFO | __main__ : Cosine Embedding Loss | 0.1171 +2025-05-12T15:34:37 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:34:37 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:34:37 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:34:56 | INFO | utils.basic_utils : Train Epoch: [0] [ 379/4978] eta: 3 days, 4:23:32 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0899 eval_avg_sim: 0.5626 video-cosine_similarity: 0.9101 time: 59.8180 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T15:35:01 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:35:01 | INFO | __main__ : Step: 91600 +2025-05-12T15:35:01 | INFO | __main__ : Current Frame Index within Batch Video: 26/247 +2025-05-12T15:35:01 | INFO | __main__ : Batch-wise Cosine Similarity | 72.96% +2025-05-12T15:35:01 | INFO | __main__ : Cosine Embedding Loss | 0.2704 +2025-05-12T15:35:01 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:35:01 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:35:01 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:35:25 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:35:25 | INFO | __main__ : Step: 91700 +2025-05-12T15:35:25 | INFO | __main__ : Current Frame Index within Batch Video: 126/247 +2025-05-12T15:35:25 | INFO | __main__ : Batch-wise Cosine Similarity | 90.26% +2025-05-12T15:35:25 | INFO | __main__ : Cosine Embedding Loss | 0.0974 +2025-05-12T15:35:25 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:35:25 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:35:25 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:35:48 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:35:48 | INFO | __main__ : Step: 91800 +2025-05-12T15:35:48 | INFO | __main__ : Current Frame Index within Batch Video: 226/247 +2025-05-12T15:35:48 | INFO | __main__ : Batch-wise Cosine Similarity | 90.99% +2025-05-12T15:35:48 | INFO | __main__ : Cosine Embedding Loss | 0.0901 +2025-05-12T15:35:48 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:35:48 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:35:48 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:35:53 | INFO | utils.basic_utils : Train Epoch: [0] [ 380/4978] eta: 3 days, 4:22:05 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0803 eval_avg_sim: 0.5626 video-cosine_similarity: 0.9197 time: 59.3555 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T15:36:12 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:36:12 | INFO | __main__ : Step: 91900 +2025-05-12T15:36:12 | INFO | __main__ : Current Frame Index within Batch Video: 85/247 +2025-05-12T15:36:12 | INFO | __main__ : Batch-wise Cosine Similarity | 84.44% +2025-05-12T15:36:12 | INFO | __main__ : Cosine Embedding Loss | 0.1556 +2025-05-12T15:36:12 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:36:12 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:36:12 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:36:36 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:36:36 | INFO | __main__ : Step: 92000 +2025-05-12T15:36:36 | INFO | __main__ : Current Frame Index within Batch Video: 185/247 +2025-05-12T15:36:36 | INFO | __main__ : Batch-wise Cosine Similarity | 90.75% +2025-05-12T15:36:36 | INFO | __main__ : Cosine Embedding Loss | 0.0925 +2025-05-12T15:36:36 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:36:36 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:36:36 | INFO | __main__ : Evaluation Average Sim | 0.5626 +2025-05-12T15:36:36 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:36:36 | INFO | __main__ : Performing periodic evaluation at global step 92000... +2025-05-12T15:36:36 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T15:36:36 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T15:36:36 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T15:36:36 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T15:36:46 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.5374 +2025-05-12T15:36:46 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0092000.png +2025-05-12T15:36:46 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T15:36:46 | INFO | __main__ : Evaluation at step 92000 complete. Average Similarity: 0.5374 +2025-05-12T15:37:00 | INFO | utils.basic_utils : Train Epoch: [0] [ 381/4978] eta: 3 days, 4:22:30 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0723 eval_avg_sim: 0.5374 video-cosine_similarity: 0.9277 time: 59.8247 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T15:37:09 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:37:09 | INFO | __main__ : Step: 92100 +2025-05-12T15:37:09 | INFO | __main__ : Current Frame Index within Batch Video: 44/247 +2025-05-12T15:37:09 | INFO | __main__ : Batch-wise Cosine Similarity | 79.11% +2025-05-12T15:37:09 | INFO | __main__ : Cosine Embedding Loss | 0.2089 +2025-05-12T15:37:09 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:37:09 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:37:09 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:37:33 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:37:33 | INFO | __main__ : Step: 92200 +2025-05-12T15:37:33 | INFO | __main__ : Current Frame Index within Batch Video: 144/247 +2025-05-12T15:37:33 | INFO | __main__ : Batch-wise Cosine Similarity | 92.44% +2025-05-12T15:37:33 | INFO | __main__ : Cosine Embedding Loss | 0.0756 +2025-05-12T15:37:33 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:37:33 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:37:33 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:37:57 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:37:57 | INFO | __main__ : Step: 92300 +2025-05-12T15:37:57 | INFO | __main__ : Current Frame Index within Batch Video: 244/247 +2025-05-12T15:37:57 | INFO | __main__ : Batch-wise Cosine Similarity | 91.60% +2025-05-12T15:37:57 | INFO | __main__ : Cosine Embedding Loss | 0.0840 +2025-05-12T15:37:57 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:37:57 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:37:57 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:37:58 | INFO | utils.basic_utils : Train Epoch: [0] [ 382/4978] eta: 3 days, 4:21:02 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0855 eval_avg_sim: 0.5374 video-cosine_similarity: 0.9145 time: 59.8240 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T15:38:21 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:38:21 | INFO | __main__ : Step: 92400 +2025-05-12T15:38:21 | INFO | __main__ : Current Frame Index within Batch Video: 103/247 +2025-05-12T15:38:21 | INFO | __main__ : Batch-wise Cosine Similarity | 88.73% +2025-05-12T15:38:21 | INFO | __main__ : Cosine Embedding Loss | 0.1127 +2025-05-12T15:38:21 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:38:21 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:38:21 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:38:45 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:38:45 | INFO | __main__ : Step: 92500 +2025-05-12T15:38:45 | INFO | __main__ : Current Frame Index within Batch Video: 203/247 +2025-05-12T15:38:45 | INFO | __main__ : Batch-wise Cosine Similarity | 92.33% +2025-05-12T15:38:45 | INFO | __main__ : Cosine Embedding Loss | 0.0767 +2025-05-12T15:38:45 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:38:45 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:38:45 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:38:55 | INFO | utils.basic_utils : Train Epoch: [0] [ 383/4978] eta: 3 days, 4:19:35 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0588 eval_avg_sim: 0.5374 video-cosine_similarity: 0.9412 time: 59.8267 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T15:39:09 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:39:09 | INFO | __main__ : Step: 92600 +2025-05-12T15:39:09 | INFO | __main__ : Current Frame Index within Batch Video: 62/247 +2025-05-12T15:39:09 | INFO | __main__ : Batch-wise Cosine Similarity | 83.06% +2025-05-12T15:39:09 | INFO | __main__ : Cosine Embedding Loss | 0.1694 +2025-05-12T15:39:09 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:39:09 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:39:09 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:39:32 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:39:32 | INFO | __main__ : Step: 92700 +2025-05-12T15:39:32 | INFO | __main__ : Current Frame Index within Batch Video: 162/247 +2025-05-12T15:39:32 | INFO | __main__ : Batch-wise Cosine Similarity | 91.14% +2025-05-12T15:39:32 | INFO | __main__ : Cosine Embedding Loss | 0.0886 +2025-05-12T15:39:32 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:39:32 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:39:32 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:39:53 | INFO | utils.basic_utils : Train Epoch: [0] [ 384/4978] eta: 3 days, 4:18:07 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0600 eval_avg_sim: 0.5374 video-cosine_similarity: 0.9400 time: 59.8274 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T15:39:56 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:39:56 | INFO | __main__ : Step: 92800 +2025-05-12T15:39:56 | INFO | __main__ : Current Frame Index within Batch Video: 21/247 +2025-05-12T15:39:56 | INFO | __main__ : Batch-wise Cosine Similarity | 67.19% +2025-05-12T15:39:56 | INFO | __main__ : Cosine Embedding Loss | 0.3281 +2025-05-12T15:39:56 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:39:56 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:39:56 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:40:20 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:40:20 | INFO | __main__ : Step: 92900 +2025-05-12T15:40:20 | INFO | __main__ : Current Frame Index within Batch Video: 121/247 +2025-05-12T15:40:20 | INFO | __main__ : Batch-wise Cosine Similarity | 88.54% +2025-05-12T15:40:20 | INFO | __main__ : Cosine Embedding Loss | 0.1146 +2025-05-12T15:40:20 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:40:20 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:40:20 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:40:44 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:40:44 | INFO | __main__ : Step: 93000 +2025-05-12T15:40:44 | INFO | __main__ : Current Frame Index within Batch Video: 221/247 +2025-05-12T15:40:44 | INFO | __main__ : Batch-wise Cosine Similarity | 92.97% +2025-05-12T15:40:44 | INFO | __main__ : Cosine Embedding Loss | 0.0703 +2025-05-12T15:40:44 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:40:44 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:40:44 | INFO | __main__ : Evaluation Average Sim | 0.5374 +2025-05-12T15:40:44 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:40:44 | INFO | __main__ : Performing periodic evaluation at global step 93000... +2025-05-12T15:40:44 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T15:40:44 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T15:40:44 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T15:40:44 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T15:40:53 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.6358 +2025-05-12T15:40:54 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0093000.png +2025-05-12T15:40:54 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T15:40:54 | INFO | __main__ : Evaluation at step 93000 complete. Average Similarity: 0.6358 +2025-05-12T15:40:59 | INFO | utils.basic_utils : Train Epoch: [0] [ 385/4978] eta: 3 days, 4:18:31 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0613 eval_avg_sim: 0.6358 video-cosine_similarity: 0.9387 time: 59.8113 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T15:41:17 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:41:17 | INFO | __main__ : Step: 93100 +2025-05-12T15:41:17 | INFO | __main__ : Current Frame Index within Batch Video: 80/247 +2025-05-12T15:41:17 | INFO | __main__ : Batch-wise Cosine Similarity | 86.11% +2025-05-12T15:41:17 | INFO | __main__ : Cosine Embedding Loss | 0.1389 +2025-05-12T15:41:17 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:41:17 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:41:17 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:41:41 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:41:41 | INFO | __main__ : Step: 93200 +2025-05-12T15:41:41 | INFO | __main__ : Current Frame Index within Batch Video: 180/247 +2025-05-12T15:41:41 | INFO | __main__ : Batch-wise Cosine Similarity | 93.52% +2025-05-12T15:41:41 | INFO | __main__ : Cosine Embedding Loss | 0.0648 +2025-05-12T15:41:41 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:41:41 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:41:41 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:41:57 | INFO | utils.basic_utils : Train Epoch: [0] [ 386/4978] eta: 3 days, 4:17:02 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0467 eval_avg_sim: 0.6358 video-cosine_similarity: 0.9533 time: 59.8064 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T15:42:05 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:42:05 | INFO | __main__ : Step: 93300 +2025-05-12T15:42:05 | INFO | __main__ : Current Frame Index within Batch Video: 39/247 +2025-05-12T15:42:05 | INFO | __main__ : Batch-wise Cosine Similarity | 76.00% +2025-05-12T15:42:05 | INFO | __main__ : Cosine Embedding Loss | 0.2400 +2025-05-12T15:42:05 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:42:05 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:42:05 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:42:29 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:42:29 | INFO | __main__ : Step: 93400 +2025-05-12T15:42:29 | INFO | __main__ : Current Frame Index within Batch Video: 139/247 +2025-05-12T15:42:29 | INFO | __main__ : Batch-wise Cosine Similarity | 91.46% +2025-05-12T15:42:29 | INFO | __main__ : Cosine Embedding Loss | 0.0854 +2025-05-12T15:42:29 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:42:29 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:42:29 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:42:52 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:42:52 | INFO | __main__ : Step: 93500 +2025-05-12T15:42:52 | INFO | __main__ : Current Frame Index within Batch Video: 239/247 +2025-05-12T15:42:52 | INFO | __main__ : Batch-wise Cosine Similarity | 94.93% +2025-05-12T15:42:52 | INFO | __main__ : Cosine Embedding Loss | 0.0507 +2025-05-12T15:42:52 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:42:52 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:42:52 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:42:54 | INFO | utils.basic_utils : Train Epoch: [0] [ 387/4978] eta: 3 days, 4:15:35 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0463 eval_avg_sim: 0.6358 video-cosine_similarity: 0.9537 time: 59.8059 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T15:43:16 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:43:16 | INFO | __main__ : Step: 93600 +2025-05-12T15:43:16 | INFO | __main__ : Current Frame Index within Batch Video: 98/247 +2025-05-12T15:43:16 | INFO | __main__ : Batch-wise Cosine Similarity | 88.08% +2025-05-12T15:43:16 | INFO | __main__ : Cosine Embedding Loss | 0.1192 +2025-05-12T15:43:16 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:43:16 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:43:16 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:43:40 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:43:40 | INFO | __main__ : Step: 93700 +2025-05-12T15:43:40 | INFO | __main__ : Current Frame Index within Batch Video: 198/247 +2025-05-12T15:43:40 | INFO | __main__ : Batch-wise Cosine Similarity | 92.62% +2025-05-12T15:43:40 | INFO | __main__ : Cosine Embedding Loss | 0.0738 +2025-05-12T15:43:40 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:43:40 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:43:40 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:43:52 | INFO | utils.basic_utils : Train Epoch: [0] [ 388/4978] eta: 3 days, 4:14:07 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0771 eval_avg_sim: 0.6358 video-cosine_similarity: 0.9229 time: 59.8030 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T15:44:04 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:44:04 | INFO | __main__ : Step: 93800 +2025-05-12T15:44:04 | INFO | __main__ : Current Frame Index within Batch Video: 57/247 +2025-05-12T15:44:04 | INFO | __main__ : Batch-wise Cosine Similarity | 83.44% +2025-05-12T15:44:04 | INFO | __main__ : Cosine Embedding Loss | 0.1656 +2025-05-12T15:44:04 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:44:04 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:44:04 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:44:28 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:44:28 | INFO | __main__ : Step: 93900 +2025-05-12T15:44:28 | INFO | __main__ : Current Frame Index within Batch Video: 157/247 +2025-05-12T15:44:28 | INFO | __main__ : Batch-wise Cosine Similarity | 90.99% +2025-05-12T15:44:28 | INFO | __main__ : Cosine Embedding Loss | 0.0901 +2025-05-12T15:44:28 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:44:28 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:44:28 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:44:49 | INFO | utils.basic_utils : Train Epoch: [0] [ 389/4978] eta: 3 days, 4:12:39 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0700 eval_avg_sim: 0.6358 video-cosine_similarity: 0.9300 time: 59.3330 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T15:44:52 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:44:52 | INFO | __main__ : Step: 94000 +2025-05-12T15:44:52 | INFO | __main__ : Current Frame Index within Batch Video: 16/247 +2025-05-12T15:44:52 | INFO | __main__ : Batch-wise Cosine Similarity | 63.47% +2025-05-12T15:44:52 | INFO | __main__ : Cosine Embedding Loss | 0.3653 +2025-05-12T15:44:52 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:44:52 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:44:52 | INFO | __main__ : Evaluation Average Sim | 0.6358 +2025-05-12T15:44:52 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:44:52 | INFO | __main__ : Performing periodic evaluation at global step 94000... +2025-05-12T15:44:52 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T15:44:52 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T15:44:52 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T15:44:52 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T15:45:02 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.5491 +2025-05-12T15:45:02 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0094000.png +2025-05-12T15:45:02 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T15:45:02 | INFO | __main__ : Evaluation at step 94000 complete. Average Similarity: 0.5491 +2025-05-12T15:45:25 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:45:25 | INFO | __main__ : Step: 94100 +2025-05-12T15:45:25 | INFO | __main__ : Current Frame Index within Batch Video: 116/247 +2025-05-12T15:45:25 | INFO | __main__ : Batch-wise Cosine Similarity | 87.91% +2025-05-12T15:45:25 | INFO | __main__ : Cosine Embedding Loss | 0.1209 +2025-05-12T15:45:25 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:45:25 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:45:25 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:45:49 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:45:49 | INFO | __main__ : Step: 94200 +2025-05-12T15:45:49 | INFO | __main__ : Current Frame Index within Batch Video: 216/247 +2025-05-12T15:45:49 | INFO | __main__ : Batch-wise Cosine Similarity | 89.61% +2025-05-12T15:45:49 | INFO | __main__ : Cosine Embedding Loss | 0.1039 +2025-05-12T15:45:49 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:45:49 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:45:49 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:45:56 | INFO | utils.basic_utils : Train Epoch: [0] [ 390/4978] eta: 3 days, 4:13:08 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0818 eval_avg_sim: 0.5491 video-cosine_similarity: 0.9182 time: 59.8269 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T15:46:13 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:46:13 | INFO | __main__ : Step: 94300 +2025-05-12T15:46:13 | INFO | __main__ : Current Frame Index within Batch Video: 75/247 +2025-05-12T15:46:13 | INFO | __main__ : Batch-wise Cosine Similarity | 85.90% +2025-05-12T15:46:13 | INFO | __main__ : Cosine Embedding Loss | 0.1410 +2025-05-12T15:46:13 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:46:13 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:46:13 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:46:37 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:46:37 | INFO | __main__ : Step: 94400 +2025-05-12T15:46:37 | INFO | __main__ : Current Frame Index within Batch Video: 175/247 +2025-05-12T15:46:37 | INFO | __main__ : Batch-wise Cosine Similarity | 90.22% +2025-05-12T15:46:37 | INFO | __main__ : Cosine Embedding Loss | 0.0978 +2025-05-12T15:46:37 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:46:37 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:46:37 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:46:54 | INFO | utils.basic_utils : Train Epoch: [0] [ 391/4978] eta: 3 days, 4:11:42 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0386 eval_avg_sim: 0.5491 video-cosine_similarity: 0.9614 time: 59.8277 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T15:47:01 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:47:01 | INFO | __main__ : Step: 94500 +2025-05-12T15:47:01 | INFO | __main__ : Current Frame Index within Batch Video: 34/247 +2025-05-12T15:47:01 | INFO | __main__ : Batch-wise Cosine Similarity | 78.64% +2025-05-12T15:47:01 | INFO | __main__ : Cosine Embedding Loss | 0.2136 +2025-05-12T15:47:01 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:47:01 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:47:01 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:47:25 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:47:25 | INFO | __main__ : Step: 94600 +2025-05-12T15:47:25 | INFO | __main__ : Current Frame Index within Batch Video: 134/247 +2025-05-12T15:47:25 | INFO | __main__ : Batch-wise Cosine Similarity | 90.75% +2025-05-12T15:47:25 | INFO | __main__ : Cosine Embedding Loss | 0.0925 +2025-05-12T15:47:25 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:47:25 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:47:25 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:47:48 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:47:48 | INFO | __main__ : Step: 94700 +2025-05-12T15:47:48 | INFO | __main__ : Current Frame Index within Batch Video: 234/247 +2025-05-12T15:47:48 | INFO | __main__ : Batch-wise Cosine Similarity | 94.19% +2025-05-12T15:47:48 | INFO | __main__ : Cosine Embedding Loss | 0.0581 +2025-05-12T15:47:48 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:47:48 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:47:48 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:47:52 | INFO | utils.basic_utils : Train Epoch: [0] [ 392/4978] eta: 3 days, 4:10:15 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0568 eval_avg_sim: 0.5491 video-cosine_similarity: 0.9432 time: 59.8283 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T15:48:12 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:48:12 | INFO | __main__ : Step: 94800 +2025-05-12T15:48:12 | INFO | __main__ : Current Frame Index within Batch Video: 93/247 +2025-05-12T15:48:12 | INFO | __main__ : Batch-wise Cosine Similarity | 87.81% +2025-05-12T15:48:12 | INFO | __main__ : Cosine Embedding Loss | 0.1219 +2025-05-12T15:48:12 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:48:12 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:48:12 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:48:36 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:48:36 | INFO | __main__ : Step: 94900 +2025-05-12T15:48:36 | INFO | __main__ : Current Frame Index within Batch Video: 193/247 +2025-05-12T15:48:36 | INFO | __main__ : Batch-wise Cosine Similarity | 92.58% +2025-05-12T15:48:36 | INFO | __main__ : Cosine Embedding Loss | 0.0742 +2025-05-12T15:48:36 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:48:36 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:48:36 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:48:49 | INFO | utils.basic_utils : Train Epoch: [0] [ 393/4978] eta: 3 days, 4:08:48 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0650 eval_avg_sim: 0.5491 video-cosine_similarity: 0.9350 time: 59.3547 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T15:49:00 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:49:00 | INFO | __main__ : Step: 95000 +2025-05-12T15:49:00 | INFO | __main__ : Current Frame Index within Batch Video: 52/247 +2025-05-12T15:49:00 | INFO | __main__ : Batch-wise Cosine Similarity | 82.65% +2025-05-12T15:49:00 | INFO | __main__ : Cosine Embedding Loss | 0.1735 +2025-05-12T15:49:00 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:49:00 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:49:00 | INFO | __main__ : Evaluation Average Sim | 0.5491 +2025-05-12T15:49:00 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:49:00 | INFO | __main__ : Saving checkpoint at global step 95000 +2025-05-12T15:49:00 | INFO | __main__ : Performing periodic evaluation at global step 95000... +2025-05-12T15:49:00 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T15:49:01 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T15:49:01 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T15:49:01 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T15:49:10 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.6249 +2025-05-12T15:49:10 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0095000.png +2025-05-12T15:49:10 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T15:49:10 | INFO | __main__ : Evaluation at step 95000 complete. Average Similarity: 0.6249 +2025-05-12T15:49:33 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:49:33 | INFO | __main__ : Step: 95100 +2025-05-12T15:49:33 | INFO | __main__ : Current Frame Index within Batch Video: 152/247 +2025-05-12T15:49:33 | INFO | __main__ : Batch-wise Cosine Similarity | 93.12% +2025-05-12T15:49:33 | INFO | __main__ : Cosine Embedding Loss | 0.0688 +2025-05-12T15:49:33 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:49:33 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:49:33 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:49:56 | INFO | utils.basic_utils : Train Epoch: [0] [ 394/4978] eta: 3 days, 4:09:11 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0586 eval_avg_sim: 0.6249 video-cosine_similarity: 0.9414 time: 59.8314 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T15:49:57 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:49:57 | INFO | __main__ : Step: 95200 +2025-05-12T15:49:57 | INFO | __main__ : Current Frame Index within Batch Video: 11/247 +2025-05-12T15:49:57 | INFO | __main__ : Batch-wise Cosine Similarity | 58.48% +2025-05-12T15:49:57 | INFO | __main__ : Cosine Embedding Loss | 0.4152 +2025-05-12T15:49:57 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:49:57 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:49:57 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:50:21 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:50:21 | INFO | __main__ : Step: 95300 +2025-05-12T15:50:21 | INFO | __main__ : Current Frame Index within Batch Video: 111/247 +2025-05-12T15:50:21 | INFO | __main__ : Batch-wise Cosine Similarity | 85.93% +2025-05-12T15:50:21 | INFO | __main__ : Cosine Embedding Loss | 0.1407 +2025-05-12T15:50:21 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:50:21 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:50:21 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:50:45 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:50:45 | INFO | __main__ : Step: 95400 +2025-05-12T15:50:45 | INFO | __main__ : Current Frame Index within Batch Video: 211/247 +2025-05-12T15:50:45 | INFO | __main__ : Batch-wise Cosine Similarity | 91.48% +2025-05-12T15:50:45 | INFO | __main__ : Cosine Embedding Loss | 0.0852 +2025-05-12T15:50:45 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:50:45 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:50:45 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:50:53 | INFO | utils.basic_utils : Train Epoch: [0] [ 395/4978] eta: 3 days, 4:07:44 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0649 eval_avg_sim: 0.6249 video-cosine_similarity: 0.9351 time: 59.8296 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T15:51:09 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:51:09 | INFO | __main__ : Step: 95500 +2025-05-12T15:51:09 | INFO | __main__ : Current Frame Index within Batch Video: 70/247 +2025-05-12T15:51:09 | INFO | __main__ : Batch-wise Cosine Similarity | 80.39% +2025-05-12T15:51:09 | INFO | __main__ : Cosine Embedding Loss | 0.1961 +2025-05-12T15:51:09 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:51:09 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:51:09 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:51:32 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:51:32 | INFO | __main__ : Step: 95600 +2025-05-12T15:51:32 | INFO | __main__ : Current Frame Index within Batch Video: 170/247 +2025-05-12T15:51:32 | INFO | __main__ : Batch-wise Cosine Similarity | 90.74% +2025-05-12T15:51:32 | INFO | __main__ : Cosine Embedding Loss | 0.0926 +2025-05-12T15:51:32 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:51:32 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:51:32 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:51:51 | INFO | utils.basic_utils : Train Epoch: [0] [ 396/4978] eta: 3 days, 4:06:16 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0666 eval_avg_sim: 0.6249 video-cosine_similarity: 0.9334 time: 59.8256 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T15:51:56 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:51:56 | INFO | __main__ : Step: 95700 +2025-05-12T15:51:56 | INFO | __main__ : Current Frame Index within Batch Video: 29/247 +2025-05-12T15:51:56 | INFO | __main__ : Batch-wise Cosine Similarity | 73.05% +2025-05-12T15:51:56 | INFO | __main__ : Cosine Embedding Loss | 0.2695 +2025-05-12T15:51:56 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:51:56 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:51:56 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:52:20 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:52:20 | INFO | __main__ : Step: 95800 +2025-05-12T15:52:20 | INFO | __main__ : Current Frame Index within Batch Video: 129/247 +2025-05-12T15:52:20 | INFO | __main__ : Batch-wise Cosine Similarity | 90.37% +2025-05-12T15:52:20 | INFO | __main__ : Cosine Embedding Loss | 0.0963 +2025-05-12T15:52:20 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:52:20 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:52:20 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:52:44 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:52:44 | INFO | __main__ : Step: 95900 +2025-05-12T15:52:44 | INFO | __main__ : Current Frame Index within Batch Video: 229/247 +2025-05-12T15:52:44 | INFO | __main__ : Batch-wise Cosine Similarity | 93.88% +2025-05-12T15:52:44 | INFO | __main__ : Cosine Embedding Loss | 0.0612 +2025-05-12T15:52:44 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:52:44 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:52:44 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:52:48 | INFO | utils.basic_utils : Train Epoch: [0] [ 397/4978] eta: 3 days, 4:04:50 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0619 eval_avg_sim: 0.6249 video-cosine_similarity: 0.9381 time: 59.3613 data: 0.0009 max mem: 11173 res mem: 15204 +2025-05-12T15:53:08 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:53:08 | INFO | __main__ : Step: 96000 +2025-05-12T15:53:08 | INFO | __main__ : Current Frame Index within Batch Video: 88/247 +2025-05-12T15:53:08 | INFO | __main__ : Batch-wise Cosine Similarity | 88.17% +2025-05-12T15:53:08 | INFO | __main__ : Cosine Embedding Loss | 0.1183 +2025-05-12T15:53:08 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:53:08 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:53:08 | INFO | __main__ : Evaluation Average Sim | 0.6249 +2025-05-12T15:53:08 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:53:08 | INFO | __main__ : Performing periodic evaluation at global step 96000... +2025-05-12T15:53:08 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T15:53:08 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T15:53:08 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T15:53:08 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T15:53:17 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.5005 +2025-05-12T15:53:17 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0096000.png +2025-05-12T15:53:17 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T15:53:17 | INFO | __main__ : Evaluation at step 96000 complete. Average Similarity: 0.5005 +2025-05-12T15:53:41 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:53:41 | INFO | __main__ : Step: 96100 +2025-05-12T15:53:41 | INFO | __main__ : Current Frame Index within Batch Video: 188/247 +2025-05-12T15:53:41 | INFO | __main__ : Batch-wise Cosine Similarity | 92.88% +2025-05-12T15:53:41 | INFO | __main__ : Cosine Embedding Loss | 0.0712 +2025-05-12T15:53:41 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:53:41 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:53:41 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:53:55 | INFO | utils.basic_utils : Train Epoch: [0] [ 398/4978] eta: 3 days, 4:05:09 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0640 eval_avg_sim: 0.5005 video-cosine_similarity: 0.9360 time: 59.8236 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T15:54:05 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:54:05 | INFO | __main__ : Step: 96200 +2025-05-12T15:54:05 | INFO | __main__ : Current Frame Index within Batch Video: 47/247 +2025-05-12T15:54:05 | INFO | __main__ : Batch-wise Cosine Similarity | 80.71% +2025-05-12T15:54:05 | INFO | __main__ : Cosine Embedding Loss | 0.1929 +2025-05-12T15:54:05 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:54:05 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:54:05 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:54:29 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:54:29 | INFO | __main__ : Step: 96300 +2025-05-12T15:54:29 | INFO | __main__ : Current Frame Index within Batch Video: 147/247 +2025-05-12T15:54:29 | INFO | __main__ : Batch-wise Cosine Similarity | 91.65% +2025-05-12T15:54:29 | INFO | __main__ : Cosine Embedding Loss | 0.0835 +2025-05-12T15:54:29 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:54:29 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:54:29 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:54:52 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:54:52 | INFO | __main__ : Step: 96400 +2025-05-12T15:54:52 | INFO | __main__ : Current Frame Index within Batch Video: 247/247 +2025-05-12T15:54:52 | INFO | __main__ : Batch-wise Cosine Similarity | 96.35% +2025-05-12T15:54:52 | INFO | __main__ : Cosine Embedding Loss | 0.0365 +2025-05-12T15:54:52 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:54:52 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:54:52 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:54:52 | INFO | utils.basic_utils : Train Epoch: [0] [ 399/4978] eta: 3 days, 4:03:43 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0365 eval_avg_sim: 0.5005 video-cosine_similarity: 0.9635 time: 59.8269 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T15:55:16 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:55:16 | INFO | __main__ : Step: 96500 +2025-05-12T15:55:16 | INFO | __main__ : Current Frame Index within Batch Video: 106/247 +2025-05-12T15:55:16 | INFO | __main__ : Batch-wise Cosine Similarity | 90.26% +2025-05-12T15:55:16 | INFO | __main__ : Cosine Embedding Loss | 0.0974 +2025-05-12T15:55:16 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:55:16 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:55:16 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:55:40 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:55:40 | INFO | __main__ : Step: 96600 +2025-05-12T15:55:40 | INFO | __main__ : Current Frame Index within Batch Video: 206/247 +2025-05-12T15:55:40 | INFO | __main__ : Batch-wise Cosine Similarity | 91.77% +2025-05-12T15:55:40 | INFO | __main__ : Cosine Embedding Loss | 0.0823 +2025-05-12T15:55:40 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:55:40 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:55:40 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:55:50 | INFO | utils.basic_utils : Train Epoch: [0] [ 400/4978] eta: 3 days, 4:02:16 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0513 eval_avg_sim: 0.5005 video-cosine_similarity: 0.9487 time: 59.8253 data: 0.0005 max mem: 11173 res mem: 15204 +2025-05-12T15:56:04 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:56:04 | INFO | __main__ : Step: 96700 +2025-05-12T15:56:04 | INFO | __main__ : Current Frame Index within Batch Video: 65/247 +2025-05-12T15:56:04 | INFO | __main__ : Batch-wise Cosine Similarity | 85.09% +2025-05-12T15:56:04 | INFO | __main__ : Cosine Embedding Loss | 0.1491 +2025-05-12T15:56:04 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:56:04 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:56:04 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:56:28 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:56:28 | INFO | __main__ : Step: 96800 +2025-05-12T15:56:28 | INFO | __main__ : Current Frame Index within Batch Video: 165/247 +2025-05-12T15:56:28 | INFO | __main__ : Batch-wise Cosine Similarity | 89.80% +2025-05-12T15:56:28 | INFO | __main__ : Cosine Embedding Loss | 0.1020 +2025-05-12T15:56:28 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:56:28 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:56:28 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:56:47 | INFO | utils.basic_utils : Train Epoch: [0] [ 401/4978] eta: 3 days, 4:00:50 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0466 eval_avg_sim: 0.5005 video-cosine_similarity: 0.9534 time: 59.3576 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T15:56:52 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:56:52 | INFO | __main__ : Step: 96900 +2025-05-12T15:56:52 | INFO | __main__ : Current Frame Index within Batch Video: 24/247 +2025-05-12T15:56:52 | INFO | __main__ : Batch-wise Cosine Similarity | 73.48% +2025-05-12T15:56:52 | INFO | __main__ : Cosine Embedding Loss | 0.2652 +2025-05-12T15:56:52 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:56:52 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:56:52 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:57:16 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:57:16 | INFO | __main__ : Step: 97000 +2025-05-12T15:57:16 | INFO | __main__ : Current Frame Index within Batch Video: 124/247 +2025-05-12T15:57:16 | INFO | __main__ : Batch-wise Cosine Similarity | 89.67% +2025-05-12T15:57:16 | INFO | __main__ : Cosine Embedding Loss | 0.1033 +2025-05-12T15:57:16 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:57:16 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:57:16 | INFO | __main__ : Evaluation Average Sim | 0.5005 +2025-05-12T15:57:16 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:57:16 | INFO | __main__ : Performing periodic evaluation at global step 97000... +2025-05-12T15:57:16 | INFO | __main__ : Starting evaluation on video: /home/zli/.cache/huggingface/hub/datasets--qingy2024--backflip_train/snapshots/3abc68a45c66978e97815267df65084e53a92826/1.mp4 +2025-05-12T15:57:16 | INFO | __main__ : Warming up streaming model for evaluation with first 7 frames... +2025-05-12T15:57:16 | INFO | __main__ : Warm-up complete for evaluation. +2025-05-12T15:57:16 | INFO | __main__ : Processing and comparing from frame 7 onwards... +2025-05-12T15:57:25 | INFO | __main__ : Evaluation complete. Average Cosine Similarity: 0.5791 +2025-05-12T15:57:25 | INFO | __main__ : Saved evaluation plot to scripts/pretraining/clip/B14/B14/cosine_sim_graphs/graph_step_0097000.png +2025-05-12T15:57:25 | INFO | __main__ : Evaluation complete. Model set back to train() mode. +2025-05-12T15:57:25 | INFO | __main__ : Evaluation at step 97000 complete. Average Similarity: 0.5791 +2025-05-12T15:57:49 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:57:49 | INFO | __main__ : Step: 97100 +2025-05-12T15:57:49 | INFO | __main__ : Current Frame Index within Batch Video: 224/247 +2025-05-12T15:57:49 | INFO | __main__ : Batch-wise Cosine Similarity | 93.45% +2025-05-12T15:57:49 | INFO | __main__ : Cosine Embedding Loss | 0.0655 +2025-05-12T15:57:49 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:57:49 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:57:49 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:57:54 | INFO | utils.basic_utils : Train Epoch: [0] [ 402/4978] eta: 3 days, 4:01:10 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0512 eval_avg_sim: 0.5791 video-cosine_similarity: 0.9488 time: 59.8228 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T15:58:13 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:58:13 | INFO | __main__ : Step: 97200 +2025-05-12T15:58:13 | INFO | __main__ : Current Frame Index within Batch Video: 83/247 +2025-05-12T15:58:13 | INFO | __main__ : Batch-wise Cosine Similarity | 86.66% +2025-05-12T15:58:13 | INFO | __main__ : Cosine Embedding Loss | 0.1334 +2025-05-12T15:58:13 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:58:13 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:58:13 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:58:36 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:58:36 | INFO | __main__ : Step: 97300 +2025-05-12T15:58:36 | INFO | __main__ : Current Frame Index within Batch Video: 183/247 +2025-05-12T15:58:36 | INFO | __main__ : Batch-wise Cosine Similarity | 92.95% +2025-05-12T15:58:36 | INFO | __main__ : Cosine Embedding Loss | 0.0705 +2025-05-12T15:58:36 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:58:36 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:58:36 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:58:52 | INFO | utils.basic_utils : Train Epoch: [0] [ 403/4978] eta: 3 days, 3:59:43 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0605 eval_avg_sim: 0.5791 video-cosine_similarity: 0.9395 time: 59.8195 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T15:59:00 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:59:00 | INFO | __main__ : Step: 97400 +2025-05-12T15:59:00 | INFO | __main__ : Current Frame Index within Batch Video: 42/247 +2025-05-12T15:59:00 | INFO | __main__ : Batch-wise Cosine Similarity | 78.69% +2025-05-12T15:59:00 | INFO | __main__ : Cosine Embedding Loss | 0.2131 +2025-05-12T15:59:00 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:59:00 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:59:00 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:59:24 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:59:24 | INFO | __main__ : Step: 97500 +2025-05-12T15:59:24 | INFO | __main__ : Current Frame Index within Batch Video: 142/247 +2025-05-12T15:59:24 | INFO | __main__ : Batch-wise Cosine Similarity | 87.67% +2025-05-12T15:59:24 | INFO | __main__ : Cosine Embedding Loss | 0.1233 +2025-05-12T15:59:24 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:59:24 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:59:24 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:59:48 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:59:48 | INFO | __main__ : Step: 97600 +2025-05-12T15:59:48 | INFO | __main__ : Current Frame Index within Batch Video: 242/247 +2025-05-12T15:59:48 | INFO | __main__ : Batch-wise Cosine Similarity | 92.45% +2025-05-12T15:59:48 | INFO | __main__ : Cosine Embedding Loss | 0.0755 +2025-05-12T15:59:48 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T15:59:48 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T15:59:48 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T15:59:49 | INFO | utils.basic_utils : Train Epoch: [0] [ 404/4978] eta: 3 days, 3:58:17 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0687 eval_avg_sim: 0.5791 video-cosine_similarity: 0.9313 time: 59.8169 data: 0.0001 max mem: 11173 res mem: 15204 +2025-05-12T16:00:12 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T16:00:12 | INFO | __main__ : Step: 97700 +2025-05-12T16:00:12 | INFO | __main__ : Current Frame Index within Batch Video: 101/247 +2025-05-12T16:00:12 | INFO | __main__ : Batch-wise Cosine Similarity | 86.83% +2025-05-12T16:00:12 | INFO | __main__ : Cosine Embedding Loss | 0.1317 +2025-05-12T16:00:12 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T16:00:12 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T16:00:12 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T16:00:36 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T16:00:36 | INFO | __main__ : Step: 97800 +2025-05-12T16:00:36 | INFO | __main__ : Current Frame Index within Batch Video: 201/247 +2025-05-12T16:00:36 | INFO | __main__ : Batch-wise Cosine Similarity | 92.86% +2025-05-12T16:00:36 | INFO | __main__ : Cosine Embedding Loss | 0.0714 +2025-05-12T16:00:36 | INFO | __main__ : Learning Rate | 0.000010 +2025-05-12T16:00:36 | INFO | __main__ : Temperature | 0.0126 +2025-05-12T16:00:36 | INFO | __main__ : ──────────────────────────────────────────────────────────────────────────────── +2025-05-12T16:00:46 | INFO | utils.basic_utils : Train Epoch: [0] [ 405/4978] eta: 3 days, 3:56:51 lr: 0.000010 temperature: 0.0126 video-loss_cosine: 0.0683 eval_avg_sim: 0.5791 video-cosine_similarity: 0.9317 time: 59.3483 data: 0.0001 max mem: 11173 res mem: 15204 diff --git a/V4.3-ckpt/zero_to_fp32.py b/V4.3-ckpt/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/V4.3-ckpt/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters)