Upload folder using huggingface_hub
Browse files- .gitattributes +9 -0
- log-e2e-epoch10_speech2speech_rerun/2a8211e0e75e000bbae1134b21fdbfa4.wav +3 -0
- log-e2e-epoch10_speech2speech_rerun/2e9fddd659b1167aed93534c608f1d99.wav +0 -0
- log-e2e-epoch10_speech2speech_rerun/31b4015dd1258ff1f8be44d6504e6956.wav +3 -0
- log-e2e-epoch10_speech2speech_rerun/3243962376f8c2e411e587e6482f9367.wav +3 -0
- log-e2e-epoch10_speech2speech_rerun/450e7541d093ee0e7e12ab76a0dff546.wav +3 -0
- log-e2e-epoch10_speech2speech_rerun/88562bce265cff88246933e62b756e84.wav +3 -0
- log-e2e-epoch10_speech2speech_rerun/ca7f2b2ab703a4f37b5da09fd119a28a.wav +3 -0
- log-e2e-epoch10_speech2speech_rerun/cer-summary-test-epoch-999-avg-1.txt +2 -0
- log-e2e-epoch10_speech2speech_rerun/d60daeaebb5b0b14ee34f5d714075108.wav +3 -0
- log-e2e-epoch10_speech2speech_rerun/ee07a1222f1d62cf0358f51d12f4c67c.wav +3 -0
- log-e2e-epoch10_speech2speech_rerun/errs-test-beam-search-epoch-999-avg-1.txt +1000 -0
- log-e2e-epoch10_speech2speech_rerun/f5cf13be42666535036bdf0f05fcc0e3.wav +3 -0
- log-e2e-epoch10_speech2speech_rerun/log-decode-epoch-999-avg-1-2025-04-29-06-36-58 +66 -0
- log-e2e-epoch10_speech2speech_rerun/log-decode-epoch-999-avg-1-2025-04-29-06-40-09 +66 -0
- log-e2e-epoch10_speech2speech_rerun/log-decode-epoch-999-avg-1-2025-04-29-06-41-35 +75 -0
- log-e2e-epoch10_speech2speech_rerun/recogs-test-beam-search-epoch-999-avg-1.txt +26 -0
- log/log-train-2025-04-24-08-58-17 +689 -0
- log/log-train-2025-04-24-09-09-22 +689 -0
- log/log-train-2025-04-24-09-15-56 +689 -0
- log/log-train-2025-04-24-10-12-23 +600 -0
- log/log-train-2025-04-24-10-14-50 +600 -0
- log/log-train-2025-04-24-10-17-26 +607 -0
- log/log-train-2025-04-24-10-21-52 +604 -0
- log/log-train-2025-04-24-10-24-20 +0 -0
- log/log-train-2025-04-24-14-23-07 +0 -0
- log/log-train-2025-04-25-00-57-07 +71 -0
- log/log-train-2025-04-25-00-58-05 +575 -0
- log/log-train-2025-04-25-01-06-08 +71 -0
- log/log-train-2025-04-25-01-06-09 +71 -0
- log/log-train-2025-04-25-01-07-00 +0 -0
- pytorch_model.bin +3 -0
- tensorboard/events.out.tfevents.1745485127.7515258.1454637.0 +3 -0
- tensorboard/events.out.tfevents.1745485786.7515258.1472312.0 +3 -0
- tensorboard/events.out.tfevents.1745486182.7515258.1482888.0 +3 -0
- tensorboard/events.out.tfevents.1745489567.7515258.1571829.0 +3 -0
- tensorboard/events.out.tfevents.1745489713.7515258.1575936.0 +3 -0
- tensorboard/events.out.tfevents.1745489870.7515258.1580280.0 +3 -0
- tensorboard/events.out.tfevents.1745490173.7518205.24828.0 +3 -0
- tensorboard/events.out.tfevents.1745490285.7518205.28691.0 +3 -0
- tensorboard/events.out.tfevents.1745505052.7518205.257175.0 +3 -0
- tensorboard/events.out.tfevents.1745543674.7518205.398874.0 +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,12 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
log-e2e-epoch10_speech2speech_rerun/2a8211e0e75e000bbae1134b21fdbfa4.wav filter=lfs diff=lfs merge=lfs -text
|
37 |
+
log-e2e-epoch10_speech2speech_rerun/31b4015dd1258ff1f8be44d6504e6956.wav filter=lfs diff=lfs merge=lfs -text
|
38 |
+
log-e2e-epoch10_speech2speech_rerun/3243962376f8c2e411e587e6482f9367.wav filter=lfs diff=lfs merge=lfs -text
|
39 |
+
log-e2e-epoch10_speech2speech_rerun/450e7541d093ee0e7e12ab76a0dff546.wav filter=lfs diff=lfs merge=lfs -text
|
40 |
+
log-e2e-epoch10_speech2speech_rerun/88562bce265cff88246933e62b756e84.wav filter=lfs diff=lfs merge=lfs -text
|
41 |
+
log-e2e-epoch10_speech2speech_rerun/ca7f2b2ab703a4f37b5da09fd119a28a.wav filter=lfs diff=lfs merge=lfs -text
|
42 |
+
log-e2e-epoch10_speech2speech_rerun/d60daeaebb5b0b14ee34f5d714075108.wav filter=lfs diff=lfs merge=lfs -text
|
43 |
+
log-e2e-epoch10_speech2speech_rerun/ee07a1222f1d62cf0358f51d12f4c67c.wav filter=lfs diff=lfs merge=lfs -text
|
44 |
+
log-e2e-epoch10_speech2speech_rerun/f5cf13be42666535036bdf0f05fcc0e3.wav filter=lfs diff=lfs merge=lfs -text
|
log-e2e-epoch10_speech2speech_rerun/2a8211e0e75e000bbae1134b21fdbfa4.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f2e2b46ccfa55b20f3f14951cdd5477dd34bc40d85466183d755561fa37aa42c
|
3 |
+
size 632364
|
log-e2e-epoch10_speech2speech_rerun/2e9fddd659b1167aed93534c608f1d99.wav
ADDED
Binary file (59.9 kB). View file
|
|
log-e2e-epoch10_speech2speech_rerun/31b4015dd1258ff1f8be44d6504e6956.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6cf3ef873d0a7ff9e571d49b021955269d9de581544a223c278f8524fa04ff96
|
3 |
+
size 903212
|
log-e2e-epoch10_speech2speech_rerun/3243962376f8c2e411e587e6482f9367.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:989b982aa534f47c7f4d631799c554b5b77f07869a1acf3196a6637db740fd76
|
3 |
+
size 903212
|
log-e2e-epoch10_speech2speech_rerun/450e7541d093ee0e7e12ab76a0dff546.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2c97ba2b00e335755d49d9535584d28db36708304749d739329afc739d602f63
|
3 |
+
size 785452
|
log-e2e-epoch10_speech2speech_rerun/88562bce265cff88246933e62b756e84.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2092b38745caa4592f1a81d87c2e5b8cd2847173ae12e63e2387147976cae7a1
|
3 |
+
size 326188
|
log-e2e-epoch10_speech2speech_rerun/ca7f2b2ab703a4f37b5da09fd119a28a.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:92f685106b2ffa556c31b03afd26ce63144a7f241395bac53114f480e2fa41b1
|
3 |
+
size 903212
|
log-e2e-epoch10_speech2speech_rerun/cer-summary-test-epoch-999-avg-1.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
settings CER
|
2 |
+
beam-search 229.61
|
log-e2e-epoch10_speech2speech_rerun/d60daeaebb5b0b14ee34f5d714075108.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cb1be48a81e69d6c594cfc255be0788037854cf3b7df9fd157a9db70b1f8e673
|
3 |
+
size 903212
|
log-e2e-epoch10_speech2speech_rerun/ee07a1222f1d62cf0358f51d12f4c67c.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4479c168805652797770f1697db0de4eabda67ad4961649268a4f6cb9ca4b624
|
3 |
+
size 280108
|
log-e2e-epoch10_speech2speech_rerun/errs-test-beam-search-epoch-999-avg-1.txt
ADDED
@@ -0,0 +1,1000 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
%WER = 229.61
|
2 |
+
Errors: 643 insertions, 78 deletions, 163 substitutions, over 385 reference words (144 correct)
|
3 |
+
Search below for sections starting with PER-UTT DETAILS:, SUBSTITUTIONS:, DELETIONS:, INSERTIONS:, PER-WORD STATS:
|
4 |
+
|
5 |
+
PER-UTT DETAILS: corr or (ref->hyp)
|
6 |
+
2a8211e0e75e000bbae1134b21fdbfa4: (以 下->答 案) 是 (一 道 小 学 数 学 题 : 有 一 条 直 线 , 上 面 有 6->D . 6 0 %
|
7 |
+
向 右 走 的 蚂 蚁 数 量 为 2) 只 (蚂 蚁->*) , (其 中 4->总 共 6) 只 (向 左 走->蚂 蚁 中) , (*->有) 2 只 (向 右 走 。 问 :->是) 向 右 走 的 (蚂 蚁 数 量 占 比 是 多 少 ? A .->, 因 此 6 0 % 的 6 只 蚂 蚁 中 有) 2 (5 % B . 3 3 . 3 % C . 5 0 %->只 。
|
8 |
+
所 以 , 答 案 选) D (. 6 6 . 6 % 回 答 该 问 题 。->。 < | i m _ e n d | >)
|
9 |
+
2e9fddd659b1167aed93534c608f1d99: (A 公 司 去 年 亏 损 了->2) 5 0 (0->*) 万 美 元 (, 今 年 净 利 润 增 长 了 5 0 % , 今 年 的 净 利 润 是 多 少 ?->。 < | i m _ e n d | >)
|
10 |
+
31b4015dd1258ff1f8be44d6504e6956: (请 给 我 介 绍->人 工 智 能 是) 一 (下->种 模 拟) 人 (*->类 智 能 的 技 术 , 它 包 括 机 器 学 习 、 自 然 语 言 处 理 、 计 算 机 视 觉 等 多 个 方 面 。 目 前 , 人) 工 智 能 (*->技 术 已 经 广 泛 应 用 于 各 个 领 域 , 比 如 医 疗 保 健 、 金 融 、 交 通 等 。 其 中 , 深 度 学 习 是 当 前 最 热 门) 的 技 术 (*->之 一 , 它 可 以 实 现 对 大 量 数 据 的 自 动 分 析) 和 (*->学 习 , 并 且 可 以 用 于 图 像 识 别 、 语 音 识 别 等 方 面 。 未 来 , 随 着 科 技 的) 发 展 (*->, 人 工 智 能 将 会 更 加 普 及 , 为 我 们 的 生 活 带 来 更 多 的 便 利 和 创 新) 。 (*->< | i m _ e n d | >)
|
11 |
+
3243962376f8c2e411e587e6482f9367: (写 一 首 关 于->在 大 自 然 的 怀 抱 中 , 我 感 受 到 了 生 命 的 美 好 。 山 川 河 流 、 花 草 树 木 , 都 散 发 着 独 特 的 魅 力 。 阳 光 明 媚 , 微 风 拂 面 , 让 人 感 受 到) 自 然 (*->的 力 量 和) 美 (景->丽 。 在 这 片 广 袤) 的 (诗 歌->大 地 上 , 我 看 到 了 无 尽 的 可 能 和 希 望 , 也 感 受 到 了 内 心 的 宁 静 和 平 静) 。 (*->< | i m _ e n d | >)
|
12 |
+
450e7541d093ee0e7e12ab76a0dff546: (帮 我 想->好 的 , 以 下 是) 一 些 关 于 狗 的 名 字 (*->: 旺 财 、 小 黑 、 巧 克 力 、 大 黄 、 金 毛 、 拉 布 拉 多 、 吉 娃 娃 、 柯 基 、 贵 宾 、 斗 牛 犬) 。 (*->当 然 , 如 果 您 有 其 他 要 求 或 想 法 , 我 也 可 以 根 据 您 的 需 求 进 行 更 精 准 的 推 荐 。 < | i m _ e n d | >)
|
13 |
+
88562bce265cff88246933e62b756e84: (给 定 一 段 文 本 和 关 键 词 列 表 ,->*) 删 除 (文 本 中 包 含 所 有 给 定 关 键 词->后) 的 (子 字 符 串 。->*) 文 本 (*->为) : " 这 是 一 个 测 试 句 子 , 目 的 是 看 看 模 型 是 否 (可 以->*) 正 确 地 从 这 个 句 子 中 删 除 关 键 词 (。->*) " (\ n 关 键 词 列 表 : [ ‘ 测 试 ’ , ‘ 模 型 ’ ]->。 < | i m _ e n d | >)
|
14 |
+
ca7f2b2ab703a4f37b5da09fd119a28a: 根 据 输 入 的 文 本 , (*->可 以) 判 断 它 (是 否->*) 属 于 新 闻 报 道 (、 广 告 或 博 客->类 别 。 原 因 如 下 :
|
15 |
+
1 . ) 文 (章 类 别 之 一 。->本 主 题 : 该 段 文 字 描 述 的 是 奥 运 会 的 消 息 , 这 通 常 是) 新 闻 报 道 (: “ 奥 运 会 在 东 京 隆 重 开 幕->的 主 题 之 一 。
|
16 |
+
2 . 文 本 内 容 : 文 本 中 提 到 了 多 个 事 件) , (*->包 括) 中 国 代 表 团 获 得 多 项 奖 牌 (*->, 这 是 典 型 的 新 闻 报 道 内 容) 。 (”->
|
17 |
+
3 . 文 本 语 气 : 整 个 文 本 的 语 气 客 观 、 中 立 , 没 有 明 显 的 广 告 或 博 客 文 章 的 推 销 和 宣 传 元 素 。
|
18 |
+
综 上 所 述 , 可 以 得 出 结 论 , 这 段 文 本 是 一 篇 新 闻 报 道 。 < | i m _ e n d | >)
|
19 |
+
d60daeaebb5b0b14ee34f5d714075108: (给->如 果 你 喜 欢 自 然 风 光 ,) 我 推 荐 (一->你 去 云 南 的 丽 江 、 大 理 和 西 双 版 纳 。 这) 些 (旅 游 景 点->地 方 有 壮 丽 的 山 脉 、 清 澈 的 湖 泊 和 独 特 的 文 化) 。 (*->如 果 你 想 体 验 异 国 风 情 , 可 以 前 往 泰 国 的 曼 谷 、 印 度 尼 西 亚 的 巴 厘 岛 或 日 本 的 东 京 。 此 外 , 欧 洲 有 许 多 美 丽 的 城 市 , 如 巴 黎 、 威 尼 斯 和 阿 姆 斯 特 丹 等 , 也 是 不 错 的 选 择 。 < | i m _ e n d | >)
|
20 |
+
ee07a1222f1d62cf0358f51d12f4c67c: (很->*) 好 (*->的) , (现 在 请 你->让 我 来) 计 算 一 下 (*->。 在) 这 篇 文 章 中 (*->, “ 获 得 ”) 出 现 了 (多 少 个 “ 获 得 ” 字 样 。->两 次 。 < | i m _ e n d | >)
|
21 |
+
f5cf13be42666535036bdf0f05fcc0e3: 好 的 (。 现 在 请 你 将->, 替 换 后 的 文 本 为 : ") 这 (*->是 一) 个 (文 本 中 的 ��� 有->句 子 这 是 目) 的 (逗 号 都 替 换 成 空 格->看 看 是 否 可 以 正 确 地 从 这 个 句 子 中 删 除 关 键 词 ") 。 (*->< | i m _ e n d | >)
|
22 |
+
|
23 |
+
SUBSTITUTIONS: count ref -> hyp
|
24 |
+
1 ? -> 蚂
|
25 |
+
1 : -> 的
|
26 |
+
1 : -> 向
|
27 |
+
1 : -> n
|
28 |
+
1 , -> 数
|
29 |
+
1 , -> 。
|
30 |
+
1 首 -> 自
|
31 |
+
1 题 -> >
|
32 |
+
1 题 ->
|
33 |
+
|
34 |
+
1 面 -> 为
|
35 |
+
1 隆 -> .
|
36 |
+
1 问 -> |
|
37 |
+
1 长 -> n
|
38 |
+
1 键 -> i
|
39 |
+
1 量 -> 6
|
40 |
+
1 重 ->
|
41 |
+
1 都 -> 是
|
42 |
+
1 道 -> .
|
43 |
+
1 逗 -> 看
|
44 |
+
1 运 -> 之
|
45 |
+
1 走 -> 中
|
46 |
+
1 请 -> 来
|
47 |
+
1 请 -> 后
|
48 |
+
1 请 -> 人
|
49 |
+
1 该 -> d
|
50 |
+
1 诗 -> 大
|
51 |
+
1 词 -> m
|
52 |
+
1 表 -> e
|
53 |
+
1 蚂 -> ,
|
54 |
+
1 蚁 -> 因
|
55 |
+
1 获 -> |
|
56 |
+
1 给 -> 工
|
57 |
+
1 给 -> 如
|
58 |
+
1 绍 -> 是
|
59 |
+
1 线 -> 蚁
|
60 |
+
1 类 -> 主
|
61 |
+
1 答 -> n
|
62 |
+
1 章 -> 本
|
63 |
+
1 空 -> 正
|
64 |
+
1 直 -> 蚂
|
65 |
+
1 的 -> 这
|
66 |
+
1 现 -> 让
|
67 |
+
1 现 -> 替
|
68 |
+
1 点 -> 壮
|
69 |
+
1 游 -> 方
|
70 |
+
1 润 -> _
|
71 |
+
1 测 -> >
|
72 |
+
1 比 -> %
|
73 |
+
1 歌 -> 地
|
74 |
+
1 格 -> 确
|
75 |
+
1 样 -> e
|
76 |
+
1 条 -> 的
|
77 |
+
1 本 -> 子
|
78 |
+
1 有 -> 目
|
79 |
+
1 有 -> 右
|
80 |
+
1 有 -> 2
|
81 |
+
1 替 -> 否
|
82 |
+
1 景 -> 有
|
83 |
+
1 景 -> 丽
|
84 |
+
1 是 -> 的
|
85 |
+
1 旅 -> 地
|
86 |
+
1 文 -> 后
|
87 |
+
1 文 -> 句
|
88 |
+
1 数 -> 此
|
89 |
+
1 数 -> 0
|
90 |
+
1 换 -> 可
|
91 |
+
1 所 -> 是
|
92 |
+
1 或 -> 原
|
93 |
+
1 我 -> 的
|
94 |
+
1 我 -> 智
|
95 |
+
1 成 -> 以
|
96 |
+
1 想 -> ,
|
97 |
+
1 得 -> i
|
98 |
+
1 开 -> 文
|
99 |
+
1 广 -> 别
|
100 |
+
1 年 -> |
|
101 |
+
1 幕 -> 本
|
102 |
+
1 帮 -> 好
|
103 |
+
1 左 -> 蚁
|
104 |
+
1 少 -> 次
|
105 |
+
1 少 -> 只
|
106 |
+
1 小 ->
|
107 |
+
1 将 -> 文
|
108 |
+
1 客 -> 如
|
109 |
+
1 学 -> 6
|
110 |
+
1 学 -> %
|
111 |
+
1 字 -> _
|
112 |
+
1 奥 -> 题
|
113 |
+
1 多 -> 两
|
114 |
+
1 多 -> 6
|
115 |
+
1 增 -> e
|
116 |
+
1 在 -> 换
|
117 |
+
1 在 -> 我
|
118 |
+
1 在 -> 。
|
119 |
+
1 回 -> e
|
120 |
+
1 告 -> 。
|
121 |
+
1 向 -> 蚂
|
122 |
+
1 向 -> 是
|
123 |
+
1 号 -> 看
|
124 |
+
1 占 -> 0
|
125 |
+
1 博 -> 因
|
126 |
+
1 别 -> 题
|
127 |
+
1 利 -> m
|
128 |
+
1 列 -> _
|
129 |
+
1 净 -> i
|
130 |
+
1 写 -> 在
|
131 |
+
1 其 -> 总
|
132 |
+
1 关 -> 然
|
133 |
+
1 关 -> |
|
134 |
+
1 你 -> 的
|
135 |
+
1 会 -> 一
|
136 |
+
1 以 -> 答
|
137 |
+
1 介 -> 能
|
138 |
+
1 今 -> <
|
139 |
+
1 京 -> 2
|
140 |
+
1 于 -> 的
|
141 |
+
1 了 -> d
|
142 |
+
1 之 -> :
|
143 |
+
1 中 -> 共
|
144 |
+
1 中 ->
|
145 |
+
1 个 -> 。
|
146 |
+
1 东 ->
|
147 |
+
|
148 |
+
1 下 -> 种
|
149 |
+
1 下 -> 案
|
150 |
+
1 上 -> 量
|
151 |
+
1 一 -> 走
|
152 |
+
1 一 -> 该
|
153 |
+
1 一 -> 大
|
154 |
+
1 一 -> 你
|
155 |
+
1 一 -> D
|
156 |
+
1 。 -> ,
|
157 |
+
1 。 -> 段
|
158 |
+
1 。 -> n
|
159 |
+
1 、 -> 类
|
160 |
+
1 ” -> m
|
161 |
+
1 ” ->
|
162 |
+
|
163 |
+
1 “ -> 主
|
164 |
+
1 “ -> <
|
165 |
+
1 ‘ -> |
|
166 |
+
1 n -> <
|
167 |
+
1 \ -> 。
|
168 |
+
1 [ -> d
|
169 |
+
1 B ->
|
170 |
+
|
171 |
+
1 A -> 蚁
|
172 |
+
1 A -> 2
|
173 |
+
1 6 -> |
|
174 |
+
1 6 -> m
|
175 |
+
1 6 -> <
|
176 |
+
1 5 -> 只
|
177 |
+
1 5 -> |
|
178 |
+
1 4 -> 6
|
179 |
+
1 3 -> ,
|
180 |
+
1 3 -> 案
|
181 |
+
1 3 -> 以
|
182 |
+
1 0 -> >
|
183 |
+
1 . -> 答
|
184 |
+
1 . -> 所
|
185 |
+
1 . -> 中
|
186 |
+
1 . -> 。
|
187 |
+
1 . -> i
|
188 |
+
1 % -> 选
|
189 |
+
1 % -> 。
|
190 |
+
1 % -> _
|
191 |
+
|
192 |
+
DELETIONS: count ref
|
193 |
+
4 。
|
194 |
+
3 ,
|
195 |
+
2 键
|
196 |
+
2 词
|
197 |
+
2 给
|
198 |
+
2 本
|
199 |
+
2 是
|
200 |
+
2 年
|
201 |
+
2 定
|
202 |
+
2 关
|
203 |
+
2 ’
|
204 |
+
2 0
|
205 |
+
2 %
|
206 |
+
1 ?
|
207 |
+
1 :
|
208 |
+
1 问
|
209 |
+
1 走
|
210 |
+
1 试
|
211 |
+
1 表
|
212 |
+
1 蚂
|
213 |
+
1 蚁
|
214 |
+
1 符
|
215 |
+
1 的
|
216 |
+
1 润
|
217 |
+
1 段
|
218 |
+
1 模
|
219 |
+
1 有
|
220 |
+
1 文
|
221 |
+
1 损
|
222 |
+
1 所
|
223 |
+
1 很
|
224 |
+
1 少
|
225 |
+
1 字
|
226 |
+
1 子
|
227 |
+
1 多
|
228 |
+
1 型
|
229 |
+
1 和
|
230 |
+
1 含
|
231 |
+
1 否
|
232 |
+
1 司
|
233 |
+
1 右
|
234 |
+
1 可
|
235 |
+
1 去
|
236 |
+
1 包
|
237 |
+
1 利
|
238 |
+
1 列
|
239 |
+
1 净
|
240 |
+
1 公
|
241 |
+
1 你
|
242 |
+
1 以
|
243 |
+
1 今
|
244 |
+
1 亏
|
245 |
+
1 了
|
246 |
+
1 串
|
247 |
+
1 中
|
248 |
+
1 一
|
249 |
+
1 ‘
|
250 |
+
1 ]
|
251 |
+
1 C
|
252 |
+
1 6
|
253 |
+
1 5
|
254 |
+
1 .
|
255 |
+
|
256 |
+
INSERTIONS: count hyp
|
257 |
+
28 ,
|
258 |
+
28 的
|
259 |
+
20 、
|
260 |
+
13 。
|
261 |
+
13 |
|
262 |
+
8 是
|
263 |
+
8 和
|
264 |
+
7 文
|
265 |
+
7 可
|
266 |
+
7 以
|
267 |
+
7 d
|
268 |
+
7 >
|
269 |
+
6 这
|
270 |
+
6 本
|
271 |
+
6 n
|
272 |
+
6 m
|
273 |
+
6 i
|
274 |
+
6 e
|
275 |
+
6 _
|
276 |
+
6 <
|
277 |
+
5 :
|
278 |
+
5 有
|
279 |
+
5 多
|
280 |
+
5 到
|
281 |
+
5 中
|
282 |
+
5 个
|
283 |
+
4 语
|
284 |
+
4 等
|
285 |
+
4 我
|
286 |
+
4 如
|
287 |
+
4 了
|
288 |
+
3 风
|
289 |
+
3 面
|
290 |
+
3 自
|
291 |
+
3 能
|
292 |
+
3 特
|
293 |
+
3 然
|
294 |
+
3 果
|
295 |
+
3 更
|
296 |
+
3 新
|
297 |
+
3 技
|
298 |
+
3 或
|
299 |
+
3 感
|
300 |
+
3 广
|
301 |
+
3 学
|
302 |
+
3 大
|
303 |
+
3 受
|
304 |
+
3 力
|
305 |
+
3 前
|
306 |
+
3 内
|
307 |
+
3 人
|
308 |
+
3 习
|
309 |
+
3 也
|
310 |
+
3 丽
|
311 |
+
3 为
|
312 |
+
3 一
|
313 |
+
2 静
|
314 |
+
2 闻
|
315 |
+
2 金
|
316 |
+
2 量
|
317 |
+
2 道
|
318 |
+
2 通
|
319 |
+
2 述
|
320 |
+
2 识
|
321 |
+
2 西
|
322 |
+
2 美
|
323 |
+
2 着
|
324 |
+
2 用
|
325 |
+
2 生
|
326 |
+
2 理
|
327 |
+
2 独
|
328 |
+
2 求
|
329 |
+
2 气
|
330 |
+
2 来
|
331 |
+
2 机
|
332 |
+
2 术
|
333 |
+
2 智
|
334 |
+
2 明
|
335 |
+
2 方
|
336 |
+
2 斯
|
337 |
+
2 推
|
338 |
+
2 据
|
339 |
+
2 括
|
340 |
+
2 拉
|
341 |
+
2 报
|
342 |
+
2 想
|
343 |
+
2 您
|
344 |
+
2 得
|
345 |
+
2 当
|
346 |
+
2 度
|
347 |
+
2 巴
|
348 |
+
2 山
|
349 |
+
2 尼
|
350 |
+
2 容
|
351 |
+
2 客
|
352 |
+
2 它
|
353 |
+
2 娃
|
354 |
+
2 在
|
355 |
+
2 国
|
356 |
+
2 包
|
357 |
+
2 别
|
358 |
+
2 其
|
359 |
+
2 光
|
360 |
+
2 你
|
361 |
+
2 会
|
362 |
+
2 于
|
363 |
+
2 下
|
364 |
+
2 上
|
365 |
+
2 .
|
366 |
+
2 "
|
367 |
+
2
|
368 |
+
2
|
369 |
+
|
370 |
+
1 黑
|
371 |
+
1 黎
|
372 |
+
1 黄
|
373 |
+
1 魅
|
374 |
+
1 验
|
375 |
+
1 领
|
376 |
+
1 音
|
377 |
+
1 需
|
378 |
+
1 随
|
379 |
+
1 除
|
380 |
+
1 阿
|
381 |
+
1 阳
|
382 |
+
1 门
|
383 |
+
1 键
|
384 |
+
1 错
|
385 |
+
1 销
|
386 |
+
1 都
|
387 |
+
1 选
|
388 |
+
1 进
|
389 |
+
1 运
|
390 |
+
1 贵
|
391 |
+
1 财
|
392 |
+
1 谷
|
393 |
+
1 词
|
394 |
+
1 论
|
395 |
+
1 许
|
396 |
+
1 让
|
397 |
+
1 计
|
398 |
+
1 言
|
399 |
+
1 觉
|
400 |
+
1 视
|
401 |
+
1 观
|
402 |
+
1 要
|
403 |
+
1 袤
|
404 |
+
1 行
|
405 |
+
1 融
|
406 |
+
1 获
|
407 |
+
1 荐
|
408 |
+
1 草
|
409 |
+
1 花
|
410 |
+
1 脉
|
411 |
+
1 综
|
412 |
+
1 结
|
413 |
+
1 经
|
414 |
+
1 纳
|
415 |
+
1 素
|
416 |
+
1 精
|
417 |
+
1 类
|
418 |
+
1 篇
|
419 |
+
1 算
|
420 |
+
1 章
|
421 |
+
1 立
|
422 |
+
1 科
|
423 |
+
1 看
|
424 |
+
1 目
|
425 |
+
1 疗
|
426 |
+
1 现
|
427 |
+
1 犬
|
428 |
+
1 牛
|
429 |
+
1 版
|
430 |
+
1 片
|
431 |
+
1 热
|
432 |
+
1 澈
|
433 |
+
1 湖
|
434 |
+
1 清
|
435 |
+
1 深
|
436 |
+
1 消
|
437 |
+
1 流
|
438 |
+
1 活
|
439 |
+
1 洲
|
440 |
+
1 泰
|
441 |
+
1 泛
|
442 |
+
1 法
|
443 |
+
1 泊
|
444 |
+
1 河
|
445 |
+
1 没
|
446 |
+
1 江
|
447 |
+
1 毛
|
448 |
+
1 比
|
449 |
+
1 段
|
450 |
+
1 此
|
451 |
+
1 欧
|
452 |
+
1 欢
|
453 |
+
1 模
|
454 |
+
1 根
|
455 |
+
1 树
|
456 |
+
1 柯
|
457 |
+
1 析
|
458 |
+
1 未
|
459 |
+
1 木
|
460 |
+
1 望
|
461 |
+
1 最
|
462 |
+
1 曼
|
463 |
+
1 普
|
464 |
+
1 显
|
465 |
+
1 旺
|
466 |
+
1 日
|
467 |
+
1 无
|
468 |
+
1 斗
|
469 |
+
1 整
|
470 |
+
1 数
|
471 |
+
1 散
|
472 |
+
1 提
|
473 |
+
1 描
|
474 |
+
1 择
|
475 |
+
1 拟
|
476 |
+
1 拂
|
477 |
+
1 抱
|
478 |
+
1 所
|
479 |
+
1 情
|
480 |
+
1 息
|
481 |
+
1 怀
|
482 |
+
1 心
|
483 |
+
1 微
|
484 |
+
1 往
|
485 |
+
1 异
|
486 |
+
1 应
|
487 |
+
1 并
|
488 |
+
1 平
|
489 |
+
1 常
|
490 |
+
1 带
|
491 |
+
1 希
|
492 |
+
1 布
|
493 |
+
1 市
|
494 |
+
1 已
|
495 |
+
1 巧
|
496 |
+
1 工
|
497 |
+
1 川
|
498 |
+
1 岛
|
499 |
+
1 尽
|
500 |
+
1 小
|
501 |
+
1 将
|
502 |
+
1 对
|
503 |
+
1 宾
|
504 |
+
1 宣
|
505 |
+
1 实
|
506 |
+
1 宁
|
507 |
+
1 字
|
508 |
+
1 子
|
509 |
+
1 媚
|
510 |
+
1 威
|
511 |
+
1 姆
|
512 |
+
1 好
|
513 |
+
1 奥
|
514 |
+
1 外
|
515 |
+
1 处
|
516 |
+
1 基
|
517 |
+
1 域
|
518 |
+
1 城
|
519 |
+
1 型
|
520 |
+
1 地
|
521 |
+
1 图
|
522 |
+
1 器
|
523 |
+
1 喜
|
524 |
+
1 命
|
525 |
+
1 告
|
526 |
+
1 吉
|
527 |
+
1 各
|
528 |
+
1 句
|
529 |
+
1 发
|
530 |
+
1 双
|
531 |
+
1 及
|
532 |
+
1 去
|
533 |
+
1 厘
|
534 |
+
1 印
|
535 |
+
1 博
|
536 |
+
1 南
|
537 |
+
1 医
|
538 |
+
1 化
|
539 |
+
1 动
|
540 |
+
1 加
|
541 |
+
1 利
|
542 |
+
1 删
|
543 |
+
1 创
|
544 |
+
1 分
|
545 |
+
1 出
|
546 |
+
1 准
|
547 |
+
1 典
|
548 |
+
1 关
|
549 |
+
1 克
|
550 |
+
1 元
|
551 |
+
1 像
|
552 |
+
1 健
|
553 |
+
1 保
|
554 |
+
1 便
|
555 |
+
1 体
|
556 |
+
1 传
|
557 |
+
1 件
|
558 |
+
1 们
|
559 |
+
1 他
|
560 |
+
1 从
|
561 |
+
1 京
|
562 |
+
1 交
|
563 |
+
1 亚
|
564 |
+
1 云
|
565 |
+
1 事
|
566 |
+
1 之
|
567 |
+
1 丹
|
568 |
+
1 东
|
569 |
+
1 且
|
570 |
+
1 不
|
571 |
+
1 ”
|
572 |
+
1 “
|
573 |
+
1 3
|
574 |
+
1 1
|
575 |
+
|
576 |
+
PER-WORD STATS: word corr tot_errs count_in_ref count_in_hyp
|
577 |
+
, 6 37 11 38
|
578 |
+
的 9 36 11 43
|
579 |
+
。 6 27 13 26
|
580 |
+
、 0 21 1 20
|
581 |
+
| 0 20 0 20
|
582 |
+
是 4 15 7 16
|
583 |
+
文 4 12 7 13
|
584 |
+
本 2 11 5 10
|
585 |
+
以 0 11 2 9
|
586 |
+
n 0 11 1 10
|
587 |
+
: 1 10 5 7
|
588 |
+
有 0 10 4 6
|
589 |
+
中 3 10 6 10
|
590 |
+
一 4 10 10 8
|
591 |
+
m 0 10 0 10
|
592 |
+
i 0 10 0 10
|
593 |
+
e 0 10 0 10
|
594 |
+
d 0 10 0 10
|
595 |
+
_ 0 10 0 10
|
596 |
+
> 0 10 0 10
|
597 |
+
< 0 10 0 10
|
598 |
+
. 0 10 6 4
|
599 |
+
和 1 9 2 9
|
600 |
+
可 0 9 1 8
|
601 |
+
多 1 8 4 6
|
602 |
+
6 0 8 4 4
|
603 |
+
这 4 7 4 11
|
604 |
+
我 1 7 3 6
|
605 |
+
% 0 7 5 2
|
606 |
+
如 0 6 0 6
|
607 |
+
在 0 6 3 3
|
608 |
+
了 1 6 3 5
|
609 |
+
个 3 6 4 8
|
610 |
+
|
611 |
+
0 6 0 6
|
612 |
+
蚂 0 5 2 3
|
613 |
+
蚁 0 5 2 3
|
614 |
+
学 0 5 2 3
|
615 |
+
大 0 5 0 5
|
616 |
+
到 0 5 0 5
|
617 |
+
关 2 5 6 3
|
618 |
+
你 0 5 2 3
|
619 |
+
0 1 5 4 3
|
620 |
+
0 5 0 5
|
621 |
+
题 0 4 2 2
|
622 |
+
面 0 4 1 3
|
623 |
+
键 1 4 4 2
|
624 |
+
量 0 4 1 3
|
625 |
+
语 0 4 0 4
|
626 |
+
词 1 4 4 2
|
627 |
+
自 1 4 1 5
|
628 |
+
能 1 4 1 5
|
629 |
+
给 0 4 4 0
|
630 |
+
等 0 4 0 4
|
631 |
+
然 1 4 1 5
|
632 |
+
数 0 4 2 2
|
633 |
+
所 0 4 2 2
|
634 |
+
或 0 4 1 3
|
635 |
+
广 0 4 1 3
|
636 |
+
别 0 4 1 3
|
637 |
+
人 1 4 1 5
|
638 |
+
丽 0 4 0 4
|
639 |
+
为 0 4 0 4
|
640 |
+
下 1 4 3 3
|
641 |
+
3 0 4 3 1
|
642 |
+
风 0 3 0 3
|
643 |
+
道 2 3 3 4
|
644 |
+
走 1 3 3 2
|
645 |
+
请 0 3 3 0
|
646 |
+
类 0 3 1 2
|
647 |
+
答 0 3 1 2
|
648 |
+
看 2 3 2 5
|
649 |
+
现 1 3 3 2
|
650 |
+
特 0 3 0 3
|
651 |
+
段 0 3 1 2
|
652 |
+
果 0 3 0 3
|
653 |
+
来 0 3 0 3
|
654 |
+
更 0 3 0 3
|
655 |
+
智 1 3 1 4
|
656 |
+
方 0 3 0 3
|
657 |
+
新 2 3 2 5
|
658 |
+
技 1 3 1 4
|
659 |
+
感 0 3 0 3
|
660 |
+
想 0 3 1 2
|
661 |
+
得 1 3 2 3
|
662 |
+
年 0 3 3 0
|
663 |
+
少 0 3 3 0
|
664 |
+
客 0 3 1 2
|
665 |
+
字 1 3 3 2
|
666 |
+
子 2 3 3 4
|
667 |
+
地 1 3 1 4
|
668 |
+
向 1 3 3 2
|
669 |
+
受 0 3 0 3
|
670 |
+
包 0 3 1 2
|
671 |
+
力 0 3 0 3
|
672 |
+
前 0 3 0 3
|
673 |
+
利 0 3 2 1
|
674 |
+
内 0 3 0 3
|
675 |
+
其 0 3 1 2
|
676 |
+
会 0 3 1 2
|
677 |
+
于 2 3 3 4
|
678 |
+
习 0 3 0 3
|
679 |
+
也 0 3 0 3
|
680 |
+
之 0 3 1 2
|
681 |
+
上 0 3 1 2
|
682 |
+
” 0 3 2 1
|
683 |
+
“ 0 3 2 1
|
684 |
+
5 1 3 4 1
|
685 |
+
2 2 3 2 5
|
686 |
+
? 0 2 2 0
|
687 |
+
静 0 2 0 2
|
688 |
+
闻 2 2 2 4
|
689 |
+
问 0 2 2 0
|
690 |
+
金 0 2 0 2
|
691 |
+
都 0 2 1 1
|
692 |
+
通 0 2 0 2
|
693 |
+
选 0 2 0 2
|
694 |
+
述 0 2 0 2
|
695 |
+
运 0 2 1 1
|
696 |
+
该 0 2 1 1
|
697 |
+
识 0 2 0 2
|
698 |
+
让 0 2 0 2
|
699 |
+
西 0 2 0 2
|
700 |
+
表 1 2 3 1
|
701 |
+
获 1 2 2 2
|
702 |
+
美 2 2 2 4
|
703 |
+
章 1 2 2 2
|
704 |
+
着 0 2 0 2
|
705 |
+
目 1 2 1 3
|
706 |
+
用 0 2 0 2
|
707 |
+
生 0 2 0 2
|
708 |
+
理 0 2 0 2
|
709 |
+
独 0 2 0 2
|
710 |
+
润 0 2 2 0
|
711 |
+
求 0 2 0 2
|
712 |
+
气 0 2 0 2
|
713 |
+
比 0 2 1 1
|
714 |
+
此 0 2 0 2
|
715 |
+
模 1 2 2 2
|
716 |
+
案 0 2 0 2
|
717 |
+
机 0 2 0 2
|
718 |
+
术 1 2 1 3
|
719 |
+
替 0 2 1 1
|
720 |
+
景 0 2 2 0
|
721 |
+
明 0 2 0 2
|
722 |
+
斯 0 2 0 2
|
723 |
+
推 1 2 1 3
|
724 |
+
据 1 2 1 3
|
725 |
+
换 0 2 1 1
|
726 |
+
括 0 2 0 2
|
727 |
+
拉 0 2 0 2
|
728 |
+
报 2 2 2 4
|
729 |
+
您 0 2 0 2
|
730 |
+
当 0 2 0 2
|
731 |
+
度 0 2 0 2
|
732 |
+
巴 0 2 0 2
|
733 |
+
工 1 2 1 3
|
734 |
+
山 0 2 0 2
|
735 |
+
尼 0 2 0 2
|
736 |
+
小 0 2 1 1
|
737 |
+
将 0 2 1 1
|
738 |
+
容 0 2 0 2
|
739 |
+
定 0 2 2 0
|
740 |
+
它 1 2 1 3
|
741 |
+
娃 0 2 0 2
|
742 |
+
好 2 2 2 4
|
743 |
+
奥 0 2 1 1
|
744 |
+
型 1 2 2 2
|
745 |
+
国 1 2 1 3
|
746 |
+
因 0 2 0 2
|
747 |
+
告 0 2 1 1
|
748 |
+
否 1 2 2 2
|
749 |
+
后 0 2 0 2
|
750 |
+
右 1 2 2 2
|
751 |
+
只 3 2 3 5
|
752 |
+
句 2 2 2 4
|
753 |
+
去 0 2 1 1
|
754 |
+
博 0 2 1 1
|
755 |
+
列 0 2 2 0
|
756 |
+
净 0 2 2 0
|
757 |
+
光 0 2 0 2
|
758 |
+
今 0 2 2 0
|
759 |
+
京 0 2 1 1
|
760 |
+
主 0 2 0 2
|
761 |
+
东 0 2 1 1
|
762 |
+
’ 0 2 2 0
|
763 |
+
‘ 0 2 2 0
|
764 |
+
A 0 2 2 0
|
765 |
+
" 2 2 2 4
|
766 |
+
黑 0 1 0 1
|
767 |
+
黎 0 1 0 1
|
768 |
+
黄 0 1 0 1
|
769 |
+
魅 0 1 0 1
|
770 |
+
验 0 1 0 1
|
771 |
+
首 0 1 1 0
|
772 |
+
领 0 1 0 1
|
773 |
+
音 0 1 0 1
|
774 |
+
需 0 1 0 1
|
775 |
+
随 0 1 0 1
|
776 |
+
隆 0 1 1 0
|
777 |
+
除 2 1 2 3
|
778 |
+
阿 0 1 0 1
|
779 |
+
阳 0 1 0 1
|
780 |
+
门 0 1 0 1
|
781 |
+
长 0 1 1 0
|
782 |
+
错 0 1 0 1
|
783 |
+
销 0 1 0 1
|
784 |
+
重 0 1 1 0
|
785 |
+
逗 0 1 1 0
|
786 |
+
进 0 1 0 1
|
787 |
+
贵 0 1 0 1
|
788 |
+
财 0 1 0 1
|
789 |
+
谷 0 1 0 1
|
790 |
+
诗 0 1 1 0
|
791 |
+
试 1 1 2 1
|
792 |
+
论 0 1 0 1
|
793 |
+
许 0 1 0 1
|
794 |
+
计 1 1 1 2
|
795 |
+
言 0 1 0 1
|
796 |
+
觉 0 1 0 1
|
797 |
+
视 0 1 0 1
|
798 |
+
观 0 1 0 1
|
799 |
+
要 0 1 0 1
|
800 |
+
袤 0 1 0 1
|
801 |
+
行 0 1 0 1
|
802 |
+
融 0 1 0 1
|
803 |
+
荐 1 1 1 2
|
804 |
+
草 0 1 0 1
|
805 |
+
花 0 1 0 1
|
806 |
+
脉 0 1 0 1
|
807 |
+
综 0 1 0 1
|
808 |
+
结 0 1 0 1
|
809 |
+
经 0 1 0 1
|
810 |
+
绍 0 1 1 0
|
811 |
+
线 0 1 1 0
|
812 |
+
纳 0 1 0 1
|
813 |
+
素 0 1 0 1
|
814 |
+
精 0 1 0 1
|
815 |
+
篇 1 1 1 2
|
816 |
+
算 1 1 1 2
|
817 |
+
符 0 1 1 0
|
818 |
+
立 0 1 0 1
|
819 |
+
空 0 1 1 0
|
820 |
+
科 0 1 0 1
|
821 |
+
种 0 1 0 1
|
822 |
+
确 1 1 1 2
|
823 |
+
直 0 1 1 0
|
824 |
+
疗 0 1 0 1
|
825 |
+
犬 0 1 0 1
|
826 |
+
牛 0 1 0 1
|
827 |
+
版 0 1 0 1
|
828 |
+
片 0 1 0 1
|
829 |
+
热 0 1 0 1
|
830 |
+
点 0 1 1 0
|
831 |
+
澈 0 1 0 1
|
832 |
+
湖 0 1 0 1
|
833 |
+
游 0 1 1 0
|
834 |
+
清 0 1 0 1
|
835 |
+
深 0 1 0 1
|
836 |
+
消 0 1 0 1
|
837 |
+
测 1 1 2 1
|
838 |
+
流 0 1 0 1
|
839 |
+
活 0 1 0 1
|
840 |
+
洲 0 1 0 1
|
841 |
+
泰 0 1 0 1
|
842 |
+
泛 0 1 0 1
|
843 |
+
法 0 1 0 1
|
844 |
+
泊 0 1 0 1
|
845 |
+
河 0 1 0 1
|
846 |
+
没 0 1 0 1
|
847 |
+
江 0 1 0 1
|
848 |
+
毛 0 1 0 1
|
849 |
+
正 1 1 1 2
|
850 |
+
歌 0 1 1 0
|
851 |
+
欧 0 1 0 1
|
852 |
+
欢 0 1 0 1
|
853 |
+
次 0 1 0 1
|
854 |
+
格 0 1 1 0
|
855 |
+
根 1 1 1 2
|
856 |
+
样 0 1 1 0
|
857 |
+
树 0 1 0 1
|
858 |
+
柯 0 1 0 1
|
859 |
+
析 0 1 0 1
|
860 |
+
条 0 1 1 0
|
861 |
+
未 0 1 0 1
|
862 |
+
木 0 1 0 1
|
863 |
+
望 0 1 0 1
|
864 |
+
最 0 1 0 1
|
865 |
+
曼 0 1 0 1
|
866 |
+
普 0 1 0 1
|
867 |
+
显 0 1 0 1
|
868 |
+
旺 0 1 0 1
|
869 |
+
日 0 1 0 1
|
870 |
+
无 0 1 0 1
|
871 |
+
旅 0 1 1 0
|
872 |
+
斗 0 1 0 1
|
873 |
+
整 0 1 0 1
|
874 |
+
散 0 1 0 1
|
875 |
+
提 0 1 0 1
|
876 |
+
描 0 1 0 1
|
877 |
+
损 0 1 1 0
|
878 |
+
择 0 1 0 1
|
879 |
+
拟 0 1 0 1
|
880 |
+
拂 0 1 0 1
|
881 |
+
抱 0 1 0 1
|
882 |
+
成 0 1 1 0
|
883 |
+
情 0 1 0 1
|
884 |
+
息 0 1 0 1
|
885 |
+
总 0 1 0 1
|
886 |
+
怀 0 1 0 1
|
887 |
+
心 0 1 0 1
|
888 |
+
微 0 1 0 1
|
889 |
+
很 0 1 1 0
|
890 |
+
往 0 1 0 1
|
891 |
+
异 0 1 0 1
|
892 |
+
开 0 1 1 0
|
893 |
+
应 0 1 0 1
|
894 |
+
并 0 1 0 1
|
895 |
+
平 0 1 0 1
|
896 |
+
幕 0 1 1 0
|
897 |
+
常 0 1 0 1
|
898 |
+
帮 0 1 1 0
|
899 |
+
带 0 1 0 1
|
900 |
+
希 0 1 0 1
|
901 |
+
布 0 1 0 1
|
902 |
+
市 0 1 0 1
|
903 |
+
已 0 1 0 1
|
904 |
+
巧 0 1 0 1
|
905 |
+
左 0 1 1 0
|
906 |
+
川 0 1 0 1
|
907 |
+
岛 0 1 0 1
|
908 |
+
尽 0 1 0 1
|
909 |
+
对 0 1 0 1
|
910 |
+
宾 0 1 0 1
|
911 |
+
宣 0 1 0 1
|
912 |
+
实 0 1 0 1
|
913 |
+
宁 0 1 0 1
|
914 |
+
媚 0 1 0 1
|
915 |
+
威 0 1 0 1
|
916 |
+
姆 0 1 0 1
|
917 |
+
外 0 1 0 1
|
918 |
+
处 0 1 0 1
|
919 |
+
壮 0 1 0 1
|
920 |
+
增 0 1 1 0
|
921 |
+
基 0 1 0 1
|
922 |
+
域 0 1 0 1
|
923 |
+
城 0 1 0 1
|
924 |
+
图 0 1 0 1
|
925 |
+
回 0 1 1 0
|
926 |
+
器 0 1 0 1
|
927 |
+
喜 0 1 0 1
|
928 |
+
命 0 1 0 1
|
929 |
+
含 0 1 1 0
|
930 |
+
吉 0 1 0 1
|
931 |
+
各 0 1 0 1
|
932 |
+
司 0 1 1 0
|
933 |
+
号 0 1 1 0
|
934 |
+
发 1 1 1 2
|
935 |
+
双 0 1 0 1
|
936 |
+
及 0 1 0 1
|
937 |
+
原 0 1 0 1
|
938 |
+
厘 0 1 0 1
|
939 |
+
印 0 1 0 1
|
940 |
+
占 0 1 1 0
|
941 |
+
南 0 1 0 1
|
942 |
+
医 0 1 0 1
|
943 |
+
化 0 1 0 1
|
944 |
+
动 0 1 0 1
|
945 |
+
加 0 1 0 1
|
946 |
+
删 2 1 2 3
|
947 |
+
创 0 1 0 1
|
948 |
+
分 0 1 0 1
|
949 |
+
出 1 1 1 2
|
950 |
+
准 0 1 0 1
|
951 |
+
写 0 1 1 0
|
952 |
+
典 0 1 0 1
|
953 |
+
共 0 1 0 1
|
954 |
+
公 0 1 1 0
|
955 |
+
克 0 1 0 1
|
956 |
+
元 1 1 1 2
|
957 |
+
像 0 1 0 1
|
958 |
+
健 0 1 0 1
|
959 |
+
保 0 1 0 1
|
960 |
+
便 0 1 0 1
|
961 |
+
体 0 1 0 1
|
962 |
+
传 0 1 0 1
|
963 |
+
件 0 1 0 1
|
964 |
+
们 0 1 0 1
|
965 |
+
他 0 1 0 1
|
966 |
+
从 1 1 1 2
|
967 |
+
介 0 1 1 0
|
968 |
+
交 0 1 0 1
|
969 |
+
亚 0 1 0 1
|
970 |
+
云 0 1 0 1
|
971 |
+
亏 0 1 1 0
|
972 |
+
事 0 1 0 1
|
973 |
+
丹 0 1 0 1
|
974 |
+
串 0 1 1 0
|
975 |
+
两 0 1 0 1
|
976 |
+
且 0 1 0 1
|
977 |
+
不 0 1 0 1
|
978 |
+
] 0 1 1 0
|
979 |
+
\ 0 1 1 0
|
980 |
+
[ 0 1 1 0
|
981 |
+
D 1 1 1 2
|
982 |
+
C 0 1 1 0
|
983 |
+
B 0 1 1 0
|
984 |
+
4 0 1 1 0
|
985 |
+
1 0 1 0 1
|
986 |
+
项 1 0 1 1
|
987 |
+
输 1 0 1 1
|
988 |
+
狗 1 0 1 1
|
989 |
+
牌 1 0 1 1
|
990 |
+
断 1 0 1 1
|
991 |
+
属 1 0 1 1
|
992 |
+
展 1 0 1 1
|
993 |
+
奖 1 0 1 1
|
994 |
+
团 1 0 1 1
|
995 |
+
名 1 0 1 1
|
996 |
+
判 1 0 1 1
|
997 |
+
入 1 0 1 1
|
998 |
+
代 1 0 1 1
|
999 |
+
些 2 0 2 2
|
1000 |
+
万 1 0 1 1
|
log-e2e-epoch10_speech2speech_rerun/f5cf13be42666535036bdf0f05fcc0e3.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4f4910d3a76bde71dfd2668454aa7caf63978e2ecb8290b97ca868e91dfaa242
|
3 |
+
size 320044
|
log-e2e-epoch10_speech2speech_rerun/log-decode-epoch-999-avg-1-2025-04-29-06-36-58
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2025-04-29 06:36:58,118 INFO [decode.py:695] Decoding started
|
2 |
+
2025-04-29 06:36:58,120 INFO [decode.py:696] {
|
3 |
+
"audio_key": "question_audio",
|
4 |
+
"avg": 1,
|
5 |
+
"beam_size": 1,
|
6 |
+
"bucketing_sampler": true,
|
7 |
+
"drop_last": true,
|
8 |
+
"enable_musan": true,
|
9 |
+
"enable_spec_aug": true,
|
10 |
+
"enable_speech_output": true,
|
11 |
+
"encoder_projector_ds_rate": 8,
|
12 |
+
"env_info": {
|
13 |
+
"IP address": "0.114.200.181",
|
14 |
+
"hostname": "7522485",
|
15 |
+
"icefall-git-branch": "master",
|
16 |
+
"icefall-git-date": "Fri Apr 25 10:31:43 2025",
|
17 |
+
"icefall-git-sha1": "d742043e-dirty",
|
18 |
+
"icefall-path": "/workspace/slam/icefall_omni",
|
19 |
+
"k2-build-type": "Release",
|
20 |
+
"k2-git-date": "Tue Oct 29 09:02:19 2024",
|
21 |
+
"k2-git-sha1": "75e2ed6b2fd87c22b7f3f34bad48a69984bb8755",
|
22 |
+
"k2-path": "/opt/conda/lib/python3.11/site-packages/k2/__init__.py",
|
23 |
+
"k2-version": "1.24.4",
|
24 |
+
"k2-with-cuda": true,
|
25 |
+
"lhotse-path": "/workspace/slam/lhotse_pr/lhotse/__init__.py",
|
26 |
+
"lhotse-version": "1.31.0.dev+git.8c56a3e6.dirty",
|
27 |
+
"python-version": "3.11",
|
28 |
+
"torch-cuda-available": true,
|
29 |
+
"torch-cuda-version": "12.4",
|
30 |
+
"torch-version": "2.4.0"
|
31 |
+
},
|
32 |
+
"epoch": 999,
|
33 |
+
"exp_dir": "slam_omni/exp_speech2speech_rerun",
|
34 |
+
"huggingface_dataset_path_or_name": "/workspace/Belle_1.4M-SLAM-Omni",
|
35 |
+
"input_strategy": "PrecomputedFeatures",
|
36 |
+
"llm_path_or_name": "models/Qwen2.5-0.5B-Instruct",
|
37 |
+
"log_dir": "slam_omni/exp_speech2speech_rerun/log-e2e-epoch10_speech2speech_rerun",
|
38 |
+
"manifest_dir": "data/fbank",
|
39 |
+
"max_duration": 1,
|
40 |
+
"method": "e2e-epoch10_speech2speech_rerun",
|
41 |
+
"num_buckets": 30,
|
42 |
+
"num_workers": 2,
|
43 |
+
"on_the_fly_feats": false,
|
44 |
+
"remove_whisper_encoder_input_length_restriction": true,
|
45 |
+
"resample_to_16kHz": true,
|
46 |
+
"return_cuts": true,
|
47 |
+
"shuffle": true,
|
48 |
+
"spec_aug_time_warp_factor": 80,
|
49 |
+
"speech_encoder_path_or_name": "models/whisper/v1.1/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt",
|
50 |
+
"suffix": "epoch-999-avg-1",
|
51 |
+
"text_key": "answer",
|
52 |
+
"token2wav_path": "/workspace/CosyVoice-300M-SFT",
|
53 |
+
"use_flash_attn": true,
|
54 |
+
"use_lora": true
|
55 |
+
}
|
56 |
+
2025-04-29 06:36:58,120 INFO [decode.py:702] device: cuda
|
57 |
+
2025-04-29 06:37:30,467 INFO [flow.py:51] input frame rate=50
|
58 |
+
2025-04-29 06:37:34,971 INFO [processor.py:101] building fst for zh_normalizer ...
|
59 |
+
2025-04-29 06:37:59,937 INFO [processor.py:106] done
|
60 |
+
2025-04-29 06:37:59,942 INFO [processor.py:107] fst path: /opt/conda/lib/python3.11/site-packages/tn/zh_tn_tagger.fst
|
61 |
+
2025-04-29 06:37:59,943 INFO [processor.py:108] /opt/conda/lib/python3.11/site-packages/tn/zh_tn_verbalizer.fst
|
62 |
+
2025-04-29 06:37:59,949 INFO [processor.py:95] found existing fst: /opt/conda/lib/python3.11/site-packages/tn/en_tn_tagger.fst
|
63 |
+
2025-04-29 06:37:59,950 INFO [processor.py:96] /opt/conda/lib/python3.11/site-packages/tn/en_tn_verbalizer.fst
|
64 |
+
2025-04-29 06:37:59,950 INFO [processor.py:97] skip building fst for en_normalizer ...
|
65 |
+
2025-04-29 06:38:05,719 INFO [decode.py:708] Number of model parameters: 1314396804
|
66 |
+
2025-04-29 06:38:05,720 INFO [data_module.py:414] About to get test cuts
|
log-e2e-epoch10_speech2speech_rerun/log-decode-epoch-999-avg-1-2025-04-29-06-40-09
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2025-04-29 06:40:09,268 INFO [decode.py:696] Decoding started
|
2 |
+
2025-04-29 06:40:09,270 INFO [decode.py:697] {
|
3 |
+
"audio_key": "question_audio",
|
4 |
+
"avg": 1,
|
5 |
+
"beam_size": 1,
|
6 |
+
"bucketing_sampler": true,
|
7 |
+
"drop_last": true,
|
8 |
+
"enable_musan": true,
|
9 |
+
"enable_spec_aug": true,
|
10 |
+
"enable_speech_output": true,
|
11 |
+
"encoder_projector_ds_rate": 8,
|
12 |
+
"env_info": {
|
13 |
+
"IP address": "0.114.200.181",
|
14 |
+
"hostname": "7522485",
|
15 |
+
"icefall-git-branch": "master",
|
16 |
+
"icefall-git-date": "Fri Apr 25 10:31:43 2025",
|
17 |
+
"icefall-git-sha1": "d742043e-dirty",
|
18 |
+
"icefall-path": "/workspace/slam/icefall_omni",
|
19 |
+
"k2-build-type": "Release",
|
20 |
+
"k2-git-date": "Tue Oct 29 09:02:19 2024",
|
21 |
+
"k2-git-sha1": "75e2ed6b2fd87c22b7f3f34bad48a69984bb8755",
|
22 |
+
"k2-path": "/opt/conda/lib/python3.11/site-packages/k2/__init__.py",
|
23 |
+
"k2-version": "1.24.4",
|
24 |
+
"k2-with-cuda": true,
|
25 |
+
"lhotse-path": "/workspace/slam/lhotse_pr/lhotse/__init__.py",
|
26 |
+
"lhotse-version": "1.31.0.dev+git.8c56a3e6.dirty",
|
27 |
+
"python-version": "3.11",
|
28 |
+
"torch-cuda-available": true,
|
29 |
+
"torch-cuda-version": "12.4",
|
30 |
+
"torch-version": "2.4.0"
|
31 |
+
},
|
32 |
+
"epoch": 999,
|
33 |
+
"exp_dir": "slam_omni/exp_speech2speech_rerun",
|
34 |
+
"huggingface_dataset_path_or_name": "/workspace/Belle_1.4M-SLAM-Omni",
|
35 |
+
"input_strategy": "PrecomputedFeatures",
|
36 |
+
"llm_path_or_name": "models/Qwen2.5-0.5B-Instruct",
|
37 |
+
"log_dir": "slam_omni/exp_speech2speech_rerun/log-e2e-epoch10_speech2speech_rerun",
|
38 |
+
"manifest_dir": "data/fbank",
|
39 |
+
"max_duration": 1,
|
40 |
+
"method": "e2e-epoch10_speech2speech_rerun",
|
41 |
+
"num_buckets": 30,
|
42 |
+
"num_workers": 2,
|
43 |
+
"on_the_fly_feats": false,
|
44 |
+
"remove_whisper_encoder_input_length_restriction": true,
|
45 |
+
"resample_to_16kHz": true,
|
46 |
+
"return_cuts": true,
|
47 |
+
"shuffle": true,
|
48 |
+
"spec_aug_time_warp_factor": 80,
|
49 |
+
"speech_encoder_path_or_name": "models/whisper/v1.1/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt",
|
50 |
+
"suffix": "epoch-999-avg-1",
|
51 |
+
"text_key": "answer",
|
52 |
+
"token2wav_path": "/workspace/CosyVoice-300M-SFT",
|
53 |
+
"use_flash_attn": true,
|
54 |
+
"use_lora": true
|
55 |
+
}
|
56 |
+
2025-04-29 06:40:09,270 INFO [decode.py:703] device: cuda
|
57 |
+
2025-04-29 06:40:31,296 INFO [flow.py:51] input frame rate=50
|
58 |
+
2025-04-29 06:40:32,568 INFO [processor.py:101] building fst for zh_normalizer ...
|
59 |
+
2025-04-29 06:40:57,320 INFO [processor.py:106] done
|
60 |
+
2025-04-29 06:40:57,320 INFO [processor.py:107] fst path: /opt/conda/lib/python3.11/site-packages/tn/zh_tn_tagger.fst
|
61 |
+
2025-04-29 06:40:57,321 INFO [processor.py:108] /opt/conda/lib/python3.11/site-packages/tn/zh_tn_verbalizer.fst
|
62 |
+
2025-04-29 06:40:57,325 INFO [processor.py:95] found existing fst: /opt/conda/lib/python3.11/site-packages/tn/en_tn_tagger.fst
|
63 |
+
2025-04-29 06:40:57,325 INFO [processor.py:96] /opt/conda/lib/python3.11/site-packages/tn/en_tn_verbalizer.fst
|
64 |
+
2025-04-29 06:40:57,325 INFO [processor.py:97] skip building fst for en_normalizer ...
|
65 |
+
2025-04-29 06:40:59,720 INFO [decode.py:709] Number of model parameters: 1314396804
|
66 |
+
2025-04-29 06:40:59,721 INFO [data_module.py:414] About to get test cuts
|
log-e2e-epoch10_speech2speech_rerun/log-decode-epoch-999-avg-1-2025-04-29-06-41-35
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2025-04-29 06:41:35,728 INFO [decode.py:697] Decoding started
|
2 |
+
2025-04-29 06:41:35,729 INFO [decode.py:698] {
|
3 |
+
"audio_key": "question_audio",
|
4 |
+
"avg": 1,
|
5 |
+
"beam_size": 1,
|
6 |
+
"bucketing_sampler": true,
|
7 |
+
"drop_last": true,
|
8 |
+
"enable_musan": true,
|
9 |
+
"enable_spec_aug": true,
|
10 |
+
"enable_speech_output": true,
|
11 |
+
"encoder_projector_ds_rate": 8,
|
12 |
+
"env_info": {
|
13 |
+
"IP address": "0.114.200.181",
|
14 |
+
"hostname": "7522485",
|
15 |
+
"icefall-git-branch": "master",
|
16 |
+
"icefall-git-date": "Fri Apr 25 10:31:43 2025",
|
17 |
+
"icefall-git-sha1": "d742043e-dirty",
|
18 |
+
"icefall-path": "/workspace/slam/icefall_omni",
|
19 |
+
"k2-build-type": "Release",
|
20 |
+
"k2-git-date": "Tue Oct 29 09:02:19 2024",
|
21 |
+
"k2-git-sha1": "75e2ed6b2fd87c22b7f3f34bad48a69984bb8755",
|
22 |
+
"k2-path": "/opt/conda/lib/python3.11/site-packages/k2/__init__.py",
|
23 |
+
"k2-version": "1.24.4",
|
24 |
+
"k2-with-cuda": true,
|
25 |
+
"lhotse-path": "/workspace/slam/lhotse_pr/lhotse/__init__.py",
|
26 |
+
"lhotse-version": "1.31.0.dev+git.8c56a3e6.dirty",
|
27 |
+
"python-version": "3.11",
|
28 |
+
"torch-cuda-available": true,
|
29 |
+
"torch-cuda-version": "12.4",
|
30 |
+
"torch-version": "2.4.0"
|
31 |
+
},
|
32 |
+
"epoch": 999,
|
33 |
+
"exp_dir": "slam_omni/exp_speech2speech_rerun",
|
34 |
+
"huggingface_dataset_path_or_name": "/workspace/Belle_1.4M-SLAM-Omni",
|
35 |
+
"input_strategy": "PrecomputedFeatures",
|
36 |
+
"llm_path_or_name": "models/Qwen2.5-0.5B-Instruct",
|
37 |
+
"log_dir": "slam_omni/exp_speech2speech_rerun/log-e2e-epoch10_speech2speech_rerun",
|
38 |
+
"manifest_dir": "data/fbank",
|
39 |
+
"max_duration": 1,
|
40 |
+
"method": "e2e-epoch10_speech2speech_rerun",
|
41 |
+
"num_buckets": 30,
|
42 |
+
"num_workers": 2,
|
43 |
+
"on_the_fly_feats": false,
|
44 |
+
"remove_whisper_encoder_input_length_restriction": true,
|
45 |
+
"resample_to_16kHz": true,
|
46 |
+
"return_cuts": true,
|
47 |
+
"shuffle": true,
|
48 |
+
"spec_aug_time_warp_factor": 80,
|
49 |
+
"speech_encoder_path_or_name": "models/whisper/v1.1/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt",
|
50 |
+
"suffix": "epoch-999-avg-1",
|
51 |
+
"text_key": "answer",
|
52 |
+
"token2wav_path": "/workspace/CosyVoice-300M-SFT",
|
53 |
+
"use_flash_attn": true,
|
54 |
+
"use_lora": true
|
55 |
+
}
|
56 |
+
2025-04-29 06:41:35,729 INFO [decode.py:704] device: cuda
|
57 |
+
2025-04-29 06:41:58,540 INFO [flow.py:51] input frame rate=50
|
58 |
+
2025-04-29 06:41:59,853 INFO [processor.py:101] building fst for zh_normalizer ...
|
59 |
+
2025-04-29 06:42:24,447 INFO [processor.py:106] done
|
60 |
+
2025-04-29 06:42:24,447 INFO [processor.py:107] fst path: /opt/conda/lib/python3.11/site-packages/tn/zh_tn_tagger.fst
|
61 |
+
2025-04-29 06:42:24,447 INFO [processor.py:108] /opt/conda/lib/python3.11/site-packages/tn/zh_tn_verbalizer.fst
|
62 |
+
2025-04-29 06:42:24,452 INFO [processor.py:95] found existing fst: /opt/conda/lib/python3.11/site-packages/tn/en_tn_tagger.fst
|
63 |
+
2025-04-29 06:42:24,452 INFO [processor.py:96] /opt/conda/lib/python3.11/site-packages/tn/en_tn_verbalizer.fst
|
64 |
+
2025-04-29 06:42:24,452 INFO [processor.py:97] skip building fst for en_normalizer ...
|
65 |
+
2025-04-29 06:42:26,967 INFO [decode.py:710] Number of model parameters: 1314396804
|
66 |
+
2025-04-29 06:42:26,967 INFO [data_module.py:414] About to get test cuts
|
67 |
+
2025-04-29 06:42:35,359 INFO [decode.py:627] batch 0/?, cuts processed until now is 1
|
68 |
+
2025-04-29 06:44:35,705 INFO [decode.py:646] The transcripts are stored in slam_omni/exp_speech2speech_rerun/log-e2e-epoch10_speech2speech_rerun/recogs-test-beam-search-epoch-999-avg-1.txt
|
69 |
+
2025-04-29 06:44:35,709 INFO [utils.py:670] [test-beam-search] %WER 229.61% [884 / 385, 643 ins, 78 del, 163 sub ]
|
70 |
+
2025-04-29 06:44:35,719 INFO [decode.py:664] Wrote detailed error stats to slam_omni/exp_speech2speech_rerun/log-e2e-epoch10_speech2speech_rerun/errs-test-beam-search-epoch-999-avg-1.txt
|
71 |
+
2025-04-29 06:44:35,722 INFO [decode.py:678]
|
72 |
+
For test, CER of different settings are:
|
73 |
+
beam-search 229.61 best for test
|
74 |
+
|
75 |
+
2025-04-29 06:44:35,723 INFO [decode.py:754] Done!
|
log-e2e-epoch10_speech2speech_rerun/recogs-test-beam-search-epoch-999-avg-1.txt
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2a8211e0e75e000bbae1134b21fdbfa4: ref=['以下是一道小学数学题:', '有一条直线,上面有', '6', '只蚂蚁,其中', '4', '只向左走,2', '只向右走。问:向右走的蚂蚁数量占比是多少?', 'A.', '25%', 'B.', '33.3%', 'C.', '50%', 'D.', '66.6%', '回答该问题。']
|
2 |
+
2a8211e0e75e000bbae1134b21fdbfa4: hyp=答案是D. 60%
|
3 |
+
向右走的蚂蚁数量为2只,总共6只蚂蚁中,有2只是向右走的,因此60%的6只蚂蚁中有2只。
|
4 |
+
所以,答案选D。<|im_end|>
|
5 |
+
2e9fddd659b1167aed93534c608f1d99: ref=['A公司去年亏损了500万美元,今年净利润增长了50%,今年的净利润是多少?']
|
6 |
+
2e9fddd659b1167aed93534c608f1d99: hyp=250万美元。<|im_end|>
|
7 |
+
31b4015dd1258ff1f8be44d6504e6956: ref=['请给我介绍一下人工智能的技术和发展。']
|
8 |
+
31b4015dd1258ff1f8be44d6504e6956: hyp=人工智能是一种模拟人类智能的技术,它包括机器学习、自然语言处理、计算机视觉等多个方面。目前,人工智能技术已经广泛应用于各个领域,比如医疗保健、金融、交通等。其中,深度学习是当前最热门的技术之一,它可以实现对大量数据的自动分析和学习,并且可以用于图像识别、语音识别等方面。未来,随着科技的发展,人工智能将会更加普及,为我们的生活带来更多的便利和创新。<|im_end|>
|
9 |
+
3243962376f8c2e411e587e6482f9367: ref=['写一首关于自然美景的诗歌。']
|
10 |
+
3243962376f8c2e411e587e6482f9367: hyp=在大自然的怀抱中,我感受到了生命的美好。山川河流、花草树木,都散发着独特的魅力。阳光明媚,微风拂面,让人感受到自然的力量和美丽。在这片广袤的大地上,我看到了无尽的可能和希望,也感受到了内心的宁静和平静。<|im_end|>
|
11 |
+
450e7541d093ee0e7e12ab76a0dff546: ref=['帮我想一些关于狗的名字。']
|
12 |
+
450e7541d093ee0e7e12ab76a0dff546: hyp=好的,以下是一些关于狗的名字:旺财、小黑、巧克力、大黄、金毛、拉布拉多、吉娃娃、柯基、贵宾、斗牛犬。当然,如果您有其他要求或想法,我也可以根据您的需求进行更精准的推荐。<|im_end|>
|
13 |
+
88562bce265cff88246933e62b756e84: ref=['给定一段文本和关键词列表,删除文本中包含所有给定关键词的子字符串。', '文本:"这是一个测试句子,目的是看看模型是否可以正确地从这个句子中删除关键词。"\\n关键词列表:[‘测试’,‘模型’]']
|
14 |
+
88562bce265cff88246933e62b756e84: hyp=删除后的文本为:"这是一个测试句子,目的是看看模型是否正确地从这个句子中删除关键词"。<|im_end|>
|
15 |
+
ca7f2b2ab703a4f37b5da09fd119a28a: ref=['根据输入的文本,判断它是否属于新闻报道、广告或博客文章类别之一。', '新闻报道:“奥运会在东京隆重开幕,中国代表团获得多项奖牌。”']
|
16 |
+
ca7f2b2ab703a4f37b5da09fd119a28a: hyp=根据输入的文本,可以判断它属于新闻报道类别。原因如下:
|
17 |
+
1. 文本主题:该段文字描述的是奥运会的消息,这通常是新闻报道的主题之一。
|
18 |
+
2. 文本内容:文本中提到了多个事件,包括中国代表团获得多项奖牌,这是典型的新闻报道内容。
|
19 |
+
3. 文本语气:整个文本的语气客观、中立,没有明显的广告或博客文章的推销和宣传元素。
|
20 |
+
综上所述,可以得出结论,这段文本是一篇新闻报道。<|im_end|>
|
21 |
+
d60daeaebb5b0b14ee34f5d714075108: ref=['给我推荐一些旅游景点。']
|
22 |
+
d60daeaebb5b0b14ee34f5d714075108: hyp=如果你喜欢自然风光,我推荐你去云南的丽江、大理和西双版纳。这些地方有壮丽的山脉、清澈的湖泊和独特的文化。如果你想体验异国风情,可以前往泰国的曼谷、印度尼西亚的巴厘岛或日本的东京。此外,欧洲有许多美丽的城市,如巴黎、威尼斯和阿姆斯特丹等,也是不错的选择。<|im_end|>
|
23 |
+
ee07a1222f1d62cf0358f51d12f4c67c: ref=['很好,现在请你计算一下这篇文章中出现了多少个“获得”字样。']
|
24 |
+
ee07a1222f1d62cf0358f51d12f4c67c: hyp=好的,让我来计算一下。在这篇文章中,“获得”出现了两次。<|im_end|>
|
25 |
+
f5cf13be42666535036bdf0f05fcc0e3: ref=['好的。现在请你将这个文本中的所有的逗号都替换成空格。']
|
26 |
+
f5cf13be42666535036bdf0f05fcc0e3: hyp=好的,替换后的文本为:"这是一个句子 这是目的看看是否可以正确地从这个句子中删除关键词"。<|im_end|>
|
log/log-train-2025-04-24-08-58-17
ADDED
@@ -0,0 +1,689 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2025-04-24 08:58:17,434 INFO [train.py:653] {
|
2 |
+
"allowed_excess_duration_ratio": 0.1,
|
3 |
+
"audio_key": "question_audio",
|
4 |
+
"batch_idx_train": 0,
|
5 |
+
"best_train_epoch": -1,
|
6 |
+
"best_train_loss": Infinity,
|
7 |
+
"best_valid_epoch": -1,
|
8 |
+
"best_valid_loss": Infinity,
|
9 |
+
"bucketing_sampler": true,
|
10 |
+
"deepscale": false,
|
11 |
+
"deepscale_config": null,
|
12 |
+
"deepspeed": true,
|
13 |
+
"deepspeed_config": "./slam_omni/ds_config_zero1.json",
|
14 |
+
"drop_last": true,
|
15 |
+
"enable_musan": false,
|
16 |
+
"enable_spec_aug": true,
|
17 |
+
"enable_speech_output": true,
|
18 |
+
"encoder_projector_ds_rate": 8,
|
19 |
+
"env_info": {
|
20 |
+
"IP address": "0.114.172.122",
|
21 |
+
"hostname": "7515258",
|
22 |
+
"icefall-git-branch": "master",
|
23 |
+
"icefall-git-date": "Thu Apr 24 08:24:11 2025",
|
24 |
+
"icefall-git-sha1": "2e9be467-dirty",
|
25 |
+
"icefall-path": "/workspace/slam/icefall_omni",
|
26 |
+
"k2-build-type": "Release",
|
27 |
+
"k2-git-date": "Tue Oct 29 09:02:19 2024",
|
28 |
+
"k2-git-sha1": "75e2ed6b2fd87c22b7f3f34bad48a69984bb8755",
|
29 |
+
"k2-path": "/opt/conda/lib/python3.11/site-packages/k2/__init__.py",
|
30 |
+
"k2-version": "1.24.4",
|
31 |
+
"k2-with-cuda": true,
|
32 |
+
"lhotse-path": "/workspace/slam/lhotse/lhotse/__init__.py",
|
33 |
+
"lhotse-version": "1.30.0.dev+git.13c7616f.dirty",
|
34 |
+
"python-version": "3.11",
|
35 |
+
"torch-cuda-available": true,
|
36 |
+
"torch-cuda-version": "12.4",
|
37 |
+
"torch-version": "2.4.0"
|
38 |
+
},
|
39 |
+
"exp_dir": "slam_omni/exp_speech2speech_rerun",
|
40 |
+
"frame_shift_ms": 10,
|
41 |
+
"huggingface_dataset_path_or_name": "/workspace/Belle_1.4M-SLAM-Omni",
|
42 |
+
"input_strategy": "PrecomputedFeatures",
|
43 |
+
"llm_path_or_name": "models/Qwen2.5-0.5B-Instruct",
|
44 |
+
"log_interval": 50,
|
45 |
+
"manifest_dir": "data/fbank",
|
46 |
+
"max_duration": 40,
|
47 |
+
"num_buckets": 30,
|
48 |
+
"num_epochs": 10,
|
49 |
+
"num_workers": 2,
|
50 |
+
"on_the_fly_feats": false,
|
51 |
+
"pretrained_model_path": null,
|
52 |
+
"resample_to_16kHz": true,
|
53 |
+
"reset_interval": 200,
|
54 |
+
"return_cuts": true,
|
55 |
+
"sampler_state_dict_path": null,
|
56 |
+
"seed": 42,
|
57 |
+
"shuffle": true,
|
58 |
+
"spec_aug_time_warp_factor": 80,
|
59 |
+
"speech_encoder_path_or_name": "models/whisper/v1.1/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt",
|
60 |
+
"start_epoch": 1,
|
61 |
+
"subsampling_factor": 2,
|
62 |
+
"tensorboard": true,
|
63 |
+
"text_key": "answer",
|
64 |
+
"unfreeze_llm": true,
|
65 |
+
"unfreeze_speech_projector": true,
|
66 |
+
"use_flash_attn": true,
|
67 |
+
"use_fp16": true,
|
68 |
+
"use_lora": true,
|
69 |
+
"valid_interval": 5000
|
70 |
+
}
|
71 |
+
2025-04-24 08:58:17,434 INFO [train.py:655] About to create model
|
72 |
+
2025-04-24 08:58:43,456 INFO [train.py:808] Number of model parameters: 1326848644
|
73 |
+
2025-04-24 08:58:43,457 INFO [train.py:810] Trainable parameters (excluding model.eval modules):
|
74 |
+
2025-04-24 08:58:43,457 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
75 |
+
2025-04-24 08:58:43,458 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
76 |
+
2025-04-24 08:58:43,458 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
77 |
+
2025-04-24 08:58:43,458 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
78 |
+
2025-04-24 08:58:43,458 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
79 |
+
2025-04-24 08:58:43,458 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
80 |
+
2025-04-24 08:58:43,458 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
81 |
+
2025-04-24 08:58:43,458 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
82 |
+
2025-04-24 08:58:43,458 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
83 |
+
2025-04-24 08:58:43,458 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
84 |
+
2025-04-24 08:58:43,458 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
85 |
+
2025-04-24 08:58:43,458 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
86 |
+
2025-04-24 08:58:43,458 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
87 |
+
2025-04-24 08:58:43,458 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
88 |
+
2025-04-24 08:58:43,458 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
89 |
+
2025-04-24 08:58:43,458 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
90 |
+
2025-04-24 08:58:43,458 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
91 |
+
2025-04-24 08:58:43,458 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
92 |
+
2025-04-24 08:58:43,458 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
93 |
+
2025-04-24 08:58:43,459 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
94 |
+
2025-04-24 08:58:43,459 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
95 |
+
2025-04-24 08:58:43,459 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
96 |
+
2025-04-24 08:58:43,459 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
97 |
+
2025-04-24 08:58:43,459 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
98 |
+
2025-04-24 08:58:43,459 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
99 |
+
2025-04-24 08:58:43,459 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
100 |
+
2025-04-24 08:58:43,459 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
101 |
+
2025-04-24 08:58:43,459 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
102 |
+
2025-04-24 08:58:43,459 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
103 |
+
2025-04-24 08:58:43,459 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
104 |
+
2025-04-24 08:58:43,459 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
105 |
+
2025-04-24 08:58:43,459 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
106 |
+
2025-04-24 08:58:43,459 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
107 |
+
2025-04-24 08:58:43,459 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
108 |
+
2025-04-24 08:58:43,459 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
109 |
+
2025-04-24 08:58:43,459 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
110 |
+
2025-04-24 08:58:43,459 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
111 |
+
2025-04-24 08:58:43,459 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
112 |
+
2025-04-24 08:58:43,460 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
113 |
+
2025-04-24 08:58:43,460 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
114 |
+
2025-04-24 08:58:43,460 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
115 |
+
2025-04-24 08:58:43,460 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
116 |
+
2025-04-24 08:58:43,460 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
117 |
+
2025-04-24 08:58:43,460 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
118 |
+
2025-04-24 08:58:43,460 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
119 |
+
2025-04-24 08:58:43,460 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
120 |
+
2025-04-24 08:58:43,460 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
121 |
+
2025-04-24 08:58:43,460 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
122 |
+
2025-04-24 08:58:43,460 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
123 |
+
2025-04-24 08:58:43,460 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
124 |
+
2025-04-24 08:58:43,460 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
125 |
+
2025-04-24 08:58:43,460 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
126 |
+
2025-04-24 08:58:43,460 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
127 |
+
2025-04-24 08:58:43,460 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
128 |
+
2025-04-24 08:58:43,460 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
129 |
+
2025-04-24 08:58:43,460 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
130 |
+
2025-04-24 08:58:43,460 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
131 |
+
2025-04-24 08:58:43,461 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
132 |
+
2025-04-24 08:58:43,461 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
133 |
+
2025-04-24 08:58:43,461 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
134 |
+
2025-04-24 08:58:43,461 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
135 |
+
2025-04-24 08:58:43,461 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
136 |
+
2025-04-24 08:58:43,461 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
137 |
+
2025-04-24 08:58:43,461 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
138 |
+
2025-04-24 08:58:43,461 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
139 |
+
2025-04-24 08:58:43,461 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
140 |
+
2025-04-24 08:58:43,461 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
141 |
+
2025-04-24 08:58:43,461 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
142 |
+
2025-04-24 08:58:43,461 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
143 |
+
2025-04-24 08:58:43,461 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
144 |
+
2025-04-24 08:58:43,461 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
145 |
+
2025-04-24 08:58:43,461 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
146 |
+
2025-04-24 08:58:43,461 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
147 |
+
2025-04-24 08:58:43,461 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
148 |
+
2025-04-24 08:58:43,461 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
149 |
+
2025-04-24 08:58:43,461 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
150 |
+
2025-04-24 08:58:43,462 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
151 |
+
2025-04-24 08:58:43,462 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
152 |
+
2025-04-24 08:58:43,462 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
153 |
+
2025-04-24 08:58:43,462 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
154 |
+
2025-04-24 08:58:43,462 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
155 |
+
2025-04-24 08:58:43,462 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
156 |
+
2025-04-24 08:58:43,462 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
157 |
+
2025-04-24 08:58:43,462 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
158 |
+
2025-04-24 08:58:43,462 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
159 |
+
2025-04-24 08:58:43,462 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
160 |
+
2025-04-24 08:58:43,462 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
161 |
+
2025-04-24 08:58:43,462 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
162 |
+
2025-04-24 08:58:43,462 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
163 |
+
2025-04-24 08:58:43,462 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
164 |
+
2025-04-24 08:58:43,462 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
165 |
+
2025-04-24 08:58:43,462 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
166 |
+
2025-04-24 08:58:43,462 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
167 |
+
2025-04-24 08:58:43,462 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
168 |
+
2025-04-24 08:58:43,462 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
169 |
+
2025-04-24 08:58:43,462 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
170 |
+
2025-04-24 08:58:43,463 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
171 |
+
2025-04-24 08:58:43,463 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
172 |
+
2025-04-24 08:58:43,463 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
173 |
+
2025-04-24 08:58:43,463 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
174 |
+
2025-04-24 08:58:43,463 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
175 |
+
2025-04-24 08:58:43,463 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
176 |
+
2025-04-24 08:58:43,463 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
177 |
+
2025-04-24 08:58:43,463 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
178 |
+
2025-04-24 08:58:43,463 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
179 |
+
2025-04-24 08:58:43,463 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
180 |
+
2025-04-24 08:58:43,463 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
181 |
+
2025-04-24 08:58:43,463 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
182 |
+
2025-04-24 08:58:43,463 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
183 |
+
2025-04-24 08:58:43,463 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
184 |
+
2025-04-24 08:58:43,463 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
185 |
+
2025-04-24 08:58:43,463 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
186 |
+
2025-04-24 08:58:43,463 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
187 |
+
2025-04-24 08:58:43,463 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
188 |
+
2025-04-24 08:58:43,463 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
189 |
+
2025-04-24 08:58:43,464 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
190 |
+
2025-04-24 08:58:43,464 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
191 |
+
2025-04-24 08:58:43,464 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
192 |
+
2025-04-24 08:58:43,464 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
193 |
+
2025-04-24 08:58:43,464 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
194 |
+
2025-04-24 08:58:43,464 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
195 |
+
2025-04-24 08:58:43,464 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
196 |
+
2025-04-24 08:58:43,464 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
197 |
+
2025-04-24 08:58:43,464 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
198 |
+
2025-04-24 08:58:43,464 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
199 |
+
2025-04-24 08:58:43,464 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
200 |
+
2025-04-24 08:58:43,464 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
201 |
+
2025-04-24 08:58:43,464 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
202 |
+
2025-04-24 08:58:43,464 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
203 |
+
2025-04-24 08:58:43,464 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
204 |
+
2025-04-24 08:58:43,464 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
205 |
+
2025-04-24 08:58:43,464 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
206 |
+
2025-04-24 08:58:43,464 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
207 |
+
2025-04-24 08:58:43,464 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
208 |
+
2025-04-24 08:58:43,464 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
209 |
+
2025-04-24 08:58:43,465 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
210 |
+
2025-04-24 08:58:43,465 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
211 |
+
2025-04-24 08:58:43,465 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
212 |
+
2025-04-24 08:58:43,465 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
213 |
+
2025-04-24 08:58:43,465 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
214 |
+
2025-04-24 08:58:43,465 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
215 |
+
2025-04-24 08:58:43,465 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
216 |
+
2025-04-24 08:58:43,465 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
217 |
+
2025-04-24 08:58:43,465 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
218 |
+
2025-04-24 08:58:43,465 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
219 |
+
2025-04-24 08:58:43,465 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
220 |
+
2025-04-24 08:58:43,465 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
221 |
+
2025-04-24 08:58:43,465 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
222 |
+
2025-04-24 08:58:43,465 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
223 |
+
2025-04-24 08:58:43,465 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
224 |
+
2025-04-24 08:58:43,465 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
225 |
+
2025-04-24 08:58:43,465 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
226 |
+
2025-04-24 08:58:43,465 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
227 |
+
2025-04-24 08:58:43,465 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
228 |
+
2025-04-24 08:58:43,466 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
229 |
+
2025-04-24 08:58:43,466 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
230 |
+
2025-04-24 08:58:43,466 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
231 |
+
2025-04-24 08:58:43,466 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
232 |
+
2025-04-24 08:58:43,466 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
233 |
+
2025-04-24 08:58:43,466 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
234 |
+
2025-04-24 08:58:43,466 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
235 |
+
2025-04-24 08:58:43,466 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
236 |
+
2025-04-24 08:58:43,466 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
237 |
+
2025-04-24 08:58:43,466 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
238 |
+
2025-04-24 08:58:43,466 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
239 |
+
2025-04-24 08:58:43,466 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
240 |
+
2025-04-24 08:58:43,466 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
241 |
+
2025-04-24 08:58:43,466 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
242 |
+
2025-04-24 08:58:43,466 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
243 |
+
2025-04-24 08:58:43,466 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
244 |
+
2025-04-24 08:58:43,466 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
245 |
+
2025-04-24 08:58:43,466 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
246 |
+
2025-04-24 08:58:43,466 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
247 |
+
2025-04-24 08:58:43,466 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
248 |
+
2025-04-24 08:58:43,467 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
249 |
+
2025-04-24 08:58:43,467 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
250 |
+
2025-04-24 08:58:43,467 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
251 |
+
2025-04-24 08:58:43,467 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
252 |
+
2025-04-24 08:58:43,467 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
253 |
+
2025-04-24 08:58:43,467 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
254 |
+
2025-04-24 08:58:43,467 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
255 |
+
2025-04-24 08:58:43,467 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
256 |
+
2025-04-24 08:58:43,467 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
257 |
+
2025-04-24 08:58:43,467 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
258 |
+
2025-04-24 08:58:43,467 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
259 |
+
2025-04-24 08:58:43,467 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
260 |
+
2025-04-24 08:58:43,467 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
261 |
+
2025-04-24 08:58:43,467 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
262 |
+
2025-04-24 08:58:43,467 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
263 |
+
2025-04-24 08:58:43,467 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
264 |
+
2025-04-24 08:58:43,467 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
265 |
+
2025-04-24 08:58:43,467 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
266 |
+
2025-04-24 08:58:43,467 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
267 |
+
2025-04-24 08:58:43,467 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
268 |
+
2025-04-24 08:58:43,468 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
269 |
+
2025-04-24 08:58:43,468 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
270 |
+
2025-04-24 08:58:43,468 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
271 |
+
2025-04-24 08:58:43,468 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
272 |
+
2025-04-24 08:58:43,468 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
273 |
+
2025-04-24 08:58:43,468 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
274 |
+
2025-04-24 08:58:43,468 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
275 |
+
2025-04-24 08:58:43,468 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
276 |
+
2025-04-24 08:58:43,468 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
277 |
+
2025-04-24 08:58:43,468 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
278 |
+
2025-04-24 08:58:43,468 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
279 |
+
2025-04-24 08:58:43,468 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
280 |
+
2025-04-24 08:58:43,468 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
281 |
+
2025-04-24 08:58:43,468 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
282 |
+
2025-04-24 08:58:43,468 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
283 |
+
2025-04-24 08:58:43,468 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
284 |
+
2025-04-24 08:58:43,468 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
285 |
+
2025-04-24 08:58:43,468 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
286 |
+
2025-04-24 08:58:43,468 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
287 |
+
2025-04-24 08:58:43,468 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
288 |
+
2025-04-24 08:58:43,469 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
289 |
+
2025-04-24 08:58:43,469 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
290 |
+
2025-04-24 08:58:43,469 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
291 |
+
2025-04-24 08:58:43,469 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
292 |
+
2025-04-24 08:58:43,469 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
293 |
+
2025-04-24 08:58:43,469 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
294 |
+
2025-04-24 08:58:43,469 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
295 |
+
2025-04-24 08:58:43,469 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
296 |
+
2025-04-24 08:58:43,469 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
297 |
+
2025-04-24 08:58:43,469 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
298 |
+
2025-04-24 08:58:43,469 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
299 |
+
2025-04-24 08:58:43,469 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
300 |
+
2025-04-24 08:58:43,469 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
301 |
+
2025-04-24 08:58:43,469 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
302 |
+
2025-04-24 08:58:43,469 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
303 |
+
2025-04-24 08:58:43,469 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
304 |
+
2025-04-24 08:58:43,469 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
305 |
+
2025-04-24 08:58:43,469 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
306 |
+
2025-04-24 08:58:43,469 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
307 |
+
2025-04-24 08:58:43,470 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
308 |
+
2025-04-24 08:58:43,470 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
309 |
+
2025-04-24 08:58:43,470 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
310 |
+
2025-04-24 08:58:43,470 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
311 |
+
2025-04-24 08:58:43,470 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
312 |
+
2025-04-24 08:58:43,470 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
313 |
+
2025-04-24 08:58:43,470 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
314 |
+
2025-04-24 08:58:43,470 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
315 |
+
2025-04-24 08:58:43,470 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
316 |
+
2025-04-24 08:58:43,470 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
317 |
+
2025-04-24 08:58:43,470 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
318 |
+
2025-04-24 08:58:43,470 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
319 |
+
2025-04-24 08:58:43,470 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
320 |
+
2025-04-24 08:58:43,470 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
321 |
+
2025-04-24 08:58:43,470 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
322 |
+
2025-04-24 08:58:43,470 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
323 |
+
2025-04-24 08:58:43,470 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
324 |
+
2025-04-24 08:58:43,470 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
325 |
+
2025-04-24 08:58:43,470 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
326 |
+
2025-04-24 08:58:43,471 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
327 |
+
2025-04-24 08:58:43,471 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
328 |
+
2025-04-24 08:58:43,471 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
329 |
+
2025-04-24 08:58:43,471 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
330 |
+
2025-04-24 08:58:43,471 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
331 |
+
2025-04-24 08:58:43,471 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
332 |
+
2025-04-24 08:58:43,471 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
333 |
+
2025-04-24 08:58:43,471 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
334 |
+
2025-04-24 08:58:43,471 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
335 |
+
2025-04-24 08:58:43,471 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
336 |
+
2025-04-24 08:58:43,471 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
337 |
+
2025-04-24 08:58:43,471 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
338 |
+
2025-04-24 08:58:43,471 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
339 |
+
2025-04-24 08:58:43,471 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
340 |
+
2025-04-24 08:58:43,471 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
341 |
+
2025-04-24 08:58:43,471 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
342 |
+
2025-04-24 08:58:43,471 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
343 |
+
2025-04-24 08:58:43,471 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
344 |
+
2025-04-24 08:58:43,471 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
345 |
+
2025-04-24 08:58:43,471 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
346 |
+
2025-04-24 08:58:43,472 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
347 |
+
2025-04-24 08:58:43,472 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
348 |
+
2025-04-24 08:58:43,472 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
349 |
+
2025-04-24 08:58:43,472 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
350 |
+
2025-04-24 08:58:43,472 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
351 |
+
2025-04-24 08:58:43,472 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
352 |
+
2025-04-24 08:58:43,472 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
353 |
+
2025-04-24 08:58:43,472 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
354 |
+
2025-04-24 08:58:43,472 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
355 |
+
2025-04-24 08:58:43,472 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
356 |
+
2025-04-24 08:58:43,472 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
357 |
+
2025-04-24 08:58:43,472 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
358 |
+
2025-04-24 08:58:43,472 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
359 |
+
2025-04-24 08:58:43,472 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
360 |
+
2025-04-24 08:58:43,472 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
361 |
+
2025-04-24 08:58:43,472 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
362 |
+
2025-04-24 08:58:43,472 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
363 |
+
2025-04-24 08:58:43,472 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
364 |
+
2025-04-24 08:58:43,472 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
365 |
+
2025-04-24 08:58:43,472 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
366 |
+
2025-04-24 08:58:43,473 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
367 |
+
2025-04-24 08:58:43,473 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
368 |
+
2025-04-24 08:58:43,473 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
369 |
+
2025-04-24 08:58:43,473 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
370 |
+
2025-04-24 08:58:43,473 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
371 |
+
2025-04-24 08:58:43,473 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
372 |
+
2025-04-24 08:58:43,473 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
373 |
+
2025-04-24 08:58:43,473 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
374 |
+
2025-04-24 08:58:43,473 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
375 |
+
2025-04-24 08:58:43,473 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
376 |
+
2025-04-24 08:58:43,473 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
377 |
+
2025-04-24 08:58:43,473 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
378 |
+
2025-04-24 08:58:43,473 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
379 |
+
2025-04-24 08:58:43,473 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
380 |
+
2025-04-24 08:58:43,473 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
381 |
+
2025-04-24 08:58:43,473 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
382 |
+
2025-04-24 08:58:43,473 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
383 |
+
2025-04-24 08:58:43,473 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
384 |
+
2025-04-24 08:58:43,473 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
385 |
+
2025-04-24 08:58:43,474 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
386 |
+
2025-04-24 08:58:43,474 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
387 |
+
2025-04-24 08:58:43,474 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
388 |
+
2025-04-24 08:58:43,474 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
389 |
+
2025-04-24 08:58:43,474 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
390 |
+
2025-04-24 08:58:43,474 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
391 |
+
2025-04-24 08:58:43,474 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
392 |
+
2025-04-24 08:58:43,474 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
393 |
+
2025-04-24 08:58:43,474 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
394 |
+
2025-04-24 08:58:43,474 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
395 |
+
2025-04-24 08:58:43,474 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
396 |
+
2025-04-24 08:58:43,474 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
397 |
+
2025-04-24 08:58:43,474 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
398 |
+
2025-04-24 08:58:43,474 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
399 |
+
2025-04-24 08:58:43,474 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
400 |
+
2025-04-24 08:58:43,474 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
401 |
+
2025-04-24 08:58:43,474 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
402 |
+
2025-04-24 08:58:43,474 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
403 |
+
2025-04-24 08:58:43,474 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
404 |
+
2025-04-24 08:58:43,474 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
405 |
+
2025-04-24 08:58:43,475 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
406 |
+
2025-04-24 08:58:43,475 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
407 |
+
2025-04-24 08:58:43,475 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
408 |
+
2025-04-24 08:58:43,475 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
409 |
+
2025-04-24 08:58:43,475 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
410 |
+
2025-04-24 08:58:43,475 INFO [train.py:813] encoder_projector.linear1.weight: torch.Size([896, 10240])
|
411 |
+
2025-04-24 08:58:43,475 INFO [train.py:813] encoder_projector.linear1.bias: torch.Size([896])
|
412 |
+
2025-04-24 08:58:43,475 INFO [train.py:813] encoder_projector.linear2.weight: torch.Size([896, 896])
|
413 |
+
2025-04-24 08:58:43,475 INFO [train.py:813] encoder_projector.linear2.bias: torch.Size([896])
|
414 |
+
2025-04-24 08:58:43,475 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
415 |
+
2025-04-24 08:58:43,475 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
416 |
+
2025-04-24 08:58:43,475 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
417 |
+
2025-04-24 08:58:43,475 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
418 |
+
2025-04-24 08:58:43,475 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
419 |
+
2025-04-24 08:58:43,475 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
420 |
+
2025-04-24 08:58:43,475 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
421 |
+
2025-04-24 08:58:43,475 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
422 |
+
2025-04-24 08:58:43,475 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
423 |
+
2025-04-24 08:58:43,475 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
424 |
+
2025-04-24 08:58:43,475 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
425 |
+
2025-04-24 08:58:43,476 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
426 |
+
2025-04-24 08:58:43,476 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
427 |
+
2025-04-24 08:58:43,476 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
428 |
+
2025-04-24 08:58:43,476 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
429 |
+
2025-04-24 08:58:43,476 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
430 |
+
2025-04-24 08:58:43,476 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
431 |
+
2025-04-24 08:58:43,476 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
432 |
+
2025-04-24 08:58:43,476 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
433 |
+
2025-04-24 08:58:43,476 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
434 |
+
2025-04-24 08:58:43,476 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
435 |
+
2025-04-24 08:58:43,476 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
436 |
+
2025-04-24 08:58:43,476 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
437 |
+
2025-04-24 08:58:43,476 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
438 |
+
2025-04-24 08:58:43,476 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
439 |
+
2025-04-24 08:58:43,476 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
440 |
+
2025-04-24 08:58:43,476 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
441 |
+
2025-04-24 08:58:43,476 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
442 |
+
2025-04-24 08:58:43,476 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
443 |
+
2025-04-24 08:58:43,476 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
444 |
+
2025-04-24 08:58:43,476 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
445 |
+
2025-04-24 08:58:43,477 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
446 |
+
2025-04-24 08:58:43,477 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
447 |
+
2025-04-24 08:58:43,477 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
448 |
+
2025-04-24 08:58:43,477 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
449 |
+
2025-04-24 08:58:43,477 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
450 |
+
2025-04-24 08:58:43,477 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
451 |
+
2025-04-24 08:58:43,477 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
452 |
+
2025-04-24 08:58:43,477 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
453 |
+
2025-04-24 08:58:43,477 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
454 |
+
2025-04-24 08:58:43,477 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
455 |
+
2025-04-24 08:58:43,477 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
456 |
+
2025-04-24 08:58:43,477 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
457 |
+
2025-04-24 08:58:43,477 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
458 |
+
2025-04-24 08:58:43,477 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
459 |
+
2025-04-24 08:58:43,477 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
460 |
+
2025-04-24 08:58:43,477 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
461 |
+
2025-04-24 08:58:43,477 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
462 |
+
2025-04-24 08:58:43,477 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
463 |
+
2025-04-24 08:58:43,477 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
464 |
+
2025-04-24 08:58:43,477 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
465 |
+
2025-04-24 08:58:43,478 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
466 |
+
2025-04-24 08:58:43,478 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
467 |
+
2025-04-24 08:58:43,478 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
468 |
+
2025-04-24 08:58:43,478 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
469 |
+
2025-04-24 08:58:43,478 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
470 |
+
2025-04-24 08:58:43,478 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
471 |
+
2025-04-24 08:58:43,478 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
472 |
+
2025-04-24 08:58:43,478 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
473 |
+
2025-04-24 08:58:43,478 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
474 |
+
2025-04-24 08:58:43,478 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
475 |
+
2025-04-24 08:58:43,478 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
476 |
+
2025-04-24 08:58:43,478 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
477 |
+
2025-04-24 08:58:43,478 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
478 |
+
2025-04-24 08:58:43,478 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
479 |
+
2025-04-24 08:58:43,478 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
480 |
+
2025-04-24 08:58:43,478 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
481 |
+
2025-04-24 08:58:43,478 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
482 |
+
2025-04-24 08:58:43,478 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
483 |
+
2025-04-24 08:58:43,478 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
484 |
+
2025-04-24 08:58:43,478 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
485 |
+
2025-04-24 08:58:43,479 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
486 |
+
2025-04-24 08:58:43,479 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
487 |
+
2025-04-24 08:58:43,479 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
488 |
+
2025-04-24 08:58:43,479 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
489 |
+
2025-04-24 08:58:43,479 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
490 |
+
2025-04-24 08:58:43,479 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
491 |
+
2025-04-24 08:58:43,479 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
492 |
+
2025-04-24 08:58:43,479 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
493 |
+
2025-04-24 08:58:43,479 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
494 |
+
2025-04-24 08:58:43,479 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
495 |
+
2025-04-24 08:58:43,479 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
496 |
+
2025-04-24 08:58:43,479 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
497 |
+
2025-04-24 08:58:43,479 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
498 |
+
2025-04-24 08:58:43,479 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
499 |
+
2025-04-24 08:58:43,479 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
500 |
+
2025-04-24 08:58:43,479 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
501 |
+
2025-04-24 08:58:43,479 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
502 |
+
2025-04-24 08:58:43,479 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
503 |
+
2025-04-24 08:58:43,479 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
504 |
+
2025-04-24 08:58:43,480 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
505 |
+
2025-04-24 08:58:43,480 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
506 |
+
2025-04-24 08:58:43,480 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
507 |
+
2025-04-24 08:58:43,480 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
508 |
+
2025-04-24 08:58:43,480 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
509 |
+
2025-04-24 08:58:43,480 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
510 |
+
2025-04-24 08:58:43,480 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
511 |
+
2025-04-24 08:58:43,480 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
512 |
+
2025-04-24 08:58:43,480 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
513 |
+
2025-04-24 08:58:43,480 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
514 |
+
2025-04-24 08:58:43,480 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
515 |
+
2025-04-24 08:58:43,480 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
516 |
+
2025-04-24 08:58:43,480 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
517 |
+
2025-04-24 08:58:43,480 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
518 |
+
2025-04-24 08:58:43,480 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
519 |
+
2025-04-24 08:58:43,480 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
520 |
+
2025-04-24 08:58:43,480 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
521 |
+
2025-04-24 08:58:43,480 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
522 |
+
2025-04-24 08:58:43,480 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
523 |
+
2025-04-24 08:58:43,480 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
524 |
+
2025-04-24 08:58:43,481 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
525 |
+
2025-04-24 08:58:43,481 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
526 |
+
2025-04-24 08:58:43,481 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
527 |
+
2025-04-24 08:58:43,481 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
528 |
+
2025-04-24 08:58:43,481 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
529 |
+
2025-04-24 08:58:43,481 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
530 |
+
2025-04-24 08:58:43,481 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
531 |
+
2025-04-24 08:58:43,481 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
532 |
+
2025-04-24 08:58:43,481 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
533 |
+
2025-04-24 08:58:43,481 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
534 |
+
2025-04-24 08:58:43,481 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
535 |
+
2025-04-24 08:58:43,481 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
536 |
+
2025-04-24 08:58:43,481 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
537 |
+
2025-04-24 08:58:43,481 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
538 |
+
2025-04-24 08:58:43,481 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
539 |
+
2025-04-24 08:58:43,481 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
540 |
+
2025-04-24 08:58:43,481 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
541 |
+
2025-04-24 08:58:43,481 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
542 |
+
2025-04-24 08:58:43,481 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
543 |
+
2025-04-24 08:58:43,481 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
544 |
+
2025-04-24 08:58:43,481 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
545 |
+
2025-04-24 08:58:43,482 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
546 |
+
2025-04-24 08:58:43,482 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
547 |
+
2025-04-24 08:58:43,482 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
548 |
+
2025-04-24 08:58:43,482 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
549 |
+
2025-04-24 08:58:43,482 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
550 |
+
2025-04-24 08:58:43,482 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
551 |
+
2025-04-24 08:58:43,482 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
552 |
+
2025-04-24 08:58:43,482 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
553 |
+
2025-04-24 08:58:43,482 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
554 |
+
2025-04-24 08:58:43,482 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
555 |
+
2025-04-24 08:58:43,482 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
556 |
+
2025-04-24 08:58:43,482 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
557 |
+
2025-04-24 08:58:43,482 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
558 |
+
2025-04-24 08:58:43,482 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
559 |
+
2025-04-24 08:58:43,482 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
560 |
+
2025-04-24 08:58:43,482 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
561 |
+
2025-04-24 08:58:43,482 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
562 |
+
2025-04-24 08:58:43,482 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
563 |
+
2025-04-24 08:58:43,482 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
564 |
+
2025-04-24 08:58:43,482 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
565 |
+
2025-04-24 08:58:43,483 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
566 |
+
2025-04-24 08:58:43,483 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
567 |
+
2025-04-24 08:58:43,483 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
568 |
+
2025-04-24 08:58:43,483 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
569 |
+
2025-04-24 08:58:43,483 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
570 |
+
2025-04-24 08:58:43,483 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
571 |
+
2025-04-24 08:58:43,483 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
572 |
+
2025-04-24 08:58:43,483 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
573 |
+
2025-04-24 08:58:43,483 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
574 |
+
2025-04-24 08:58:43,483 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
575 |
+
2025-04-24 08:58:43,483 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
576 |
+
2025-04-24 08:58:43,483 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
577 |
+
2025-04-24 08:58:43,483 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
578 |
+
2025-04-24 08:58:43,483 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
579 |
+
2025-04-24 08:58:43,483 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
580 |
+
2025-04-24 08:58:43,483 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
581 |
+
2025-04-24 08:58:43,483 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
582 |
+
2025-04-24 08:58:43,483 INFO [train.py:813] speech_token_projector.weight: torch.Size([1024, 896])
|
583 |
+
2025-04-24 08:58:43,483 INFO [train.py:813] speech_token_projector.bias: torch.Size([1024])
|
584 |
+
2025-04-24 08:58:43,483 INFO [train.py:813] codec_lm_head.weight: torch.Size([4100, 1024])
|
585 |
+
2025-04-24 08:58:43,484 INFO [train.py:813] codec_lm_head.bias: torch.Size([4100])
|
586 |
+
2025-04-24 08:58:43,484 INFO [train.py:819] Device: cuda:1
|
587 |
+
2025-04-24 08:58:44,314 INFO [train.py:823] Using DeepSpeed
|
588 |
+
2025-04-24 08:58:46,121 INFO [data_module.py:445] About to get train cuts
|
589 |
+
2025-04-24 08:58:46,124 INFO [data_module.py:250] Disable MUSAN
|
590 |
+
2025-04-24 08:58:46,124 INFO [data_module.py:268] Enable SpecAugment
|
591 |
+
2025-04-24 08:58:46,124 INFO [data_module.py:269] Time warp factor: 80
|
592 |
+
2025-04-24 08:58:46,124 INFO [data_module.py:279] Num frame mask: 10
|
593 |
+
2025-04-24 08:58:46,124 INFO [data_module.py:292] About to create train dataset
|
594 |
+
2025-04-24 08:58:46,124 INFO [data_module.py:319] Using DynamicBucketingSampler.
|
595 |
+
2025-04-24 08:58:46,156 WARNING [train.py:847] Exclude cut with ID c45eb75253bb65901de61b81270c9974 from training. Duration: 6.199727891156463, lenth: 2321
|
596 |
+
2025-04-24 08:58:46,338 WARNING [train.py:847] Exclude cut with ID 82678aa1e73accb2d11a87440a90fd55 from training. Duration: 5.0967800453514736, lenth: 2413
|
597 |
+
2025-04-24 08:58:46,831 WARNING [train.py:847] Exclude cut with ID 345bcf1b57cbfcf140007812e992c07a from training. Duration: 5.712108843537415, lenth: 2520
|
598 |
+
2025-04-24 08:58:46,985 WARNING [train.py:847] Exclude cut with ID 2bc428341a40605b4887a7e46e3ee4ac from training. Duration: 3.657142857142857, lenth: 2282
|
599 |
+
2025-04-24 08:58:47,053 WARNING [train.py:847] Exclude cut with ID 24c14f6007ceb8d993f6496aeb663c24 from training. Duration: 2.391655328798186, lenth: 2218
|
600 |
+
2025-04-24 08:58:47,120 INFO [data_module.py:336] About to create train dataloader
|
601 |
+
2025-04-24 08:58:47,120 INFO [data_module.py:436] About to get test cuts
|
602 |
+
2025-04-24 08:58:47,121 INFO [data_module.py:365] About to create dev dataset
|
603 |
+
2025-04-24 08:58:47,209 INFO [data_module.py:379] About to create dev dataloader
|
604 |
+
2025-04-24 08:58:47,215 INFO [train.py:875] start training from epoch 1
|
605 |
+
2025-04-24 08:58:47,376 WARNING [train.py:847] Exclude cut with ID c45eb75253bb65901de61b81270c9974 from training. Duration: 6.199727891156463, lenth: 2321
|
606 |
+
2025-04-24 08:58:47,606 WARNING [train.py:847] Exclude cut with ID 82678aa1e73accb2d11a87440a90fd55 from training. Duration: 5.0967800453514736, lenth: 2413
|
607 |
+
2025-04-24 08:58:48,237 WARNING [train.py:847] Exclude cut with ID 345bcf1b57cbfcf140007812e992c07a from training. Duration: 5.712108843537415, lenth: 2520
|
608 |
+
2025-04-24 08:58:48,432 WARNING [train.py:847] Exclude cut with ID 2bc428341a40605b4887a7e46e3ee4ac from training. Duration: 3.657142857142857, lenth: 2282
|
609 |
+
2025-04-24 08:58:48,511 WARNING [train.py:847] Exclude cut with ID 24c14f6007ceb8d993f6496aeb663c24 from training. Duration: 2.391655328798186, lenth: 2218
|
610 |
+
2025-04-24 08:58:49,064 WARNING [train.py:847] Exclude cut with ID de225db1da1393474e4ce9af8cbafb42 from training. Duration: 5.433469387755102, lenth: 2230
|
611 |
+
2025-04-24 08:58:49,354 WARNING [train.py:847] Exclude cut with ID ef376c7d1fb4606edb0547b2a7fb95b3 from training. Duration: 2.7399546485260773, lenth: 2637
|
612 |
+
2025-04-24 08:58:49,692 WARNING [train.py:847] Exclude cut with ID c7f3c7da44c2c6e27f86bda421b958d8 from training. Duration: 2.995374149659864, lenth: 2318
|
613 |
+
2025-04-24 08:58:49,867 WARNING [train.py:847] Exclude cut with ID 8d724f452d1d99549eebf90ab658e7bd from training. Duration: 5.712108843537415, lenth: 2209
|
614 |
+
2025-04-24 08:58:49,983 WARNING [train.py:847] Exclude cut with ID a585785e795cb35d02d8f7b599352e16 from training. Duration: 4.911020408163266, lenth: 2575
|
615 |
+
2025-04-24 08:58:50,062 WARNING [train.py:847] Exclude cut with ID 31a00c5e63c88ef67402f578ebc6ebf7 from training. Duration: 3.575873015873016, lenth: 2307
|
616 |
+
2025-04-24 08:58:50,180 WARNING [train.py:847] Exclude cut with ID 32ab4761aaf08d9473fc8388c3a6cffc from training. Duration: 7.9760544217687075, lenth: 2201
|
617 |
+
2025-04-24 08:58:50,237 WARNING [train.py:847] Exclude cut with ID 5fddb6bf9d5265c0409b81c85fadbaec from training. Duration: 3.4597732426303853, lenth: 2210
|
618 |
+
2025-04-24 08:58:51,088 WARNING [train.py:847] Exclude cut with ID c5d43a2443ce7820cf864f7c55eafb77 from training. Duration: 3.912562358276644, lenth: 2315
|
619 |
+
2025-04-24 08:58:52,211 WARNING [train.py:847] Exclude cut with ID a951e36267c8901d4c03fb6b4dffe636 from training. Duration: 4.73687074829932, lenth: 2223
|
620 |
+
2025-04-24 08:58:52,283 WARNING [train.py:847] Exclude cut with ID 595680451c1ee7e42dba0452e165dea3 from training. Duration: 3.959002267573696, lenth: 2251
|
621 |
+
2025-04-24 08:58:52,830 WARNING [train.py:847] Exclude cut with ID 20fa9b11557159d78f72ccc6b464de18 from training. Duration: 3.9938321995464854, lenth: 2215
|
622 |
+
2025-04-24 08:58:52,884 WARNING [train.py:847] Exclude cut with ID 8a01fc72b4108673fd8393e523450e23 from training. Duration: 8.719092970521542, lenth: 2517
|
623 |
+
2025-04-24 08:58:53,397 WARNING [train.py:847] Exclude cut with ID 924d54288847d448a20809a26b320477 from training. Duration: 3.81968253968254, lenth: 2302
|
624 |
+
2025-04-24 08:58:53,695 WARNING [train.py:847] Exclude cut with ID a7f91a17f6318d896d0bee7b7307d000 from training. Duration: 5.294149659863946, lenth: 2239
|
625 |
+
2025-04-24 08:58:54,189 WARNING [train.py:847] Exclude cut with ID 8d7cff5d1d25a773f9d27590b5912b60 from training. Duration: 4.597551020408163, lenth: 2203
|
626 |
+
2025-04-24 08:58:54,309 WARNING [train.py:847] Exclude cut with ID 13407ca1ec78def38346569b54d11a78 from training. Duration: 5.909478458049887, lenth: 2280
|
627 |
+
2025-04-24 08:58:54,313 WARNING [train.py:847] Exclude cut with ID 6390ab6964b52f2ee9bf36826393abaa from training. Duration: 4.017052154195011, lenth: 2209
|
628 |
+
2025-04-24 08:58:54,357 WARNING [train.py:847] Exclude cut with ID f55c19435d6643ebf7356a1072a014fa from training. Duration: 5.259319727891157, lenth: 2310
|
629 |
+
2025-04-24 08:58:55,044 WARNING [train.py:847] Exclude cut with ID 546f5fb6d11db41d104ee7b0667f4e1d from training. Duration: 5.712108843537415, lenth: 2770
|
630 |
+
2025-04-24 08:58:55,337 WARNING [train.py:847] Exclude cut with ID b2069846ab4df18f5caa676751aa0b40 from training. Duration: 6.6525170068027215, lenth: 2588
|
631 |
+
2025-04-24 08:58:55,708 WARNING [train.py:847] Exclude cut with ID 5717a65b9855425c0330e6022a71cc2f from training. Duration: 5.073560090702948, lenth: 2269
|
632 |
+
2025-04-24 08:58:56,015 WARNING [train.py:847] Exclude cut with ID d6e37a975b3c69a86a808347b53e42f2 from training. Duration: 5.259319727891157, lenth: 2885
|
633 |
+
2025-04-24 08:58:56,164 WARNING [train.py:847] Exclude cut with ID d15e75257f8dac616c078113f8f96c63 from training. Duration: 7.6741950113378685, lenth: 2276
|
634 |
+
2025-04-24 08:58:56,227 WARNING [train.py:847] Exclude cut with ID 7ea49b35e0dc536537ae3ddcd07032af from training. Duration: 4.829750566893424, lenth: 2206
|
635 |
+
2025-04-24 08:58:56,689 WARNING [train.py:847] Exclude cut with ID 0a1a80c13fd0c38d97c9491dff84c7da from training. Duration: 2.914104308390023, lenth: 2220
|
636 |
+
2025-04-24 08:58:56,739 WARNING [train.py:847] Exclude cut with ID 359ff41c694bb848c3e37b948b1434f7 from training. Duration: 6.478367346938776, lenth: 2881
|
637 |
+
2025-04-24 08:58:56,747 WARNING [train.py:847] Exclude cut with ID 6989881203588034ddf38ae5a7cfe238 from training. Duration: 5.491519274376417, lenth: 2243
|
638 |
+
2025-04-24 08:58:57,027 WARNING [train.py:847] Exclude cut with ID fd7b5cb9a6bce2c02bf8a7c3dd2b35d6 from training. Duration: 5.793378684807256, lenth: 2305
|
639 |
+
2025-04-24 08:58:58,007 WARNING [train.py:847] Exclude cut with ID e1c5efd6d4cd7c6dc0f3ac7720161c49 from training. Duration: 9.438911564625851, lenth: 2201
|
640 |
+
2025-04-24 08:58:58,405 WARNING [train.py:847] Exclude cut with ID c13ffcc7b60ea460731e417d2d9575cf from training. Duration: 3.250793650793651, lenth: 2205
|
641 |
+
2025-04-24 08:58:58,498 WARNING [train.py:847] Exclude cut with ID ba1a37e64efe00d8b8cbfb0069e65f8d from training. Duration: 3.274013605442177, lenth: 2290
|
642 |
+
2025-04-24 08:58:58,579 WARNING [train.py:847] Exclude cut with ID de9f5fe02034bc4b0a66fbe378fffb1a from training. Duration: 6.455147392290249, lenth: 2228
|
643 |
+
2025-04-24 08:58:59,067 WARNING [train.py:847] Exclude cut with ID 8ca7a35870994edd410e1ea8e951faa3 from training. Duration: 7.012426303854875, lenth: 2233
|
644 |
+
2025-04-24 08:58:59,214 WARNING [train.py:847] Exclude cut with ID 3260368505eddf46fb1c9b562124ff18 from training. Duration: 12.097596371882085, lenth: 2261
|
645 |
+
2025-04-24 08:58:59,412 WARNING [train.py:847] Exclude cut with ID 36878ea551121147626747e9fc39ae17 from training. Duration: 5.537959183673469, lenth: 2224
|
646 |
+
2025-04-24 08:59:00,361 WARNING [train.py:847] Exclude cut with ID d4c7270e6c9b3dffed8643040b5f922b from training. Duration: 4.133151927437642, lenth: 2329
|
647 |
+
2025-04-24 08:59:00,843 WARNING [train.py:847] Exclude cut with ID 179517464c97fbbcb8909a56b849bf73 from training. Duration: 6.617687074829932, lenth: 2257
|
648 |
+
2025-04-24 08:59:01,065 WARNING [train.py:847] Exclude cut with ID 17480e9524e4c0f8e405487e50c8e98e from training. Duration: 4.551111111111111, lenth: 2263
|
649 |
+
2025-04-24 08:59:01,165 WARNING [train.py:847] Exclude cut with ID 2c10af552b911b8ac45c1fc7bb9fcb5d from training. Duration: 6.2693877551020405, lenth: 2481
|
650 |
+
2025-04-24 08:59:01,276 WARNING [train.py:847] Exclude cut with ID aa2539588175d8c96975a06a8993e5b6 from training. Duration: 5.456689342403628, lenth: 2573
|
651 |
+
2025-04-24 08:59:01,317 WARNING [train.py:847] Exclude cut with ID 32945d8e21a8628bb3c93fd499330639 from training. Duration: 13.455963718820861, lenth: 2205
|
652 |
+
2025-04-24 08:59:01,495 WARNING [train.py:847] Exclude cut with ID 64db5d6166c3edc340d8e1b02c059f19 from training. Duration: 4.818140589569161, lenth: 2259
|
653 |
+
2025-04-24 08:59:01,638 WARNING [train.py:847] Exclude cut with ID 39483151e08291efe16217ab41622d59 from training. Duration: 7.952834467120182, lenth: 2439
|
654 |
+
2025-04-24 08:59:02,375 WARNING [train.py:847] Exclude cut with ID c4a74dd23aa9a3fb13330113787139d6 from training. Duration: 3.239183673469388, lenth: 2276
|
655 |
+
2025-04-24 08:59:02,420 WARNING [train.py:847] Exclude cut with ID b12a8836f1aa525a763ac657a44229a3 from training. Duration: 5.0967800453514736, lenth: 2427
|
656 |
+
2025-04-24 08:59:05,911 WARNING [train.py:847] Exclude cut with ID 185cfa3ae9c12babe1e217de612f0c7b from training. Duration: 6.257777777777778, lenth: 2587
|
657 |
+
2025-04-24 08:59:06,001 WARNING [train.py:847] Exclude cut with ID 08eff5b64d191a0d50634630437a3107 from training. Duration: 10.135510204081633, lenth: 2273
|
658 |
+
2025-04-24 08:59:06,092 WARNING [train.py:847] Exclude cut with ID bcd8207538deeccfcf8d1de872b8f7e9 from training. Duration: 4.179591836734694, lenth: 2242
|
659 |
+
2025-04-24 08:59:06,302 WARNING [train.py:847] Exclude cut with ID f909651c923fdd6cc0eaa81de64f4a5b from training. Duration: 8.068934240362811, lenth: 2421
|
660 |
+
2025-04-24 08:59:06,330 WARNING [train.py:847] Exclude cut with ID f5d34097b89bd201018189ff44b10b33 from training. Duration: 3.7964625850340137, lenth: 2250
|
661 |
+
2025-04-24 08:59:06,425 WARNING [train.py:847] Exclude cut with ID 245530abe5a62b390611b7a3b91ddfef from training. Duration: 9.079002267573696, lenth: 2299
|
662 |
+
2025-04-24 08:59:07,344 WARNING [train.py:847] Exclude cut with ID 79d0d3129c764fd4d9c5391b6353e349 from training. Duration: 5.0967800453514736, lenth: 2309
|
663 |
+
2025-04-24 08:59:07,382 WARNING [train.py:847] Exclude cut with ID 1969177f73511ef204642f43ce029d05 from training. Duration: 3.657142857142857, lenth: 2339
|
664 |
+
2025-04-24 08:59:07,895 WARNING [train.py:847] Exclude cut with ID f601e756ecd6074cd3037c61e413cb6e from training. Duration: 5.712108843537415, lenth: 2287
|
665 |
+
2025-04-24 08:59:08,149 WARNING [train.py:847] Exclude cut with ID 9f7758e035b3eebcb9267aa49ad65d4c from training. Duration: 3.9938321995464854, lenth: 2303
|
666 |
+
2025-04-24 08:59:08,619 WARNING [train.py:847] Exclude cut with ID b29a42e8c0af813bf605d7f0a974620c from training. Duration: 5.050340136054421, lenth: 2219
|
667 |
+
2025-04-24 08:59:08,791 WARNING [train.py:847] Exclude cut with ID 192e5517b050a4659892a2832d986764 from training. Duration: 5.12, lenth: 2420
|
668 |
+
2025-04-24 08:59:08,959 WARNING [train.py:847] Exclude cut with ID bc262e923d16a434a3c5701bea841e4e from training. Duration: 7.418775510204082, lenth: 2205
|
669 |
+
2025-04-24 08:59:09,126 WARNING [train.py:847] Exclude cut with ID 5141e01019dec47ed309fbf5881ab675 from training. Duration: 8.034104308390022, lenth: 2284
|
670 |
+
2025-04-24 08:59:09,156 WARNING [train.py:847] Exclude cut with ID 3555e5c7123d744b39ba7ecce44ce1e2 from training. Duration: 3.7151927437641725, lenth: 2226
|
671 |
+
2025-04-24 08:59:09,605 WARNING [train.py:847] Exclude cut with ID 973a1dc43ca452580d6999027a82fbec from training. Duration: 5.270929705215419, lenth: 2250
|
672 |
+
2025-04-24 08:59:09,718 WARNING [train.py:847] Exclude cut with ID e2791accb62bb5e4bde9e0e2b04945c5 from training. Duration: 3.517823129251701, lenth: 2286
|
673 |
+
2025-04-24 08:59:09,897 WARNING [train.py:847] Exclude cut with ID e91152d010c4479932b7ab64a998d5c9 from training. Duration: 4.771700680272109, lenth: 2468
|
674 |
+
2025-04-24 08:59:10,032 WARNING [train.py:847] Exclude cut with ID cff6a8011a86580fcf47595d18819ae9 from training. Duration: 4.458231292517007, lenth: 2205
|
675 |
+
2025-04-24 08:59:10,081 WARNING [train.py:847] Exclude cut with ID 1623cdac3d17f7a211e8a8f434f70657 from training. Duration: 4.191201814058957, lenth: 2308
|
676 |
+
2025-04-24 08:59:10,723 WARNING [train.py:847] Exclude cut with ID 273cfc04aebaf8b0cad541d0f3d1cded from training. Duration: 6.408707482993197, lenth: 2609
|
677 |
+
2025-04-24 08:59:10,804 WARNING [train.py:847] Exclude cut with ID 7d591b090a42935fd1edd8de97c690b6 from training. Duration: 5.038730158730159, lenth: 2229
|
678 |
+
2025-04-24 08:59:10,900 WARNING [train.py:847] Exclude cut with ID 1cfa9ef743b568bb034b415000c8c479 from training. Duration: 7.174965986394557, lenth: 2440
|
679 |
+
2025-04-24 08:59:11,699 WARNING [train.py:847] Exclude cut with ID 762676d460719a9203fa7e7537560530 from training. Duration: 3.1346938775510202, lenth: 2443
|
680 |
+
2025-04-24 08:59:12,205 WARNING [train.py:847] Exclude cut with ID 43575814ad7bb6d631483cdc513b49ac from training. Duration: 10.95981859410431, lenth: 2380
|
681 |
+
2025-04-24 08:59:12,326 WARNING [train.py:847] Exclude cut with ID ae208dee7119ca436f380b99077bcde9 from training. Duration: 5.2128798185941045, lenth: 2615
|
682 |
+
2025-04-24 08:59:12,425 WARNING [train.py:847] Exclude cut with ID b56757708077ab00baa7857c1cef8491 from training. Duration: 5.456689342403628, lenth: 2206
|
683 |
+
2025-04-24 08:59:12,660 WARNING [train.py:847] Exclude cut with ID f3f6d5671f3d11bc83f8fcad598007ff from training. Duration: 4.179591836734694, lenth: 2262
|
684 |
+
2025-04-24 08:59:13,091 WARNING [train.py:847] Exclude cut with ID 6af07f098cd794b9dd5a6bd660f53505 from training. Duration: 4.435011337868481, lenth: 2251
|
685 |
+
2025-04-24 08:59:13,234 WARNING [train.py:847] Exclude cut with ID 4ac1131393136a35ef54499bf3c728d4 from training. Duration: 7.732244897959184, lenth: 2211
|
686 |
+
2025-04-24 08:59:13,236 WARNING [train.py:847] Exclude cut with ID 482877e78defa83a355d2cdeb5a629ed from training. Duration: 5.154829931972789, lenth: 2891
|
687 |
+
2025-04-24 08:59:16,145 WARNING [train.py:847] Exclude cut with ID a93637e8f3e7bdf34bc8eb5d34d34197 from training. Duration: 3.215963718820862, lenth: 2235
|
688 |
+
2025-04-24 08:59:17,828 WARNING [train.py:847] Exclude cut with ID 90c6fbf20eeb34c13ffb97a198d84b0b from training. Duration: 5.0967800453514736, lenth: 2236
|
689 |
+
2025-04-24 08:59:17,931 INFO [train.py:539] Computing validation loss
|
log/log-train-2025-04-24-09-09-22
ADDED
@@ -0,0 +1,689 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2025-04-24 09:09:22,425 INFO [train.py:653] {
|
2 |
+
"allowed_excess_duration_ratio": 0.1,
|
3 |
+
"audio_key": "question_audio",
|
4 |
+
"batch_idx_train": 0,
|
5 |
+
"best_train_epoch": -1,
|
6 |
+
"best_train_loss": Infinity,
|
7 |
+
"best_valid_epoch": -1,
|
8 |
+
"best_valid_loss": Infinity,
|
9 |
+
"bucketing_sampler": true,
|
10 |
+
"deepscale": false,
|
11 |
+
"deepscale_config": null,
|
12 |
+
"deepspeed": true,
|
13 |
+
"deepspeed_config": "./slam_omni/ds_config_zero1.json",
|
14 |
+
"drop_last": true,
|
15 |
+
"enable_musan": false,
|
16 |
+
"enable_spec_aug": true,
|
17 |
+
"enable_speech_output": true,
|
18 |
+
"encoder_projector_ds_rate": 8,
|
19 |
+
"env_info": {
|
20 |
+
"IP address": "0.114.172.122",
|
21 |
+
"hostname": "7515258",
|
22 |
+
"icefall-git-branch": "master",
|
23 |
+
"icefall-git-date": "Thu Apr 24 08:24:11 2025",
|
24 |
+
"icefall-git-sha1": "2e9be467-dirty",
|
25 |
+
"icefall-path": "/workspace/slam/icefall_omni",
|
26 |
+
"k2-build-type": "Release",
|
27 |
+
"k2-git-date": "Tue Oct 29 09:02:19 2024",
|
28 |
+
"k2-git-sha1": "75e2ed6b2fd87c22b7f3f34bad48a69984bb8755",
|
29 |
+
"k2-path": "/opt/conda/lib/python3.11/site-packages/k2/__init__.py",
|
30 |
+
"k2-version": "1.24.4",
|
31 |
+
"k2-with-cuda": true,
|
32 |
+
"lhotse-path": "/workspace/slam/lhotse/lhotse/__init__.py",
|
33 |
+
"lhotse-version": "1.30.0.dev+git.13c7616f.dirty",
|
34 |
+
"python-version": "3.11",
|
35 |
+
"torch-cuda-available": true,
|
36 |
+
"torch-cuda-version": "12.4",
|
37 |
+
"torch-version": "2.4.0"
|
38 |
+
},
|
39 |
+
"exp_dir": "slam_omni/exp_speech2speech_rerun",
|
40 |
+
"frame_shift_ms": 10,
|
41 |
+
"huggingface_dataset_path_or_name": "/workspace/Belle_1.4M-SLAM-Omni",
|
42 |
+
"input_strategy": "PrecomputedFeatures",
|
43 |
+
"llm_path_or_name": "models/Qwen2.5-0.5B-Instruct",
|
44 |
+
"log_interval": 50,
|
45 |
+
"manifest_dir": "data/fbank",
|
46 |
+
"max_duration": 40,
|
47 |
+
"num_buckets": 30,
|
48 |
+
"num_epochs": 10,
|
49 |
+
"num_workers": 2,
|
50 |
+
"on_the_fly_feats": false,
|
51 |
+
"pretrained_model_path": null,
|
52 |
+
"resample_to_16kHz": true,
|
53 |
+
"reset_interval": 200,
|
54 |
+
"return_cuts": true,
|
55 |
+
"sampler_state_dict_path": null,
|
56 |
+
"seed": 42,
|
57 |
+
"shuffle": true,
|
58 |
+
"spec_aug_time_warp_factor": 80,
|
59 |
+
"speech_encoder_path_or_name": "models/whisper/v1.1/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt",
|
60 |
+
"start_epoch": 1,
|
61 |
+
"subsampling_factor": 2,
|
62 |
+
"tensorboard": true,
|
63 |
+
"text_key": "answer",
|
64 |
+
"unfreeze_llm": true,
|
65 |
+
"unfreeze_speech_projector": true,
|
66 |
+
"use_flash_attn": true,
|
67 |
+
"use_fp16": true,
|
68 |
+
"use_lora": true,
|
69 |
+
"valid_interval": 5000
|
70 |
+
}
|
71 |
+
2025-04-24 09:09:22,426 INFO [train.py:655] About to create model
|
72 |
+
2025-04-24 09:09:42,388 INFO [train.py:808] Number of model parameters: 1326848644
|
73 |
+
2025-04-24 09:09:42,389 INFO [train.py:810] Trainable parameters (excluding model.eval modules):
|
74 |
+
2025-04-24 09:09:42,389 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
75 |
+
2025-04-24 09:09:42,390 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
76 |
+
2025-04-24 09:09:42,390 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
77 |
+
2025-04-24 09:09:42,390 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
78 |
+
2025-04-24 09:09:42,390 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
79 |
+
2025-04-24 09:09:42,390 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
80 |
+
2025-04-24 09:09:42,390 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
81 |
+
2025-04-24 09:09:42,390 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
82 |
+
2025-04-24 09:09:42,390 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
83 |
+
2025-04-24 09:09:42,390 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
84 |
+
2025-04-24 09:09:42,390 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
85 |
+
2025-04-24 09:09:42,390 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
86 |
+
2025-04-24 09:09:42,390 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
87 |
+
2025-04-24 09:09:42,390 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
88 |
+
2025-04-24 09:09:42,390 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
89 |
+
2025-04-24 09:09:42,390 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
90 |
+
2025-04-24 09:09:42,390 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
91 |
+
2025-04-24 09:09:42,390 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
92 |
+
2025-04-24 09:09:42,391 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
93 |
+
2025-04-24 09:09:42,391 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
94 |
+
2025-04-24 09:09:42,391 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
95 |
+
2025-04-24 09:09:42,391 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
96 |
+
2025-04-24 09:09:42,391 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
97 |
+
2025-04-24 09:09:42,391 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
98 |
+
2025-04-24 09:09:42,391 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
99 |
+
2025-04-24 09:09:42,391 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
100 |
+
2025-04-24 09:09:42,393 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
101 |
+
2025-04-24 09:09:42,393 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
102 |
+
2025-04-24 09:09:42,393 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
103 |
+
2025-04-24 09:09:42,393 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
104 |
+
2025-04-24 09:09:42,393 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
105 |
+
2025-04-24 09:09:42,393 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
106 |
+
2025-04-24 09:09:42,393 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
107 |
+
2025-04-24 09:09:42,393 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
108 |
+
2025-04-24 09:09:42,393 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
109 |
+
2025-04-24 09:09:42,393 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
110 |
+
2025-04-24 09:09:42,393 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
111 |
+
2025-04-24 09:09:42,394 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
112 |
+
2025-04-24 09:09:42,394 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
113 |
+
2025-04-24 09:09:42,394 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
114 |
+
2025-04-24 09:09:42,394 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
115 |
+
2025-04-24 09:09:42,394 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
116 |
+
2025-04-24 09:09:42,394 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
117 |
+
2025-04-24 09:09:42,394 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
118 |
+
2025-04-24 09:09:42,394 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
119 |
+
2025-04-24 09:09:42,394 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
120 |
+
2025-04-24 09:09:42,394 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
121 |
+
2025-04-24 09:09:42,394 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
122 |
+
2025-04-24 09:09:42,394 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
123 |
+
2025-04-24 09:09:42,394 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
124 |
+
2025-04-24 09:09:42,394 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
125 |
+
2025-04-24 09:09:42,394 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
126 |
+
2025-04-24 09:09:42,394 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
127 |
+
2025-04-24 09:09:42,394 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
128 |
+
2025-04-24 09:09:42,394 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
129 |
+
2025-04-24 09:09:42,395 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
130 |
+
2025-04-24 09:09:42,395 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
131 |
+
2025-04-24 09:09:42,395 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
132 |
+
2025-04-24 09:09:42,395 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
133 |
+
2025-04-24 09:09:42,395 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
134 |
+
2025-04-24 09:09:42,395 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
135 |
+
2025-04-24 09:09:42,395 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
136 |
+
2025-04-24 09:09:42,395 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
137 |
+
2025-04-24 09:09:42,395 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
138 |
+
2025-04-24 09:09:42,395 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
139 |
+
2025-04-24 09:09:42,395 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
140 |
+
2025-04-24 09:09:42,395 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
141 |
+
2025-04-24 09:09:42,395 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
142 |
+
2025-04-24 09:09:42,395 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
143 |
+
2025-04-24 09:09:42,395 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
144 |
+
2025-04-24 09:09:42,395 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
145 |
+
2025-04-24 09:09:42,395 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
146 |
+
2025-04-24 09:09:42,395 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
147 |
+
2025-04-24 09:09:42,396 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
148 |
+
2025-04-24 09:09:42,396 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
149 |
+
2025-04-24 09:09:42,396 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
150 |
+
2025-04-24 09:09:42,396 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
151 |
+
2025-04-24 09:09:42,396 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
152 |
+
2025-04-24 09:09:42,396 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
153 |
+
2025-04-24 09:09:42,396 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
154 |
+
2025-04-24 09:09:42,396 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
155 |
+
2025-04-24 09:09:42,396 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
156 |
+
2025-04-24 09:09:42,396 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
157 |
+
2025-04-24 09:09:42,396 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
158 |
+
2025-04-24 09:09:42,396 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
159 |
+
2025-04-24 09:09:42,396 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
160 |
+
2025-04-24 09:09:42,396 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
161 |
+
2025-04-24 09:09:42,396 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
162 |
+
2025-04-24 09:09:42,396 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
163 |
+
2025-04-24 09:09:42,396 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
164 |
+
2025-04-24 09:09:42,396 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
165 |
+
2025-04-24 09:09:42,396 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
166 |
+
2025-04-24 09:09:42,397 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
167 |
+
2025-04-24 09:09:42,397 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
168 |
+
2025-04-24 09:09:42,397 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
169 |
+
2025-04-24 09:09:42,397 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
170 |
+
2025-04-24 09:09:42,397 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
171 |
+
2025-04-24 09:09:42,397 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
172 |
+
2025-04-24 09:09:42,397 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
173 |
+
2025-04-24 09:09:42,397 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
174 |
+
2025-04-24 09:09:42,397 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
175 |
+
2025-04-24 09:09:42,397 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
176 |
+
2025-04-24 09:09:42,397 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
177 |
+
2025-04-24 09:09:42,397 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
178 |
+
2025-04-24 09:09:42,397 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
179 |
+
2025-04-24 09:09:42,397 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
180 |
+
2025-04-24 09:09:42,397 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
181 |
+
2025-04-24 09:09:42,397 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
182 |
+
2025-04-24 09:09:42,397 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
183 |
+
2025-04-24 09:09:42,397 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
184 |
+
2025-04-24 09:09:42,397 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
185 |
+
2025-04-24 09:09:42,397 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
186 |
+
2025-04-24 09:09:42,398 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
187 |
+
2025-04-24 09:09:42,398 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
188 |
+
2025-04-24 09:09:42,398 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
189 |
+
2025-04-24 09:09:42,398 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
190 |
+
2025-04-24 09:09:42,398 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
191 |
+
2025-04-24 09:09:42,398 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
192 |
+
2025-04-24 09:09:42,398 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
193 |
+
2025-04-24 09:09:42,398 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
194 |
+
2025-04-24 09:09:42,398 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
195 |
+
2025-04-24 09:09:42,398 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
196 |
+
2025-04-24 09:09:42,398 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
197 |
+
2025-04-24 09:09:42,398 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
198 |
+
2025-04-24 09:09:42,398 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
199 |
+
2025-04-24 09:09:42,398 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
200 |
+
2025-04-24 09:09:42,398 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
201 |
+
2025-04-24 09:09:42,398 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
202 |
+
2025-04-24 09:09:42,398 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
203 |
+
2025-04-24 09:09:42,398 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
204 |
+
2025-04-24 09:09:42,399 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
205 |
+
2025-04-24 09:09:42,399 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
206 |
+
2025-04-24 09:09:42,399 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
207 |
+
2025-04-24 09:09:42,399 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
208 |
+
2025-04-24 09:09:42,399 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
209 |
+
2025-04-24 09:09:42,399 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
210 |
+
2025-04-24 09:09:42,399 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
211 |
+
2025-04-24 09:09:42,399 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
212 |
+
2025-04-24 09:09:42,399 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
213 |
+
2025-04-24 09:09:42,399 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
214 |
+
2025-04-24 09:09:42,399 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
215 |
+
2025-04-24 09:09:42,399 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
216 |
+
2025-04-24 09:09:42,399 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
217 |
+
2025-04-24 09:09:42,399 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
218 |
+
2025-04-24 09:09:42,399 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
219 |
+
2025-04-24 09:09:42,399 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
220 |
+
2025-04-24 09:09:42,399 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
221 |
+
2025-04-24 09:09:42,399 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
222 |
+
2025-04-24 09:09:42,399 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
223 |
+
2025-04-24 09:09:42,400 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
224 |
+
2025-04-24 09:09:42,400 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
225 |
+
2025-04-24 09:09:42,400 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
226 |
+
2025-04-24 09:09:42,400 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
227 |
+
2025-04-24 09:09:42,400 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
228 |
+
2025-04-24 09:09:42,400 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
229 |
+
2025-04-24 09:09:42,400 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
230 |
+
2025-04-24 09:09:42,400 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
231 |
+
2025-04-24 09:09:42,400 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
232 |
+
2025-04-24 09:09:42,400 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
233 |
+
2025-04-24 09:09:42,400 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
234 |
+
2025-04-24 09:09:42,400 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
235 |
+
2025-04-24 09:09:42,400 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
236 |
+
2025-04-24 09:09:42,400 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
237 |
+
2025-04-24 09:09:42,400 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
238 |
+
2025-04-24 09:09:42,400 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
239 |
+
2025-04-24 09:09:42,400 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
240 |
+
2025-04-24 09:09:42,400 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
241 |
+
2025-04-24 09:09:42,400 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
242 |
+
2025-04-24 09:09:42,401 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
243 |
+
2025-04-24 09:09:42,401 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
244 |
+
2025-04-24 09:09:42,401 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
245 |
+
2025-04-24 09:09:42,401 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
246 |
+
2025-04-24 09:09:42,401 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
247 |
+
2025-04-24 09:09:42,401 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
248 |
+
2025-04-24 09:09:42,401 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
249 |
+
2025-04-24 09:09:42,401 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
250 |
+
2025-04-24 09:09:42,401 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
251 |
+
2025-04-24 09:09:42,401 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
252 |
+
2025-04-24 09:09:42,401 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
253 |
+
2025-04-24 09:09:42,401 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
254 |
+
2025-04-24 09:09:42,401 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
255 |
+
2025-04-24 09:09:42,401 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
256 |
+
2025-04-24 09:09:42,401 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
257 |
+
2025-04-24 09:09:42,401 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
258 |
+
2025-04-24 09:09:42,401 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
259 |
+
2025-04-24 09:09:42,401 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
260 |
+
2025-04-24 09:09:42,401 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
261 |
+
2025-04-24 09:09:42,402 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
262 |
+
2025-04-24 09:09:42,402 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
263 |
+
2025-04-24 09:09:42,402 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
264 |
+
2025-04-24 09:09:42,402 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
265 |
+
2025-04-24 09:09:42,402 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
266 |
+
2025-04-24 09:09:42,402 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
267 |
+
2025-04-24 09:09:42,402 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
268 |
+
2025-04-24 09:09:42,402 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
269 |
+
2025-04-24 09:09:42,402 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
270 |
+
2025-04-24 09:09:42,402 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
271 |
+
2025-04-24 09:09:42,402 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
272 |
+
2025-04-24 09:09:42,402 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
273 |
+
2025-04-24 09:09:42,402 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
274 |
+
2025-04-24 09:09:42,402 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
275 |
+
2025-04-24 09:09:42,402 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
276 |
+
2025-04-24 09:09:42,402 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
277 |
+
2025-04-24 09:09:42,402 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
278 |
+
2025-04-24 09:09:42,402 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
279 |
+
2025-04-24 09:09:42,402 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
280 |
+
2025-04-24 09:09:42,403 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
281 |
+
2025-04-24 09:09:42,403 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
282 |
+
2025-04-24 09:09:42,403 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
283 |
+
2025-04-24 09:09:42,403 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
284 |
+
2025-04-24 09:09:42,403 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
285 |
+
2025-04-24 09:09:42,403 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
286 |
+
2025-04-24 09:09:42,403 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
287 |
+
2025-04-24 09:09:42,403 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
288 |
+
2025-04-24 09:09:42,403 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
289 |
+
2025-04-24 09:09:42,403 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
290 |
+
2025-04-24 09:09:42,403 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
291 |
+
2025-04-24 09:09:42,403 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
292 |
+
2025-04-24 09:09:42,403 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
293 |
+
2025-04-24 09:09:42,403 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
294 |
+
2025-04-24 09:09:42,403 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
295 |
+
2025-04-24 09:09:42,403 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
296 |
+
2025-04-24 09:09:42,403 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
297 |
+
2025-04-24 09:09:42,403 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
298 |
+
2025-04-24 09:09:42,403 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
299 |
+
2025-04-24 09:09:42,404 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
300 |
+
2025-04-24 09:09:42,404 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
301 |
+
2025-04-24 09:09:42,404 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
302 |
+
2025-04-24 09:09:42,404 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
303 |
+
2025-04-24 09:09:42,404 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
304 |
+
2025-04-24 09:09:42,404 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
305 |
+
2025-04-24 09:09:42,404 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
306 |
+
2025-04-24 09:09:42,404 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
307 |
+
2025-04-24 09:09:42,404 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
308 |
+
2025-04-24 09:09:42,404 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
309 |
+
2025-04-24 09:09:42,404 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
310 |
+
2025-04-24 09:09:42,404 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
311 |
+
2025-04-24 09:09:42,404 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
312 |
+
2025-04-24 09:09:42,404 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
313 |
+
2025-04-24 09:09:42,404 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
314 |
+
2025-04-24 09:09:42,404 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
315 |
+
2025-04-24 09:09:42,404 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
316 |
+
2025-04-24 09:09:42,404 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
317 |
+
2025-04-24 09:09:42,404 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
318 |
+
2025-04-24 09:09:42,405 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
319 |
+
2025-04-24 09:09:42,405 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
320 |
+
2025-04-24 09:09:42,405 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
321 |
+
2025-04-24 09:09:42,405 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
322 |
+
2025-04-24 09:09:42,405 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
323 |
+
2025-04-24 09:09:42,405 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
324 |
+
2025-04-24 09:09:42,405 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
325 |
+
2025-04-24 09:09:42,405 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
326 |
+
2025-04-24 09:09:42,405 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
327 |
+
2025-04-24 09:09:42,405 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
328 |
+
2025-04-24 09:09:42,405 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
329 |
+
2025-04-24 09:09:42,405 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
330 |
+
2025-04-24 09:09:42,405 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
331 |
+
2025-04-24 09:09:42,405 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
332 |
+
2025-04-24 09:09:42,405 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
333 |
+
2025-04-24 09:09:42,405 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
334 |
+
2025-04-24 09:09:42,405 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
335 |
+
2025-04-24 09:09:42,405 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
336 |
+
2025-04-24 09:09:42,406 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
337 |
+
2025-04-24 09:09:42,406 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
338 |
+
2025-04-24 09:09:42,406 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
339 |
+
2025-04-24 09:09:42,406 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
340 |
+
2025-04-24 09:09:42,406 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
341 |
+
2025-04-24 09:09:42,406 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
342 |
+
2025-04-24 09:09:42,406 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
343 |
+
2025-04-24 09:09:42,406 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
344 |
+
2025-04-24 09:09:42,406 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
345 |
+
2025-04-24 09:09:42,406 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
346 |
+
2025-04-24 09:09:42,406 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
347 |
+
2025-04-24 09:09:42,406 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
348 |
+
2025-04-24 09:09:42,406 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
349 |
+
2025-04-24 09:09:42,406 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
350 |
+
2025-04-24 09:09:42,406 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
351 |
+
2025-04-24 09:09:42,406 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
352 |
+
2025-04-24 09:09:42,406 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
353 |
+
2025-04-24 09:09:42,406 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
354 |
+
2025-04-24 09:09:42,406 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
355 |
+
2025-04-24 09:09:42,407 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
356 |
+
2025-04-24 09:09:42,407 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
357 |
+
2025-04-24 09:09:42,407 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
358 |
+
2025-04-24 09:09:42,407 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
359 |
+
2025-04-24 09:09:42,407 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
360 |
+
2025-04-24 09:09:42,407 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
361 |
+
2025-04-24 09:09:42,407 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
362 |
+
2025-04-24 09:09:42,407 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
363 |
+
2025-04-24 09:09:42,407 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
364 |
+
2025-04-24 09:09:42,407 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
365 |
+
2025-04-24 09:09:42,407 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
366 |
+
2025-04-24 09:09:42,407 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
367 |
+
2025-04-24 09:09:42,407 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
368 |
+
2025-04-24 09:09:42,407 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
369 |
+
2025-04-24 09:09:42,407 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
370 |
+
2025-04-24 09:09:42,407 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
371 |
+
2025-04-24 09:09:42,407 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
372 |
+
2025-04-24 09:09:42,407 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
373 |
+
2025-04-24 09:09:42,407 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
374 |
+
2025-04-24 09:09:42,407 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
375 |
+
2025-04-24 09:09:42,408 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
376 |
+
2025-04-24 09:09:42,408 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
377 |
+
2025-04-24 09:09:42,408 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
378 |
+
2025-04-24 09:09:42,408 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
379 |
+
2025-04-24 09:09:42,408 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
380 |
+
2025-04-24 09:09:42,408 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
381 |
+
2025-04-24 09:09:42,408 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
382 |
+
2025-04-24 09:09:42,408 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
383 |
+
2025-04-24 09:09:42,408 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
384 |
+
2025-04-24 09:09:42,408 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
385 |
+
2025-04-24 09:09:42,408 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
386 |
+
2025-04-24 09:09:42,408 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
387 |
+
2025-04-24 09:09:42,408 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
388 |
+
2025-04-24 09:09:42,408 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
389 |
+
2025-04-24 09:09:42,408 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
390 |
+
2025-04-24 09:09:42,408 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
391 |
+
2025-04-24 09:09:42,408 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
392 |
+
2025-04-24 09:09:42,408 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
393 |
+
2025-04-24 09:09:42,408 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
394 |
+
2025-04-24 09:09:42,409 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
395 |
+
2025-04-24 09:09:42,409 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
396 |
+
2025-04-24 09:09:42,409 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
397 |
+
2025-04-24 09:09:42,409 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
398 |
+
2025-04-24 09:09:42,409 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
399 |
+
2025-04-24 09:09:42,409 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
400 |
+
2025-04-24 09:09:42,409 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
401 |
+
2025-04-24 09:09:42,409 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
402 |
+
2025-04-24 09:09:42,409 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
403 |
+
2025-04-24 09:09:42,409 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
404 |
+
2025-04-24 09:09:42,409 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
405 |
+
2025-04-24 09:09:42,409 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
406 |
+
2025-04-24 09:09:42,409 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
407 |
+
2025-04-24 09:09:42,409 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
408 |
+
2025-04-24 09:09:42,409 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
409 |
+
2025-04-24 09:09:42,409 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
410 |
+
2025-04-24 09:09:42,409 INFO [train.py:813] encoder_projector.linear1.weight: torch.Size([896, 10240])
|
411 |
+
2025-04-24 09:09:42,409 INFO [train.py:813] encoder_projector.linear1.bias: torch.Size([896])
|
412 |
+
2025-04-24 09:09:42,409 INFO [train.py:813] encoder_projector.linear2.weight: torch.Size([896, 896])
|
413 |
+
2025-04-24 09:09:42,410 INFO [train.py:813] encoder_projector.linear2.bias: torch.Size([896])
|
414 |
+
2025-04-24 09:09:42,410 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
415 |
+
2025-04-24 09:09:42,410 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
416 |
+
2025-04-24 09:09:42,410 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
417 |
+
2025-04-24 09:09:42,410 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
418 |
+
2025-04-24 09:09:42,410 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
419 |
+
2025-04-24 09:09:42,410 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
420 |
+
2025-04-24 09:09:42,410 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
421 |
+
2025-04-24 09:09:42,410 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
422 |
+
2025-04-24 09:09:42,410 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
423 |
+
2025-04-24 09:09:42,410 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
424 |
+
2025-04-24 09:09:42,410 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
425 |
+
2025-04-24 09:09:42,410 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
426 |
+
2025-04-24 09:09:42,410 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
427 |
+
2025-04-24 09:09:42,410 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
428 |
+
2025-04-24 09:09:42,410 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
429 |
+
2025-04-24 09:09:42,410 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
430 |
+
2025-04-24 09:09:42,410 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
431 |
+
2025-04-24 09:09:42,410 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
432 |
+
2025-04-24 09:09:42,411 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
433 |
+
2025-04-24 09:09:42,411 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
434 |
+
2025-04-24 09:09:42,411 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
435 |
+
2025-04-24 09:09:42,411 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
436 |
+
2025-04-24 09:09:42,411 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
437 |
+
2025-04-24 09:09:42,411 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
438 |
+
2025-04-24 09:09:42,411 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
439 |
+
2025-04-24 09:09:42,411 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
440 |
+
2025-04-24 09:09:42,411 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
441 |
+
2025-04-24 09:09:42,411 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
442 |
+
2025-04-24 09:09:42,411 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
443 |
+
2025-04-24 09:09:42,411 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
444 |
+
2025-04-24 09:09:42,411 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
445 |
+
2025-04-24 09:09:42,411 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
446 |
+
2025-04-24 09:09:42,411 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
447 |
+
2025-04-24 09:09:42,411 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
448 |
+
2025-04-24 09:09:42,411 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
449 |
+
2025-04-24 09:09:42,411 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
450 |
+
2025-04-24 09:09:42,411 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
451 |
+
2025-04-24 09:09:42,412 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
452 |
+
2025-04-24 09:09:42,412 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
453 |
+
2025-04-24 09:09:42,412 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
454 |
+
2025-04-24 09:09:42,412 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
455 |
+
2025-04-24 09:09:42,412 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
456 |
+
2025-04-24 09:09:42,412 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
457 |
+
2025-04-24 09:09:42,412 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
458 |
+
2025-04-24 09:09:42,412 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
459 |
+
2025-04-24 09:09:42,412 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
460 |
+
2025-04-24 09:09:42,412 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
461 |
+
2025-04-24 09:09:42,412 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
462 |
+
2025-04-24 09:09:42,412 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
463 |
+
2025-04-24 09:09:42,412 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
464 |
+
2025-04-24 09:09:42,412 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
465 |
+
2025-04-24 09:09:42,412 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
466 |
+
2025-04-24 09:09:42,412 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
467 |
+
2025-04-24 09:09:42,412 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
468 |
+
2025-04-24 09:09:42,412 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
469 |
+
2025-04-24 09:09:42,412 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
470 |
+
2025-04-24 09:09:42,413 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
471 |
+
2025-04-24 09:09:42,413 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
472 |
+
2025-04-24 09:09:42,413 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
473 |
+
2025-04-24 09:09:42,413 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
474 |
+
2025-04-24 09:09:42,413 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
475 |
+
2025-04-24 09:09:42,413 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
476 |
+
2025-04-24 09:09:42,413 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
477 |
+
2025-04-24 09:09:42,413 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
478 |
+
2025-04-24 09:09:42,413 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
479 |
+
2025-04-24 09:09:42,413 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
480 |
+
2025-04-24 09:09:42,413 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
481 |
+
2025-04-24 09:09:42,413 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
482 |
+
2025-04-24 09:09:42,413 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
483 |
+
2025-04-24 09:09:42,413 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
484 |
+
2025-04-24 09:09:42,413 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
485 |
+
2025-04-24 09:09:42,413 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
486 |
+
2025-04-24 09:09:42,413 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
487 |
+
2025-04-24 09:09:42,413 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
488 |
+
2025-04-24 09:09:42,413 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
489 |
+
2025-04-24 09:09:42,414 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
490 |
+
2025-04-24 09:09:42,414 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
491 |
+
2025-04-24 09:09:42,414 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
492 |
+
2025-04-24 09:09:42,414 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
493 |
+
2025-04-24 09:09:42,414 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
494 |
+
2025-04-24 09:09:42,414 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
495 |
+
2025-04-24 09:09:42,414 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
496 |
+
2025-04-24 09:09:42,414 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
497 |
+
2025-04-24 09:09:42,414 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
498 |
+
2025-04-24 09:09:42,414 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
499 |
+
2025-04-24 09:09:42,414 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
500 |
+
2025-04-24 09:09:42,414 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
501 |
+
2025-04-24 09:09:42,414 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
502 |
+
2025-04-24 09:09:42,414 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
503 |
+
2025-04-24 09:09:42,414 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
504 |
+
2025-04-24 09:09:42,414 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
505 |
+
2025-04-24 09:09:42,414 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
506 |
+
2025-04-24 09:09:42,414 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
507 |
+
2025-04-24 09:09:42,414 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
508 |
+
2025-04-24 09:09:42,415 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
509 |
+
2025-04-24 09:09:42,415 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
510 |
+
2025-04-24 09:09:42,415 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
511 |
+
2025-04-24 09:09:42,415 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
512 |
+
2025-04-24 09:09:42,415 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
513 |
+
2025-04-24 09:09:42,415 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
514 |
+
2025-04-24 09:09:42,415 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
515 |
+
2025-04-24 09:09:42,415 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
516 |
+
2025-04-24 09:09:42,415 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
517 |
+
2025-04-24 09:09:42,415 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
518 |
+
2025-04-24 09:09:42,415 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
519 |
+
2025-04-24 09:09:42,415 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
520 |
+
2025-04-24 09:09:42,415 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
521 |
+
2025-04-24 09:09:42,415 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
522 |
+
2025-04-24 09:09:42,415 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
523 |
+
2025-04-24 09:09:42,415 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
524 |
+
2025-04-24 09:09:42,415 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
525 |
+
2025-04-24 09:09:42,415 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
526 |
+
2025-04-24 09:09:42,415 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
527 |
+
2025-04-24 09:09:42,416 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
528 |
+
2025-04-24 09:09:42,416 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
529 |
+
2025-04-24 09:09:42,416 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
530 |
+
2025-04-24 09:09:42,416 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
531 |
+
2025-04-24 09:09:42,416 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
532 |
+
2025-04-24 09:09:42,416 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
533 |
+
2025-04-24 09:09:42,416 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
534 |
+
2025-04-24 09:09:42,416 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
535 |
+
2025-04-24 09:09:42,416 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
536 |
+
2025-04-24 09:09:42,416 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
537 |
+
2025-04-24 09:09:42,416 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
538 |
+
2025-04-24 09:09:42,416 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
539 |
+
2025-04-24 09:09:42,416 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
540 |
+
2025-04-24 09:09:42,416 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
541 |
+
2025-04-24 09:09:42,416 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
542 |
+
2025-04-24 09:09:42,416 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
543 |
+
2025-04-24 09:09:42,416 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
544 |
+
2025-04-24 09:09:42,416 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
545 |
+
2025-04-24 09:09:42,416 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
546 |
+
2025-04-24 09:09:42,417 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
547 |
+
2025-04-24 09:09:42,417 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
548 |
+
2025-04-24 09:09:42,417 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
549 |
+
2025-04-24 09:09:42,417 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
550 |
+
2025-04-24 09:09:42,417 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
551 |
+
2025-04-24 09:09:42,417 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
552 |
+
2025-04-24 09:09:42,417 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
553 |
+
2025-04-24 09:09:42,417 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
554 |
+
2025-04-24 09:09:42,417 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
555 |
+
2025-04-24 09:09:42,417 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
556 |
+
2025-04-24 09:09:42,417 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
557 |
+
2025-04-24 09:09:42,417 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
558 |
+
2025-04-24 09:09:42,417 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
559 |
+
2025-04-24 09:09:42,417 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
560 |
+
2025-04-24 09:09:42,417 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
561 |
+
2025-04-24 09:09:42,417 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
562 |
+
2025-04-24 09:09:42,417 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
563 |
+
2025-04-24 09:09:42,417 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
564 |
+
2025-04-24 09:09:42,417 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
565 |
+
2025-04-24 09:09:42,417 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
566 |
+
2025-04-24 09:09:42,418 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
567 |
+
2025-04-24 09:09:42,418 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
568 |
+
2025-04-24 09:09:42,418 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
569 |
+
2025-04-24 09:09:42,418 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
570 |
+
2025-04-24 09:09:42,418 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
571 |
+
2025-04-24 09:09:42,418 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
572 |
+
2025-04-24 09:09:42,418 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
573 |
+
2025-04-24 09:09:42,418 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
574 |
+
2025-04-24 09:09:42,418 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
575 |
+
2025-04-24 09:09:42,418 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
576 |
+
2025-04-24 09:09:42,418 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
577 |
+
2025-04-24 09:09:42,418 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
578 |
+
2025-04-24 09:09:42,418 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
579 |
+
2025-04-24 09:09:42,418 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
580 |
+
2025-04-24 09:09:42,418 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
581 |
+
2025-04-24 09:09:42,418 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
582 |
+
2025-04-24 09:09:42,418 INFO [train.py:813] speech_token_projector.weight: torch.Size([1024, 896])
|
583 |
+
2025-04-24 09:09:42,418 INFO [train.py:813] speech_token_projector.bias: torch.Size([1024])
|
584 |
+
2025-04-24 09:09:42,418 INFO [train.py:813] codec_lm_head.weight: torch.Size([4100, 1024])
|
585 |
+
2025-04-24 09:09:42,418 INFO [train.py:813] codec_lm_head.bias: torch.Size([4100])
|
586 |
+
2025-04-24 09:09:42,419 INFO [train.py:819] Device: cuda:0
|
587 |
+
2025-04-24 09:09:43,225 INFO [train.py:823] Using DeepSpeed
|
588 |
+
2025-04-24 09:09:45,005 INFO [data_module.py:445] About to get train cuts
|
589 |
+
2025-04-24 09:09:45,007 INFO [data_module.py:250] Disable MUSAN
|
590 |
+
2025-04-24 09:09:45,007 INFO [data_module.py:268] Enable SpecAugment
|
591 |
+
2025-04-24 09:09:45,007 INFO [data_module.py:269] Time warp factor: 80
|
592 |
+
2025-04-24 09:09:45,007 INFO [data_module.py:279] Num frame mask: 10
|
593 |
+
2025-04-24 09:09:45,007 INFO [data_module.py:292] About to create train dataset
|
594 |
+
2025-04-24 09:09:45,008 INFO [data_module.py:319] Using DynamicBucketingSampler.
|
595 |
+
2025-04-24 09:09:45,039 WARNING [train.py:847] Exclude cut with ID c45eb75253bb65901de61b81270c9974 from training. Duration: 6.199727891156463, lenth: 2321
|
596 |
+
2025-04-24 09:09:45,222 WARNING [train.py:847] Exclude cut with ID 82678aa1e73accb2d11a87440a90fd55 from training. Duration: 5.0967800453514736, lenth: 2413
|
597 |
+
2025-04-24 09:09:45,716 WARNING [train.py:847] Exclude cut with ID 345bcf1b57cbfcf140007812e992c07a from training. Duration: 5.712108843537415, lenth: 2520
|
598 |
+
2025-04-24 09:09:45,867 WARNING [train.py:847] Exclude cut with ID 2bc428341a40605b4887a7e46e3ee4ac from training. Duration: 3.657142857142857, lenth: 2282
|
599 |
+
2025-04-24 09:09:45,933 WARNING [train.py:847] Exclude cut with ID 24c14f6007ceb8d993f6496aeb663c24 from training. Duration: 2.391655328798186, lenth: 2218
|
600 |
+
2025-04-24 09:09:45,998 INFO [data_module.py:336] About to create train dataloader
|
601 |
+
2025-04-24 09:09:45,999 INFO [data_module.py:436] About to get test cuts
|
602 |
+
2025-04-24 09:09:45,999 INFO [data_module.py:365] About to create dev dataset
|
603 |
+
2025-04-24 09:09:46,086 INFO [data_module.py:379] About to create dev dataloader
|
604 |
+
2025-04-24 09:09:46,092 INFO [train.py:875] start training from epoch 1
|
605 |
+
2025-04-24 09:09:46,257 WARNING [train.py:847] Exclude cut with ID c45eb75253bb65901de61b81270c9974 from training. Duration: 6.199727891156463, lenth: 2321
|
606 |
+
2025-04-24 09:09:46,483 WARNING [train.py:847] Exclude cut with ID 82678aa1e73accb2d11a87440a90fd55 from training. Duration: 5.0967800453514736, lenth: 2413
|
607 |
+
2025-04-24 09:09:47,106 WARNING [train.py:847] Exclude cut with ID 345bcf1b57cbfcf140007812e992c07a from training. Duration: 5.712108843537415, lenth: 2520
|
608 |
+
2025-04-24 09:09:47,300 WARNING [train.py:847] Exclude cut with ID 2bc428341a40605b4887a7e46e3ee4ac from training. Duration: 3.657142857142857, lenth: 2282
|
609 |
+
2025-04-24 09:09:47,378 WARNING [train.py:847] Exclude cut with ID 24c14f6007ceb8d993f6496aeb663c24 from training. Duration: 2.391655328798186, lenth: 2218
|
610 |
+
2025-04-24 09:09:47,927 WARNING [train.py:847] Exclude cut with ID de225db1da1393474e4ce9af8cbafb42 from training. Duration: 5.433469387755102, lenth: 2230
|
611 |
+
2025-04-24 09:09:48,216 WARNING [train.py:847] Exclude cut with ID ef376c7d1fb4606edb0547b2a7fb95b3 from training. Duration: 2.7399546485260773, lenth: 2637
|
612 |
+
2025-04-24 09:09:48,545 WARNING [train.py:847] Exclude cut with ID c7f3c7da44c2c6e27f86bda421b958d8 from training. Duration: 2.995374149659864, lenth: 2318
|
613 |
+
2025-04-24 09:09:48,715 WARNING [train.py:847] Exclude cut with ID 8d724f452d1d99549eebf90ab658e7bd from training. Duration: 5.712108843537415, lenth: 2209
|
614 |
+
2025-04-24 09:09:48,829 WARNING [train.py:847] Exclude cut with ID a585785e795cb35d02d8f7b599352e16 from training. Duration: 4.911020408163266, lenth: 2575
|
615 |
+
2025-04-24 09:09:48,909 WARNING [train.py:847] Exclude cut with ID 31a00c5e63c88ef67402f578ebc6ebf7 from training. Duration: 3.575873015873016, lenth: 2307
|
616 |
+
2025-04-24 09:09:49,023 WARNING [train.py:847] Exclude cut with ID 32ab4761aaf08d9473fc8388c3a6cffc from training. Duration: 7.9760544217687075, lenth: 2201
|
617 |
+
2025-04-24 09:09:49,079 WARNING [train.py:847] Exclude cut with ID 5fddb6bf9d5265c0409b81c85fadbaec from training. Duration: 3.4597732426303853, lenth: 2210
|
618 |
+
2025-04-24 09:09:49,909 WARNING [train.py:847] Exclude cut with ID c5d43a2443ce7820cf864f7c55eafb77 from training. Duration: 3.912562358276644, lenth: 2315
|
619 |
+
2025-04-24 09:09:50,954 WARNING [train.py:847] Exclude cut with ID a951e36267c8901d4c03fb6b4dffe636 from training. Duration: 4.73687074829932, lenth: 2223
|
620 |
+
2025-04-24 09:09:51,025 WARNING [train.py:847] Exclude cut with ID 595680451c1ee7e42dba0452e165dea3 from training. Duration: 3.959002267573696, lenth: 2251
|
621 |
+
2025-04-24 09:09:51,576 WARNING [train.py:847] Exclude cut with ID 20fa9b11557159d78f72ccc6b464de18 from training. Duration: 3.9938321995464854, lenth: 2215
|
622 |
+
2025-04-24 09:09:51,630 WARNING [train.py:847] Exclude cut with ID 8a01fc72b4108673fd8393e523450e23 from training. Duration: 8.719092970521542, lenth: 2517
|
623 |
+
2025-04-24 09:09:52,139 WARNING [train.py:847] Exclude cut with ID 924d54288847d448a20809a26b320477 from training. Duration: 3.81968253968254, lenth: 2302
|
624 |
+
2025-04-24 09:09:52,437 WARNING [train.py:847] Exclude cut with ID a7f91a17f6318d896d0bee7b7307d000 from training. Duration: 5.294149659863946, lenth: 2239
|
625 |
+
2025-04-24 09:09:52,922 WARNING [train.py:847] Exclude cut with ID 8d7cff5d1d25a773f9d27590b5912b60 from training. Duration: 4.597551020408163, lenth: 2203
|
626 |
+
2025-04-24 09:09:53,040 WARNING [train.py:847] Exclude cut with ID 13407ca1ec78def38346569b54d11a78 from training. Duration: 5.909478458049887, lenth: 2280
|
627 |
+
2025-04-24 09:09:53,044 WARNING [train.py:847] Exclude cut with ID 6390ab6964b52f2ee9bf36826393abaa from training. Duration: 4.017052154195011, lenth: 2209
|
628 |
+
2025-04-24 09:09:53,086 WARNING [train.py:847] Exclude cut with ID f55c19435d6643ebf7356a1072a014fa from training. Duration: 5.259319727891157, lenth: 2310
|
629 |
+
2025-04-24 09:09:53,765 WARNING [train.py:847] Exclude cut with ID 546f5fb6d11db41d104ee7b0667f4e1d from training. Duration: 5.712108843537415, lenth: 2770
|
630 |
+
2025-04-24 09:09:54,071 WARNING [train.py:847] Exclude cut with ID b2069846ab4df18f5caa676751aa0b40 from training. Duration: 6.6525170068027215, lenth: 2588
|
631 |
+
2025-04-24 09:09:54,445 WARNING [train.py:847] Exclude cut with ID 5717a65b9855425c0330e6022a71cc2f from training. Duration: 5.073560090702948, lenth: 2269
|
632 |
+
2025-04-24 09:09:54,746 WARNING [train.py:847] Exclude cut with ID d6e37a975b3c69a86a808347b53e42f2 from training. Duration: 5.259319727891157, lenth: 2885
|
633 |
+
2025-04-24 09:09:54,894 WARNING [train.py:847] Exclude cut with ID d15e75257f8dac616c078113f8f96c63 from training. Duration: 7.6741950113378685, lenth: 2276
|
634 |
+
2025-04-24 09:09:54,958 WARNING [train.py:847] Exclude cut with ID 7ea49b35e0dc536537ae3ddcd07032af from training. Duration: 4.829750566893424, lenth: 2206
|
635 |
+
2025-04-24 09:09:55,412 WARNING [train.py:847] Exclude cut with ID 0a1a80c13fd0c38d97c9491dff84c7da from training. Duration: 2.914104308390023, lenth: 2220
|
636 |
+
2025-04-24 09:09:55,461 WARNING [train.py:847] Exclude cut with ID 359ff41c694bb848c3e37b948b1434f7 from training. Duration: 6.478367346938776, lenth: 2881
|
637 |
+
2025-04-24 09:09:55,469 WARNING [train.py:847] Exclude cut with ID 6989881203588034ddf38ae5a7cfe238 from training. Duration: 5.491519274376417, lenth: 2243
|
638 |
+
2025-04-24 09:09:55,741 WARNING [train.py:847] Exclude cut with ID fd7b5cb9a6bce2c02bf8a7c3dd2b35d6 from training. Duration: 5.793378684807256, lenth: 2305
|
639 |
+
2025-04-24 09:09:56,713 WARNING [train.py:847] Exclude cut with ID e1c5efd6d4cd7c6dc0f3ac7720161c49 from training. Duration: 9.438911564625851, lenth: 2201
|
640 |
+
2025-04-24 09:09:57,105 WARNING [train.py:847] Exclude cut with ID c13ffcc7b60ea460731e417d2d9575cf from training. Duration: 3.250793650793651, lenth: 2205
|
641 |
+
2025-04-24 09:09:57,196 WARNING [train.py:847] Exclude cut with ID ba1a37e64efe00d8b8cbfb0069e65f8d from training. Duration: 3.274013605442177, lenth: 2290
|
642 |
+
2025-04-24 09:09:57,273 WARNING [train.py:847] Exclude cut with ID de9f5fe02034bc4b0a66fbe378fffb1a from training. Duration: 6.455147392290249, lenth: 2228
|
643 |
+
2025-04-24 09:09:57,753 WARNING [train.py:847] Exclude cut with ID 8ca7a35870994edd410e1ea8e951faa3 from training. Duration: 7.012426303854875, lenth: 2233
|
644 |
+
2025-04-24 09:09:57,897 WARNING [train.py:847] Exclude cut with ID 3260368505eddf46fb1c9b562124ff18 from training. Duration: 12.097596371882085, lenth: 2261
|
645 |
+
2025-04-24 09:09:58,088 WARNING [train.py:847] Exclude cut with ID 36878ea551121147626747e9fc39ae17 from training. Duration: 5.537959183673469, lenth: 2224
|
646 |
+
2025-04-24 09:09:59,042 WARNING [train.py:847] Exclude cut with ID d4c7270e6c9b3dffed8643040b5f922b from training. Duration: 4.133151927437642, lenth: 2329
|
647 |
+
2025-04-24 09:09:59,502 WARNING [train.py:847] Exclude cut with ID 179517464c97fbbcb8909a56b849bf73 from training. Duration: 6.617687074829932, lenth: 2257
|
648 |
+
2025-04-24 09:09:59,704 WARNING [train.py:847] Exclude cut with ID 17480e9524e4c0f8e405487e50c8e98e from training. Duration: 4.551111111111111, lenth: 2263
|
649 |
+
2025-04-24 09:09:59,792 WARNING [train.py:847] Exclude cut with ID 2c10af552b911b8ac45c1fc7bb9fcb5d from training. Duration: 6.2693877551020405, lenth: 2481
|
650 |
+
2025-04-24 09:09:59,902 WARNING [train.py:847] Exclude cut with ID aa2539588175d8c96975a06a8993e5b6 from training. Duration: 5.456689342403628, lenth: 2573
|
651 |
+
2025-04-24 09:09:59,933 WARNING [train.py:847] Exclude cut with ID 32945d8e21a8628bb3c93fd499330639 from training. Duration: 13.455963718820861, lenth: 2205
|
652 |
+
2025-04-24 09:10:00,098 WARNING [train.py:847] Exclude cut with ID 64db5d6166c3edc340d8e1b02c059f19 from training. Duration: 4.818140589569161, lenth: 2259
|
653 |
+
2025-04-24 09:10:00,235 WARNING [train.py:847] Exclude cut with ID 39483151e08291efe16217ab41622d59 from training. Duration: 7.952834467120182, lenth: 2439
|
654 |
+
2025-04-24 09:10:00,924 WARNING [train.py:847] Exclude cut with ID c4a74dd23aa9a3fb13330113787139d6 from training. Duration: 3.239183673469388, lenth: 2276
|
655 |
+
2025-04-24 09:10:00,966 WARNING [train.py:847] Exclude cut with ID b12a8836f1aa525a763ac657a44229a3 from training. Duration: 5.0967800453514736, lenth: 2427
|
656 |
+
2025-04-24 09:10:04,105 WARNING [train.py:847] Exclude cut with ID 185cfa3ae9c12babe1e217de612f0c7b from training. Duration: 6.257777777777778, lenth: 2587
|
657 |
+
2025-04-24 09:10:04,192 WARNING [train.py:847] Exclude cut with ID 08eff5b64d191a0d50634630437a3107 from training. Duration: 10.135510204081633, lenth: 2273
|
658 |
+
2025-04-24 09:10:04,279 WARNING [train.py:847] Exclude cut with ID bcd8207538deeccfcf8d1de872b8f7e9 from training. Duration: 4.179591836734694, lenth: 2242
|
659 |
+
2025-04-24 09:10:04,474 WARNING [train.py:847] Exclude cut with ID f909651c923fdd6cc0eaa81de64f4a5b from training. Duration: 8.068934240362811, lenth: 2421
|
660 |
+
2025-04-24 09:10:04,502 WARNING [train.py:847] Exclude cut with ID f5d34097b89bd201018189ff44b10b33 from training. Duration: 3.7964625850340137, lenth: 2250
|
661 |
+
2025-04-24 09:10:04,592 WARNING [train.py:847] Exclude cut with ID 245530abe5a62b390611b7a3b91ddfef from training. Duration: 9.079002267573696, lenth: 2299
|
662 |
+
2025-04-24 09:10:05,468 WARNING [train.py:847] Exclude cut with ID 79d0d3129c764fd4d9c5391b6353e349 from training. Duration: 5.0967800453514736, lenth: 2309
|
663 |
+
2025-04-24 09:10:05,505 WARNING [train.py:847] Exclude cut with ID 1969177f73511ef204642f43ce029d05 from training. Duration: 3.657142857142857, lenth: 2339
|
664 |
+
2025-04-24 09:10:06,014 WARNING [train.py:847] Exclude cut with ID f601e756ecd6074cd3037c61e413cb6e from training. Duration: 5.712108843537415, lenth: 2287
|
665 |
+
2025-04-24 09:10:06,271 WARNING [train.py:847] Exclude cut with ID 9f7758e035b3eebcb9267aa49ad65d4c from training. Duration: 3.9938321995464854, lenth: 2303
|
666 |
+
2025-04-24 09:10:07,035 WARNING [train.py:847] Exclude cut with ID b29a42e8c0af813bf605d7f0a974620c from training. Duration: 5.050340136054421, lenth: 2219
|
667 |
+
2025-04-24 09:10:07,208 WARNING [train.py:847] Exclude cut with ID 192e5517b050a4659892a2832d986764 from training. Duration: 5.12, lenth: 2420
|
668 |
+
2025-04-24 09:10:07,376 WARNING [train.py:847] Exclude cut with ID bc262e923d16a434a3c5701bea841e4e from training. Duration: 7.418775510204082, lenth: 2205
|
669 |
+
2025-04-24 09:10:07,535 WARNING [train.py:847] Exclude cut with ID 5141e01019dec47ed309fbf5881ab675 from training. Duration: 8.034104308390022, lenth: 2284
|
670 |
+
2025-04-24 09:10:07,564 WARNING [train.py:847] Exclude cut with ID 3555e5c7123d744b39ba7ecce44ce1e2 from training. Duration: 3.7151927437641725, lenth: 2226
|
671 |
+
2025-04-24 09:10:08,012 WARNING [train.py:847] Exclude cut with ID 973a1dc43ca452580d6999027a82fbec from training. Duration: 5.270929705215419, lenth: 2250
|
672 |
+
2025-04-24 09:10:08,113 WARNING [train.py:847] Exclude cut with ID e2791accb62bb5e4bde9e0e2b04945c5 from training. Duration: 3.517823129251701, lenth: 2286
|
673 |
+
2025-04-24 09:10:08,285 WARNING [train.py:847] Exclude cut with ID e91152d010c4479932b7ab64a998d5c9 from training. Duration: 4.771700680272109, lenth: 2468
|
674 |
+
2025-04-24 09:10:08,418 WARNING [train.py:847] Exclude cut with ID cff6a8011a86580fcf47595d18819ae9 from training. Duration: 4.458231292517007, lenth: 2205
|
675 |
+
2025-04-24 09:10:08,466 WARNING [train.py:847] Exclude cut with ID 1623cdac3d17f7a211e8a8f434f70657 from training. Duration: 4.191201814058957, lenth: 2308
|
676 |
+
2025-04-24 09:10:09,098 WARNING [train.py:847] Exclude cut with ID 273cfc04aebaf8b0cad541d0f3d1cded from training. Duration: 6.408707482993197, lenth: 2609
|
677 |
+
2025-04-24 09:10:09,182 WARNING [train.py:847] Exclude cut with ID 7d591b090a42935fd1edd8de97c690b6 from training. Duration: 5.038730158730159, lenth: 2229
|
678 |
+
2025-04-24 09:10:09,281 WARNING [train.py:847] Exclude cut with ID 1cfa9ef743b568bb034b415000c8c479 from training. Duration: 7.174965986394557, lenth: 2440
|
679 |
+
2025-04-24 09:10:10,073 WARNING [train.py:847] Exclude cut with ID 762676d460719a9203fa7e7537560530 from training. Duration: 3.1346938775510202, lenth: 2443
|
680 |
+
2025-04-24 09:10:10,581 WARNING [train.py:847] Exclude cut with ID 43575814ad7bb6d631483cdc513b49ac from training. Duration: 10.95981859410431, lenth: 2380
|
681 |
+
2025-04-24 09:10:10,698 WARNING [train.py:847] Exclude cut with ID ae208dee7119ca436f380b99077bcde9 from training. Duration: 5.2128798185941045, lenth: 2615
|
682 |
+
2025-04-24 09:10:10,784 WARNING [train.py:847] Exclude cut with ID b56757708077ab00baa7857c1cef8491 from training. Duration: 5.456689342403628, lenth: 2206
|
683 |
+
2025-04-24 09:10:11,014 WARNING [train.py:847] Exclude cut with ID f3f6d5671f3d11bc83f8fcad598007ff from training. Duration: 4.179591836734694, lenth: 2262
|
684 |
+
2025-04-24 09:10:11,431 WARNING [train.py:847] Exclude cut with ID 6af07f098cd794b9dd5a6bd660f53505 from training. Duration: 4.435011337868481, lenth: 2251
|
685 |
+
2025-04-24 09:10:11,568 WARNING [train.py:847] Exclude cut with ID 4ac1131393136a35ef54499bf3c728d4 from training. Duration: 7.732244897959184, lenth: 2211
|
686 |
+
2025-04-24 09:10:11,570 WARNING [train.py:847] Exclude cut with ID 482877e78defa83a355d2cdeb5a629ed from training. Duration: 5.154829931972789, lenth: 2891
|
687 |
+
2025-04-24 09:10:13,777 WARNING [train.py:847] Exclude cut with ID a93637e8f3e7bdf34bc8eb5d34d34197 from training. Duration: 3.215963718820862, lenth: 2235
|
688 |
+
2025-04-24 09:10:15,429 WARNING [train.py:847] Exclude cut with ID 90c6fbf20eeb34c13ffb97a198d84b0b from training. Duration: 5.0967800453514736, lenth: 2236
|
689 |
+
2025-04-24 09:10:15,536 INFO [train.py:539] Computing validation loss
|
log/log-train-2025-04-24-09-15-56
ADDED
@@ -0,0 +1,689 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2025-04-24 09:15:56,357 INFO [train.py:653] {
|
2 |
+
"allowed_excess_duration_ratio": 0.1,
|
3 |
+
"audio_key": "question_audio",
|
4 |
+
"batch_idx_train": 0,
|
5 |
+
"best_train_epoch": -1,
|
6 |
+
"best_train_loss": Infinity,
|
7 |
+
"best_valid_epoch": -1,
|
8 |
+
"best_valid_loss": Infinity,
|
9 |
+
"bucketing_sampler": true,
|
10 |
+
"deepscale": false,
|
11 |
+
"deepscale_config": null,
|
12 |
+
"deepspeed": true,
|
13 |
+
"deepspeed_config": "./slam_omni/ds_config_zero1.json",
|
14 |
+
"drop_last": true,
|
15 |
+
"enable_musan": false,
|
16 |
+
"enable_spec_aug": true,
|
17 |
+
"enable_speech_output": true,
|
18 |
+
"encoder_projector_ds_rate": 8,
|
19 |
+
"env_info": {
|
20 |
+
"IP address": "0.114.172.122",
|
21 |
+
"hostname": "7515258",
|
22 |
+
"icefall-git-branch": "master",
|
23 |
+
"icefall-git-date": "Thu Apr 24 08:24:11 2025",
|
24 |
+
"icefall-git-sha1": "2e9be467-dirty",
|
25 |
+
"icefall-path": "/workspace/slam/icefall_omni",
|
26 |
+
"k2-build-type": "Release",
|
27 |
+
"k2-git-date": "Tue Oct 29 09:02:19 2024",
|
28 |
+
"k2-git-sha1": "75e2ed6b2fd87c22b7f3f34bad48a69984bb8755",
|
29 |
+
"k2-path": "/opt/conda/lib/python3.11/site-packages/k2/__init__.py",
|
30 |
+
"k2-version": "1.24.4",
|
31 |
+
"k2-with-cuda": true,
|
32 |
+
"lhotse-path": "/workspace/slam/lhotse/lhotse/__init__.py",
|
33 |
+
"lhotse-version": "1.30.0.dev+git.13c7616f.dirty",
|
34 |
+
"python-version": "3.11",
|
35 |
+
"torch-cuda-available": true,
|
36 |
+
"torch-cuda-version": "12.4",
|
37 |
+
"torch-version": "2.4.0"
|
38 |
+
},
|
39 |
+
"exp_dir": "slam_omni/exp_speech2speech_rerun",
|
40 |
+
"frame_shift_ms": 10,
|
41 |
+
"huggingface_dataset_path_or_name": "/workspace/Belle_1.4M-SLAM-Omni",
|
42 |
+
"input_strategy": "PrecomputedFeatures",
|
43 |
+
"llm_path_or_name": "models/Qwen2.5-0.5B-Instruct",
|
44 |
+
"log_interval": 50,
|
45 |
+
"manifest_dir": "data/fbank",
|
46 |
+
"max_duration": 40,
|
47 |
+
"num_buckets": 30,
|
48 |
+
"num_epochs": 10,
|
49 |
+
"num_workers": 2,
|
50 |
+
"on_the_fly_feats": false,
|
51 |
+
"pretrained_model_path": null,
|
52 |
+
"resample_to_16kHz": true,
|
53 |
+
"reset_interval": 200,
|
54 |
+
"return_cuts": true,
|
55 |
+
"sampler_state_dict_path": null,
|
56 |
+
"seed": 42,
|
57 |
+
"shuffle": true,
|
58 |
+
"spec_aug_time_warp_factor": 80,
|
59 |
+
"speech_encoder_path_or_name": "models/whisper/v1.1/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt",
|
60 |
+
"start_epoch": 1,
|
61 |
+
"subsampling_factor": 2,
|
62 |
+
"tensorboard": true,
|
63 |
+
"text_key": "answer",
|
64 |
+
"unfreeze_llm": true,
|
65 |
+
"unfreeze_speech_projector": true,
|
66 |
+
"use_flash_attn": true,
|
67 |
+
"use_fp16": true,
|
68 |
+
"use_lora": true,
|
69 |
+
"valid_interval": 5000
|
70 |
+
}
|
71 |
+
2025-04-24 09:15:56,357 INFO [train.py:655] About to create model
|
72 |
+
2025-04-24 09:16:18,941 INFO [train.py:808] Number of model parameters: 1326848644
|
73 |
+
2025-04-24 09:16:18,941 INFO [train.py:810] Trainable parameters (excluding model.eval modules):
|
74 |
+
2025-04-24 09:16:18,942 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
75 |
+
2025-04-24 09:16:18,942 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
76 |
+
2025-04-24 09:16:18,942 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
77 |
+
2025-04-24 09:16:18,942 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
78 |
+
2025-04-24 09:16:18,942 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
79 |
+
2025-04-24 09:16:18,942 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
80 |
+
2025-04-24 09:16:18,942 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
81 |
+
2025-04-24 09:16:18,942 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
82 |
+
2025-04-24 09:16:18,943 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
83 |
+
2025-04-24 09:16:18,943 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
84 |
+
2025-04-24 09:16:18,943 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
85 |
+
2025-04-24 09:16:18,943 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
86 |
+
2025-04-24 09:16:18,943 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
87 |
+
2025-04-24 09:16:18,943 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
88 |
+
2025-04-24 09:16:18,943 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
89 |
+
2025-04-24 09:16:18,943 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
90 |
+
2025-04-24 09:16:18,943 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
91 |
+
2025-04-24 09:16:18,943 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
92 |
+
2025-04-24 09:16:18,943 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
93 |
+
2025-04-24 09:16:18,943 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
94 |
+
2025-04-24 09:16:18,943 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
95 |
+
2025-04-24 09:16:18,943 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
96 |
+
2025-04-24 09:16:18,943 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
97 |
+
2025-04-24 09:16:18,943 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
98 |
+
2025-04-24 09:16:18,943 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
99 |
+
2025-04-24 09:16:18,943 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
100 |
+
2025-04-24 09:16:18,944 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
101 |
+
2025-04-24 09:16:18,944 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
102 |
+
2025-04-24 09:16:18,944 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
103 |
+
2025-04-24 09:16:18,944 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
104 |
+
2025-04-24 09:16:18,944 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
105 |
+
2025-04-24 09:16:18,944 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
106 |
+
2025-04-24 09:16:18,944 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
107 |
+
2025-04-24 09:16:18,944 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
108 |
+
2025-04-24 09:16:18,944 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
109 |
+
2025-04-24 09:16:18,944 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
110 |
+
2025-04-24 09:16:18,944 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
111 |
+
2025-04-24 09:16:18,944 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
112 |
+
2025-04-24 09:16:18,944 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
113 |
+
2025-04-24 09:16:18,944 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
114 |
+
2025-04-24 09:16:18,944 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
115 |
+
2025-04-24 09:16:18,944 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
116 |
+
2025-04-24 09:16:18,944 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
117 |
+
2025-04-24 09:16:18,944 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
118 |
+
2025-04-24 09:16:18,944 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
119 |
+
2025-04-24 09:16:18,945 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
120 |
+
2025-04-24 09:16:18,945 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
121 |
+
2025-04-24 09:16:18,945 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
122 |
+
2025-04-24 09:16:18,945 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
123 |
+
2025-04-24 09:16:18,945 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
124 |
+
2025-04-24 09:16:18,945 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
125 |
+
2025-04-24 09:16:18,945 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
126 |
+
2025-04-24 09:16:18,945 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
127 |
+
2025-04-24 09:16:18,945 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
128 |
+
2025-04-24 09:16:18,945 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
129 |
+
2025-04-24 09:16:18,945 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
130 |
+
2025-04-24 09:16:18,945 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
131 |
+
2025-04-24 09:16:18,945 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
132 |
+
2025-04-24 09:16:18,945 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
133 |
+
2025-04-24 09:16:18,945 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
134 |
+
2025-04-24 09:16:18,945 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
135 |
+
2025-04-24 09:16:18,945 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
136 |
+
2025-04-24 09:16:18,945 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
137 |
+
2025-04-24 09:16:18,945 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
138 |
+
2025-04-24 09:16:18,945 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
139 |
+
2025-04-24 09:16:18,946 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
140 |
+
2025-04-24 09:16:18,946 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
141 |
+
2025-04-24 09:16:18,946 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
142 |
+
2025-04-24 09:16:18,946 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
143 |
+
2025-04-24 09:16:18,946 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
144 |
+
2025-04-24 09:16:18,946 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
145 |
+
2025-04-24 09:16:18,946 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
146 |
+
2025-04-24 09:16:18,946 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
147 |
+
2025-04-24 09:16:18,946 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
148 |
+
2025-04-24 09:16:18,946 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
149 |
+
2025-04-24 09:16:18,946 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
150 |
+
2025-04-24 09:16:18,946 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
151 |
+
2025-04-24 09:16:18,946 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
152 |
+
2025-04-24 09:16:18,946 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
153 |
+
2025-04-24 09:16:18,946 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
154 |
+
2025-04-24 09:16:18,946 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
155 |
+
2025-04-24 09:16:18,946 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
156 |
+
2025-04-24 09:16:18,946 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
157 |
+
2025-04-24 09:16:18,946 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
158 |
+
2025-04-24 09:16:18,947 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
159 |
+
2025-04-24 09:16:18,947 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
160 |
+
2025-04-24 09:16:18,947 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
161 |
+
2025-04-24 09:16:18,947 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
162 |
+
2025-04-24 09:16:18,947 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
163 |
+
2025-04-24 09:16:18,947 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
164 |
+
2025-04-24 09:16:18,947 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
165 |
+
2025-04-24 09:16:18,947 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
166 |
+
2025-04-24 09:16:18,947 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
167 |
+
2025-04-24 09:16:18,947 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
168 |
+
2025-04-24 09:16:18,947 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
169 |
+
2025-04-24 09:16:18,947 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
170 |
+
2025-04-24 09:16:18,947 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
171 |
+
2025-04-24 09:16:18,947 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
172 |
+
2025-04-24 09:16:18,947 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
173 |
+
2025-04-24 09:16:18,947 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
174 |
+
2025-04-24 09:16:18,947 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
175 |
+
2025-04-24 09:16:18,947 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
176 |
+
2025-04-24 09:16:18,947 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
177 |
+
2025-04-24 09:16:18,948 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
178 |
+
2025-04-24 09:16:18,948 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
179 |
+
2025-04-24 09:16:18,948 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
180 |
+
2025-04-24 09:16:18,948 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
181 |
+
2025-04-24 09:16:18,948 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
182 |
+
2025-04-24 09:16:18,948 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
183 |
+
2025-04-24 09:16:18,948 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
184 |
+
2025-04-24 09:16:18,948 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
185 |
+
2025-04-24 09:16:18,948 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
186 |
+
2025-04-24 09:16:18,948 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
187 |
+
2025-04-24 09:16:18,948 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
188 |
+
2025-04-24 09:16:18,948 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
189 |
+
2025-04-24 09:16:18,948 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
190 |
+
2025-04-24 09:16:18,948 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
191 |
+
2025-04-24 09:16:18,948 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
192 |
+
2025-04-24 09:16:18,948 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
193 |
+
2025-04-24 09:16:18,948 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
194 |
+
2025-04-24 09:16:18,948 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
195 |
+
2025-04-24 09:16:18,948 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
196 |
+
2025-04-24 09:16:18,948 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
197 |
+
2025-04-24 09:16:18,949 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
198 |
+
2025-04-24 09:16:18,949 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
199 |
+
2025-04-24 09:16:18,949 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
200 |
+
2025-04-24 09:16:18,949 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
201 |
+
2025-04-24 09:16:18,949 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
202 |
+
2025-04-24 09:16:18,949 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
203 |
+
2025-04-24 09:16:18,949 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
204 |
+
2025-04-24 09:16:18,949 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
205 |
+
2025-04-24 09:16:18,949 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
206 |
+
2025-04-24 09:16:18,949 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
207 |
+
2025-04-24 09:16:18,949 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
208 |
+
2025-04-24 09:16:18,949 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
209 |
+
2025-04-24 09:16:18,949 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
210 |
+
2025-04-24 09:16:18,949 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
211 |
+
2025-04-24 09:16:18,949 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
212 |
+
2025-04-24 09:16:18,949 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
213 |
+
2025-04-24 09:16:18,949 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
214 |
+
2025-04-24 09:16:18,949 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
215 |
+
2025-04-24 09:16:18,950 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
216 |
+
2025-04-24 09:16:18,950 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
217 |
+
2025-04-24 09:16:18,950 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
218 |
+
2025-04-24 09:16:18,950 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
219 |
+
2025-04-24 09:16:18,950 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
220 |
+
2025-04-24 09:16:18,950 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
221 |
+
2025-04-24 09:16:18,950 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
222 |
+
2025-04-24 09:16:18,950 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
223 |
+
2025-04-24 09:16:18,950 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
224 |
+
2025-04-24 09:16:18,950 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
225 |
+
2025-04-24 09:16:18,950 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
226 |
+
2025-04-24 09:16:18,950 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
227 |
+
2025-04-24 09:16:18,950 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
228 |
+
2025-04-24 09:16:18,950 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
229 |
+
2025-04-24 09:16:18,950 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
230 |
+
2025-04-24 09:16:18,950 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
231 |
+
2025-04-24 09:16:18,950 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
232 |
+
2025-04-24 09:16:18,950 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
233 |
+
2025-04-24 09:16:18,950 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
234 |
+
2025-04-24 09:16:18,951 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
235 |
+
2025-04-24 09:16:18,951 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
236 |
+
2025-04-24 09:16:18,951 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
237 |
+
2025-04-24 09:16:18,951 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
238 |
+
2025-04-24 09:16:18,951 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
239 |
+
2025-04-24 09:16:18,951 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
240 |
+
2025-04-24 09:16:18,951 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
241 |
+
2025-04-24 09:16:18,951 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
242 |
+
2025-04-24 09:16:18,951 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
243 |
+
2025-04-24 09:16:18,951 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
244 |
+
2025-04-24 09:16:18,951 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
245 |
+
2025-04-24 09:16:18,951 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
246 |
+
2025-04-24 09:16:18,951 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
247 |
+
2025-04-24 09:16:18,951 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
248 |
+
2025-04-24 09:16:18,951 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
249 |
+
2025-04-24 09:16:18,951 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
250 |
+
2025-04-24 09:16:18,951 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
251 |
+
2025-04-24 09:16:18,951 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
252 |
+
2025-04-24 09:16:18,951 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
253 |
+
2025-04-24 09:16:18,952 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
254 |
+
2025-04-24 09:16:18,952 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
255 |
+
2025-04-24 09:16:18,952 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
256 |
+
2025-04-24 09:16:18,952 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
257 |
+
2025-04-24 09:16:18,952 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
258 |
+
2025-04-24 09:16:18,952 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
259 |
+
2025-04-24 09:16:18,952 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
260 |
+
2025-04-24 09:16:18,952 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
261 |
+
2025-04-24 09:16:18,952 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
262 |
+
2025-04-24 09:16:18,952 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
263 |
+
2025-04-24 09:16:18,952 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
264 |
+
2025-04-24 09:16:18,952 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
265 |
+
2025-04-24 09:16:18,952 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
266 |
+
2025-04-24 09:16:18,952 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
267 |
+
2025-04-24 09:16:18,952 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
268 |
+
2025-04-24 09:16:18,952 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
269 |
+
2025-04-24 09:16:18,952 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
270 |
+
2025-04-24 09:16:18,952 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
271 |
+
2025-04-24 09:16:18,952 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
272 |
+
2025-04-24 09:16:18,953 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
273 |
+
2025-04-24 09:16:18,953 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
274 |
+
2025-04-24 09:16:18,953 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
275 |
+
2025-04-24 09:16:18,953 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
276 |
+
2025-04-24 09:16:18,953 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
277 |
+
2025-04-24 09:16:18,953 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
278 |
+
2025-04-24 09:16:18,953 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
279 |
+
2025-04-24 09:16:18,953 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
280 |
+
2025-04-24 09:16:18,953 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
281 |
+
2025-04-24 09:16:18,953 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
282 |
+
2025-04-24 09:16:18,953 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
283 |
+
2025-04-24 09:16:18,953 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
284 |
+
2025-04-24 09:16:18,953 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
285 |
+
2025-04-24 09:16:18,953 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
286 |
+
2025-04-24 09:16:18,953 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
287 |
+
2025-04-24 09:16:18,953 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
288 |
+
2025-04-24 09:16:18,953 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
289 |
+
2025-04-24 09:16:18,953 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
290 |
+
2025-04-24 09:16:18,953 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
291 |
+
2025-04-24 09:16:18,954 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
292 |
+
2025-04-24 09:16:18,954 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
293 |
+
2025-04-24 09:16:18,954 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
294 |
+
2025-04-24 09:16:18,954 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
295 |
+
2025-04-24 09:16:18,954 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
296 |
+
2025-04-24 09:16:18,954 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
297 |
+
2025-04-24 09:16:18,954 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
298 |
+
2025-04-24 09:16:18,954 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
299 |
+
2025-04-24 09:16:18,954 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
300 |
+
2025-04-24 09:16:18,954 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
301 |
+
2025-04-24 09:16:18,954 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
302 |
+
2025-04-24 09:16:18,954 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
303 |
+
2025-04-24 09:16:18,954 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
304 |
+
2025-04-24 09:16:18,954 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
305 |
+
2025-04-24 09:16:18,954 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
306 |
+
2025-04-24 09:16:18,954 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
307 |
+
2025-04-24 09:16:18,954 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
308 |
+
2025-04-24 09:16:18,954 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
309 |
+
2025-04-24 09:16:18,954 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
310 |
+
2025-04-24 09:16:18,955 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
311 |
+
2025-04-24 09:16:18,955 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
312 |
+
2025-04-24 09:16:18,955 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
313 |
+
2025-04-24 09:16:18,955 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
314 |
+
2025-04-24 09:16:18,955 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
315 |
+
2025-04-24 09:16:18,955 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
316 |
+
2025-04-24 09:16:18,955 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
317 |
+
2025-04-24 09:16:18,955 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
318 |
+
2025-04-24 09:16:18,955 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
319 |
+
2025-04-24 09:16:18,955 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
320 |
+
2025-04-24 09:16:18,955 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
321 |
+
2025-04-24 09:16:18,955 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
322 |
+
2025-04-24 09:16:18,955 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
323 |
+
2025-04-24 09:16:18,955 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
324 |
+
2025-04-24 09:16:18,955 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
325 |
+
2025-04-24 09:16:18,955 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
326 |
+
2025-04-24 09:16:18,955 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
327 |
+
2025-04-24 09:16:18,955 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
328 |
+
2025-04-24 09:16:18,956 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
329 |
+
2025-04-24 09:16:18,956 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
330 |
+
2025-04-24 09:16:18,956 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
331 |
+
2025-04-24 09:16:18,956 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
332 |
+
2025-04-24 09:16:18,956 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
333 |
+
2025-04-24 09:16:18,956 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
334 |
+
2025-04-24 09:16:18,956 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
335 |
+
2025-04-24 09:16:18,956 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
336 |
+
2025-04-24 09:16:18,956 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
337 |
+
2025-04-24 09:16:18,956 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
338 |
+
2025-04-24 09:16:18,956 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
339 |
+
2025-04-24 09:16:18,956 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
340 |
+
2025-04-24 09:16:18,956 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
341 |
+
2025-04-24 09:16:18,956 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
342 |
+
2025-04-24 09:16:18,956 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
343 |
+
2025-04-24 09:16:18,956 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
344 |
+
2025-04-24 09:16:18,956 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
345 |
+
2025-04-24 09:16:18,956 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
346 |
+
2025-04-24 09:16:18,956 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
347 |
+
2025-04-24 09:16:18,957 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
348 |
+
2025-04-24 09:16:18,957 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
349 |
+
2025-04-24 09:16:18,957 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
350 |
+
2025-04-24 09:16:18,957 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
351 |
+
2025-04-24 09:16:18,957 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
352 |
+
2025-04-24 09:16:18,957 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
353 |
+
2025-04-24 09:16:18,957 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
354 |
+
2025-04-24 09:16:18,957 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
355 |
+
2025-04-24 09:16:18,957 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
356 |
+
2025-04-24 09:16:18,957 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
357 |
+
2025-04-24 09:16:18,957 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
358 |
+
2025-04-24 09:16:18,957 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
359 |
+
2025-04-24 09:16:18,957 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
360 |
+
2025-04-24 09:16:18,957 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
361 |
+
2025-04-24 09:16:18,957 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
362 |
+
2025-04-24 09:16:18,957 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
363 |
+
2025-04-24 09:16:18,957 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
364 |
+
2025-04-24 09:16:18,957 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
365 |
+
2025-04-24 09:16:18,957 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
366 |
+
2025-04-24 09:16:18,957 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
367 |
+
2025-04-24 09:16:18,958 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
368 |
+
2025-04-24 09:16:18,958 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
369 |
+
2025-04-24 09:16:18,958 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
370 |
+
2025-04-24 09:16:18,958 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
371 |
+
2025-04-24 09:16:18,958 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
372 |
+
2025-04-24 09:16:18,958 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
373 |
+
2025-04-24 09:16:18,958 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
374 |
+
2025-04-24 09:16:18,958 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
375 |
+
2025-04-24 09:16:18,958 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
376 |
+
2025-04-24 09:16:18,958 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
377 |
+
2025-04-24 09:16:18,958 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
378 |
+
2025-04-24 09:16:18,958 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
379 |
+
2025-04-24 09:16:18,958 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
380 |
+
2025-04-24 09:16:18,958 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
381 |
+
2025-04-24 09:16:18,958 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
382 |
+
2025-04-24 09:16:18,958 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
383 |
+
2025-04-24 09:16:18,958 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
384 |
+
2025-04-24 09:16:18,958 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
385 |
+
2025-04-24 09:16:18,958 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
386 |
+
2025-04-24 09:16:18,959 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
387 |
+
2025-04-24 09:16:18,959 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
388 |
+
2025-04-24 09:16:18,959 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
389 |
+
2025-04-24 09:16:18,959 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
390 |
+
2025-04-24 09:16:18,959 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
391 |
+
2025-04-24 09:16:18,959 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
392 |
+
2025-04-24 09:16:18,959 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
393 |
+
2025-04-24 09:16:18,959 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
394 |
+
2025-04-24 09:16:18,959 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
395 |
+
2025-04-24 09:16:18,959 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
396 |
+
2025-04-24 09:16:18,959 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
397 |
+
2025-04-24 09:16:18,959 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
398 |
+
2025-04-24 09:16:18,959 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
399 |
+
2025-04-24 09:16:18,959 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
400 |
+
2025-04-24 09:16:18,959 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
401 |
+
2025-04-24 09:16:18,959 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
402 |
+
2025-04-24 09:16:18,959 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
403 |
+
2025-04-24 09:16:18,959 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
404 |
+
2025-04-24 09:16:18,959 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
405 |
+
2025-04-24 09:16:18,960 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
406 |
+
2025-04-24 09:16:18,960 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
407 |
+
2025-04-24 09:16:18,960 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
408 |
+
2025-04-24 09:16:18,960 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
409 |
+
2025-04-24 09:16:18,960 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
410 |
+
2025-04-24 09:16:18,960 INFO [train.py:813] encoder_projector.linear1.weight: torch.Size([896, 10240])
|
411 |
+
2025-04-24 09:16:18,960 INFO [train.py:813] encoder_projector.linear1.bias: torch.Size([896])
|
412 |
+
2025-04-24 09:16:18,960 INFO [train.py:813] encoder_projector.linear2.weight: torch.Size([896, 896])
|
413 |
+
2025-04-24 09:16:18,960 INFO [train.py:813] encoder_projector.linear2.bias: torch.Size([896])
|
414 |
+
2025-04-24 09:16:18,960 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
415 |
+
2025-04-24 09:16:18,960 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
416 |
+
2025-04-24 09:16:18,960 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
417 |
+
2025-04-24 09:16:18,960 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
418 |
+
2025-04-24 09:16:18,960 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
419 |
+
2025-04-24 09:16:18,960 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
420 |
+
2025-04-24 09:16:18,960 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
421 |
+
2025-04-24 09:16:18,960 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
422 |
+
2025-04-24 09:16:18,960 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
423 |
+
2025-04-24 09:16:18,960 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
424 |
+
2025-04-24 09:16:18,961 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
425 |
+
2025-04-24 09:16:18,961 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
426 |
+
2025-04-24 09:16:18,961 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
427 |
+
2025-04-24 09:16:18,961 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
428 |
+
2025-04-24 09:16:18,961 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
429 |
+
2025-04-24 09:16:18,961 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
430 |
+
2025-04-24 09:16:18,961 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
431 |
+
2025-04-24 09:16:18,961 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
432 |
+
2025-04-24 09:16:18,961 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
433 |
+
2025-04-24 09:16:18,961 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
434 |
+
2025-04-24 09:16:18,961 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
435 |
+
2025-04-24 09:16:18,961 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
436 |
+
2025-04-24 09:16:18,961 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
437 |
+
2025-04-24 09:16:18,961 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
438 |
+
2025-04-24 09:16:18,961 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
439 |
+
2025-04-24 09:16:18,961 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
440 |
+
2025-04-24 09:16:18,961 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
441 |
+
2025-04-24 09:16:18,961 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
442 |
+
2025-04-24 09:16:18,961 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
443 |
+
2025-04-24 09:16:18,962 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
444 |
+
2025-04-24 09:16:18,962 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
445 |
+
2025-04-24 09:16:18,962 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
446 |
+
2025-04-24 09:16:18,962 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
447 |
+
2025-04-24 09:16:18,962 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
448 |
+
2025-04-24 09:16:18,962 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
449 |
+
2025-04-24 09:16:18,962 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
450 |
+
2025-04-24 09:16:18,962 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
451 |
+
2025-04-24 09:16:18,962 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
452 |
+
2025-04-24 09:16:18,962 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
453 |
+
2025-04-24 09:16:18,962 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
454 |
+
2025-04-24 09:16:18,962 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
455 |
+
2025-04-24 09:16:18,962 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
456 |
+
2025-04-24 09:16:18,962 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
457 |
+
2025-04-24 09:16:18,962 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
458 |
+
2025-04-24 09:16:18,962 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
459 |
+
2025-04-24 09:16:18,962 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
460 |
+
2025-04-24 09:16:18,962 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
461 |
+
2025-04-24 09:16:18,962 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
462 |
+
2025-04-24 09:16:18,963 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
463 |
+
2025-04-24 09:16:18,963 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
464 |
+
2025-04-24 09:16:18,963 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
465 |
+
2025-04-24 09:16:18,963 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
466 |
+
2025-04-24 09:16:18,963 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
467 |
+
2025-04-24 09:16:18,963 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
468 |
+
2025-04-24 09:16:18,963 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
469 |
+
2025-04-24 09:16:18,963 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
470 |
+
2025-04-24 09:16:18,963 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
471 |
+
2025-04-24 09:16:18,963 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
472 |
+
2025-04-24 09:16:18,963 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
473 |
+
2025-04-24 09:16:18,963 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
474 |
+
2025-04-24 09:16:18,963 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
475 |
+
2025-04-24 09:16:18,963 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
476 |
+
2025-04-24 09:16:18,963 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
477 |
+
2025-04-24 09:16:18,963 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
478 |
+
2025-04-24 09:16:18,963 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
479 |
+
2025-04-24 09:16:18,963 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
480 |
+
2025-04-24 09:16:18,963 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
481 |
+
2025-04-24 09:16:18,964 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
482 |
+
2025-04-24 09:16:18,964 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
483 |
+
2025-04-24 09:16:18,964 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
484 |
+
2025-04-24 09:16:18,964 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
485 |
+
2025-04-24 09:16:18,964 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
486 |
+
2025-04-24 09:16:18,964 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
487 |
+
2025-04-24 09:16:18,964 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
488 |
+
2025-04-24 09:16:18,964 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
489 |
+
2025-04-24 09:16:18,964 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
490 |
+
2025-04-24 09:16:18,964 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
491 |
+
2025-04-24 09:16:18,964 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
492 |
+
2025-04-24 09:16:18,964 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
493 |
+
2025-04-24 09:16:18,964 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
494 |
+
2025-04-24 09:16:18,964 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
495 |
+
2025-04-24 09:16:18,964 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
496 |
+
2025-04-24 09:16:18,964 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
497 |
+
2025-04-24 09:16:18,964 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
498 |
+
2025-04-24 09:16:18,964 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
499 |
+
2025-04-24 09:16:18,964 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
500 |
+
2025-04-24 09:16:18,965 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
501 |
+
2025-04-24 09:16:18,965 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
502 |
+
2025-04-24 09:16:18,965 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
503 |
+
2025-04-24 09:16:18,965 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
504 |
+
2025-04-24 09:16:18,965 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
505 |
+
2025-04-24 09:16:18,965 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
506 |
+
2025-04-24 09:16:18,965 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
507 |
+
2025-04-24 09:16:18,965 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
508 |
+
2025-04-24 09:16:18,965 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
509 |
+
2025-04-24 09:16:18,965 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
510 |
+
2025-04-24 09:16:18,965 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
511 |
+
2025-04-24 09:16:18,965 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
512 |
+
2025-04-24 09:16:18,965 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
513 |
+
2025-04-24 09:16:18,965 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
514 |
+
2025-04-24 09:16:18,965 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
515 |
+
2025-04-24 09:16:18,965 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
516 |
+
2025-04-24 09:16:18,965 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
517 |
+
2025-04-24 09:16:18,965 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
518 |
+
2025-04-24 09:16:18,965 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
519 |
+
2025-04-24 09:16:18,966 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
520 |
+
2025-04-24 09:16:18,966 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
521 |
+
2025-04-24 09:16:18,966 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
522 |
+
2025-04-24 09:16:18,966 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
523 |
+
2025-04-24 09:16:18,966 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
524 |
+
2025-04-24 09:16:18,966 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
525 |
+
2025-04-24 09:16:18,966 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
526 |
+
2025-04-24 09:16:18,966 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
527 |
+
2025-04-24 09:16:18,966 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
528 |
+
2025-04-24 09:16:18,966 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
529 |
+
2025-04-24 09:16:18,966 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
530 |
+
2025-04-24 09:16:18,966 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
531 |
+
2025-04-24 09:16:18,966 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
532 |
+
2025-04-24 09:16:18,966 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
533 |
+
2025-04-24 09:16:18,966 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
534 |
+
2025-04-24 09:16:18,966 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
535 |
+
2025-04-24 09:16:18,966 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
536 |
+
2025-04-24 09:16:18,966 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
537 |
+
2025-04-24 09:16:18,966 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
538 |
+
2025-04-24 09:16:18,966 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
539 |
+
2025-04-24 09:16:18,967 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
540 |
+
2025-04-24 09:16:18,967 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
541 |
+
2025-04-24 09:16:18,967 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
542 |
+
2025-04-24 09:16:18,967 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
543 |
+
2025-04-24 09:16:18,967 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
544 |
+
2025-04-24 09:16:18,967 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
545 |
+
2025-04-24 09:16:18,967 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
546 |
+
2025-04-24 09:16:18,967 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
547 |
+
2025-04-24 09:16:18,967 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
548 |
+
2025-04-24 09:16:18,967 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
549 |
+
2025-04-24 09:16:18,967 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
550 |
+
2025-04-24 09:16:18,967 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
551 |
+
2025-04-24 09:16:18,967 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
552 |
+
2025-04-24 09:16:18,967 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
553 |
+
2025-04-24 09:16:18,967 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
554 |
+
2025-04-24 09:16:18,967 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
555 |
+
2025-04-24 09:16:18,967 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
556 |
+
2025-04-24 09:16:18,967 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
557 |
+
2025-04-24 09:16:18,967 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
558 |
+
2025-04-24 09:16:18,968 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
559 |
+
2025-04-24 09:16:18,968 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
560 |
+
2025-04-24 09:16:18,968 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
561 |
+
2025-04-24 09:16:18,968 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
562 |
+
2025-04-24 09:16:18,968 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
563 |
+
2025-04-24 09:16:18,968 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
564 |
+
2025-04-24 09:16:18,968 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
565 |
+
2025-04-24 09:16:18,968 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
566 |
+
2025-04-24 09:16:18,968 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
567 |
+
2025-04-24 09:16:18,968 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
568 |
+
2025-04-24 09:16:18,968 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
569 |
+
2025-04-24 09:16:18,968 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
570 |
+
2025-04-24 09:16:18,968 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
571 |
+
2025-04-24 09:16:18,968 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
572 |
+
2025-04-24 09:16:18,968 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
573 |
+
2025-04-24 09:16:18,968 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
574 |
+
2025-04-24 09:16:18,968 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
575 |
+
2025-04-24 09:16:18,968 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
576 |
+
2025-04-24 09:16:18,968 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
577 |
+
2025-04-24 09:16:18,969 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
578 |
+
2025-04-24 09:16:18,969 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
579 |
+
2025-04-24 09:16:18,969 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
580 |
+
2025-04-24 09:16:18,969 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
581 |
+
2025-04-24 09:16:18,969 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
582 |
+
2025-04-24 09:16:18,969 INFO [train.py:813] speech_token_projector.weight: torch.Size([1024, 896])
|
583 |
+
2025-04-24 09:16:18,969 INFO [train.py:813] speech_token_projector.bias: torch.Size([1024])
|
584 |
+
2025-04-24 09:16:18,969 INFO [train.py:813] codec_lm_head.weight: torch.Size([4100, 1024])
|
585 |
+
2025-04-24 09:16:18,969 INFO [train.py:813] codec_lm_head.bias: torch.Size([4100])
|
586 |
+
2025-04-24 09:16:18,969 INFO [train.py:819] Device: cuda:0
|
587 |
+
2025-04-24 09:16:19,658 INFO [train.py:823] Using DeepSpeed
|
588 |
+
2025-04-24 09:16:21,501 INFO [data_module.py:445] About to get train cuts
|
589 |
+
2025-04-24 09:16:21,504 INFO [data_module.py:250] Disable MUSAN
|
590 |
+
2025-04-24 09:16:21,504 INFO [data_module.py:268] Enable SpecAugment
|
591 |
+
2025-04-24 09:16:21,504 INFO [data_module.py:269] Time warp factor: 80
|
592 |
+
2025-04-24 09:16:21,505 INFO [data_module.py:279] Num frame mask: 10
|
593 |
+
2025-04-24 09:16:21,505 INFO [data_module.py:292] About to create train dataset
|
594 |
+
2025-04-24 09:16:21,505 INFO [data_module.py:319] Using DynamicBucketingSampler.
|
595 |
+
2025-04-24 09:16:21,537 WARNING [train.py:847] Exclude cut with ID c45eb75253bb65901de61b81270c9974 from training. Duration: 6.199727891156463, lenth: 2321
|
596 |
+
2025-04-24 09:16:21,720 WARNING [train.py:847] Exclude cut with ID 82678aa1e73accb2d11a87440a90fd55 from training. Duration: 5.0967800453514736, lenth: 2413
|
597 |
+
2025-04-24 09:16:22,210 WARNING [train.py:847] Exclude cut with ID 345bcf1b57cbfcf140007812e992c07a from training. Duration: 5.712108843537415, lenth: 2520
|
598 |
+
2025-04-24 09:16:22,364 WARNING [train.py:847] Exclude cut with ID 2bc428341a40605b4887a7e46e3ee4ac from training. Duration: 3.657142857142857, lenth: 2282
|
599 |
+
2025-04-24 09:16:22,430 WARNING [train.py:847] Exclude cut with ID 24c14f6007ceb8d993f6496aeb663c24 from training. Duration: 2.391655328798186, lenth: 2218
|
600 |
+
2025-04-24 09:16:22,494 INFO [data_module.py:336] About to create train dataloader
|
601 |
+
2025-04-24 09:16:22,495 INFO [data_module.py:436] About to get test cuts
|
602 |
+
2025-04-24 09:16:22,496 INFO [data_module.py:365] About to create dev dataset
|
603 |
+
2025-04-24 09:16:22,586 INFO [data_module.py:379] About to create dev dataloader
|
604 |
+
2025-04-24 09:16:22,594 INFO [train.py:875] start training from epoch 1
|
605 |
+
2025-04-24 09:16:22,719 WARNING [train.py:847] Exclude cut with ID c45eb75253bb65901de61b81270c9974 from training. Duration: 6.199727891156463, lenth: 2321
|
606 |
+
2025-04-24 09:16:22,947 WARNING [train.py:847] Exclude cut with ID 82678aa1e73accb2d11a87440a90fd55 from training. Duration: 5.0967800453514736, lenth: 2413
|
607 |
+
2025-04-24 09:16:23,573 WARNING [train.py:847] Exclude cut with ID 345bcf1b57cbfcf140007812e992c07a from training. Duration: 5.712108843537415, lenth: 2520
|
608 |
+
2025-04-24 09:16:23,764 WARNING [train.py:847] Exclude cut with ID 2bc428341a40605b4887a7e46e3ee4ac from training. Duration: 3.657142857142857, lenth: 2282
|
609 |
+
2025-04-24 09:16:23,840 WARNING [train.py:847] Exclude cut with ID 24c14f6007ceb8d993f6496aeb663c24 from training. Duration: 2.391655328798186, lenth: 2218
|
610 |
+
2025-04-24 09:16:24,386 WARNING [train.py:847] Exclude cut with ID de225db1da1393474e4ce9af8cbafb42 from training. Duration: 5.433469387755102, lenth: 2230
|
611 |
+
2025-04-24 09:16:24,677 WARNING [train.py:847] Exclude cut with ID ef376c7d1fb4606edb0547b2a7fb95b3 from training. Duration: 2.7399546485260773, lenth: 2637
|
612 |
+
2025-04-24 09:16:25,021 WARNING [train.py:847] Exclude cut with ID c7f3c7da44c2c6e27f86bda421b958d8 from training. Duration: 2.995374149659864, lenth: 2318
|
613 |
+
2025-04-24 09:16:25,194 WARNING [train.py:847] Exclude cut with ID 8d724f452d1d99549eebf90ab658e7bd from training. Duration: 5.712108843537415, lenth: 2209
|
614 |
+
2025-04-24 09:16:25,310 WARNING [train.py:847] Exclude cut with ID a585785e795cb35d02d8f7b599352e16 from training. Duration: 4.911020408163266, lenth: 2575
|
615 |
+
2025-04-24 09:16:25,391 WARNING [train.py:847] Exclude cut with ID 31a00c5e63c88ef67402f578ebc6ebf7 from training. Duration: 3.575873015873016, lenth: 2307
|
616 |
+
2025-04-24 09:16:25,508 WARNING [train.py:847] Exclude cut with ID 32ab4761aaf08d9473fc8388c3a6cffc from training. Duration: 7.9760544217687075, lenth: 2201
|
617 |
+
2025-04-24 09:16:25,564 WARNING [train.py:847] Exclude cut with ID 5fddb6bf9d5265c0409b81c85fadbaec from training. Duration: 3.4597732426303853, lenth: 2210
|
618 |
+
2025-04-24 09:16:26,408 WARNING [train.py:847] Exclude cut with ID c5d43a2443ce7820cf864f7c55eafb77 from training. Duration: 3.912562358276644, lenth: 2315
|
619 |
+
2025-04-24 09:16:27,597 WARNING [train.py:847] Exclude cut with ID a951e36267c8901d4c03fb6b4dffe636 from training. Duration: 4.73687074829932, lenth: 2223
|
620 |
+
2025-04-24 09:16:27,668 WARNING [train.py:847] Exclude cut with ID 595680451c1ee7e42dba0452e165dea3 from training. Duration: 3.959002267573696, lenth: 2251
|
621 |
+
2025-04-24 09:16:28,218 WARNING [train.py:847] Exclude cut with ID 20fa9b11557159d78f72ccc6b464de18 from training. Duration: 3.9938321995464854, lenth: 2215
|
622 |
+
2025-04-24 09:16:28,272 WARNING [train.py:847] Exclude cut with ID 8a01fc72b4108673fd8393e523450e23 from training. Duration: 8.719092970521542, lenth: 2517
|
623 |
+
2025-04-24 09:16:28,798 WARNING [train.py:847] Exclude cut with ID 924d54288847d448a20809a26b320477 from training. Duration: 3.81968253968254, lenth: 2302
|
624 |
+
2025-04-24 09:16:29,100 WARNING [train.py:847] Exclude cut with ID a7f91a17f6318d896d0bee7b7307d000 from training. Duration: 5.294149659863946, lenth: 2239
|
625 |
+
2025-04-24 09:16:29,600 WARNING [train.py:847] Exclude cut with ID 8d7cff5d1d25a773f9d27590b5912b60 from training. Duration: 4.597551020408163, lenth: 2203
|
626 |
+
2025-04-24 09:16:29,720 WARNING [train.py:847] Exclude cut with ID 13407ca1ec78def38346569b54d11a78 from training. Duration: 5.909478458049887, lenth: 2280
|
627 |
+
2025-04-24 09:16:29,724 WARNING [train.py:847] Exclude cut with ID 6390ab6964b52f2ee9bf36826393abaa from training. Duration: 4.017052154195011, lenth: 2209
|
628 |
+
2025-04-24 09:16:29,769 WARNING [train.py:847] Exclude cut with ID f55c19435d6643ebf7356a1072a014fa from training. Duration: 5.259319727891157, lenth: 2310
|
629 |
+
2025-04-24 09:16:30,441 WARNING [train.py:847] Exclude cut with ID 546f5fb6d11db41d104ee7b0667f4e1d from training. Duration: 5.712108843537415, lenth: 2770
|
630 |
+
2025-04-24 09:16:30,732 WARNING [train.py:847] Exclude cut with ID b2069846ab4df18f5caa676751aa0b40 from training. Duration: 6.6525170068027215, lenth: 2588
|
631 |
+
2025-04-24 09:16:31,105 WARNING [train.py:847] Exclude cut with ID 5717a65b9855425c0330e6022a71cc2f from training. Duration: 5.073560090702948, lenth: 2269
|
632 |
+
2025-04-24 09:16:31,416 WARNING [train.py:847] Exclude cut with ID d6e37a975b3c69a86a808347b53e42f2 from training. Duration: 5.259319727891157, lenth: 2885
|
633 |
+
2025-04-24 09:16:31,564 WARNING [train.py:847] Exclude cut with ID d15e75257f8dac616c078113f8f96c63 from training. Duration: 7.6741950113378685, lenth: 2276
|
634 |
+
2025-04-24 09:16:31,632 WARNING [train.py:847] Exclude cut with ID 7ea49b35e0dc536537ae3ddcd07032af from training. Duration: 4.829750566893424, lenth: 2206
|
635 |
+
2025-04-24 09:16:32,095 WARNING [train.py:847] Exclude cut with ID 0a1a80c13fd0c38d97c9491dff84c7da from training. Duration: 2.914104308390023, lenth: 2220
|
636 |
+
2025-04-24 09:16:32,143 WARNING [train.py:847] Exclude cut with ID 359ff41c694bb848c3e37b948b1434f7 from training. Duration: 6.478367346938776, lenth: 2881
|
637 |
+
2025-04-24 09:16:32,151 WARNING [train.py:847] Exclude cut with ID 6989881203588034ddf38ae5a7cfe238 from training. Duration: 5.491519274376417, lenth: 2243
|
638 |
+
2025-04-24 09:16:32,424 WARNING [train.py:847] Exclude cut with ID fd7b5cb9a6bce2c02bf8a7c3dd2b35d6 from training. Duration: 5.793378684807256, lenth: 2305
|
639 |
+
2025-04-24 09:16:33,613 WARNING [train.py:847] Exclude cut with ID e1c5efd6d4cd7c6dc0f3ac7720161c49 from training. Duration: 9.438911564625851, lenth: 2201
|
640 |
+
2025-04-24 09:16:34,013 WARNING [train.py:847] Exclude cut with ID c13ffcc7b60ea460731e417d2d9575cf from training. Duration: 3.250793650793651, lenth: 2205
|
641 |
+
2025-04-24 09:16:34,104 WARNING [train.py:847] Exclude cut with ID ba1a37e64efe00d8b8cbfb0069e65f8d from training. Duration: 3.274013605442177, lenth: 2290
|
642 |
+
2025-04-24 09:16:34,183 WARNING [train.py:847] Exclude cut with ID de9f5fe02034bc4b0a66fbe378fffb1a from training. Duration: 6.455147392290249, lenth: 2228
|
643 |
+
2025-04-24 09:16:34,674 WARNING [train.py:847] Exclude cut with ID 8ca7a35870994edd410e1ea8e951faa3 from training. Duration: 7.012426303854875, lenth: 2233
|
644 |
+
2025-04-24 09:16:34,821 WARNING [train.py:847] Exclude cut with ID 3260368505eddf46fb1c9b562124ff18 from training. Duration: 12.097596371882085, lenth: 2261
|
645 |
+
2025-04-24 09:16:35,017 WARNING [train.py:847] Exclude cut with ID 36878ea551121147626747e9fc39ae17 from training. Duration: 5.537959183673469, lenth: 2224
|
646 |
+
2025-04-24 09:16:35,938 WARNING [train.py:847] Exclude cut with ID d4c7270e6c9b3dffed8643040b5f922b from training. Duration: 4.133151927437642, lenth: 2329
|
647 |
+
2025-04-24 09:16:36,406 WARNING [train.py:847] Exclude cut with ID 179517464c97fbbcb8909a56b849bf73 from training. Duration: 6.617687074829932, lenth: 2257
|
648 |
+
2025-04-24 09:16:36,608 WARNING [train.py:847] Exclude cut with ID 17480e9524e4c0f8e405487e50c8e98e from training. Duration: 4.551111111111111, lenth: 2263
|
649 |
+
2025-04-24 09:16:36,697 WARNING [train.py:847] Exclude cut with ID 2c10af552b911b8ac45c1fc7bb9fcb5d from training. Duration: 6.2693877551020405, lenth: 2481
|
650 |
+
2025-04-24 09:16:36,805 WARNING [train.py:847] Exclude cut with ID aa2539588175d8c96975a06a8993e5b6 from training. Duration: 5.456689342403628, lenth: 2573
|
651 |
+
2025-04-24 09:16:36,842 WARNING [train.py:847] Exclude cut with ID 32945d8e21a8628bb3c93fd499330639 from training. Duration: 13.455963718820861, lenth: 2205
|
652 |
+
2025-04-24 09:16:37,010 WARNING [train.py:847] Exclude cut with ID 64db5d6166c3edc340d8e1b02c059f19 from training. Duration: 4.818140589569161, lenth: 2259
|
653 |
+
2025-04-24 09:16:37,145 WARNING [train.py:847] Exclude cut with ID 39483151e08291efe16217ab41622d59 from training. Duration: 7.952834467120182, lenth: 2439
|
654 |
+
2025-04-24 09:16:37,842 WARNING [train.py:847] Exclude cut with ID c4a74dd23aa9a3fb13330113787139d6 from training. Duration: 3.239183673469388, lenth: 2276
|
655 |
+
2025-04-24 09:16:37,885 WARNING [train.py:847] Exclude cut with ID b12a8836f1aa525a763ac657a44229a3 from training. Duration: 5.0967800453514736, lenth: 2427
|
656 |
+
2025-04-24 09:16:41,266 WARNING [train.py:847] Exclude cut with ID 185cfa3ae9c12babe1e217de612f0c7b from training. Duration: 6.257777777777778, lenth: 2587
|
657 |
+
2025-04-24 09:16:41,353 WARNING [train.py:847] Exclude cut with ID 08eff5b64d191a0d50634630437a3107 from training. Duration: 10.135510204081633, lenth: 2273
|
658 |
+
2025-04-24 09:16:41,441 WARNING [train.py:847] Exclude cut with ID bcd8207538deeccfcf8d1de872b8f7e9 from training. Duration: 4.179591836734694, lenth: 2242
|
659 |
+
2025-04-24 09:16:41,639 WARNING [train.py:847] Exclude cut with ID f909651c923fdd6cc0eaa81de64f4a5b from training. Duration: 8.068934240362811, lenth: 2421
|
660 |
+
2025-04-24 09:16:41,667 WARNING [train.py:847] Exclude cut with ID f5d34097b89bd201018189ff44b10b33 from training. Duration: 3.7964625850340137, lenth: 2250
|
661 |
+
2025-04-24 09:16:41,758 WARNING [train.py:847] Exclude cut with ID 245530abe5a62b390611b7a3b91ddfef from training. Duration: 9.079002267573696, lenth: 2299
|
662 |
+
2025-04-24 09:16:42,647 WARNING [train.py:847] Exclude cut with ID 79d0d3129c764fd4d9c5391b6353e349 from training. Duration: 5.0967800453514736, lenth: 2309
|
663 |
+
2025-04-24 09:16:42,684 WARNING [train.py:847] Exclude cut with ID 1969177f73511ef204642f43ce029d05 from training. Duration: 3.657142857142857, lenth: 2339
|
664 |
+
2025-04-24 09:16:43,198 WARNING [train.py:847] Exclude cut with ID f601e756ecd6074cd3037c61e413cb6e from training. Duration: 5.712108843537415, lenth: 2287
|
665 |
+
2025-04-24 09:16:43,450 WARNING [train.py:847] Exclude cut with ID 9f7758e035b3eebcb9267aa49ad65d4c from training. Duration: 3.9938321995464854, lenth: 2303
|
666 |
+
2025-04-24 09:16:43,934 WARNING [train.py:847] Exclude cut with ID b29a42e8c0af813bf605d7f0a974620c from training. Duration: 5.050340136054421, lenth: 2219
|
667 |
+
2025-04-24 09:16:44,106 WARNING [train.py:847] Exclude cut with ID 192e5517b050a4659892a2832d986764 from training. Duration: 5.12, lenth: 2420
|
668 |
+
2025-04-24 09:16:44,279 WARNING [train.py:847] Exclude cut with ID bc262e923d16a434a3c5701bea841e4e from training. Duration: 7.418775510204082, lenth: 2205
|
669 |
+
2025-04-24 09:16:44,447 WARNING [train.py:847] Exclude cut with ID 5141e01019dec47ed309fbf5881ab675 from training. Duration: 8.034104308390022, lenth: 2284
|
670 |
+
2025-04-24 09:16:44,478 WARNING [train.py:847] Exclude cut with ID 3555e5c7123d744b39ba7ecce44ce1e2 from training. Duration: 3.7151927437641725, lenth: 2226
|
671 |
+
2025-04-24 09:16:44,934 WARNING [train.py:847] Exclude cut with ID 973a1dc43ca452580d6999027a82fbec from training. Duration: 5.270929705215419, lenth: 2250
|
672 |
+
2025-04-24 09:16:45,047 WARNING [train.py:847] Exclude cut with ID e2791accb62bb5e4bde9e0e2b04945c5 from training. Duration: 3.517823129251701, lenth: 2286
|
673 |
+
2025-04-24 09:16:45,223 WARNING [train.py:847] Exclude cut with ID e91152d010c4479932b7ab64a998d5c9 from training. Duration: 4.771700680272109, lenth: 2468
|
674 |
+
2025-04-24 09:16:45,358 WARNING [train.py:847] Exclude cut with ID cff6a8011a86580fcf47595d18819ae9 from training. Duration: 4.458231292517007, lenth: 2205
|
675 |
+
2025-04-24 09:16:45,406 WARNING [train.py:847] Exclude cut with ID 1623cdac3d17f7a211e8a8f434f70657 from training. Duration: 4.191201814058957, lenth: 2308
|
676 |
+
2025-04-24 09:16:46,050 WARNING [train.py:847] Exclude cut with ID 273cfc04aebaf8b0cad541d0f3d1cded from training. Duration: 6.408707482993197, lenth: 2609
|
677 |
+
2025-04-24 09:16:46,130 WARNING [train.py:847] Exclude cut with ID 7d591b090a42935fd1edd8de97c690b6 from training. Duration: 5.038730158730159, lenth: 2229
|
678 |
+
2025-04-24 09:16:46,229 WARNING [train.py:847] Exclude cut with ID 1cfa9ef743b568bb034b415000c8c479 from training. Duration: 7.174965986394557, lenth: 2440
|
679 |
+
2025-04-24 09:16:47,010 WARNING [train.py:847] Exclude cut with ID 762676d460719a9203fa7e7537560530 from training. Duration: 3.1346938775510202, lenth: 2443
|
680 |
+
2025-04-24 09:16:47,513 WARNING [train.py:847] Exclude cut with ID 43575814ad7bb6d631483cdc513b49ac from training. Duration: 10.95981859410431, lenth: 2380
|
681 |
+
2025-04-24 09:16:47,634 WARNING [train.py:847] Exclude cut with ID ae208dee7119ca436f380b99077bcde9 from training. Duration: 5.2128798185941045, lenth: 2615
|
682 |
+
2025-04-24 09:16:47,726 WARNING [train.py:847] Exclude cut with ID b56757708077ab00baa7857c1cef8491 from training. Duration: 5.456689342403628, lenth: 2206
|
683 |
+
2025-04-24 09:16:47,959 WARNING [train.py:847] Exclude cut with ID f3f6d5671f3d11bc83f8fcad598007ff from training. Duration: 4.179591836734694, lenth: 2262
|
684 |
+
2025-04-24 09:16:48,380 WARNING [train.py:847] Exclude cut with ID 6af07f098cd794b9dd5a6bd660f53505 from training. Duration: 4.435011337868481, lenth: 2251
|
685 |
+
2025-04-24 09:16:48,518 WARNING [train.py:847] Exclude cut with ID 4ac1131393136a35ef54499bf3c728d4 from training. Duration: 7.732244897959184, lenth: 2211
|
686 |
+
2025-04-24 09:16:48,520 WARNING [train.py:847] Exclude cut with ID 482877e78defa83a355d2cdeb5a629ed from training. Duration: 5.154829931972789, lenth: 2891
|
687 |
+
2025-04-24 09:16:51,265 WARNING [train.py:847] Exclude cut with ID a93637e8f3e7bdf34bc8eb5d34d34197 from training. Duration: 3.215963718820862, lenth: 2235
|
688 |
+
2025-04-24 09:16:52,934 WARNING [train.py:847] Exclude cut with ID 90c6fbf20eeb34c13ffb97a198d84b0b from training. Duration: 5.0967800453514736, lenth: 2236
|
689 |
+
2025-04-24 09:16:53,026 INFO [train.py:539] Computing validation loss
|
log/log-train-2025-04-24-10-12-23
ADDED
@@ -0,0 +1,600 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2025-04-24 10:12:23,878 INFO [train.py:653] {
|
2 |
+
"allowed_excess_duration_ratio": 0.1,
|
3 |
+
"audio_key": "question_audio",
|
4 |
+
"batch_idx_train": 0,
|
5 |
+
"best_train_epoch": -1,
|
6 |
+
"best_train_loss": Infinity,
|
7 |
+
"best_valid_epoch": -1,
|
8 |
+
"best_valid_loss": Infinity,
|
9 |
+
"bucketing_sampler": true,
|
10 |
+
"deepscale": false,
|
11 |
+
"deepscale_config": null,
|
12 |
+
"deepspeed": true,
|
13 |
+
"deepspeed_config": "./slam_omni/ds_config_zero1.json",
|
14 |
+
"drop_last": true,
|
15 |
+
"enable_musan": false,
|
16 |
+
"enable_spec_aug": true,
|
17 |
+
"enable_speech_output": true,
|
18 |
+
"encoder_projector_ds_rate": 8,
|
19 |
+
"env_info": {
|
20 |
+
"IP address": "0.114.172.122",
|
21 |
+
"hostname": "7515258",
|
22 |
+
"icefall-git-branch": "master",
|
23 |
+
"icefall-git-date": "Thu Apr 24 08:24:11 2025",
|
24 |
+
"icefall-git-sha1": "2e9be467-dirty",
|
25 |
+
"icefall-path": "/workspace/slam/icefall_omni",
|
26 |
+
"k2-build-type": "Release",
|
27 |
+
"k2-git-date": "Tue Oct 29 09:02:19 2024",
|
28 |
+
"k2-git-sha1": "75e2ed6b2fd87c22b7f3f34bad48a69984bb8755",
|
29 |
+
"k2-path": "/opt/conda/lib/python3.11/site-packages/k2/__init__.py",
|
30 |
+
"k2-version": "1.24.4",
|
31 |
+
"k2-with-cuda": true,
|
32 |
+
"lhotse-path": "/workspace/slam/lhotse/lhotse/__init__.py",
|
33 |
+
"lhotse-version": "1.30.0.dev+git.13c7616f.dirty",
|
34 |
+
"python-version": "3.11",
|
35 |
+
"torch-cuda-available": true,
|
36 |
+
"torch-cuda-version": "12.4",
|
37 |
+
"torch-version": "2.4.0"
|
38 |
+
},
|
39 |
+
"exp_dir": "slam_omni/exp_speech2speech_rerun",
|
40 |
+
"frame_shift_ms": 10,
|
41 |
+
"huggingface_dataset_path_or_name": "/workspace/Belle_1.4M-SLAM-Omni",
|
42 |
+
"input_strategy": "PrecomputedFeatures",
|
43 |
+
"llm_path_or_name": "models/Qwen2.5-0.5B-Instruct",
|
44 |
+
"log_interval": 50,
|
45 |
+
"manifest_dir": "data/fbank",
|
46 |
+
"max_duration": 40,
|
47 |
+
"num_buckets": 30,
|
48 |
+
"num_epochs": 10,
|
49 |
+
"num_workers": 2,
|
50 |
+
"on_the_fly_feats": false,
|
51 |
+
"pretrained_model_path": null,
|
52 |
+
"resample_to_16kHz": true,
|
53 |
+
"reset_interval": 200,
|
54 |
+
"return_cuts": true,
|
55 |
+
"sampler_state_dict_path": null,
|
56 |
+
"seed": 42,
|
57 |
+
"shuffle": true,
|
58 |
+
"spec_aug_time_warp_factor": 80,
|
59 |
+
"speech_encoder_path_or_name": "models/whisper/v1.1/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt",
|
60 |
+
"start_epoch": 1,
|
61 |
+
"subsampling_factor": 2,
|
62 |
+
"tensorboard": true,
|
63 |
+
"text_key": "answer",
|
64 |
+
"unfreeze_llm": true,
|
65 |
+
"unfreeze_speech_projector": true,
|
66 |
+
"use_flash_attn": true,
|
67 |
+
"use_fp16": true,
|
68 |
+
"use_lora": true,
|
69 |
+
"valid_interval": 5000
|
70 |
+
}
|
71 |
+
2025-04-24 10:12:23,878 INFO [train.py:655] About to create model
|
72 |
+
2025-04-24 10:12:44,166 INFO [train.py:808] Number of model parameters: 1326848644
|
73 |
+
2025-04-24 10:12:44,166 INFO [train.py:810] Trainable parameters (excluding model.eval modules):
|
74 |
+
2025-04-24 10:12:44,167 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
75 |
+
2025-04-24 10:12:44,167 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
76 |
+
2025-04-24 10:12:44,167 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
77 |
+
2025-04-24 10:12:44,167 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
78 |
+
2025-04-24 10:12:44,167 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
79 |
+
2025-04-24 10:12:44,167 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
80 |
+
2025-04-24 10:12:44,168 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
81 |
+
2025-04-24 10:12:44,168 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
82 |
+
2025-04-24 10:12:44,168 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
83 |
+
2025-04-24 10:12:44,168 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
84 |
+
2025-04-24 10:12:44,168 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
85 |
+
2025-04-24 10:12:44,168 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
86 |
+
2025-04-24 10:12:44,168 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
87 |
+
2025-04-24 10:12:44,168 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
88 |
+
2025-04-24 10:12:44,168 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
89 |
+
2025-04-24 10:12:44,168 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
90 |
+
2025-04-24 10:12:44,168 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
91 |
+
2025-04-24 10:12:44,168 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
92 |
+
2025-04-24 10:12:44,168 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
93 |
+
2025-04-24 10:12:44,168 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
94 |
+
2025-04-24 10:12:44,168 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
95 |
+
2025-04-24 10:12:44,168 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
96 |
+
2025-04-24 10:12:44,168 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
97 |
+
2025-04-24 10:12:44,169 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
98 |
+
2025-04-24 10:12:44,169 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
99 |
+
2025-04-24 10:12:44,169 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
100 |
+
2025-04-24 10:12:44,169 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
101 |
+
2025-04-24 10:12:44,169 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
102 |
+
2025-04-24 10:12:44,169 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
103 |
+
2025-04-24 10:12:44,169 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
104 |
+
2025-04-24 10:12:44,169 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
105 |
+
2025-04-24 10:12:44,169 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
106 |
+
2025-04-24 10:12:44,169 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
107 |
+
2025-04-24 10:12:44,169 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
108 |
+
2025-04-24 10:12:44,169 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
109 |
+
2025-04-24 10:12:44,169 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
110 |
+
2025-04-24 10:12:44,169 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
111 |
+
2025-04-24 10:12:44,169 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
112 |
+
2025-04-24 10:12:44,169 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
113 |
+
2025-04-24 10:12:44,169 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
114 |
+
2025-04-24 10:12:44,169 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
115 |
+
2025-04-24 10:12:44,169 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
116 |
+
2025-04-24 10:12:44,170 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
117 |
+
2025-04-24 10:12:44,170 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
118 |
+
2025-04-24 10:12:44,170 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
119 |
+
2025-04-24 10:12:44,170 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
120 |
+
2025-04-24 10:12:44,170 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
121 |
+
2025-04-24 10:12:44,170 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
122 |
+
2025-04-24 10:12:44,170 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
123 |
+
2025-04-24 10:12:44,170 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
124 |
+
2025-04-24 10:12:44,170 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
125 |
+
2025-04-24 10:12:44,170 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
126 |
+
2025-04-24 10:12:44,170 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
127 |
+
2025-04-24 10:12:44,170 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
128 |
+
2025-04-24 10:12:44,170 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
129 |
+
2025-04-24 10:12:44,170 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
130 |
+
2025-04-24 10:12:44,170 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
131 |
+
2025-04-24 10:12:44,170 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
132 |
+
2025-04-24 10:12:44,170 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
133 |
+
2025-04-24 10:12:44,170 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
134 |
+
2025-04-24 10:12:44,171 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
135 |
+
2025-04-24 10:12:44,171 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
136 |
+
2025-04-24 10:12:44,171 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
137 |
+
2025-04-24 10:12:44,171 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
138 |
+
2025-04-24 10:12:44,171 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
139 |
+
2025-04-24 10:12:44,171 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
140 |
+
2025-04-24 10:12:44,171 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
141 |
+
2025-04-24 10:12:44,171 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
142 |
+
2025-04-24 10:12:44,171 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
143 |
+
2025-04-24 10:12:44,171 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
144 |
+
2025-04-24 10:12:44,171 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
145 |
+
2025-04-24 10:12:44,171 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
146 |
+
2025-04-24 10:12:44,171 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
147 |
+
2025-04-24 10:12:44,171 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
148 |
+
2025-04-24 10:12:44,171 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
149 |
+
2025-04-24 10:12:44,171 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
150 |
+
2025-04-24 10:12:44,171 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
151 |
+
2025-04-24 10:12:44,171 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
152 |
+
2025-04-24 10:12:44,172 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
153 |
+
2025-04-24 10:12:44,172 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
154 |
+
2025-04-24 10:12:44,172 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
155 |
+
2025-04-24 10:12:44,172 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
156 |
+
2025-04-24 10:12:44,172 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
157 |
+
2025-04-24 10:12:44,172 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
158 |
+
2025-04-24 10:12:44,172 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
159 |
+
2025-04-24 10:12:44,172 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
160 |
+
2025-04-24 10:12:44,172 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
161 |
+
2025-04-24 10:12:44,172 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
162 |
+
2025-04-24 10:12:44,172 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
163 |
+
2025-04-24 10:12:44,172 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
164 |
+
2025-04-24 10:12:44,172 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
165 |
+
2025-04-24 10:12:44,172 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
166 |
+
2025-04-24 10:12:44,172 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
167 |
+
2025-04-24 10:12:44,172 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
168 |
+
2025-04-24 10:12:44,172 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
169 |
+
2025-04-24 10:12:44,172 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
170 |
+
2025-04-24 10:12:44,172 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
171 |
+
2025-04-24 10:12:44,172 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
172 |
+
2025-04-24 10:12:44,173 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
173 |
+
2025-04-24 10:12:44,173 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
174 |
+
2025-04-24 10:12:44,173 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
175 |
+
2025-04-24 10:12:44,173 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
176 |
+
2025-04-24 10:12:44,173 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
177 |
+
2025-04-24 10:12:44,173 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
178 |
+
2025-04-24 10:12:44,173 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
179 |
+
2025-04-24 10:12:44,173 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
180 |
+
2025-04-24 10:12:44,173 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
181 |
+
2025-04-24 10:12:44,173 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
182 |
+
2025-04-24 10:12:44,173 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
183 |
+
2025-04-24 10:12:44,173 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
184 |
+
2025-04-24 10:12:44,173 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
185 |
+
2025-04-24 10:12:44,173 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
186 |
+
2025-04-24 10:12:44,173 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
187 |
+
2025-04-24 10:12:44,173 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
188 |
+
2025-04-24 10:12:44,173 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
189 |
+
2025-04-24 10:12:44,173 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
190 |
+
2025-04-24 10:12:44,173 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
191 |
+
2025-04-24 10:12:44,174 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
192 |
+
2025-04-24 10:12:44,174 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
193 |
+
2025-04-24 10:12:44,174 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
194 |
+
2025-04-24 10:12:44,174 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
195 |
+
2025-04-24 10:12:44,174 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
196 |
+
2025-04-24 10:12:44,174 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
197 |
+
2025-04-24 10:12:44,174 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
198 |
+
2025-04-24 10:12:44,174 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
199 |
+
2025-04-24 10:12:44,174 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
200 |
+
2025-04-24 10:12:44,174 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
201 |
+
2025-04-24 10:12:44,174 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
202 |
+
2025-04-24 10:12:44,174 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
203 |
+
2025-04-24 10:12:44,174 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
204 |
+
2025-04-24 10:12:44,174 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
205 |
+
2025-04-24 10:12:44,174 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
206 |
+
2025-04-24 10:12:44,174 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
207 |
+
2025-04-24 10:12:44,174 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
208 |
+
2025-04-24 10:12:44,174 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
209 |
+
2025-04-24 10:12:44,174 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
210 |
+
2025-04-24 10:12:44,175 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
211 |
+
2025-04-24 10:12:44,175 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
212 |
+
2025-04-24 10:12:44,175 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
213 |
+
2025-04-24 10:12:44,175 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
214 |
+
2025-04-24 10:12:44,175 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
215 |
+
2025-04-24 10:12:44,175 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
216 |
+
2025-04-24 10:12:44,175 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
217 |
+
2025-04-24 10:12:44,175 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
218 |
+
2025-04-24 10:12:44,175 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
219 |
+
2025-04-24 10:12:44,175 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
220 |
+
2025-04-24 10:12:44,175 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
221 |
+
2025-04-24 10:12:44,175 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
222 |
+
2025-04-24 10:12:44,175 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
223 |
+
2025-04-24 10:12:44,175 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
224 |
+
2025-04-24 10:12:44,175 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
225 |
+
2025-04-24 10:12:44,175 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
226 |
+
2025-04-24 10:12:44,175 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
227 |
+
2025-04-24 10:12:44,175 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
228 |
+
2025-04-24 10:12:44,176 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
229 |
+
2025-04-24 10:12:44,176 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
230 |
+
2025-04-24 10:12:44,176 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
231 |
+
2025-04-24 10:12:44,176 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
232 |
+
2025-04-24 10:12:44,176 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
233 |
+
2025-04-24 10:12:44,176 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
234 |
+
2025-04-24 10:12:44,176 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
235 |
+
2025-04-24 10:12:44,176 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
236 |
+
2025-04-24 10:12:44,176 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
237 |
+
2025-04-24 10:12:44,176 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
238 |
+
2025-04-24 10:12:44,176 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
239 |
+
2025-04-24 10:12:44,176 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
240 |
+
2025-04-24 10:12:44,176 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
241 |
+
2025-04-24 10:12:44,176 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
242 |
+
2025-04-24 10:12:44,176 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
243 |
+
2025-04-24 10:12:44,176 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
244 |
+
2025-04-24 10:12:44,176 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
245 |
+
2025-04-24 10:12:44,176 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
246 |
+
2025-04-24 10:12:44,176 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
247 |
+
2025-04-24 10:12:44,176 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
248 |
+
2025-04-24 10:12:44,177 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
249 |
+
2025-04-24 10:12:44,177 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
250 |
+
2025-04-24 10:12:44,177 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
251 |
+
2025-04-24 10:12:44,177 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
252 |
+
2025-04-24 10:12:44,177 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
253 |
+
2025-04-24 10:12:44,177 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
254 |
+
2025-04-24 10:12:44,177 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
255 |
+
2025-04-24 10:12:44,177 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
256 |
+
2025-04-24 10:12:44,177 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
257 |
+
2025-04-24 10:12:44,177 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
258 |
+
2025-04-24 10:12:44,177 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
259 |
+
2025-04-24 10:12:44,177 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
260 |
+
2025-04-24 10:12:44,177 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
261 |
+
2025-04-24 10:12:44,177 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
262 |
+
2025-04-24 10:12:44,177 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
263 |
+
2025-04-24 10:12:44,177 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
264 |
+
2025-04-24 10:12:44,177 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
265 |
+
2025-04-24 10:12:44,177 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
266 |
+
2025-04-24 10:12:44,177 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
267 |
+
2025-04-24 10:12:44,178 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
268 |
+
2025-04-24 10:12:44,178 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
269 |
+
2025-04-24 10:12:44,178 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
270 |
+
2025-04-24 10:12:44,178 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
271 |
+
2025-04-24 10:12:44,178 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
272 |
+
2025-04-24 10:12:44,178 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
273 |
+
2025-04-24 10:12:44,178 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
274 |
+
2025-04-24 10:12:44,178 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
275 |
+
2025-04-24 10:12:44,178 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
276 |
+
2025-04-24 10:12:44,178 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
277 |
+
2025-04-24 10:12:44,178 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
278 |
+
2025-04-24 10:12:44,178 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
279 |
+
2025-04-24 10:12:44,178 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
280 |
+
2025-04-24 10:12:44,178 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
281 |
+
2025-04-24 10:12:44,178 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
282 |
+
2025-04-24 10:12:44,178 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
283 |
+
2025-04-24 10:12:44,178 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
284 |
+
2025-04-24 10:12:44,178 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
285 |
+
2025-04-24 10:12:44,178 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
286 |
+
2025-04-24 10:12:44,179 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
287 |
+
2025-04-24 10:12:44,179 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
288 |
+
2025-04-24 10:12:44,179 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
289 |
+
2025-04-24 10:12:44,179 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
290 |
+
2025-04-24 10:12:44,179 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
291 |
+
2025-04-24 10:12:44,179 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
292 |
+
2025-04-24 10:12:44,179 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
293 |
+
2025-04-24 10:12:44,179 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
294 |
+
2025-04-24 10:12:44,179 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
295 |
+
2025-04-24 10:12:44,179 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
296 |
+
2025-04-24 10:12:44,179 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
297 |
+
2025-04-24 10:12:44,179 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
298 |
+
2025-04-24 10:12:44,179 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
299 |
+
2025-04-24 10:12:44,179 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
300 |
+
2025-04-24 10:12:44,179 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
301 |
+
2025-04-24 10:12:44,179 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
302 |
+
2025-04-24 10:12:44,179 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
303 |
+
2025-04-24 10:12:44,179 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
304 |
+
2025-04-24 10:12:44,180 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
305 |
+
2025-04-24 10:12:44,180 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
306 |
+
2025-04-24 10:12:44,180 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
307 |
+
2025-04-24 10:12:44,180 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
308 |
+
2025-04-24 10:12:44,180 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
309 |
+
2025-04-24 10:12:44,180 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
310 |
+
2025-04-24 10:12:44,180 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
311 |
+
2025-04-24 10:12:44,180 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
312 |
+
2025-04-24 10:12:44,180 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
313 |
+
2025-04-24 10:12:44,180 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
314 |
+
2025-04-24 10:12:44,180 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
315 |
+
2025-04-24 10:12:44,180 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
316 |
+
2025-04-24 10:12:44,180 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
317 |
+
2025-04-24 10:12:44,180 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
318 |
+
2025-04-24 10:12:44,180 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
319 |
+
2025-04-24 10:12:44,180 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
320 |
+
2025-04-24 10:12:44,180 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
321 |
+
2025-04-24 10:12:44,180 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
322 |
+
2025-04-24 10:12:44,181 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
323 |
+
2025-04-24 10:12:44,181 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
324 |
+
2025-04-24 10:12:44,181 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
325 |
+
2025-04-24 10:12:44,181 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
326 |
+
2025-04-24 10:12:44,181 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
327 |
+
2025-04-24 10:12:44,181 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
328 |
+
2025-04-24 10:12:44,181 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
329 |
+
2025-04-24 10:12:44,181 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
330 |
+
2025-04-24 10:12:44,181 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
331 |
+
2025-04-24 10:12:44,181 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
332 |
+
2025-04-24 10:12:44,181 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
333 |
+
2025-04-24 10:12:44,181 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
334 |
+
2025-04-24 10:12:44,181 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
335 |
+
2025-04-24 10:12:44,181 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
336 |
+
2025-04-24 10:12:44,181 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
337 |
+
2025-04-24 10:12:44,181 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
338 |
+
2025-04-24 10:12:44,181 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
339 |
+
2025-04-24 10:12:44,181 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
340 |
+
2025-04-24 10:12:44,181 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
341 |
+
2025-04-24 10:12:44,181 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
342 |
+
2025-04-24 10:12:44,182 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
343 |
+
2025-04-24 10:12:44,182 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
344 |
+
2025-04-24 10:12:44,182 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
345 |
+
2025-04-24 10:12:44,182 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
346 |
+
2025-04-24 10:12:44,182 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
347 |
+
2025-04-24 10:12:44,182 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
348 |
+
2025-04-24 10:12:44,182 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
349 |
+
2025-04-24 10:12:44,182 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
350 |
+
2025-04-24 10:12:44,182 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
351 |
+
2025-04-24 10:12:44,182 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
352 |
+
2025-04-24 10:12:44,182 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
353 |
+
2025-04-24 10:12:44,182 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
354 |
+
2025-04-24 10:12:44,182 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
355 |
+
2025-04-24 10:12:44,182 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
356 |
+
2025-04-24 10:12:44,182 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
357 |
+
2025-04-24 10:12:44,182 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
358 |
+
2025-04-24 10:12:44,182 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
359 |
+
2025-04-24 10:12:44,182 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
360 |
+
2025-04-24 10:12:44,183 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
361 |
+
2025-04-24 10:12:44,183 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
362 |
+
2025-04-24 10:12:44,183 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
363 |
+
2025-04-24 10:12:44,183 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
364 |
+
2025-04-24 10:12:44,183 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
365 |
+
2025-04-24 10:12:44,183 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
366 |
+
2025-04-24 10:12:44,183 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
367 |
+
2025-04-24 10:12:44,183 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
368 |
+
2025-04-24 10:12:44,183 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
369 |
+
2025-04-24 10:12:44,183 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
370 |
+
2025-04-24 10:12:44,183 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
371 |
+
2025-04-24 10:12:44,183 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
372 |
+
2025-04-24 10:12:44,183 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
373 |
+
2025-04-24 10:12:44,183 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
374 |
+
2025-04-24 10:12:44,183 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
375 |
+
2025-04-24 10:12:44,183 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
376 |
+
2025-04-24 10:12:44,183 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
377 |
+
2025-04-24 10:12:44,183 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
378 |
+
2025-04-24 10:12:44,183 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
379 |
+
2025-04-24 10:12:44,184 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
380 |
+
2025-04-24 10:12:44,184 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
381 |
+
2025-04-24 10:12:44,184 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
382 |
+
2025-04-24 10:12:44,184 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
383 |
+
2025-04-24 10:12:44,184 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
384 |
+
2025-04-24 10:12:44,184 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
385 |
+
2025-04-24 10:12:44,184 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
386 |
+
2025-04-24 10:12:44,184 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
387 |
+
2025-04-24 10:12:44,184 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
388 |
+
2025-04-24 10:12:44,184 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
389 |
+
2025-04-24 10:12:44,184 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
390 |
+
2025-04-24 10:12:44,184 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
391 |
+
2025-04-24 10:12:44,184 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
392 |
+
2025-04-24 10:12:44,184 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
393 |
+
2025-04-24 10:12:44,184 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
394 |
+
2025-04-24 10:12:44,184 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
395 |
+
2025-04-24 10:12:44,184 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
396 |
+
2025-04-24 10:12:44,184 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
397 |
+
2025-04-24 10:12:44,184 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
398 |
+
2025-04-24 10:12:44,184 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
399 |
+
2025-04-24 10:12:44,185 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
400 |
+
2025-04-24 10:12:44,185 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
401 |
+
2025-04-24 10:12:44,185 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
402 |
+
2025-04-24 10:12:44,185 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
403 |
+
2025-04-24 10:12:44,185 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
404 |
+
2025-04-24 10:12:44,185 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
405 |
+
2025-04-24 10:12:44,185 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
406 |
+
2025-04-24 10:12:44,185 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
407 |
+
2025-04-24 10:12:44,185 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
408 |
+
2025-04-24 10:12:44,185 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
409 |
+
2025-04-24 10:12:44,185 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
410 |
+
2025-04-24 10:12:44,185 INFO [train.py:813] encoder_projector.linear1.weight: torch.Size([896, 10240])
|
411 |
+
2025-04-24 10:12:44,185 INFO [train.py:813] encoder_projector.linear1.bias: torch.Size([896])
|
412 |
+
2025-04-24 10:12:44,185 INFO [train.py:813] encoder_projector.linear2.weight: torch.Size([896, 896])
|
413 |
+
2025-04-24 10:12:44,185 INFO [train.py:813] encoder_projector.linear2.bias: torch.Size([896])
|
414 |
+
2025-04-24 10:12:44,185 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
415 |
+
2025-04-24 10:12:44,185 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
416 |
+
2025-04-24 10:12:44,185 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
417 |
+
2025-04-24 10:12:44,185 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
418 |
+
2025-04-24 10:12:44,186 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
419 |
+
2025-04-24 10:12:44,186 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
420 |
+
2025-04-24 10:12:44,186 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
421 |
+
2025-04-24 10:12:44,186 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
422 |
+
2025-04-24 10:12:44,186 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
423 |
+
2025-04-24 10:12:44,186 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
424 |
+
2025-04-24 10:12:44,186 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
425 |
+
2025-04-24 10:12:44,186 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
426 |
+
2025-04-24 10:12:44,186 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
427 |
+
2025-04-24 10:12:44,186 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
428 |
+
2025-04-24 10:12:44,186 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
429 |
+
2025-04-24 10:12:44,186 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
430 |
+
2025-04-24 10:12:44,186 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
431 |
+
2025-04-24 10:12:44,186 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
432 |
+
2025-04-24 10:12:44,186 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
433 |
+
2025-04-24 10:12:44,186 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
434 |
+
2025-04-24 10:12:44,186 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
435 |
+
2025-04-24 10:12:44,186 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
436 |
+
2025-04-24 10:12:44,186 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
437 |
+
2025-04-24 10:12:44,186 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
438 |
+
2025-04-24 10:12:44,187 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
439 |
+
2025-04-24 10:12:44,187 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
440 |
+
2025-04-24 10:12:44,187 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
441 |
+
2025-04-24 10:12:44,187 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
442 |
+
2025-04-24 10:12:44,187 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
443 |
+
2025-04-24 10:12:44,187 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
444 |
+
2025-04-24 10:12:44,187 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
445 |
+
2025-04-24 10:12:44,187 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
446 |
+
2025-04-24 10:12:44,187 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
447 |
+
2025-04-24 10:12:44,187 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
448 |
+
2025-04-24 10:12:44,187 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
449 |
+
2025-04-24 10:12:44,187 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
450 |
+
2025-04-24 10:12:44,187 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
451 |
+
2025-04-24 10:12:44,187 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
452 |
+
2025-04-24 10:12:44,187 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
453 |
+
2025-04-24 10:12:44,187 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
454 |
+
2025-04-24 10:12:44,187 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
455 |
+
2025-04-24 10:12:44,187 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
456 |
+
2025-04-24 10:12:44,187 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
457 |
+
2025-04-24 10:12:44,188 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
458 |
+
2025-04-24 10:12:44,188 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
459 |
+
2025-04-24 10:12:44,188 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
460 |
+
2025-04-24 10:12:44,188 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
461 |
+
2025-04-24 10:12:44,188 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
462 |
+
2025-04-24 10:12:44,188 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
463 |
+
2025-04-24 10:12:44,188 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
464 |
+
2025-04-24 10:12:44,188 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
465 |
+
2025-04-24 10:12:44,188 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
466 |
+
2025-04-24 10:12:44,188 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
467 |
+
2025-04-24 10:12:44,188 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
468 |
+
2025-04-24 10:12:44,188 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
469 |
+
2025-04-24 10:12:44,188 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
470 |
+
2025-04-24 10:12:44,188 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
471 |
+
2025-04-24 10:12:44,188 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
472 |
+
2025-04-24 10:12:44,188 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
473 |
+
2025-04-24 10:12:44,188 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
474 |
+
2025-04-24 10:12:44,188 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
475 |
+
2025-04-24 10:12:44,188 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
476 |
+
2025-04-24 10:12:44,189 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
477 |
+
2025-04-24 10:12:44,189 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
478 |
+
2025-04-24 10:12:44,189 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
479 |
+
2025-04-24 10:12:44,189 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
480 |
+
2025-04-24 10:12:44,189 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
481 |
+
2025-04-24 10:12:44,189 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
482 |
+
2025-04-24 10:12:44,189 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
483 |
+
2025-04-24 10:12:44,189 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
484 |
+
2025-04-24 10:12:44,189 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
485 |
+
2025-04-24 10:12:44,189 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
486 |
+
2025-04-24 10:12:44,189 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
487 |
+
2025-04-24 10:12:44,189 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
488 |
+
2025-04-24 10:12:44,189 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
489 |
+
2025-04-24 10:12:44,189 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
490 |
+
2025-04-24 10:12:44,189 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
491 |
+
2025-04-24 10:12:44,189 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
492 |
+
2025-04-24 10:12:44,189 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
493 |
+
2025-04-24 10:12:44,189 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
494 |
+
2025-04-24 10:12:44,189 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
495 |
+
2025-04-24 10:12:44,190 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
496 |
+
2025-04-24 10:12:44,190 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
497 |
+
2025-04-24 10:12:44,190 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
498 |
+
2025-04-24 10:12:44,190 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
499 |
+
2025-04-24 10:12:44,190 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
500 |
+
2025-04-24 10:12:44,190 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
501 |
+
2025-04-24 10:12:44,190 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
502 |
+
2025-04-24 10:12:44,190 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
503 |
+
2025-04-24 10:12:44,190 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
504 |
+
2025-04-24 10:12:44,190 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
505 |
+
2025-04-24 10:12:44,190 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
506 |
+
2025-04-24 10:12:44,190 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
507 |
+
2025-04-24 10:12:44,190 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
508 |
+
2025-04-24 10:12:44,190 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
509 |
+
2025-04-24 10:12:44,190 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
510 |
+
2025-04-24 10:12:44,190 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
511 |
+
2025-04-24 10:12:44,190 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
512 |
+
2025-04-24 10:12:44,190 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
513 |
+
2025-04-24 10:12:44,190 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
514 |
+
2025-04-24 10:12:44,190 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
515 |
+
2025-04-24 10:12:44,191 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
516 |
+
2025-04-24 10:12:44,191 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
517 |
+
2025-04-24 10:12:44,191 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
518 |
+
2025-04-24 10:12:44,191 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
519 |
+
2025-04-24 10:12:44,191 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
520 |
+
2025-04-24 10:12:44,191 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
521 |
+
2025-04-24 10:12:44,191 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
522 |
+
2025-04-24 10:12:44,191 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
523 |
+
2025-04-24 10:12:44,191 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
524 |
+
2025-04-24 10:12:44,191 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
525 |
+
2025-04-24 10:12:44,191 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
526 |
+
2025-04-24 10:12:44,191 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
527 |
+
2025-04-24 10:12:44,191 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
528 |
+
2025-04-24 10:12:44,191 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
529 |
+
2025-04-24 10:12:44,191 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
530 |
+
2025-04-24 10:12:44,191 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
531 |
+
2025-04-24 10:12:44,191 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
532 |
+
2025-04-24 10:12:44,191 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
533 |
+
2025-04-24 10:12:44,191 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
534 |
+
2025-04-24 10:12:44,192 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
535 |
+
2025-04-24 10:12:44,192 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
536 |
+
2025-04-24 10:12:44,192 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
537 |
+
2025-04-24 10:12:44,192 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
538 |
+
2025-04-24 10:12:44,192 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
539 |
+
2025-04-24 10:12:44,192 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
540 |
+
2025-04-24 10:12:44,192 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
541 |
+
2025-04-24 10:12:44,192 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
542 |
+
2025-04-24 10:12:44,192 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
543 |
+
2025-04-24 10:12:44,192 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
544 |
+
2025-04-24 10:12:44,192 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
545 |
+
2025-04-24 10:12:44,192 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
546 |
+
2025-04-24 10:12:44,192 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
547 |
+
2025-04-24 10:12:44,192 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
548 |
+
2025-04-24 10:12:44,192 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
549 |
+
2025-04-24 10:12:44,192 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
550 |
+
2025-04-24 10:12:44,192 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
551 |
+
2025-04-24 10:12:44,192 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
552 |
+
2025-04-24 10:12:44,192 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
553 |
+
2025-04-24 10:12:44,192 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
554 |
+
2025-04-24 10:12:44,193 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
555 |
+
2025-04-24 10:12:44,193 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
556 |
+
2025-04-24 10:12:44,193 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
557 |
+
2025-04-24 10:12:44,193 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
558 |
+
2025-04-24 10:12:44,193 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
559 |
+
2025-04-24 10:12:44,193 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
560 |
+
2025-04-24 10:12:44,193 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
561 |
+
2025-04-24 10:12:44,193 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
562 |
+
2025-04-24 10:12:44,193 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
563 |
+
2025-04-24 10:12:44,193 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
564 |
+
2025-04-24 10:12:44,193 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
565 |
+
2025-04-24 10:12:44,193 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
566 |
+
2025-04-24 10:12:44,193 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
567 |
+
2025-04-24 10:12:44,193 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
568 |
+
2025-04-24 10:12:44,193 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
569 |
+
2025-04-24 10:12:44,193 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
570 |
+
2025-04-24 10:12:44,193 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
571 |
+
2025-04-24 10:12:44,193 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
572 |
+
2025-04-24 10:12:44,193 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
573 |
+
2025-04-24 10:12:44,194 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
574 |
+
2025-04-24 10:12:44,194 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
575 |
+
2025-04-24 10:12:44,194 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
576 |
+
2025-04-24 10:12:44,194 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
577 |
+
2025-04-24 10:12:44,194 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
578 |
+
2025-04-24 10:12:44,194 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
579 |
+
2025-04-24 10:12:44,194 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
580 |
+
2025-04-24 10:12:44,194 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
581 |
+
2025-04-24 10:12:44,194 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
582 |
+
2025-04-24 10:12:44,194 INFO [train.py:813] speech_token_projector.weight: torch.Size([1024, 896])
|
583 |
+
2025-04-24 10:12:44,194 INFO [train.py:813] speech_token_projector.bias: torch.Size([1024])
|
584 |
+
2025-04-24 10:12:44,194 INFO [train.py:813] codec_lm_head.weight: torch.Size([4100, 1024])
|
585 |
+
2025-04-24 10:12:44,194 INFO [train.py:813] codec_lm_head.bias: torch.Size([4100])
|
586 |
+
2025-04-24 10:12:44,194 INFO [train.py:819] Device: cuda:0
|
587 |
+
2025-04-24 10:12:44,922 INFO [train.py:823] Using DeepSpeed
|
588 |
+
2025-04-24 10:12:46,690 INFO [data_module.py:445] About to get train cuts
|
589 |
+
2025-04-24 10:12:46,693 INFO [data_module.py:250] Disable MUSAN
|
590 |
+
2025-04-24 10:12:46,693 INFO [data_module.py:268] Enable SpecAugment
|
591 |
+
2025-04-24 10:12:46,693 INFO [data_module.py:269] Time warp factor: 80
|
592 |
+
2025-04-24 10:12:46,693 INFO [data_module.py:279] Num frame mask: 10
|
593 |
+
2025-04-24 10:12:46,693 INFO [data_module.py:292] About to create train dataset
|
594 |
+
2025-04-24 10:12:46,693 INFO [data_module.py:319] Using DynamicBucketingSampler.
|
595 |
+
2025-04-24 10:12:47,691 INFO [data_module.py:336] About to create train dataloader
|
596 |
+
2025-04-24 10:12:47,692 INFO [data_module.py:436] About to get test cuts
|
597 |
+
2025-04-24 10:12:47,693 INFO [data_module.py:365] About to create dev dataset
|
598 |
+
2025-04-24 10:12:47,781 INFO [data_module.py:379] About to create dev dataloader
|
599 |
+
2025-04-24 10:12:47,788 INFO [train.py:875] start training from epoch 1
|
600 |
+
2025-04-24 10:13:18,332 INFO [train.py:539] Computing validation loss
|
log/log-train-2025-04-24-10-14-50
ADDED
@@ -0,0 +1,600 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2025-04-24 10:14:50,019 INFO [train.py:653] {
|
2 |
+
"allowed_excess_duration_ratio": 0.1,
|
3 |
+
"audio_key": "question_audio",
|
4 |
+
"batch_idx_train": 0,
|
5 |
+
"best_train_epoch": -1,
|
6 |
+
"best_train_loss": Infinity,
|
7 |
+
"best_valid_epoch": -1,
|
8 |
+
"best_valid_loss": Infinity,
|
9 |
+
"bucketing_sampler": true,
|
10 |
+
"deepscale": false,
|
11 |
+
"deepscale_config": null,
|
12 |
+
"deepspeed": true,
|
13 |
+
"deepspeed_config": "./slam_omni/ds_config_zero1.json",
|
14 |
+
"drop_last": true,
|
15 |
+
"enable_musan": false,
|
16 |
+
"enable_spec_aug": true,
|
17 |
+
"enable_speech_output": true,
|
18 |
+
"encoder_projector_ds_rate": 8,
|
19 |
+
"env_info": {
|
20 |
+
"IP address": "0.114.172.122",
|
21 |
+
"hostname": "7515258",
|
22 |
+
"icefall-git-branch": "master",
|
23 |
+
"icefall-git-date": "Thu Apr 24 08:24:11 2025",
|
24 |
+
"icefall-git-sha1": "2e9be467-dirty",
|
25 |
+
"icefall-path": "/workspace/slam/icefall_omni",
|
26 |
+
"k2-build-type": "Release",
|
27 |
+
"k2-git-date": "Tue Oct 29 09:02:19 2024",
|
28 |
+
"k2-git-sha1": "75e2ed6b2fd87c22b7f3f34bad48a69984bb8755",
|
29 |
+
"k2-path": "/opt/conda/lib/python3.11/site-packages/k2/__init__.py",
|
30 |
+
"k2-version": "1.24.4",
|
31 |
+
"k2-with-cuda": true,
|
32 |
+
"lhotse-path": "/workspace/slam/lhotse/lhotse/__init__.py",
|
33 |
+
"lhotse-version": "1.30.0.dev+git.13c7616f.dirty",
|
34 |
+
"python-version": "3.11",
|
35 |
+
"torch-cuda-available": true,
|
36 |
+
"torch-cuda-version": "12.4",
|
37 |
+
"torch-version": "2.4.0"
|
38 |
+
},
|
39 |
+
"exp_dir": "slam_omni/exp_speech2speech_rerun",
|
40 |
+
"frame_shift_ms": 10,
|
41 |
+
"huggingface_dataset_path_or_name": "/workspace/Belle_1.4M-SLAM-Omni",
|
42 |
+
"input_strategy": "PrecomputedFeatures",
|
43 |
+
"llm_path_or_name": "models/Qwen2.5-0.5B-Instruct",
|
44 |
+
"log_interval": 50,
|
45 |
+
"manifest_dir": "data/fbank",
|
46 |
+
"max_duration": 40,
|
47 |
+
"num_buckets": 30,
|
48 |
+
"num_epochs": 10,
|
49 |
+
"num_workers": 2,
|
50 |
+
"on_the_fly_feats": false,
|
51 |
+
"pretrained_model_path": null,
|
52 |
+
"resample_to_16kHz": true,
|
53 |
+
"reset_interval": 200,
|
54 |
+
"return_cuts": true,
|
55 |
+
"sampler_state_dict_path": null,
|
56 |
+
"seed": 42,
|
57 |
+
"shuffle": true,
|
58 |
+
"spec_aug_time_warp_factor": 80,
|
59 |
+
"speech_encoder_path_or_name": "models/whisper/v1.1/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt",
|
60 |
+
"start_epoch": 1,
|
61 |
+
"subsampling_factor": 2,
|
62 |
+
"tensorboard": true,
|
63 |
+
"text_key": "answer",
|
64 |
+
"unfreeze_llm": true,
|
65 |
+
"unfreeze_speech_projector": true,
|
66 |
+
"use_flash_attn": true,
|
67 |
+
"use_fp16": true,
|
68 |
+
"use_lora": true,
|
69 |
+
"valid_interval": 5000
|
70 |
+
}
|
71 |
+
2025-04-24 10:14:50,020 INFO [train.py:655] About to create model
|
72 |
+
2025-04-24 10:15:09,864 INFO [train.py:808] Number of model parameters: 1327766148
|
73 |
+
2025-04-24 10:15:09,865 INFO [train.py:810] Trainable parameters (excluding model.eval modules):
|
74 |
+
2025-04-24 10:15:09,866 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
75 |
+
2025-04-24 10:15:09,866 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
76 |
+
2025-04-24 10:15:09,866 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
77 |
+
2025-04-24 10:15:09,866 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
78 |
+
2025-04-24 10:15:09,866 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
79 |
+
2025-04-24 10:15:09,866 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
80 |
+
2025-04-24 10:15:09,866 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
81 |
+
2025-04-24 10:15:09,866 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
82 |
+
2025-04-24 10:15:09,866 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
83 |
+
2025-04-24 10:15:09,866 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
84 |
+
2025-04-24 10:15:09,866 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
85 |
+
2025-04-24 10:15:09,866 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
86 |
+
2025-04-24 10:15:09,866 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
87 |
+
2025-04-24 10:15:09,866 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
88 |
+
2025-04-24 10:15:09,866 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
89 |
+
2025-04-24 10:15:09,866 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
90 |
+
2025-04-24 10:15:09,867 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
91 |
+
2025-04-24 10:15:09,867 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
92 |
+
2025-04-24 10:15:09,867 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
93 |
+
2025-04-24 10:15:09,867 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
94 |
+
2025-04-24 10:15:09,867 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
95 |
+
2025-04-24 10:15:09,867 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
96 |
+
2025-04-24 10:15:09,867 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
97 |
+
2025-04-24 10:15:09,867 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
98 |
+
2025-04-24 10:15:09,867 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
99 |
+
2025-04-24 10:15:09,867 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
100 |
+
2025-04-24 10:15:09,867 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
101 |
+
2025-04-24 10:15:09,867 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
102 |
+
2025-04-24 10:15:09,867 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
103 |
+
2025-04-24 10:15:09,867 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
104 |
+
2025-04-24 10:15:09,867 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
105 |
+
2025-04-24 10:15:09,867 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
106 |
+
2025-04-24 10:15:09,867 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
107 |
+
2025-04-24 10:15:09,867 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
108 |
+
2025-04-24 10:15:09,868 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
109 |
+
2025-04-24 10:15:09,868 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
110 |
+
2025-04-24 10:15:09,868 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
111 |
+
2025-04-24 10:15:09,868 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
112 |
+
2025-04-24 10:15:09,868 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
113 |
+
2025-04-24 10:15:09,868 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
114 |
+
2025-04-24 10:15:09,868 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
115 |
+
2025-04-24 10:15:09,868 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
116 |
+
2025-04-24 10:15:09,868 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
117 |
+
2025-04-24 10:15:09,868 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
118 |
+
2025-04-24 10:15:09,868 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
119 |
+
2025-04-24 10:15:09,868 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
120 |
+
2025-04-24 10:15:09,868 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
121 |
+
2025-04-24 10:15:09,868 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
122 |
+
2025-04-24 10:15:09,868 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
123 |
+
2025-04-24 10:15:09,868 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
124 |
+
2025-04-24 10:15:09,868 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
125 |
+
2025-04-24 10:15:09,868 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
126 |
+
2025-04-24 10:15:09,869 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
127 |
+
2025-04-24 10:15:09,869 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
128 |
+
2025-04-24 10:15:09,869 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
129 |
+
2025-04-24 10:15:09,869 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
130 |
+
2025-04-24 10:15:09,869 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
131 |
+
2025-04-24 10:15:09,869 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
132 |
+
2025-04-24 10:15:09,869 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
133 |
+
2025-04-24 10:15:09,869 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
134 |
+
2025-04-24 10:15:09,869 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
135 |
+
2025-04-24 10:15:09,869 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
136 |
+
2025-04-24 10:15:09,869 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
137 |
+
2025-04-24 10:15:09,869 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
138 |
+
2025-04-24 10:15:09,869 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
139 |
+
2025-04-24 10:15:09,869 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
140 |
+
2025-04-24 10:15:09,869 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
141 |
+
2025-04-24 10:15:09,869 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
142 |
+
2025-04-24 10:15:09,869 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
143 |
+
2025-04-24 10:15:09,869 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
144 |
+
2025-04-24 10:15:09,869 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
145 |
+
2025-04-24 10:15:09,870 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
146 |
+
2025-04-24 10:15:09,870 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
147 |
+
2025-04-24 10:15:09,870 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
148 |
+
2025-04-24 10:15:09,870 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
149 |
+
2025-04-24 10:15:09,870 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
150 |
+
2025-04-24 10:15:09,870 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
151 |
+
2025-04-24 10:15:09,870 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
152 |
+
2025-04-24 10:15:09,870 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
153 |
+
2025-04-24 10:15:09,870 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
154 |
+
2025-04-24 10:15:09,870 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
155 |
+
2025-04-24 10:15:09,870 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
156 |
+
2025-04-24 10:15:09,870 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
157 |
+
2025-04-24 10:15:09,870 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
158 |
+
2025-04-24 10:15:09,870 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
159 |
+
2025-04-24 10:15:09,870 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
160 |
+
2025-04-24 10:15:09,870 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
161 |
+
2025-04-24 10:15:09,870 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
162 |
+
2025-04-24 10:15:09,870 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
163 |
+
2025-04-24 10:15:09,870 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
164 |
+
2025-04-24 10:15:09,871 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
165 |
+
2025-04-24 10:15:09,871 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
166 |
+
2025-04-24 10:15:09,871 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
167 |
+
2025-04-24 10:15:09,871 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
168 |
+
2025-04-24 10:15:09,871 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
169 |
+
2025-04-24 10:15:09,871 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
170 |
+
2025-04-24 10:15:09,871 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
171 |
+
2025-04-24 10:15:09,871 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
172 |
+
2025-04-24 10:15:09,871 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
173 |
+
2025-04-24 10:15:09,871 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
174 |
+
2025-04-24 10:15:09,871 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
175 |
+
2025-04-24 10:15:09,871 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
176 |
+
2025-04-24 10:15:09,871 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
177 |
+
2025-04-24 10:15:09,871 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
178 |
+
2025-04-24 10:15:09,871 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
179 |
+
2025-04-24 10:15:09,871 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
180 |
+
2025-04-24 10:15:09,871 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
181 |
+
2025-04-24 10:15:09,871 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
182 |
+
2025-04-24 10:15:09,872 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
183 |
+
2025-04-24 10:15:09,872 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
184 |
+
2025-04-24 10:15:09,872 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
185 |
+
2025-04-24 10:15:09,872 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
186 |
+
2025-04-24 10:15:09,872 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
187 |
+
2025-04-24 10:15:09,872 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
188 |
+
2025-04-24 10:15:09,872 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
189 |
+
2025-04-24 10:15:09,872 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
190 |
+
2025-04-24 10:15:09,872 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
191 |
+
2025-04-24 10:15:09,872 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
192 |
+
2025-04-24 10:15:09,872 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
193 |
+
2025-04-24 10:15:09,872 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
194 |
+
2025-04-24 10:15:09,872 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
195 |
+
2025-04-24 10:15:09,872 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
196 |
+
2025-04-24 10:15:09,872 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
197 |
+
2025-04-24 10:15:09,872 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
198 |
+
2025-04-24 10:15:09,872 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
199 |
+
2025-04-24 10:15:09,872 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
200 |
+
2025-04-24 10:15:09,872 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
201 |
+
2025-04-24 10:15:09,873 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
202 |
+
2025-04-24 10:15:09,873 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
203 |
+
2025-04-24 10:15:09,873 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
204 |
+
2025-04-24 10:15:09,873 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
205 |
+
2025-04-24 10:15:09,873 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
206 |
+
2025-04-24 10:15:09,873 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
207 |
+
2025-04-24 10:15:09,873 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
208 |
+
2025-04-24 10:15:09,873 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
209 |
+
2025-04-24 10:15:09,873 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
210 |
+
2025-04-24 10:15:09,873 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
211 |
+
2025-04-24 10:15:09,873 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
212 |
+
2025-04-24 10:15:09,873 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
213 |
+
2025-04-24 10:15:09,873 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
214 |
+
2025-04-24 10:15:09,873 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
215 |
+
2025-04-24 10:15:09,873 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
216 |
+
2025-04-24 10:15:09,873 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
217 |
+
2025-04-24 10:15:09,873 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
218 |
+
2025-04-24 10:15:09,873 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
219 |
+
2025-04-24 10:15:09,874 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
220 |
+
2025-04-24 10:15:09,874 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
221 |
+
2025-04-24 10:15:09,874 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
222 |
+
2025-04-24 10:15:09,874 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
223 |
+
2025-04-24 10:15:09,874 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
224 |
+
2025-04-24 10:15:09,874 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
225 |
+
2025-04-24 10:15:09,874 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
226 |
+
2025-04-24 10:15:09,874 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
227 |
+
2025-04-24 10:15:09,874 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
228 |
+
2025-04-24 10:15:09,874 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
229 |
+
2025-04-24 10:15:09,874 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
230 |
+
2025-04-24 10:15:09,874 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
231 |
+
2025-04-24 10:15:09,874 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
232 |
+
2025-04-24 10:15:09,874 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
233 |
+
2025-04-24 10:15:09,874 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
234 |
+
2025-04-24 10:15:09,874 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
235 |
+
2025-04-24 10:15:09,874 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
236 |
+
2025-04-24 10:15:09,874 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
237 |
+
2025-04-24 10:15:09,874 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
238 |
+
2025-04-24 10:15:09,875 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
239 |
+
2025-04-24 10:15:09,875 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
240 |
+
2025-04-24 10:15:09,875 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
241 |
+
2025-04-24 10:15:09,875 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
242 |
+
2025-04-24 10:15:09,875 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
243 |
+
2025-04-24 10:15:09,875 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
244 |
+
2025-04-24 10:15:09,875 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
245 |
+
2025-04-24 10:15:09,875 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
246 |
+
2025-04-24 10:15:09,875 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
247 |
+
2025-04-24 10:15:09,875 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
248 |
+
2025-04-24 10:15:09,875 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
249 |
+
2025-04-24 10:15:09,875 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
250 |
+
2025-04-24 10:15:09,875 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
251 |
+
2025-04-24 10:15:09,875 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
252 |
+
2025-04-24 10:15:09,875 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
253 |
+
2025-04-24 10:15:09,875 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
254 |
+
2025-04-24 10:15:09,875 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
255 |
+
2025-04-24 10:15:09,875 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
256 |
+
2025-04-24 10:15:09,875 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
257 |
+
2025-04-24 10:15:09,876 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
258 |
+
2025-04-24 10:15:09,876 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
259 |
+
2025-04-24 10:15:09,876 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
260 |
+
2025-04-24 10:15:09,876 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
261 |
+
2025-04-24 10:15:09,876 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
262 |
+
2025-04-24 10:15:09,876 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
263 |
+
2025-04-24 10:15:09,876 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
264 |
+
2025-04-24 10:15:09,876 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
265 |
+
2025-04-24 10:15:09,876 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
266 |
+
2025-04-24 10:15:09,876 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
267 |
+
2025-04-24 10:15:09,876 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
268 |
+
2025-04-24 10:15:09,876 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
269 |
+
2025-04-24 10:15:09,876 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
270 |
+
2025-04-24 10:15:09,876 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
271 |
+
2025-04-24 10:15:09,876 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
272 |
+
2025-04-24 10:15:09,876 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
273 |
+
2025-04-24 10:15:09,876 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
274 |
+
2025-04-24 10:15:09,876 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
275 |
+
2025-04-24 10:15:09,876 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
276 |
+
2025-04-24 10:15:09,877 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
277 |
+
2025-04-24 10:15:09,877 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
278 |
+
2025-04-24 10:15:09,877 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
279 |
+
2025-04-24 10:15:09,877 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
280 |
+
2025-04-24 10:15:09,877 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
281 |
+
2025-04-24 10:15:09,877 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
282 |
+
2025-04-24 10:15:09,877 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
283 |
+
2025-04-24 10:15:09,877 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
284 |
+
2025-04-24 10:15:09,877 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
285 |
+
2025-04-24 10:15:09,877 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
286 |
+
2025-04-24 10:15:09,877 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
287 |
+
2025-04-24 10:15:09,877 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
288 |
+
2025-04-24 10:15:09,877 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
289 |
+
2025-04-24 10:15:09,877 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
290 |
+
2025-04-24 10:15:09,877 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
291 |
+
2025-04-24 10:15:09,877 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
292 |
+
2025-04-24 10:15:09,877 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
293 |
+
2025-04-24 10:15:09,877 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
294 |
+
2025-04-24 10:15:09,877 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
295 |
+
2025-04-24 10:15:09,878 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
296 |
+
2025-04-24 10:15:09,878 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
297 |
+
2025-04-24 10:15:09,878 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
298 |
+
2025-04-24 10:15:09,878 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
299 |
+
2025-04-24 10:15:09,878 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
300 |
+
2025-04-24 10:15:09,878 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
301 |
+
2025-04-24 10:15:09,878 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
302 |
+
2025-04-24 10:15:09,878 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
303 |
+
2025-04-24 10:15:09,878 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
304 |
+
2025-04-24 10:15:09,878 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
305 |
+
2025-04-24 10:15:09,878 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
306 |
+
2025-04-24 10:15:09,878 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
307 |
+
2025-04-24 10:15:09,878 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
308 |
+
2025-04-24 10:15:09,878 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
309 |
+
2025-04-24 10:15:09,878 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
310 |
+
2025-04-24 10:15:09,878 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
311 |
+
2025-04-24 10:15:09,878 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
312 |
+
2025-04-24 10:15:09,878 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
313 |
+
2025-04-24 10:15:09,878 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
314 |
+
2025-04-24 10:15:09,879 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
315 |
+
2025-04-24 10:15:09,879 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
316 |
+
2025-04-24 10:15:09,879 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
317 |
+
2025-04-24 10:15:09,879 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
318 |
+
2025-04-24 10:15:09,879 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
319 |
+
2025-04-24 10:15:09,879 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
320 |
+
2025-04-24 10:15:09,879 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
321 |
+
2025-04-24 10:15:09,879 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
322 |
+
2025-04-24 10:15:09,879 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
323 |
+
2025-04-24 10:15:09,879 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
324 |
+
2025-04-24 10:15:09,879 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
325 |
+
2025-04-24 10:15:09,879 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
326 |
+
2025-04-24 10:15:09,879 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
327 |
+
2025-04-24 10:15:09,879 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
328 |
+
2025-04-24 10:15:09,879 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
329 |
+
2025-04-24 10:15:09,879 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
330 |
+
2025-04-24 10:15:09,879 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
331 |
+
2025-04-24 10:15:09,879 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
332 |
+
2025-04-24 10:15:09,880 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
333 |
+
2025-04-24 10:15:09,880 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
334 |
+
2025-04-24 10:15:09,880 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
335 |
+
2025-04-24 10:15:09,880 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
336 |
+
2025-04-24 10:15:09,880 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
337 |
+
2025-04-24 10:15:09,880 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
338 |
+
2025-04-24 10:15:09,880 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
339 |
+
2025-04-24 10:15:09,880 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
340 |
+
2025-04-24 10:15:09,880 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
341 |
+
2025-04-24 10:15:09,880 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
342 |
+
2025-04-24 10:15:09,880 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
343 |
+
2025-04-24 10:15:09,880 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
344 |
+
2025-04-24 10:15:09,880 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
345 |
+
2025-04-24 10:15:09,880 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
346 |
+
2025-04-24 10:15:09,880 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
347 |
+
2025-04-24 10:15:09,880 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
348 |
+
2025-04-24 10:15:09,880 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
349 |
+
2025-04-24 10:15:09,880 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
350 |
+
2025-04-24 10:15:09,880 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
351 |
+
2025-04-24 10:15:09,881 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
352 |
+
2025-04-24 10:15:09,881 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
353 |
+
2025-04-24 10:15:09,881 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
354 |
+
2025-04-24 10:15:09,881 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
355 |
+
2025-04-24 10:15:09,881 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
356 |
+
2025-04-24 10:15:09,881 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
357 |
+
2025-04-24 10:15:09,881 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
358 |
+
2025-04-24 10:15:09,881 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
359 |
+
2025-04-24 10:15:09,881 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
360 |
+
2025-04-24 10:15:09,881 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
361 |
+
2025-04-24 10:15:09,881 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
362 |
+
2025-04-24 10:15:09,881 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
363 |
+
2025-04-24 10:15:09,881 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
364 |
+
2025-04-24 10:15:09,881 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
365 |
+
2025-04-24 10:15:09,881 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
366 |
+
2025-04-24 10:15:09,881 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
367 |
+
2025-04-24 10:15:09,881 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
368 |
+
2025-04-24 10:15:09,881 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
369 |
+
2025-04-24 10:15:09,881 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
370 |
+
2025-04-24 10:15:09,882 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
371 |
+
2025-04-24 10:15:09,882 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
372 |
+
2025-04-24 10:15:09,882 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
373 |
+
2025-04-24 10:15:09,882 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
374 |
+
2025-04-24 10:15:09,882 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
375 |
+
2025-04-24 10:15:09,882 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
376 |
+
2025-04-24 10:15:09,882 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
377 |
+
2025-04-24 10:15:09,882 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
378 |
+
2025-04-24 10:15:09,882 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
379 |
+
2025-04-24 10:15:09,882 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
380 |
+
2025-04-24 10:15:09,882 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
381 |
+
2025-04-24 10:15:09,882 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
382 |
+
2025-04-24 10:15:09,882 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
383 |
+
2025-04-24 10:15:09,882 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
384 |
+
2025-04-24 10:15:09,882 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
385 |
+
2025-04-24 10:15:09,882 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
386 |
+
2025-04-24 10:15:09,882 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
387 |
+
2025-04-24 10:15:09,882 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
388 |
+
2025-04-24 10:15:09,883 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
389 |
+
2025-04-24 10:15:09,883 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
390 |
+
2025-04-24 10:15:09,883 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
391 |
+
2025-04-24 10:15:09,883 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
392 |
+
2025-04-24 10:15:09,883 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
393 |
+
2025-04-24 10:15:09,883 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
394 |
+
2025-04-24 10:15:09,883 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
395 |
+
2025-04-24 10:15:09,883 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
396 |
+
2025-04-24 10:15:09,883 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
397 |
+
2025-04-24 10:15:09,883 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
398 |
+
2025-04-24 10:15:09,883 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
399 |
+
2025-04-24 10:15:09,883 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
400 |
+
2025-04-24 10:15:09,883 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
401 |
+
2025-04-24 10:15:09,883 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
402 |
+
2025-04-24 10:15:09,883 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
403 |
+
2025-04-24 10:15:09,883 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
404 |
+
2025-04-24 10:15:09,883 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
405 |
+
2025-04-24 10:15:09,883 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
406 |
+
2025-04-24 10:15:09,883 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
407 |
+
2025-04-24 10:15:09,884 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
408 |
+
2025-04-24 10:15:09,884 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
409 |
+
2025-04-24 10:15:09,884 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
410 |
+
2025-04-24 10:15:09,884 INFO [train.py:813] encoder_projector.linear1.weight: torch.Size([896, 10240])
|
411 |
+
2025-04-24 10:15:09,884 INFO [train.py:813] encoder_projector.linear1.bias: torch.Size([896])
|
412 |
+
2025-04-24 10:15:09,884 INFO [train.py:813] encoder_projector.linear2.weight: torch.Size([896, 896])
|
413 |
+
2025-04-24 10:15:09,884 INFO [train.py:813] encoder_projector.linear2.bias: torch.Size([896])
|
414 |
+
2025-04-24 10:15:09,884 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
415 |
+
2025-04-24 10:15:09,884 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
416 |
+
2025-04-24 10:15:09,884 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
417 |
+
2025-04-24 10:15:09,884 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
418 |
+
2025-04-24 10:15:09,884 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
419 |
+
2025-04-24 10:15:09,884 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
420 |
+
2025-04-24 10:15:09,884 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
421 |
+
2025-04-24 10:15:09,884 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
422 |
+
2025-04-24 10:15:09,884 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
423 |
+
2025-04-24 10:15:09,884 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
424 |
+
2025-04-24 10:15:09,884 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
425 |
+
2025-04-24 10:15:09,884 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
426 |
+
2025-04-24 10:15:09,885 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
427 |
+
2025-04-24 10:15:09,885 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
428 |
+
2025-04-24 10:15:09,885 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
429 |
+
2025-04-24 10:15:09,885 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
430 |
+
2025-04-24 10:15:09,885 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
431 |
+
2025-04-24 10:15:09,885 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
432 |
+
2025-04-24 10:15:09,885 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
433 |
+
2025-04-24 10:15:09,885 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
434 |
+
2025-04-24 10:15:09,885 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
435 |
+
2025-04-24 10:15:09,885 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
436 |
+
2025-04-24 10:15:09,885 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
437 |
+
2025-04-24 10:15:09,885 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
438 |
+
2025-04-24 10:15:09,885 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
439 |
+
2025-04-24 10:15:09,885 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
440 |
+
2025-04-24 10:15:09,885 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
441 |
+
2025-04-24 10:15:09,885 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
442 |
+
2025-04-24 10:15:09,885 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
443 |
+
2025-04-24 10:15:09,885 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
444 |
+
2025-04-24 10:15:09,885 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
445 |
+
2025-04-24 10:15:09,886 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
446 |
+
2025-04-24 10:15:09,886 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
447 |
+
2025-04-24 10:15:09,886 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
448 |
+
2025-04-24 10:15:09,886 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
449 |
+
2025-04-24 10:15:09,886 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
450 |
+
2025-04-24 10:15:09,886 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
451 |
+
2025-04-24 10:15:09,886 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
452 |
+
2025-04-24 10:15:09,886 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
453 |
+
2025-04-24 10:15:09,886 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
454 |
+
2025-04-24 10:15:09,886 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
455 |
+
2025-04-24 10:15:09,886 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
456 |
+
2025-04-24 10:15:09,886 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
457 |
+
2025-04-24 10:15:09,886 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
458 |
+
2025-04-24 10:15:09,886 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
459 |
+
2025-04-24 10:15:09,886 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
460 |
+
2025-04-24 10:15:09,886 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
461 |
+
2025-04-24 10:15:09,886 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
462 |
+
2025-04-24 10:15:09,886 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
463 |
+
2025-04-24 10:15:09,886 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
464 |
+
2025-04-24 10:15:09,887 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
465 |
+
2025-04-24 10:15:09,887 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
466 |
+
2025-04-24 10:15:09,887 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
467 |
+
2025-04-24 10:15:09,887 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
468 |
+
2025-04-24 10:15:09,887 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
469 |
+
2025-04-24 10:15:09,887 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
470 |
+
2025-04-24 10:15:09,887 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
471 |
+
2025-04-24 10:15:09,887 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
472 |
+
2025-04-24 10:15:09,887 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
473 |
+
2025-04-24 10:15:09,887 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
474 |
+
2025-04-24 10:15:09,887 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
475 |
+
2025-04-24 10:15:09,887 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
476 |
+
2025-04-24 10:15:09,887 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
477 |
+
2025-04-24 10:15:09,887 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
478 |
+
2025-04-24 10:15:09,887 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
479 |
+
2025-04-24 10:15:09,887 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
480 |
+
2025-04-24 10:15:09,887 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
481 |
+
2025-04-24 10:15:09,887 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
482 |
+
2025-04-24 10:15:09,887 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
483 |
+
2025-04-24 10:15:09,888 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
484 |
+
2025-04-24 10:15:09,888 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
485 |
+
2025-04-24 10:15:09,888 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
486 |
+
2025-04-24 10:15:09,888 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
487 |
+
2025-04-24 10:15:09,888 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
488 |
+
2025-04-24 10:15:09,888 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
489 |
+
2025-04-24 10:15:09,888 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
490 |
+
2025-04-24 10:15:09,888 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
491 |
+
2025-04-24 10:15:09,888 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
492 |
+
2025-04-24 10:15:09,888 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
493 |
+
2025-04-24 10:15:09,888 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
494 |
+
2025-04-24 10:15:09,888 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
495 |
+
2025-04-24 10:15:09,888 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
496 |
+
2025-04-24 10:15:09,888 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
497 |
+
2025-04-24 10:15:09,888 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
498 |
+
2025-04-24 10:15:09,888 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
499 |
+
2025-04-24 10:15:09,888 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
500 |
+
2025-04-24 10:15:09,888 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
501 |
+
2025-04-24 10:15:09,888 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
502 |
+
2025-04-24 10:15:09,888 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
503 |
+
2025-04-24 10:15:09,889 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
504 |
+
2025-04-24 10:15:09,889 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
505 |
+
2025-04-24 10:15:09,889 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
506 |
+
2025-04-24 10:15:09,889 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
507 |
+
2025-04-24 10:15:09,889 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
508 |
+
2025-04-24 10:15:09,889 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
509 |
+
2025-04-24 10:15:09,889 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
510 |
+
2025-04-24 10:15:09,889 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
511 |
+
2025-04-24 10:15:09,889 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
512 |
+
2025-04-24 10:15:09,889 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
513 |
+
2025-04-24 10:15:09,889 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
514 |
+
2025-04-24 10:15:09,889 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
515 |
+
2025-04-24 10:15:09,889 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
516 |
+
2025-04-24 10:15:09,889 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
517 |
+
2025-04-24 10:15:09,889 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
518 |
+
2025-04-24 10:15:09,889 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
519 |
+
2025-04-24 10:15:09,889 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
520 |
+
2025-04-24 10:15:09,889 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
521 |
+
2025-04-24 10:15:09,889 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
522 |
+
2025-04-24 10:15:09,889 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
523 |
+
2025-04-24 10:15:09,890 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
524 |
+
2025-04-24 10:15:09,890 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
525 |
+
2025-04-24 10:15:09,890 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
526 |
+
2025-04-24 10:15:09,890 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
527 |
+
2025-04-24 10:15:09,890 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
528 |
+
2025-04-24 10:15:09,890 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
529 |
+
2025-04-24 10:15:09,890 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
530 |
+
2025-04-24 10:15:09,890 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
531 |
+
2025-04-24 10:15:09,890 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
532 |
+
2025-04-24 10:15:09,890 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
533 |
+
2025-04-24 10:15:09,890 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
534 |
+
2025-04-24 10:15:09,890 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
535 |
+
2025-04-24 10:15:09,890 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
536 |
+
2025-04-24 10:15:09,890 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
537 |
+
2025-04-24 10:15:09,890 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
538 |
+
2025-04-24 10:15:09,890 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
539 |
+
2025-04-24 10:15:09,890 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
540 |
+
2025-04-24 10:15:09,890 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
541 |
+
2025-04-24 10:15:09,890 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
542 |
+
2025-04-24 10:15:09,890 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
543 |
+
2025-04-24 10:15:09,891 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
544 |
+
2025-04-24 10:15:09,891 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
545 |
+
2025-04-24 10:15:09,891 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
546 |
+
2025-04-24 10:15:09,891 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
547 |
+
2025-04-24 10:15:09,891 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
548 |
+
2025-04-24 10:15:09,891 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
549 |
+
2025-04-24 10:15:09,891 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
550 |
+
2025-04-24 10:15:09,891 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
551 |
+
2025-04-24 10:15:09,891 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
552 |
+
2025-04-24 10:15:09,891 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
553 |
+
2025-04-24 10:15:09,891 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
554 |
+
2025-04-24 10:15:09,891 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
555 |
+
2025-04-24 10:15:09,891 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
556 |
+
2025-04-24 10:15:09,891 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
557 |
+
2025-04-24 10:15:09,891 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
558 |
+
2025-04-24 10:15:09,891 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
559 |
+
2025-04-24 10:15:09,891 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
560 |
+
2025-04-24 10:15:09,891 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
561 |
+
2025-04-24 10:15:09,891 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
562 |
+
2025-04-24 10:15:09,891 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
563 |
+
2025-04-24 10:15:09,892 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
564 |
+
2025-04-24 10:15:09,892 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
565 |
+
2025-04-24 10:15:09,892 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
566 |
+
2025-04-24 10:15:09,892 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
567 |
+
2025-04-24 10:15:09,892 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
568 |
+
2025-04-24 10:15:09,892 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
569 |
+
2025-04-24 10:15:09,892 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
570 |
+
2025-04-24 10:15:09,892 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
571 |
+
2025-04-24 10:15:09,892 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
572 |
+
2025-04-24 10:15:09,892 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
573 |
+
2025-04-24 10:15:09,892 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
574 |
+
2025-04-24 10:15:09,892 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
575 |
+
2025-04-24 10:15:09,892 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
576 |
+
2025-04-24 10:15:09,892 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
577 |
+
2025-04-24 10:15:09,892 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
578 |
+
2025-04-24 10:15:09,892 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
579 |
+
2025-04-24 10:15:09,892 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
580 |
+
2025-04-24 10:15:09,892 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
581 |
+
2025-04-24 10:15:09,892 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
582 |
+
2025-04-24 10:15:09,892 INFO [train.py:813] speech_token_projector.weight: torch.Size([1024, 1792])
|
583 |
+
2025-04-24 10:15:09,893 INFO [train.py:813] speech_token_projector.bias: torch.Size([1024])
|
584 |
+
2025-04-24 10:15:09,893 INFO [train.py:813] codec_lm_head.weight: torch.Size([4100, 1024])
|
585 |
+
2025-04-24 10:15:09,893 INFO [train.py:813] codec_lm_head.bias: torch.Size([4100])
|
586 |
+
2025-04-24 10:15:09,893 INFO [train.py:819] Device: cuda:1
|
587 |
+
2025-04-24 10:15:10,679 INFO [train.py:823] Using DeepSpeed
|
588 |
+
2025-04-24 10:15:12,380 INFO [data_module.py:445] About to get train cuts
|
589 |
+
2025-04-24 10:15:12,382 INFO [data_module.py:250] Disable MUSAN
|
590 |
+
2025-04-24 10:15:12,382 INFO [data_module.py:268] Enable SpecAugment
|
591 |
+
2025-04-24 10:15:12,382 INFO [data_module.py:269] Time warp factor: 80
|
592 |
+
2025-04-24 10:15:12,382 INFO [data_module.py:279] Num frame mask: 10
|
593 |
+
2025-04-24 10:15:12,382 INFO [data_module.py:292] About to create train dataset
|
594 |
+
2025-04-24 10:15:12,382 INFO [data_module.py:319] Using DynamicBucketingSampler.
|
595 |
+
2025-04-24 10:15:13,374 INFO [data_module.py:336] About to create train dataloader
|
596 |
+
2025-04-24 10:15:13,374 INFO [data_module.py:436] About to get test cuts
|
597 |
+
2025-04-24 10:15:13,375 INFO [data_module.py:365] About to create dev dataset
|
598 |
+
2025-04-24 10:15:13,462 INFO [data_module.py:379] About to create dev dataloader
|
599 |
+
2025-04-24 10:15:13,468 INFO [train.py:875] start training from epoch 1
|
600 |
+
2025-04-24 10:15:42,675 INFO [train.py:539] Computing validation loss
|
log/log-train-2025-04-24-10-17-26
ADDED
@@ -0,0 +1,607 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2025-04-24 10:17:26,814 INFO [train.py:653] {
|
2 |
+
"allowed_excess_duration_ratio": 0.1,
|
3 |
+
"audio_key": "question_audio",
|
4 |
+
"batch_idx_train": 0,
|
5 |
+
"best_train_epoch": -1,
|
6 |
+
"best_train_loss": Infinity,
|
7 |
+
"best_valid_epoch": -1,
|
8 |
+
"best_valid_loss": Infinity,
|
9 |
+
"bucketing_sampler": true,
|
10 |
+
"deepscale": false,
|
11 |
+
"deepscale_config": null,
|
12 |
+
"deepspeed": true,
|
13 |
+
"deepspeed_config": "./slam_omni/ds_config_zero1.json",
|
14 |
+
"drop_last": true,
|
15 |
+
"enable_musan": false,
|
16 |
+
"enable_spec_aug": true,
|
17 |
+
"enable_speech_output": true,
|
18 |
+
"encoder_projector_ds_rate": 8,
|
19 |
+
"env_info": {
|
20 |
+
"IP address": "0.114.172.122",
|
21 |
+
"hostname": "7515258",
|
22 |
+
"icefall-git-branch": "master",
|
23 |
+
"icefall-git-date": "Thu Apr 24 08:24:11 2025",
|
24 |
+
"icefall-git-sha1": "2e9be467-dirty",
|
25 |
+
"icefall-path": "/workspace/slam/icefall_omni",
|
26 |
+
"k2-build-type": "Release",
|
27 |
+
"k2-git-date": "Tue Oct 29 09:02:19 2024",
|
28 |
+
"k2-git-sha1": "75e2ed6b2fd87c22b7f3f34bad48a69984bb8755",
|
29 |
+
"k2-path": "/opt/conda/lib/python3.11/site-packages/k2/__init__.py",
|
30 |
+
"k2-version": "1.24.4",
|
31 |
+
"k2-with-cuda": true,
|
32 |
+
"lhotse-path": "/workspace/slam/lhotse/lhotse/__init__.py",
|
33 |
+
"lhotse-version": "1.30.0.dev+git.13c7616f.dirty",
|
34 |
+
"python-version": "3.11",
|
35 |
+
"torch-cuda-available": true,
|
36 |
+
"torch-cuda-version": "12.4",
|
37 |
+
"torch-version": "2.4.0"
|
38 |
+
},
|
39 |
+
"exp_dir": "slam_omni/exp_speech2speech_rerun",
|
40 |
+
"frame_shift_ms": 10,
|
41 |
+
"huggingface_dataset_path_or_name": "/workspace/Belle_1.4M-SLAM-Omni",
|
42 |
+
"input_strategy": "PrecomputedFeatures",
|
43 |
+
"llm_path_or_name": "models/Qwen2.5-0.5B-Instruct",
|
44 |
+
"log_interval": 50,
|
45 |
+
"manifest_dir": "data/fbank",
|
46 |
+
"max_duration": 40,
|
47 |
+
"num_buckets": 30,
|
48 |
+
"num_epochs": 10,
|
49 |
+
"num_workers": 2,
|
50 |
+
"on_the_fly_feats": false,
|
51 |
+
"pretrained_model_path": null,
|
52 |
+
"resample_to_16kHz": true,
|
53 |
+
"reset_interval": 200,
|
54 |
+
"return_cuts": true,
|
55 |
+
"sampler_state_dict_path": null,
|
56 |
+
"seed": 42,
|
57 |
+
"shuffle": true,
|
58 |
+
"spec_aug_time_warp_factor": 80,
|
59 |
+
"speech_encoder_path_or_name": "models/whisper/v1.1/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt",
|
60 |
+
"start_epoch": 1,
|
61 |
+
"subsampling_factor": 2,
|
62 |
+
"tensorboard": true,
|
63 |
+
"text_key": "answer",
|
64 |
+
"unfreeze_llm": true,
|
65 |
+
"unfreeze_speech_projector": true,
|
66 |
+
"use_flash_attn": true,
|
67 |
+
"use_fp16": true,
|
68 |
+
"use_lora": true,
|
69 |
+
"valid_interval": 5000
|
70 |
+
}
|
71 |
+
2025-04-24 10:17:26,814 INFO [train.py:655] About to create model
|
72 |
+
2025-04-24 10:17:46,427 INFO [train.py:808] Number of model parameters: 1327766148
|
73 |
+
2025-04-24 10:17:46,427 INFO [train.py:810] Trainable parameters (excluding model.eval modules):
|
74 |
+
2025-04-24 10:17:46,428 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
75 |
+
2025-04-24 10:17:46,428 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
76 |
+
2025-04-24 10:17:46,428 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
77 |
+
2025-04-24 10:17:46,428 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
78 |
+
2025-04-24 10:17:46,428 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
79 |
+
2025-04-24 10:17:46,428 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
80 |
+
2025-04-24 10:17:46,428 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
81 |
+
2025-04-24 10:17:46,428 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
82 |
+
2025-04-24 10:17:46,428 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
83 |
+
2025-04-24 10:17:46,428 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
84 |
+
2025-04-24 10:17:46,428 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
85 |
+
2025-04-24 10:17:46,428 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
86 |
+
2025-04-24 10:17:46,428 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
87 |
+
2025-04-24 10:17:46,429 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
88 |
+
2025-04-24 10:17:46,429 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
89 |
+
2025-04-24 10:17:46,429 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
90 |
+
2025-04-24 10:17:46,429 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
91 |
+
2025-04-24 10:17:46,429 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
92 |
+
2025-04-24 10:17:46,429 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
93 |
+
2025-04-24 10:17:46,429 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
94 |
+
2025-04-24 10:17:46,429 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
95 |
+
2025-04-24 10:17:46,429 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
96 |
+
2025-04-24 10:17:46,429 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
97 |
+
2025-04-24 10:17:46,429 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
98 |
+
2025-04-24 10:17:46,429 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
99 |
+
2025-04-24 10:17:46,429 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
100 |
+
2025-04-24 10:17:46,429 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
101 |
+
2025-04-24 10:17:46,429 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
102 |
+
2025-04-24 10:17:46,429 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
103 |
+
2025-04-24 10:17:46,429 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
104 |
+
2025-04-24 10:17:46,429 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
105 |
+
2025-04-24 10:17:46,429 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
106 |
+
2025-04-24 10:17:46,430 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
107 |
+
2025-04-24 10:17:46,430 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
108 |
+
2025-04-24 10:17:46,430 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
109 |
+
2025-04-24 10:17:46,430 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
110 |
+
2025-04-24 10:17:46,430 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
111 |
+
2025-04-24 10:17:46,430 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
112 |
+
2025-04-24 10:17:46,430 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
113 |
+
2025-04-24 10:17:46,430 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
114 |
+
2025-04-24 10:17:46,430 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
115 |
+
2025-04-24 10:17:46,430 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
116 |
+
2025-04-24 10:17:46,430 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
117 |
+
2025-04-24 10:17:46,430 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
118 |
+
2025-04-24 10:17:46,430 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
119 |
+
2025-04-24 10:17:46,430 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
120 |
+
2025-04-24 10:17:46,430 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
121 |
+
2025-04-24 10:17:46,430 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
122 |
+
2025-04-24 10:17:46,430 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
123 |
+
2025-04-24 10:17:46,430 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
124 |
+
2025-04-24 10:17:46,430 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
125 |
+
2025-04-24 10:17:46,430 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
126 |
+
2025-04-24 10:17:46,431 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
127 |
+
2025-04-24 10:17:46,431 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
128 |
+
2025-04-24 10:17:46,431 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
129 |
+
2025-04-24 10:17:46,431 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
130 |
+
2025-04-24 10:17:46,431 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
131 |
+
2025-04-24 10:17:46,431 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
132 |
+
2025-04-24 10:17:46,431 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
133 |
+
2025-04-24 10:17:46,431 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
134 |
+
2025-04-24 10:17:46,431 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
135 |
+
2025-04-24 10:17:46,431 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
136 |
+
2025-04-24 10:17:46,431 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
137 |
+
2025-04-24 10:17:46,431 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
138 |
+
2025-04-24 10:17:46,431 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
139 |
+
2025-04-24 10:17:46,431 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
140 |
+
2025-04-24 10:17:46,431 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
141 |
+
2025-04-24 10:17:46,431 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
142 |
+
2025-04-24 10:17:46,431 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
143 |
+
2025-04-24 10:17:46,431 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
144 |
+
2025-04-24 10:17:46,431 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
145 |
+
2025-04-24 10:17:46,432 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
146 |
+
2025-04-24 10:17:46,432 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
147 |
+
2025-04-24 10:17:46,432 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
148 |
+
2025-04-24 10:17:46,432 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
149 |
+
2025-04-24 10:17:46,432 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
150 |
+
2025-04-24 10:17:46,432 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
151 |
+
2025-04-24 10:17:46,432 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
152 |
+
2025-04-24 10:17:46,432 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
153 |
+
2025-04-24 10:17:46,432 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
154 |
+
2025-04-24 10:17:46,432 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
155 |
+
2025-04-24 10:17:46,432 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
156 |
+
2025-04-24 10:17:46,432 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
157 |
+
2025-04-24 10:17:46,432 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
158 |
+
2025-04-24 10:17:46,432 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
159 |
+
2025-04-24 10:17:46,432 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
160 |
+
2025-04-24 10:17:46,432 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
161 |
+
2025-04-24 10:17:46,432 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
162 |
+
2025-04-24 10:17:46,432 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
163 |
+
2025-04-24 10:17:46,432 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
164 |
+
2025-04-24 10:17:46,433 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
165 |
+
2025-04-24 10:17:46,433 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
166 |
+
2025-04-24 10:17:46,433 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
167 |
+
2025-04-24 10:17:46,433 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
168 |
+
2025-04-24 10:17:46,433 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
169 |
+
2025-04-24 10:17:46,433 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
170 |
+
2025-04-24 10:17:46,433 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
171 |
+
2025-04-24 10:17:46,433 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
172 |
+
2025-04-24 10:17:46,433 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
173 |
+
2025-04-24 10:17:46,433 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
174 |
+
2025-04-24 10:17:46,433 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
175 |
+
2025-04-24 10:17:46,433 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
176 |
+
2025-04-24 10:17:46,433 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
177 |
+
2025-04-24 10:17:46,433 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
178 |
+
2025-04-24 10:17:46,433 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
179 |
+
2025-04-24 10:17:46,433 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
180 |
+
2025-04-24 10:17:46,433 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
181 |
+
2025-04-24 10:17:46,433 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
182 |
+
2025-04-24 10:17:46,433 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
183 |
+
2025-04-24 10:17:46,433 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
184 |
+
2025-04-24 10:17:46,434 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
185 |
+
2025-04-24 10:17:46,434 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
186 |
+
2025-04-24 10:17:46,434 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
187 |
+
2025-04-24 10:17:46,434 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
188 |
+
2025-04-24 10:17:46,434 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
189 |
+
2025-04-24 10:17:46,434 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
190 |
+
2025-04-24 10:17:46,434 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
191 |
+
2025-04-24 10:17:46,434 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
192 |
+
2025-04-24 10:17:46,434 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
193 |
+
2025-04-24 10:17:46,434 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
194 |
+
2025-04-24 10:17:46,434 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
195 |
+
2025-04-24 10:17:46,434 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
196 |
+
2025-04-24 10:17:46,434 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
197 |
+
2025-04-24 10:17:46,434 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
198 |
+
2025-04-24 10:17:46,434 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
199 |
+
2025-04-24 10:17:46,434 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
200 |
+
2025-04-24 10:17:46,434 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
201 |
+
2025-04-24 10:17:46,434 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
202 |
+
2025-04-24 10:17:46,435 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
203 |
+
2025-04-24 10:17:46,435 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
204 |
+
2025-04-24 10:17:46,435 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
205 |
+
2025-04-24 10:17:46,435 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
206 |
+
2025-04-24 10:17:46,435 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
207 |
+
2025-04-24 10:17:46,435 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
208 |
+
2025-04-24 10:17:46,435 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
209 |
+
2025-04-24 10:17:46,435 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
210 |
+
2025-04-24 10:17:46,435 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
211 |
+
2025-04-24 10:17:46,435 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
212 |
+
2025-04-24 10:17:46,435 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
213 |
+
2025-04-24 10:17:46,435 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
214 |
+
2025-04-24 10:17:46,435 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
215 |
+
2025-04-24 10:17:46,435 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
216 |
+
2025-04-24 10:17:46,435 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
217 |
+
2025-04-24 10:17:46,435 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
218 |
+
2025-04-24 10:17:46,435 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
219 |
+
2025-04-24 10:17:46,435 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
220 |
+
2025-04-24 10:17:46,435 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
221 |
+
2025-04-24 10:17:46,436 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
222 |
+
2025-04-24 10:17:46,436 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
223 |
+
2025-04-24 10:17:46,436 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
224 |
+
2025-04-24 10:17:46,436 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
225 |
+
2025-04-24 10:17:46,436 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
226 |
+
2025-04-24 10:17:46,436 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
227 |
+
2025-04-24 10:17:46,436 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
228 |
+
2025-04-24 10:17:46,436 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
229 |
+
2025-04-24 10:17:46,436 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
230 |
+
2025-04-24 10:17:46,436 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
231 |
+
2025-04-24 10:17:46,436 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
232 |
+
2025-04-24 10:17:46,436 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
233 |
+
2025-04-24 10:17:46,436 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
234 |
+
2025-04-24 10:17:46,436 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
235 |
+
2025-04-24 10:17:46,436 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
236 |
+
2025-04-24 10:17:46,436 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
237 |
+
2025-04-24 10:17:46,436 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
238 |
+
2025-04-24 10:17:46,436 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
239 |
+
2025-04-24 10:17:46,436 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
240 |
+
2025-04-24 10:17:46,436 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
241 |
+
2025-04-24 10:17:46,437 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
242 |
+
2025-04-24 10:17:46,437 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
243 |
+
2025-04-24 10:17:46,437 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
244 |
+
2025-04-24 10:17:46,437 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
245 |
+
2025-04-24 10:17:46,437 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
246 |
+
2025-04-24 10:17:46,437 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
247 |
+
2025-04-24 10:17:46,437 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
248 |
+
2025-04-24 10:17:46,437 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
249 |
+
2025-04-24 10:17:46,437 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
250 |
+
2025-04-24 10:17:46,437 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
251 |
+
2025-04-24 10:17:46,437 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
252 |
+
2025-04-24 10:17:46,437 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
253 |
+
2025-04-24 10:17:46,437 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
254 |
+
2025-04-24 10:17:46,437 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
255 |
+
2025-04-24 10:17:46,437 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
256 |
+
2025-04-24 10:17:46,437 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
257 |
+
2025-04-24 10:17:46,437 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
258 |
+
2025-04-24 10:17:46,437 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
259 |
+
2025-04-24 10:17:46,437 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
260 |
+
2025-04-24 10:17:46,438 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
261 |
+
2025-04-24 10:17:46,438 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
262 |
+
2025-04-24 10:17:46,438 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
263 |
+
2025-04-24 10:17:46,438 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
264 |
+
2025-04-24 10:17:46,438 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
265 |
+
2025-04-24 10:17:46,438 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
266 |
+
2025-04-24 10:17:46,438 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
267 |
+
2025-04-24 10:17:46,438 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
268 |
+
2025-04-24 10:17:46,438 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
269 |
+
2025-04-24 10:17:46,438 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
270 |
+
2025-04-24 10:17:46,438 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
271 |
+
2025-04-24 10:17:46,438 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
272 |
+
2025-04-24 10:17:46,438 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
273 |
+
2025-04-24 10:17:46,438 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
274 |
+
2025-04-24 10:17:46,438 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
275 |
+
2025-04-24 10:17:46,438 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
276 |
+
2025-04-24 10:17:46,438 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
277 |
+
2025-04-24 10:17:46,438 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
278 |
+
2025-04-24 10:17:46,438 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
279 |
+
2025-04-24 10:17:46,438 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
280 |
+
2025-04-24 10:17:46,439 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
281 |
+
2025-04-24 10:17:46,439 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
282 |
+
2025-04-24 10:17:46,439 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
283 |
+
2025-04-24 10:17:46,439 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
284 |
+
2025-04-24 10:17:46,439 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
285 |
+
2025-04-24 10:17:46,439 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
286 |
+
2025-04-24 10:17:46,439 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
287 |
+
2025-04-24 10:17:46,439 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
288 |
+
2025-04-24 10:17:46,439 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
289 |
+
2025-04-24 10:17:46,439 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
290 |
+
2025-04-24 10:17:46,439 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
291 |
+
2025-04-24 10:17:46,439 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
292 |
+
2025-04-24 10:17:46,439 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
293 |
+
2025-04-24 10:17:46,439 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
294 |
+
2025-04-24 10:17:46,439 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
295 |
+
2025-04-24 10:17:46,439 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
296 |
+
2025-04-24 10:17:46,439 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
297 |
+
2025-04-24 10:17:46,439 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
298 |
+
2025-04-24 10:17:46,440 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
299 |
+
2025-04-24 10:17:46,440 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
300 |
+
2025-04-24 10:17:46,440 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
301 |
+
2025-04-24 10:17:46,440 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
302 |
+
2025-04-24 10:17:46,440 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
303 |
+
2025-04-24 10:17:46,440 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
304 |
+
2025-04-24 10:17:46,440 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
305 |
+
2025-04-24 10:17:46,440 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
306 |
+
2025-04-24 10:17:46,440 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
307 |
+
2025-04-24 10:17:46,440 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
308 |
+
2025-04-24 10:17:46,440 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
309 |
+
2025-04-24 10:17:46,440 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
310 |
+
2025-04-24 10:17:46,440 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
311 |
+
2025-04-24 10:17:46,440 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
312 |
+
2025-04-24 10:17:46,440 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
313 |
+
2025-04-24 10:17:46,440 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
314 |
+
2025-04-24 10:17:46,440 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
315 |
+
2025-04-24 10:17:46,440 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
316 |
+
2025-04-24 10:17:46,440 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
317 |
+
2025-04-24 10:17:46,440 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
318 |
+
2025-04-24 10:17:46,441 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
319 |
+
2025-04-24 10:17:46,441 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
320 |
+
2025-04-24 10:17:46,441 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
321 |
+
2025-04-24 10:17:46,441 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
322 |
+
2025-04-24 10:17:46,441 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
323 |
+
2025-04-24 10:17:46,441 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
324 |
+
2025-04-24 10:17:46,441 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
325 |
+
2025-04-24 10:17:46,441 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
326 |
+
2025-04-24 10:17:46,441 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
327 |
+
2025-04-24 10:17:46,441 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
328 |
+
2025-04-24 10:17:46,441 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
329 |
+
2025-04-24 10:17:46,441 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
330 |
+
2025-04-24 10:17:46,441 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
331 |
+
2025-04-24 10:17:46,441 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
332 |
+
2025-04-24 10:17:46,441 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
333 |
+
2025-04-24 10:17:46,441 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
334 |
+
2025-04-24 10:17:46,441 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
335 |
+
2025-04-24 10:17:46,441 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
336 |
+
2025-04-24 10:17:46,441 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
337 |
+
2025-04-24 10:17:46,441 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
338 |
+
2025-04-24 10:17:46,442 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
339 |
+
2025-04-24 10:17:46,442 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
340 |
+
2025-04-24 10:17:46,442 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
341 |
+
2025-04-24 10:17:46,442 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
342 |
+
2025-04-24 10:17:46,442 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
343 |
+
2025-04-24 10:17:46,442 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
344 |
+
2025-04-24 10:17:46,442 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
345 |
+
2025-04-24 10:17:46,442 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
346 |
+
2025-04-24 10:17:46,442 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
347 |
+
2025-04-24 10:17:46,442 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
348 |
+
2025-04-24 10:17:46,442 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
349 |
+
2025-04-24 10:17:46,442 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
350 |
+
2025-04-24 10:17:46,442 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
351 |
+
2025-04-24 10:17:46,442 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
352 |
+
2025-04-24 10:17:46,442 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
353 |
+
2025-04-24 10:17:46,442 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
354 |
+
2025-04-24 10:17:46,442 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
355 |
+
2025-04-24 10:17:46,442 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
356 |
+
2025-04-24 10:17:46,442 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
357 |
+
2025-04-24 10:17:46,443 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
358 |
+
2025-04-24 10:17:46,443 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
359 |
+
2025-04-24 10:17:46,443 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
360 |
+
2025-04-24 10:17:46,443 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
361 |
+
2025-04-24 10:17:46,443 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
362 |
+
2025-04-24 10:17:46,443 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
363 |
+
2025-04-24 10:17:46,443 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
364 |
+
2025-04-24 10:17:46,443 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
365 |
+
2025-04-24 10:17:46,443 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
366 |
+
2025-04-24 10:17:46,443 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
367 |
+
2025-04-24 10:17:46,443 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
368 |
+
2025-04-24 10:17:46,443 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
369 |
+
2025-04-24 10:17:46,443 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
370 |
+
2025-04-24 10:17:46,443 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
371 |
+
2025-04-24 10:17:46,443 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
372 |
+
2025-04-24 10:17:46,443 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
373 |
+
2025-04-24 10:17:46,443 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
374 |
+
2025-04-24 10:17:46,443 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
375 |
+
2025-04-24 10:17:46,443 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
376 |
+
2025-04-24 10:17:46,443 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
377 |
+
2025-04-24 10:17:46,444 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
378 |
+
2025-04-24 10:17:46,444 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
379 |
+
2025-04-24 10:17:46,444 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
380 |
+
2025-04-24 10:17:46,444 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
381 |
+
2025-04-24 10:17:46,444 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
382 |
+
2025-04-24 10:17:46,444 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
383 |
+
2025-04-24 10:17:46,444 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
384 |
+
2025-04-24 10:17:46,444 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
385 |
+
2025-04-24 10:17:46,444 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
386 |
+
2025-04-24 10:17:46,444 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
387 |
+
2025-04-24 10:17:46,444 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
388 |
+
2025-04-24 10:17:46,444 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
389 |
+
2025-04-24 10:17:46,444 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
390 |
+
2025-04-24 10:17:46,444 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
391 |
+
2025-04-24 10:17:46,444 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
392 |
+
2025-04-24 10:17:46,444 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
393 |
+
2025-04-24 10:17:46,444 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
394 |
+
2025-04-24 10:17:46,444 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
395 |
+
2025-04-24 10:17:46,444 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
396 |
+
2025-04-24 10:17:46,444 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
397 |
+
2025-04-24 10:17:46,445 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
398 |
+
2025-04-24 10:17:46,445 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
399 |
+
2025-04-24 10:17:46,445 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
400 |
+
2025-04-24 10:17:46,445 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
401 |
+
2025-04-24 10:17:46,445 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
402 |
+
2025-04-24 10:17:46,445 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
403 |
+
2025-04-24 10:17:46,445 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
404 |
+
2025-04-24 10:17:46,445 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
405 |
+
2025-04-24 10:17:46,445 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
406 |
+
2025-04-24 10:17:46,445 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
407 |
+
2025-04-24 10:17:46,445 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
408 |
+
2025-04-24 10:17:46,445 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
409 |
+
2025-04-24 10:17:46,445 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
410 |
+
2025-04-24 10:17:46,445 INFO [train.py:813] encoder_projector.linear1.weight: torch.Size([896, 10240])
|
411 |
+
2025-04-24 10:17:46,445 INFO [train.py:813] encoder_projector.linear1.bias: torch.Size([896])
|
412 |
+
2025-04-24 10:17:46,445 INFO [train.py:813] encoder_projector.linear2.weight: torch.Size([896, 896])
|
413 |
+
2025-04-24 10:17:46,445 INFO [train.py:813] encoder_projector.linear2.bias: torch.Size([896])
|
414 |
+
2025-04-24 10:17:46,445 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
415 |
+
2025-04-24 10:17:46,445 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
416 |
+
2025-04-24 10:17:46,446 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
417 |
+
2025-04-24 10:17:46,446 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
418 |
+
2025-04-24 10:17:46,446 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
419 |
+
2025-04-24 10:17:46,446 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
420 |
+
2025-04-24 10:17:46,446 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
421 |
+
2025-04-24 10:17:46,446 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
422 |
+
2025-04-24 10:17:46,446 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
423 |
+
2025-04-24 10:17:46,446 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
424 |
+
2025-04-24 10:17:46,446 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
425 |
+
2025-04-24 10:17:46,446 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
426 |
+
2025-04-24 10:17:46,446 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
427 |
+
2025-04-24 10:17:46,446 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
428 |
+
2025-04-24 10:17:46,446 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
429 |
+
2025-04-24 10:17:46,446 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
430 |
+
2025-04-24 10:17:46,446 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
431 |
+
2025-04-24 10:17:46,446 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
432 |
+
2025-04-24 10:17:46,446 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
433 |
+
2025-04-24 10:17:46,446 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
434 |
+
2025-04-24 10:17:46,446 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
435 |
+
2025-04-24 10:17:46,446 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
436 |
+
2025-04-24 10:17:46,447 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
437 |
+
2025-04-24 10:17:46,447 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
438 |
+
2025-04-24 10:17:46,447 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
439 |
+
2025-04-24 10:17:46,447 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
440 |
+
2025-04-24 10:17:46,447 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
441 |
+
2025-04-24 10:17:46,447 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
442 |
+
2025-04-24 10:17:46,447 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
443 |
+
2025-04-24 10:17:46,447 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
444 |
+
2025-04-24 10:17:46,447 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
445 |
+
2025-04-24 10:17:46,447 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
446 |
+
2025-04-24 10:17:46,447 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
447 |
+
2025-04-24 10:17:46,447 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
448 |
+
2025-04-24 10:17:46,447 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
449 |
+
2025-04-24 10:17:46,447 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
450 |
+
2025-04-24 10:17:46,447 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
451 |
+
2025-04-24 10:17:46,447 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
452 |
+
2025-04-24 10:17:46,447 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
453 |
+
2025-04-24 10:17:46,447 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
454 |
+
2025-04-24 10:17:46,447 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
455 |
+
2025-04-24 10:17:46,447 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
456 |
+
2025-04-24 10:17:46,448 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
457 |
+
2025-04-24 10:17:46,448 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
458 |
+
2025-04-24 10:17:46,448 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
459 |
+
2025-04-24 10:17:46,448 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
460 |
+
2025-04-24 10:17:46,448 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
461 |
+
2025-04-24 10:17:46,448 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
462 |
+
2025-04-24 10:17:46,448 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
463 |
+
2025-04-24 10:17:46,448 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
464 |
+
2025-04-24 10:17:46,448 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
465 |
+
2025-04-24 10:17:46,448 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
466 |
+
2025-04-24 10:17:46,448 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
467 |
+
2025-04-24 10:17:46,448 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
468 |
+
2025-04-24 10:17:46,448 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
469 |
+
2025-04-24 10:17:46,448 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
470 |
+
2025-04-24 10:17:46,448 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
471 |
+
2025-04-24 10:17:46,448 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
472 |
+
2025-04-24 10:17:46,448 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
473 |
+
2025-04-24 10:17:46,448 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
474 |
+
2025-04-24 10:17:46,448 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
475 |
+
2025-04-24 10:17:46,449 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
476 |
+
2025-04-24 10:17:46,449 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
477 |
+
2025-04-24 10:17:46,449 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
478 |
+
2025-04-24 10:17:46,449 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
479 |
+
2025-04-24 10:17:46,449 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
480 |
+
2025-04-24 10:17:46,449 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
481 |
+
2025-04-24 10:17:46,449 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
482 |
+
2025-04-24 10:17:46,449 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
483 |
+
2025-04-24 10:17:46,449 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
484 |
+
2025-04-24 10:17:46,449 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
485 |
+
2025-04-24 10:17:46,449 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
486 |
+
2025-04-24 10:17:46,449 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
487 |
+
2025-04-24 10:17:46,449 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
488 |
+
2025-04-24 10:17:46,449 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
489 |
+
2025-04-24 10:17:46,449 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
490 |
+
2025-04-24 10:17:46,449 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
491 |
+
2025-04-24 10:17:46,449 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
492 |
+
2025-04-24 10:17:46,449 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
493 |
+
2025-04-24 10:17:46,449 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
494 |
+
2025-04-24 10:17:46,449 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
495 |
+
2025-04-24 10:17:46,450 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
496 |
+
2025-04-24 10:17:46,450 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
497 |
+
2025-04-24 10:17:46,450 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
498 |
+
2025-04-24 10:17:46,450 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
499 |
+
2025-04-24 10:17:46,450 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
500 |
+
2025-04-24 10:17:46,450 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
501 |
+
2025-04-24 10:17:46,450 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
502 |
+
2025-04-24 10:17:46,450 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
503 |
+
2025-04-24 10:17:46,450 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
504 |
+
2025-04-24 10:17:46,450 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
505 |
+
2025-04-24 10:17:46,450 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
506 |
+
2025-04-24 10:17:46,450 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
507 |
+
2025-04-24 10:17:46,450 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
508 |
+
2025-04-24 10:17:46,450 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
509 |
+
2025-04-24 10:17:46,450 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
510 |
+
2025-04-24 10:17:46,450 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
511 |
+
2025-04-24 10:17:46,450 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
512 |
+
2025-04-24 10:17:46,450 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
513 |
+
2025-04-24 10:17:46,450 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
514 |
+
2025-04-24 10:17:46,450 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
515 |
+
2025-04-24 10:17:46,451 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
516 |
+
2025-04-24 10:17:46,451 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
517 |
+
2025-04-24 10:17:46,451 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
518 |
+
2025-04-24 10:17:46,451 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
519 |
+
2025-04-24 10:17:46,451 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
520 |
+
2025-04-24 10:17:46,451 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
521 |
+
2025-04-24 10:17:46,451 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
522 |
+
2025-04-24 10:17:46,451 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
523 |
+
2025-04-24 10:17:46,451 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
524 |
+
2025-04-24 10:17:46,451 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
525 |
+
2025-04-24 10:17:46,451 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
526 |
+
2025-04-24 10:17:46,451 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
527 |
+
2025-04-24 10:17:46,451 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
528 |
+
2025-04-24 10:17:46,451 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
529 |
+
2025-04-24 10:17:46,451 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
530 |
+
2025-04-24 10:17:46,451 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
531 |
+
2025-04-24 10:17:46,451 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
532 |
+
2025-04-24 10:17:46,451 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
533 |
+
2025-04-24 10:17:46,451 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
534 |
+
2025-04-24 10:17:46,452 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
535 |
+
2025-04-24 10:17:46,452 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
536 |
+
2025-04-24 10:17:46,452 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
537 |
+
2025-04-24 10:17:46,452 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
538 |
+
2025-04-24 10:17:46,452 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
539 |
+
2025-04-24 10:17:46,452 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
540 |
+
2025-04-24 10:17:46,452 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
541 |
+
2025-04-24 10:17:46,452 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
542 |
+
2025-04-24 10:17:46,452 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
543 |
+
2025-04-24 10:17:46,452 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
544 |
+
2025-04-24 10:17:46,452 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
545 |
+
2025-04-24 10:17:46,452 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
546 |
+
2025-04-24 10:17:46,452 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
547 |
+
2025-04-24 10:17:46,452 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
548 |
+
2025-04-24 10:17:46,452 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
549 |
+
2025-04-24 10:17:46,452 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
550 |
+
2025-04-24 10:17:46,452 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
551 |
+
2025-04-24 10:17:46,452 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
552 |
+
2025-04-24 10:17:46,452 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
553 |
+
2025-04-24 10:17:46,452 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
554 |
+
2025-04-24 10:17:46,453 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
555 |
+
2025-04-24 10:17:46,453 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
556 |
+
2025-04-24 10:17:46,453 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
557 |
+
2025-04-24 10:17:46,453 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
558 |
+
2025-04-24 10:17:46,453 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
559 |
+
2025-04-24 10:17:46,453 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
560 |
+
2025-04-24 10:17:46,453 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
561 |
+
2025-04-24 10:17:46,453 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
562 |
+
2025-04-24 10:17:46,453 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
563 |
+
2025-04-24 10:17:46,453 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
564 |
+
2025-04-24 10:17:46,453 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
565 |
+
2025-04-24 10:17:46,453 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
566 |
+
2025-04-24 10:17:46,453 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
567 |
+
2025-04-24 10:17:46,453 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
568 |
+
2025-04-24 10:17:46,453 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
569 |
+
2025-04-24 10:17:46,453 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
570 |
+
2025-04-24 10:17:46,453 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
571 |
+
2025-04-24 10:17:46,453 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
572 |
+
2025-04-24 10:17:46,453 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
573 |
+
2025-04-24 10:17:46,453 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
574 |
+
2025-04-24 10:17:46,454 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
575 |
+
2025-04-24 10:17:46,454 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
576 |
+
2025-04-24 10:17:46,454 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
577 |
+
2025-04-24 10:17:46,454 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
578 |
+
2025-04-24 10:17:46,454 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
579 |
+
2025-04-24 10:17:46,454 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
580 |
+
2025-04-24 10:17:46,454 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
581 |
+
2025-04-24 10:17:46,454 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
582 |
+
2025-04-24 10:17:46,454 INFO [train.py:813] speech_token_projector.weight: torch.Size([1024, 1792])
|
583 |
+
2025-04-24 10:17:46,454 INFO [train.py:813] speech_token_projector.bias: torch.Size([1024])
|
584 |
+
2025-04-24 10:17:46,454 INFO [train.py:813] codec_lm_head.weight: torch.Size([4100, 1024])
|
585 |
+
2025-04-24 10:17:46,454 INFO [train.py:813] codec_lm_head.bias: torch.Size([4100])
|
586 |
+
2025-04-24 10:17:46,454 INFO [train.py:819] Device: cuda:0
|
587 |
+
2025-04-24 10:17:47,363 INFO [train.py:823] Using DeepSpeed
|
588 |
+
2025-04-24 10:17:49,160 INFO [data_module.py:445] About to get train cuts
|
589 |
+
2025-04-24 10:17:49,163 INFO [data_module.py:250] Disable MUSAN
|
590 |
+
2025-04-24 10:17:49,163 INFO [data_module.py:268] Enable SpecAugment
|
591 |
+
2025-04-24 10:17:49,163 INFO [data_module.py:269] Time warp factor: 80
|
592 |
+
2025-04-24 10:17:49,163 INFO [data_module.py:279] Num frame mask: 10
|
593 |
+
2025-04-24 10:17:49,163 INFO [data_module.py:292] About to create train dataset
|
594 |
+
2025-04-24 10:17:49,163 INFO [data_module.py:319] Using DynamicBucketingSampler.
|
595 |
+
2025-04-24 10:17:50,163 INFO [data_module.py:336] About to create train dataloader
|
596 |
+
2025-04-24 10:17:50,164 INFO [data_module.py:436] About to get test cuts
|
597 |
+
2025-04-24 10:17:50,164 INFO [data_module.py:365] About to create dev dataset
|
598 |
+
2025-04-24 10:17:50,251 INFO [data_module.py:379] About to create dev dataloader
|
599 |
+
2025-04-24 10:17:50,260 INFO [train.py:875] start training from epoch 1
|
600 |
+
2025-04-24 10:18:19,657 INFO [train.py:539] Computing validation loss
|
601 |
+
2025-04-24 10:18:23,605 INFO [utils.py:1966] Filtering uneven-sized batch, original batch size is 2, retained batch size is 1.
|
602 |
+
2025-04-24 10:18:36,169 INFO [train.py:548] Epoch 1, validation: loss=0.007026, acc=0.4635, codec_acc=0.0004482, codec_topk_acc=0.003435, codec_loss=0.0053, text_loss=0.001726, over 284780.00 frames.
|
603 |
+
2025-04-24 10:18:36,170 INFO [train.py:549] Maximum memory allocated so far is 9104MB
|
604 |
+
2025-04-24 10:18:36,617 INFO [train.py:611] Epoch 1, batch 0, loss[loss=0.006556, acc=0.4215, codec_acc=0, codec_topk_acc=0.0032025-04-24 10:18:36,617 INFO [train.py:611] Epoch 1, batch 0, loss[loss=0.006369, acc=0.48, codec_acc=0, codec_topk_acc=0.003988, codec_loss=0.004861, text_loss=0.001508, over 1743.00 frames. ], tot_loss[loss=0.006369, acc=0.48, codec_acc=2025-04-24 10:19:00,389 INFO [train.py:611] Epoch 1, batch 50, loss[loss=0.006524, acc=0.4478, codec_acc=0.01511, codec_top2025-04-24 10:19:00,389 INFO [train.py:611] Epoch 1, batch 50, loss[loss=0.006777, acc=0.3789, codec_acc=0.01844, codec_topk_acc=0.08811, codec_loss=0.004652, text_loss=0.002125, over 1371.00 frames. ], tot_loss[loss=0.005674, acc=0.4825, codec_acc=0.00862025-04-24 10:19:26,931 INFO [train.py:611] Epoch 1, batch 100, loss[loss=0.00485, acc=0.4072, codec_acc=0.02365, codec_topk2025-04-24 10:19:26,931 INFO [train.py:611] Epoch 1, batch 100, loss[loss=0.004453, acc=0.6573, codec_acc=0.02348, codec_topk_acc=0.08241, codec_loss=0.003445, text_loss=0.001008, over 1826.00 frames. ], tot_loss[loss=0.005231, acc=0.4903, codec_acc=0.02025-04-24 10:19:40,453 INFO [train.py:940] Saving batch to slam_omni/exp_speech2speech_rerun/batch-bdd640fb-0667-1ad1-1c80-317fa3b1799d.pt
|
605 |
+
2025-04-24 10:19:40,475 INFO [train.py:945] features shape: torch.Size([8, 455, 80])
|
606 |
+
h-bdd640fb-0667-1ad1-1c80-317fa3b1799d.pt
|
607 |
+
2025-04-24 10:19:40,473 INFO [train.py:945] features shape: torch.Size([8, 457, 80])
|
log/log-train-2025-04-24-10-21-52
ADDED
@@ -0,0 +1,604 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2025-04-24 10:21:52,552 INFO [train.py:653] {
|
2 |
+
"allowed_excess_duration_ratio": 0.1,
|
3 |
+
"audio_key": "question_audio",
|
4 |
+
"batch_idx_train": 0,
|
5 |
+
"best_train_epoch": -1,
|
6 |
+
"best_train_loss": Infinity,
|
7 |
+
"best_valid_epoch": -1,
|
8 |
+
"best_valid_loss": Infinity,
|
9 |
+
"bucketing_sampler": true,
|
10 |
+
"deepscale": false,
|
11 |
+
"deepscale_config": null,
|
12 |
+
"deepspeed": true,
|
13 |
+
"deepspeed_config": "./slam_omni/ds_config_zero1.json",
|
14 |
+
"drop_last": true,
|
15 |
+
"enable_musan": false,
|
16 |
+
"enable_spec_aug": true,
|
17 |
+
"enable_speech_output": true,
|
18 |
+
"encoder_projector_ds_rate": 8,
|
19 |
+
"env_info": {
|
20 |
+
"IP address": "0.114.183.253",
|
21 |
+
"hostname": "7518205",
|
22 |
+
"icefall-git-branch": null,
|
23 |
+
"icefall-git-date": null,
|
24 |
+
"icefall-git-sha1": null,
|
25 |
+
"icefall-path": "/workspace/slam/icefall_omni",
|
26 |
+
"k2-build-type": "Release",
|
27 |
+
"k2-git-date": "Tue Oct 29 09:02:19 2024",
|
28 |
+
"k2-git-sha1": "75e2ed6b2fd87c22b7f3f34bad48a69984bb8755",
|
29 |
+
"k2-path": "/opt/conda/lib/python3.11/site-packages/k2/__init__.py",
|
30 |
+
"k2-version": "1.24.4",
|
31 |
+
"k2-with-cuda": true,
|
32 |
+
"lhotse-path": "/workspace/slam/lhotse/lhotse/__init__.py",
|
33 |
+
"lhotse-version": "1.30.0.dev+git.13c7616f.dirty",
|
34 |
+
"python-version": "3.11",
|
35 |
+
"torch-cuda-available": true,
|
36 |
+
"torch-cuda-version": "12.4",
|
37 |
+
"torch-version": "2.4.0"
|
38 |
+
},
|
39 |
+
"exp_dir": "slam_omni/exp_speech2speech_rerun",
|
40 |
+
"frame_shift_ms": 10,
|
41 |
+
"huggingface_dataset_path_or_name": "/workspace/Belle_1.4M-SLAM-Omni",
|
42 |
+
"input_strategy": "PrecomputedFeatures",
|
43 |
+
"llm_path_or_name": "models/Qwen2.5-0.5B-Instruct",
|
44 |
+
"log_interval": 50,
|
45 |
+
"manifest_dir": "data/fbank",
|
46 |
+
"max_duration": 50,
|
47 |
+
"num_buckets": 30,
|
48 |
+
"num_epochs": 10,
|
49 |
+
"num_workers": 2,
|
50 |
+
"on_the_fly_feats": false,
|
51 |
+
"pretrained_model_path": null,
|
52 |
+
"resample_to_16kHz": true,
|
53 |
+
"reset_interval": 200,
|
54 |
+
"return_cuts": true,
|
55 |
+
"sampler_state_dict_path": null,
|
56 |
+
"seed": 42,
|
57 |
+
"shuffle": true,
|
58 |
+
"spec_aug_time_warp_factor": 80,
|
59 |
+
"speech_encoder_path_or_name": "models/whisper/v1.1/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt",
|
60 |
+
"start_epoch": 1,
|
61 |
+
"subsampling_factor": 2,
|
62 |
+
"tensorboard": true,
|
63 |
+
"text_key": "answer",
|
64 |
+
"unfreeze_llm": true,
|
65 |
+
"unfreeze_speech_projector": true,
|
66 |
+
"use_flash_attn": true,
|
67 |
+
"use_fp16": true,
|
68 |
+
"use_lora": true,
|
69 |
+
"valid_interval": 5000
|
70 |
+
}
|
71 |
+
2025-04-24 10:21:52,552 INFO [train.py:655] About to create model
|
72 |
+
2025-04-24 10:22:24,486 INFO [train.py:808] Number of model parameters: 1327766148
|
73 |
+
2025-04-24 10:22:24,487 INFO [train.py:810] Trainable parameters (excluding model.eval modules):
|
74 |
+
2025-04-24 10:22:24,487 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
75 |
+
2025-04-24 10:22:24,488 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
76 |
+
2025-04-24 10:22:24,488 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
77 |
+
2025-04-24 10:22:24,488 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
78 |
+
2025-04-24 10:22:24,488 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
79 |
+
2025-04-24 10:22:24,488 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
80 |
+
2025-04-24 10:22:24,488 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
81 |
+
2025-04-24 10:22:24,488 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
82 |
+
2025-04-24 10:22:24,488 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
83 |
+
2025-04-24 10:22:24,488 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
84 |
+
2025-04-24 10:22:24,488 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
85 |
+
2025-04-24 10:22:24,488 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
86 |
+
2025-04-24 10:22:24,488 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
87 |
+
2025-04-24 10:22:24,488 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
88 |
+
2025-04-24 10:22:24,488 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
89 |
+
2025-04-24 10:22:24,488 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
90 |
+
2025-04-24 10:22:24,488 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
91 |
+
2025-04-24 10:22:24,488 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
92 |
+
2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
93 |
+
2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
94 |
+
2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
95 |
+
2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
96 |
+
2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
97 |
+
2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
98 |
+
2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
99 |
+
2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
100 |
+
2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
101 |
+
2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
102 |
+
2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
103 |
+
2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
104 |
+
2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
105 |
+
2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
106 |
+
2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
107 |
+
2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
108 |
+
2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
109 |
+
2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
110 |
+
2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
111 |
+
2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
112 |
+
2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
113 |
+
2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
114 |
+
2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
115 |
+
2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
116 |
+
2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
117 |
+
2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
118 |
+
2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
119 |
+
2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
120 |
+
2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
121 |
+
2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
122 |
+
2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
123 |
+
2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
124 |
+
2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
125 |
+
2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
126 |
+
2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
127 |
+
2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
128 |
+
2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
129 |
+
2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
130 |
+
2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
131 |
+
2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
132 |
+
2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
133 |
+
2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
134 |
+
2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
135 |
+
2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
136 |
+
2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
137 |
+
2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
138 |
+
2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
139 |
+
2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
140 |
+
2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
141 |
+
2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
142 |
+
2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
143 |
+
2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
144 |
+
2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
145 |
+
2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
146 |
+
2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
147 |
+
2025-04-24 10:22:24,492 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
148 |
+
2025-04-24 10:22:24,492 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
149 |
+
2025-04-24 10:22:24,492 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
150 |
+
2025-04-24 10:22:24,492 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
151 |
+
2025-04-24 10:22:24,492 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
152 |
+
2025-04-24 10:22:24,492 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
153 |
+
2025-04-24 10:22:24,492 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
154 |
+
2025-04-24 10:22:24,492 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
155 |
+
2025-04-24 10:22:24,492 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
156 |
+
2025-04-24 10:22:24,492 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
157 |
+
2025-04-24 10:22:24,492 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
158 |
+
2025-04-24 10:22:24,492 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
159 |
+
2025-04-24 10:22:24,492 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
160 |
+
2025-04-24 10:22:24,492 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
161 |
+
2025-04-24 10:22:24,492 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
162 |
+
2025-04-24 10:22:24,492 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
163 |
+
2025-04-24 10:22:24,492 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
164 |
+
2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
165 |
+
2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
166 |
+
2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
167 |
+
2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
168 |
+
2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
169 |
+
2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
170 |
+
2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
171 |
+
2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
172 |
+
2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
173 |
+
2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
174 |
+
2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
175 |
+
2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
176 |
+
2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
177 |
+
2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
178 |
+
2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
179 |
+
2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
180 |
+
2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
181 |
+
2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
182 |
+
2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
183 |
+
2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
184 |
+
2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
185 |
+
2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
186 |
+
2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
187 |
+
2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
188 |
+
2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
189 |
+
2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
190 |
+
2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
191 |
+
2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
192 |
+
2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
193 |
+
2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
194 |
+
2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
195 |
+
2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
196 |
+
2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
197 |
+
2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
198 |
+
2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
199 |
+
2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
200 |
+
2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
201 |
+
2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
202 |
+
2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
203 |
+
2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
204 |
+
2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
205 |
+
2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
206 |
+
2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
207 |
+
2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
208 |
+
2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
209 |
+
2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
210 |
+
2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
211 |
+
2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
212 |
+
2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
213 |
+
2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
214 |
+
2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
215 |
+
2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
216 |
+
2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
217 |
+
2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
218 |
+
2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
219 |
+
2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
220 |
+
2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
221 |
+
2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
222 |
+
2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
223 |
+
2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
224 |
+
2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
225 |
+
2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
226 |
+
2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
227 |
+
2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
228 |
+
2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
229 |
+
2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
230 |
+
2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
231 |
+
2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
232 |
+
2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
233 |
+
2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
234 |
+
2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
235 |
+
2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
236 |
+
2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
237 |
+
2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
238 |
+
2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
239 |
+
2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
240 |
+
2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
241 |
+
2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
242 |
+
2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
243 |
+
2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
244 |
+
2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
245 |
+
2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
246 |
+
2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
247 |
+
2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
248 |
+
2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
249 |
+
2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
250 |
+
2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
251 |
+
2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
252 |
+
2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
253 |
+
2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
254 |
+
2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
255 |
+
2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
256 |
+
2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
257 |
+
2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
258 |
+
2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
259 |
+
2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
260 |
+
2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
261 |
+
2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
262 |
+
2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
263 |
+
2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
264 |
+
2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
265 |
+
2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
266 |
+
2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
267 |
+
2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
268 |
+
2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
269 |
+
2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
270 |
+
2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
271 |
+
2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
272 |
+
2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
273 |
+
2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
274 |
+
2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
275 |
+
2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
276 |
+
2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
277 |
+
2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
278 |
+
2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
279 |
+
2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
280 |
+
2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
281 |
+
2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
282 |
+
2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
283 |
+
2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
284 |
+
2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
285 |
+
2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
286 |
+
2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
287 |
+
2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
288 |
+
2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
289 |
+
2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
290 |
+
2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
291 |
+
2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
292 |
+
2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
293 |
+
2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
294 |
+
2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
295 |
+
2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
296 |
+
2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
297 |
+
2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
298 |
+
2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
299 |
+
2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
300 |
+
2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
301 |
+
2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
302 |
+
2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
303 |
+
2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
304 |
+
2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
305 |
+
2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
306 |
+
2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
307 |
+
2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
308 |
+
2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
309 |
+
2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
310 |
+
2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
311 |
+
2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
312 |
+
2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
313 |
+
2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
314 |
+
2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
315 |
+
2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
316 |
+
2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
317 |
+
2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
318 |
+
2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
319 |
+
2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
320 |
+
2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
321 |
+
2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
322 |
+
2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
323 |
+
2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
324 |
+
2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
325 |
+
2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
326 |
+
2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
327 |
+
2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
328 |
+
2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
329 |
+
2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
330 |
+
2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
331 |
+
2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
332 |
+
2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
333 |
+
2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
334 |
+
2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
335 |
+
2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
336 |
+
2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
337 |
+
2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
338 |
+
2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
339 |
+
2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
340 |
+
2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
341 |
+
2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
342 |
+
2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
343 |
+
2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
344 |
+
2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
345 |
+
2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
346 |
+
2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
347 |
+
2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
348 |
+
2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
349 |
+
2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
350 |
+
2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
351 |
+
2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
352 |
+
2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
353 |
+
2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
354 |
+
2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
355 |
+
2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
356 |
+
2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
357 |
+
2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
358 |
+
2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
359 |
+
2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
360 |
+
2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
361 |
+
2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
362 |
+
2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
363 |
+
2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
364 |
+
2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
365 |
+
2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
366 |
+
2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
367 |
+
2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
368 |
+
2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
369 |
+
2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
370 |
+
2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
371 |
+
2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
372 |
+
2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
373 |
+
2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
374 |
+
2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
375 |
+
2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
376 |
+
2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
377 |
+
2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
378 |
+
2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
379 |
+
2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
380 |
+
2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
381 |
+
2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
382 |
+
2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
383 |
+
2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
384 |
+
2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
385 |
+
2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
386 |
+
2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
387 |
+
2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
388 |
+
2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
389 |
+
2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
390 |
+
2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
391 |
+
2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
392 |
+
2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
393 |
+
2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
394 |
+
2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
395 |
+
2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
396 |
+
2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
397 |
+
2025-04-24 10:22:24,505 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
398 |
+
2025-04-24 10:22:24,505 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
399 |
+
2025-04-24 10:22:24,505 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
400 |
+
2025-04-24 10:22:24,505 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
401 |
+
2025-04-24 10:22:24,505 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
402 |
+
2025-04-24 10:22:24,505 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
403 |
+
2025-04-24 10:22:24,505 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
404 |
+
2025-04-24 10:22:24,505 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
405 |
+
2025-04-24 10:22:24,505 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
406 |
+
2025-04-24 10:22:24,505 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
407 |
+
2025-04-24 10:22:24,505 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
408 |
+
2025-04-24 10:22:24,505 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
409 |
+
2025-04-24 10:22:24,505 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
410 |
+
2025-04-24 10:22:24,505 INFO [train.py:813] encoder_projector.linear1.weight: torch.Size([896, 10240])
|
411 |
+
2025-04-24 10:22:24,505 INFO [train.py:813] encoder_projector.linear1.bias: torch.Size([896])
|
412 |
+
2025-04-24 10:22:24,505 INFO [train.py:813] encoder_projector.linear2.weight: torch.Size([896, 896])
|
413 |
+
2025-04-24 10:22:24,505 INFO [train.py:813] encoder_projector.linear2.bias: torch.Size([896])
|
414 |
+
2025-04-24 10:22:24,505 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
415 |
+
2025-04-24 10:22:24,505 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
416 |
+
2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
417 |
+
2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
418 |
+
2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
419 |
+
2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
420 |
+
2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
421 |
+
2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
422 |
+
2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
423 |
+
2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
424 |
+
2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
425 |
+
2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
426 |
+
2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
427 |
+
2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
428 |
+
2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
429 |
+
2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
430 |
+
2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
431 |
+
2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
432 |
+
2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
433 |
+
2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
434 |
+
2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
435 |
+
2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
436 |
+
2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
437 |
+
2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
438 |
+
2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
439 |
+
2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
440 |
+
2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
441 |
+
2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
442 |
+
2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
443 |
+
2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
444 |
+
2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
445 |
+
2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
446 |
+
2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
447 |
+
2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
448 |
+
2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
449 |
+
2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
450 |
+
2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
451 |
+
2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
452 |
+
2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
453 |
+
2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
454 |
+
2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
455 |
+
2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
456 |
+
2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
457 |
+
2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
458 |
+
2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
459 |
+
2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
460 |
+
2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
461 |
+
2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
462 |
+
2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
463 |
+
2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
464 |
+
2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
465 |
+
2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
466 |
+
2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
467 |
+
2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
468 |
+
2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
469 |
+
2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
470 |
+
2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
471 |
+
2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
472 |
+
2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
473 |
+
2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
474 |
+
2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
475 |
+
2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
476 |
+
2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
477 |
+
2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
478 |
+
2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
479 |
+
2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
480 |
+
2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
481 |
+
2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
482 |
+
2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
483 |
+
2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
484 |
+
2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
485 |
+
2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
486 |
+
2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
487 |
+
2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
488 |
+
2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
489 |
+
2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
490 |
+
2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
491 |
+
2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
492 |
+
2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
493 |
+
2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
494 |
+
2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
495 |
+
2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
496 |
+
2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
497 |
+
2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
498 |
+
2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
499 |
+
2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
500 |
+
2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
501 |
+
2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
502 |
+
2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
503 |
+
2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
504 |
+
2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
505 |
+
2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
506 |
+
2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
507 |
+
2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
508 |
+
2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
509 |
+
2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
510 |
+
2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
511 |
+
2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
512 |
+
2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
513 |
+
2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
514 |
+
2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
515 |
+
2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
516 |
+
2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
517 |
+
2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
518 |
+
2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
519 |
+
2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
520 |
+
2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
521 |
+
2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
522 |
+
2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
523 |
+
2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
524 |
+
2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
525 |
+
2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
526 |
+
2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
527 |
+
2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
528 |
+
2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
529 |
+
2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
530 |
+
2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
531 |
+
2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
532 |
+
2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
533 |
+
2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
534 |
+
2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
535 |
+
2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
536 |
+
2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
537 |
+
2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
538 |
+
2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
539 |
+
2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
540 |
+
2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
541 |
+
2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
542 |
+
2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
543 |
+
2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
544 |
+
2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
545 |
+
2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
546 |
+
2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
547 |
+
2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
548 |
+
2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
549 |
+
2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
550 |
+
2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
551 |
+
2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
552 |
+
2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
553 |
+
2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
554 |
+
2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
555 |
+
2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
556 |
+
2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
557 |
+
2025-04-24 10:22:24,513 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
558 |
+
2025-04-24 10:22:24,513 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
559 |
+
2025-04-24 10:22:24,513 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
560 |
+
2025-04-24 10:22:24,513 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
561 |
+
2025-04-24 10:22:24,513 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
562 |
+
2025-04-24 10:22:24,513 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
563 |
+
2025-04-24 10:22:24,513 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
564 |
+
2025-04-24 10:22:24,513 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
565 |
+
2025-04-24 10:22:24,513 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
566 |
+
2025-04-24 10:22:24,513 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
567 |
+
2025-04-24 10:22:24,513 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
568 |
+
2025-04-24 10:22:24,513 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
|
569 |
+
2025-04-24 10:22:24,513 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
|
570 |
+
2025-04-24 10:22:24,513 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
|
571 |
+
2025-04-24 10:22:24,513 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
|
572 |
+
2025-04-24 10:22:24,513 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
|
573 |
+
2025-04-24 10:22:24,513 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
|
574 |
+
2025-04-24 10:22:24,514 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
|
575 |
+
2025-04-24 10:22:24,514 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
|
576 |
+
2025-04-24 10:22:24,514 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
|
577 |
+
2025-04-24 10:22:24,514 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
|
578 |
+
2025-04-24 10:22:24,514 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
|
579 |
+
2025-04-24 10:22:24,514 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
|
580 |
+
2025-04-24 10:22:24,514 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
|
581 |
+
2025-04-24 10:22:24,514 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
|
582 |
+
2025-04-24 10:22:24,514 INFO [train.py:813] speech_token_projector.weight: torch.Size([1024, 1792])
|
583 |
+
2025-04-24 10:22:24,514 INFO [train.py:813] speech_token_projector.bias: torch.Size([1024])
|
584 |
+
2025-04-24 10:22:24,514 INFO [train.py:813] codec_lm_head.weight: torch.Size([4100, 1024])
|
585 |
+
2025-04-24 10:22:24,514 INFO [train.py:813] codec_lm_head.bias: torch.Size([4100])
|
586 |
+
2025-04-24 10:22:24,514 INFO [train.py:819] Device: cuda:3
|
587 |
+
2025-04-24 10:22:25,742 INFO [train.py:823] Using DeepSpeed
|
588 |
+
2025-04-24 10:22:52,214 INFO [data_module.py:445] About to get train cuts
|
589 |
+
2025-04-24 10:22:52,216 INFO [data_module.py:250] Disable MUSAN
|
590 |
+
2025-04-24 10:22:52,216 INFO [data_module.py:268] Enable SpecAugment
|
591 |
+
2025-04-24 10:22:52,216 INFO [data_module.py:269] Time warp factor: 80
|
592 |
+
2025-04-24 10:22:52,216 INFO [data_module.py:279] Num frame mask: 10
|
593 |
+
2025-04-24 10:22:52,217 INFO [data_module.py:292] About to create train dataset
|
594 |
+
2025-04-24 10:22:52,217 INFO [data_module.py:319] Using DynamicBucketingSampler.
|
595 |
+
2025-04-24 10:22:53,219 INFO [data_module.py:336] About to create train dataloader
|
596 |
+
2025-04-24 10:22:53,219 INFO [data_module.py:436] About to get test cuts
|
597 |
+
2025-04-24 10:22:53,220 INFO [data_module.py:365] About to create dev dataset
|
598 |
+
2025-04-24 10:22:53,308 INFO [data_module.py:379] About to create dev dataloader
|
599 |
+
2025-04-24 10:22:53,315 INFO [train.py:875] start training from epoch 1
|
600 |
+
2025-04-24 10:23:23,440 INFO [train.py:539] Computing validation loss
|
601 |
+
2025-04-24 10:23:31,208 INFO [train.py:548] Epoch 1, validation: loss=0.005359, acc=0.4658, codec_acc=0.0004324, codec_topk_acc=0.003407, codec_loss=0.004039, text_loss=0.00132, over 285507.00 frames.
|
602 |
+
2025-04-24 10:23:31,209 INFO [train.py:549] Maximum memory allocated so far is 12383MB
|
603 |
+
2025-04-24 10:23:31,687 INFO [train.py:611] Epoch 1, batch 0, loss[loss=0.005086, acc=0.4173, codec_acc=0.001085, codec_topk_ac2025-04-24 10:23:31,687 INFO [train.py:611] Epoch 1, batch 0, loss[loss=0.004863, acc=0.4196, codec_acc=0.0002915, codec_topk_acc=0.002623, codec_loss=0.00371, text_loss=0.001153, over 2279.00 frames. ], tot_loss[loss=0.004863, acc=0.4196, codec_acc=0.0002025-04-24 10:23:39,106 INFO [train.py:940] Saving batch to slam_omni/exp_speech2speech_rerun/batch-bdd640fb-0667-1ad1-1c80-317fa3b1799d.pt
|
604 |
+
2025-04-24 10:23:39,117 INFO [train.py:945] features shape: torch.Size([4, 1168, 80])
|
log/log-train-2025-04-24-10-24-20
ADDED
The diff for this file is too large to render.
See raw diff
|
|
log/log-train-2025-04-24-14-23-07
ADDED
The diff for this file is too large to render.
See raw diff
|
|
log/log-train-2025-04-25-00-57-07
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2025-04-25 00:57:07,641 INFO [train.py:653] {
|
2 |
+
"allowed_excess_duration_ratio": 0.1,
|
3 |
+
"audio_key": "question_audio",
|
4 |
+
"batch_idx_train": 0,
|
5 |
+
"best_train_epoch": -1,
|
6 |
+
"best_train_loss": Infinity,
|
7 |
+
"best_valid_epoch": -1,
|
8 |
+
"best_valid_loss": Infinity,
|
9 |
+
"bucketing_sampler": true,
|
10 |
+
"deepscale": false,
|
11 |
+
"deepscale_config": null,
|
12 |
+
"deepspeed": true,
|
13 |
+
"deepspeed_config": "./slam_omni/ds_config_zero1.json",
|
14 |
+
"drop_last": true,
|
15 |
+
"enable_musan": false,
|
16 |
+
"enable_spec_aug": true,
|
17 |
+
"enable_speech_output": true,
|
18 |
+
"encoder_projector_ds_rate": 8,
|
19 |
+
"env_info": {
|
20 |
+
"IP address": "0.114.183.253",
|
21 |
+
"hostname": "7518205",
|
22 |
+
"icefall-git-branch": null,
|
23 |
+
"icefall-git-date": null,
|
24 |
+
"icefall-git-sha1": null,
|
25 |
+
"icefall-path": "/workspace/slam/icefall_omni",
|
26 |
+
"k2-build-type": "Release",
|
27 |
+
"k2-git-date": "Tue Oct 29 09:02:19 2024",
|
28 |
+
"k2-git-sha1": "75e2ed6b2fd87c22b7f3f34bad48a69984bb8755",
|
29 |
+
"k2-path": "/opt/conda/lib/python3.11/site-packages/k2/__init__.py",
|
30 |
+
"k2-version": "1.24.4",
|
31 |
+
"k2-with-cuda": true,
|
32 |
+
"lhotse-path": "/workspace/slam/lhotse/lhotse/__init__.py",
|
33 |
+
"lhotse-version": "1.30.0.dev+git.13c7616f.dirty",
|
34 |
+
"python-version": "3.11",
|
35 |
+
"torch-cuda-available": true,
|
36 |
+
"torch-cuda-version": "12.4",
|
37 |
+
"torch-version": "2.4.0"
|
38 |
+
},
|
39 |
+
"exp_dir": "slam_omni/exp_speech2speech_rerun",
|
40 |
+
"frame_shift_ms": 10,
|
41 |
+
"huggingface_dataset_path_or_name": "/workspace/Belle_1.4M-SLAM-Omni",
|
42 |
+
"input_strategy": "PrecomputedFeatures",
|
43 |
+
"llm_path_or_name": "models/Qwen2.5-0.5B-Instruct",
|
44 |
+
"log_interval": 50,
|
45 |
+
"manifest_dir": "data/fbank",
|
46 |
+
"max_duration": 50,
|
47 |
+
"num_buckets": 30,
|
48 |
+
"num_epochs": 10,
|
49 |
+
"num_workers": 2,
|
50 |
+
"on_the_fly_feats": false,
|
51 |
+
"pretrained_model_path": "./slam_omni/exp_speech2speech_rerun/epoch-1-checkpoint-15000.pt/pytorch_model.bin",
|
52 |
+
"resample_to_16kHz": true,
|
53 |
+
"reset_interval": 200,
|
54 |
+
"return_cuts": true,
|
55 |
+
"sampler_state_dict_path": "./slam_omni/exp_speech2speech_rerun/epoch-1-checkpoint-15000-sampler.pt",
|
56 |
+
"seed": 42,
|
57 |
+
"shuffle": true,
|
58 |
+
"spec_aug_time_warp_factor": 80,
|
59 |
+
"speech_encoder_path_or_name": "models/whisper/v1.1/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt",
|
60 |
+
"start_epoch": 1,
|
61 |
+
"subsampling_factor": 2,
|
62 |
+
"tensorboard": true,
|
63 |
+
"text_key": "answer",
|
64 |
+
"unfreeze_llm": true,
|
65 |
+
"unfreeze_speech_projector": true,
|
66 |
+
"use_flash_attn": true,
|
67 |
+
"use_fp16": true,
|
68 |
+
"use_lora": true,
|
69 |
+
"valid_interval": 5000
|
70 |
+
}
|
71 |
+
2025-04-25 00:57:07,641 INFO [train.py:655] About to create model
|
log/log-train-2025-04-25-00-58-05
ADDED
@@ -0,0 +1,575 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2025-04-25 00:58:05,568 INFO [train.py:653] {
|
2 |
+
"allowed_excess_duration_ratio": 0.1,
|
3 |
+
"audio_key": "question_audio",
|
4 |
+
"batch_idx_train": 0,
|
5 |
+
"best_train_epoch": -1,
|
6 |
+
"best_train_loss": Infinity,
|
7 |
+
"best_valid_epoch": -1,
|
8 |
+
"best_valid_loss": Infinity,
|
9 |
+
"bucketing_sampler": true,
|
10 |
+
"deepscale": false,
|
11 |
+
"deepscale_config": null,
|
12 |
+
"deepspeed": true,
|
13 |
+
"deepspeed_config": "./slam_omni/ds_config_zero1.json",
|
14 |
+
"drop_last": true,
|
15 |
+
"enable_musan": false,
|
16 |
+
"enable_spec_aug": true,
|
17 |
+
"enable_speech_output": true,
|
18 |
+
"encoder_projector_ds_rate": 8,
|
19 |
+
"env_info": {
|
20 |
+
"IP address": "0.114.183.253",
|
21 |
+
"hostname": "7518205",
|
22 |
+
"icefall-git-branch": null,
|
23 |
+
"icefall-git-date": null,
|
24 |
+
"icefall-git-sha1": null,
|
25 |
+
"icefall-path": "/workspace/slam/icefall_omni",
|
26 |
+
"k2-build-type": "Release",
|
27 |
+
"k2-git-date": "Tue Oct 29 09:02:19 2024",
|
28 |
+
"k2-git-sha1": "75e2ed6b2fd87c22b7f3f34bad48a69984bb8755",
|
29 |
+
"k2-path": "/opt/conda/lib/python3.11/site-packages/k2/__init__.py",
|
30 |
+
"k2-version": "1.24.4",
|
31 |
+
"k2-with-cuda": true,
|
32 |
+
"lhotse-path": "/workspace/slam/lhotse/lhotse/__init__.py",
|
33 |
+
"lhotse-version": "1.30.0.dev+git.13c7616f.dirty",
|
34 |
+
"python-version": "3.11",
|
35 |
+
"torch-cuda-available": true,
|
36 |
+
"torch-cuda-version": "12.4",
|
37 |
+
"torch-version": "2.4.0"
|
38 |
+
},
|
39 |
+
"exp_dir": "slam_omni/exp_speech2speech_rerun",
|
40 |
+
"frame_shift_ms": 10,
|
41 |
+
"huggingface_dataset_path_or_name": "/workspace/Belle_1.4M-SLAM-Omni",
|
42 |
+
"input_strategy": "PrecomputedFeatures",
|
43 |
+
"llm_path_or_name": "models/Qwen2.5-0.5B-Instruct",
|
44 |
+
"log_interval": 50,
|
45 |
+
"manifest_dir": "data/fbank",
|
46 |
+
"max_duration": 50,
|
47 |
+
"num_buckets": 30,
|
48 |
+
"num_epochs": 10,
|
49 |
+
"num_workers": 2,
|
50 |
+
"on_the_fly_feats": false,
|
51 |
+
"pretrained_model_path": "./slam_omni/exp_speech2speech_rerun/epoch-1-checkpoint-15000.pt/pytorch_model.bin",
|
52 |
+
"resample_to_16kHz": true,
|
53 |
+
"reset_interval": 200,
|
54 |
+
"return_cuts": true,
|
55 |
+
"sampler_state_dict_path": "./slam_omni/exp_speech2speech_rerun/epoch-1-checkpoint-15000-sampler.pt",
|
56 |
+
"seed": 42,
|
57 |
+
"shuffle": true,
|
58 |
+
"spec_aug_time_warp_factor": 80,
|
59 |
+
"speech_encoder_path_or_name": "models/whisper/v1.1/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt",
|
60 |
+
"start_epoch": 1,
|
61 |
+
"subsampling_factor": 2,
|
62 |
+
"tensorboard": true,
|
63 |
+
"text_key": "answer",
|
64 |
+
"unfreeze_llm": true,
|
65 |
+
"unfreeze_speech_projector": true,
|
66 |
+
"use_flash_attn": true,
|
67 |
+
"use_fp16": true,
|
68 |
+
"use_lora": true,
|
69 |
+
"valid_interval": 5000
|
70 |
+
}
|
71 |
+
2025-04-25 00:58:05,569 INFO [train.py:655] About to create model
|
72 |
+
2025-04-25 00:58:25,413 INFO [train.py:808] Number of model parameters: 1314396804
|
73 |
+
2025-04-25 00:58:25,413 INFO [train.py:810] Trainable parameters (excluding model.eval modules):
|
74 |
+
2025-04-25 00:58:25,414 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
75 |
+
2025-04-25 00:58:25,414 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
76 |
+
2025-04-25 00:58:25,414 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
77 |
+
2025-04-25 00:58:25,414 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
78 |
+
2025-04-25 00:58:25,414 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
79 |
+
2025-04-25 00:58:25,415 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
80 |
+
2025-04-25 00:58:25,415 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
81 |
+
2025-04-25 00:58:25,415 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
82 |
+
2025-04-25 00:58:25,415 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
83 |
+
2025-04-25 00:58:25,415 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
84 |
+
2025-04-25 00:58:25,415 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
85 |
+
2025-04-25 00:58:25,415 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
86 |
+
2025-04-25 00:58:25,415 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
87 |
+
2025-04-25 00:58:25,415 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
88 |
+
2025-04-25 00:58:25,415 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
89 |
+
2025-04-25 00:58:25,415 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
90 |
+
2025-04-25 00:58:25,415 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
91 |
+
2025-04-25 00:58:25,415 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
92 |
+
2025-04-25 00:58:25,415 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
93 |
+
2025-04-25 00:58:25,415 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
94 |
+
2025-04-25 00:58:25,415 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
95 |
+
2025-04-25 00:58:25,415 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
96 |
+
2025-04-25 00:58:25,415 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
97 |
+
2025-04-25 00:58:25,415 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
98 |
+
2025-04-25 00:58:25,415 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
99 |
+
2025-04-25 00:58:25,416 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
100 |
+
2025-04-25 00:58:25,416 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
101 |
+
2025-04-25 00:58:25,416 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
102 |
+
2025-04-25 00:58:25,416 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
103 |
+
2025-04-25 00:58:25,416 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
104 |
+
2025-04-25 00:58:25,416 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
105 |
+
2025-04-25 00:58:25,416 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
106 |
+
2025-04-25 00:58:25,416 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
107 |
+
2025-04-25 00:58:25,416 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
108 |
+
2025-04-25 00:58:25,416 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
109 |
+
2025-04-25 00:58:25,416 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
110 |
+
2025-04-25 00:58:25,416 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
111 |
+
2025-04-25 00:58:25,416 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
112 |
+
2025-04-25 00:58:25,416 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
113 |
+
2025-04-25 00:58:25,416 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
114 |
+
2025-04-25 00:58:25,416 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
115 |
+
2025-04-25 00:58:25,416 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
116 |
+
2025-04-25 00:58:25,416 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
117 |
+
2025-04-25 00:58:25,416 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
118 |
+
2025-04-25 00:58:25,416 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
119 |
+
2025-04-25 00:58:25,416 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
120 |
+
2025-04-25 00:58:25,417 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
121 |
+
2025-04-25 00:58:25,417 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
122 |
+
2025-04-25 00:58:25,417 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
123 |
+
2025-04-25 00:58:25,417 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
124 |
+
2025-04-25 00:58:25,417 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
125 |
+
2025-04-25 00:58:25,417 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
126 |
+
2025-04-25 00:58:25,417 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
127 |
+
2025-04-25 00:58:25,417 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
128 |
+
2025-04-25 00:58:25,417 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
129 |
+
2025-04-25 00:58:25,417 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
130 |
+
2025-04-25 00:58:25,417 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
131 |
+
2025-04-25 00:58:25,417 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
132 |
+
2025-04-25 00:58:25,417 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
133 |
+
2025-04-25 00:58:25,417 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
134 |
+
2025-04-25 00:58:25,417 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
135 |
+
2025-04-25 00:58:25,417 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
136 |
+
2025-04-25 00:58:25,417 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
137 |
+
2025-04-25 00:58:25,417 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
138 |
+
2025-04-25 00:58:25,417 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
139 |
+
2025-04-25 00:58:25,417 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
140 |
+
2025-04-25 00:58:25,417 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
141 |
+
2025-04-25 00:58:25,417 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
142 |
+
2025-04-25 00:58:25,418 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
143 |
+
2025-04-25 00:58:25,418 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
144 |
+
2025-04-25 00:58:25,418 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
145 |
+
2025-04-25 00:58:25,418 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
146 |
+
2025-04-25 00:58:25,418 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
147 |
+
2025-04-25 00:58:25,418 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
148 |
+
2025-04-25 00:58:25,418 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
149 |
+
2025-04-25 00:58:25,418 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
150 |
+
2025-04-25 00:58:25,418 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
151 |
+
2025-04-25 00:58:25,418 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
152 |
+
2025-04-25 00:58:25,418 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
153 |
+
2025-04-25 00:58:25,418 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
154 |
+
2025-04-25 00:58:25,418 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
155 |
+
2025-04-25 00:58:25,418 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
156 |
+
2025-04-25 00:58:25,418 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
157 |
+
2025-04-25 00:58:25,418 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
158 |
+
2025-04-25 00:58:25,418 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
159 |
+
2025-04-25 00:58:25,418 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
160 |
+
2025-04-25 00:58:25,418 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
161 |
+
2025-04-25 00:58:25,418 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
162 |
+
2025-04-25 00:58:25,418 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
163 |
+
2025-04-25 00:58:25,419 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
164 |
+
2025-04-25 00:58:25,419 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
165 |
+
2025-04-25 00:58:25,419 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
166 |
+
2025-04-25 00:58:25,419 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
167 |
+
2025-04-25 00:58:25,419 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
168 |
+
2025-04-25 00:58:25,419 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
169 |
+
2025-04-25 00:58:25,419 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
170 |
+
2025-04-25 00:58:25,419 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
171 |
+
2025-04-25 00:58:25,419 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
172 |
+
2025-04-25 00:58:25,419 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
173 |
+
2025-04-25 00:58:25,419 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
174 |
+
2025-04-25 00:58:25,419 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
175 |
+
2025-04-25 00:58:25,419 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
176 |
+
2025-04-25 00:58:25,419 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
177 |
+
2025-04-25 00:58:25,419 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
178 |
+
2025-04-25 00:58:25,419 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
179 |
+
2025-04-25 00:58:25,419 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
180 |
+
2025-04-25 00:58:25,419 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
181 |
+
2025-04-25 00:58:25,419 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
182 |
+
2025-04-25 00:58:25,419 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
183 |
+
2025-04-25 00:58:25,420 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
184 |
+
2025-04-25 00:58:25,420 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
185 |
+
2025-04-25 00:58:25,420 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
186 |
+
2025-04-25 00:58:25,420 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
187 |
+
2025-04-25 00:58:25,420 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
188 |
+
2025-04-25 00:58:25,420 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
189 |
+
2025-04-25 00:58:25,420 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
190 |
+
2025-04-25 00:58:25,420 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
191 |
+
2025-04-25 00:58:25,420 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
192 |
+
2025-04-25 00:58:25,420 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
193 |
+
2025-04-25 00:58:25,420 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
194 |
+
2025-04-25 00:58:25,420 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
195 |
+
2025-04-25 00:58:25,420 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
196 |
+
2025-04-25 00:58:25,420 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
197 |
+
2025-04-25 00:58:25,420 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
198 |
+
2025-04-25 00:58:25,420 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
199 |
+
2025-04-25 00:58:25,420 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
200 |
+
2025-04-25 00:58:25,420 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
201 |
+
2025-04-25 00:58:25,420 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
202 |
+
2025-04-25 00:58:25,420 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
203 |
+
2025-04-25 00:58:25,420 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
204 |
+
2025-04-25 00:58:25,420 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
205 |
+
2025-04-25 00:58:25,421 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
206 |
+
2025-04-25 00:58:25,421 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
207 |
+
2025-04-25 00:58:25,421 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
208 |
+
2025-04-25 00:58:25,421 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
209 |
+
2025-04-25 00:58:25,421 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
210 |
+
2025-04-25 00:58:25,421 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
211 |
+
2025-04-25 00:58:25,421 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
212 |
+
2025-04-25 00:58:25,421 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
213 |
+
2025-04-25 00:58:25,421 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
214 |
+
2025-04-25 00:58:25,421 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
215 |
+
2025-04-25 00:58:25,421 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
216 |
+
2025-04-25 00:58:25,421 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
217 |
+
2025-04-25 00:58:25,421 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
218 |
+
2025-04-25 00:58:25,421 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
219 |
+
2025-04-25 00:58:25,421 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
220 |
+
2025-04-25 00:58:25,421 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
221 |
+
2025-04-25 00:58:25,421 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
222 |
+
2025-04-25 00:58:25,421 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
223 |
+
2025-04-25 00:58:25,421 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
224 |
+
2025-04-25 00:58:25,421 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
225 |
+
2025-04-25 00:58:25,421 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
226 |
+
2025-04-25 00:58:25,422 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
227 |
+
2025-04-25 00:58:25,422 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
228 |
+
2025-04-25 00:58:25,422 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
229 |
+
2025-04-25 00:58:25,422 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
230 |
+
2025-04-25 00:58:25,422 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
231 |
+
2025-04-25 00:58:25,422 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
232 |
+
2025-04-25 00:58:25,422 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
233 |
+
2025-04-25 00:58:25,422 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
234 |
+
2025-04-25 00:58:25,422 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
235 |
+
2025-04-25 00:58:25,422 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
236 |
+
2025-04-25 00:58:25,422 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
237 |
+
2025-04-25 00:58:25,422 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
238 |
+
2025-04-25 00:58:25,422 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
239 |
+
2025-04-25 00:58:25,422 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
240 |
+
2025-04-25 00:58:25,422 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
241 |
+
2025-04-25 00:58:25,422 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
242 |
+
2025-04-25 00:58:25,422 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
243 |
+
2025-04-25 00:58:25,422 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
244 |
+
2025-04-25 00:58:25,422 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
245 |
+
2025-04-25 00:58:25,423 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
246 |
+
2025-04-25 00:58:25,423 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
247 |
+
2025-04-25 00:58:25,423 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
248 |
+
2025-04-25 00:58:25,423 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
249 |
+
2025-04-25 00:58:25,423 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
250 |
+
2025-04-25 00:58:25,423 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
251 |
+
2025-04-25 00:58:25,423 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
252 |
+
2025-04-25 00:58:25,423 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
253 |
+
2025-04-25 00:58:25,423 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
254 |
+
2025-04-25 00:58:25,423 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
255 |
+
2025-04-25 00:58:25,423 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
256 |
+
2025-04-25 00:58:25,423 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
257 |
+
2025-04-25 00:58:25,423 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
258 |
+
2025-04-25 00:58:25,423 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
259 |
+
2025-04-25 00:58:25,423 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
260 |
+
2025-04-25 00:58:25,423 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
261 |
+
2025-04-25 00:58:25,423 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
262 |
+
2025-04-25 00:58:25,423 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
263 |
+
2025-04-25 00:58:25,423 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
264 |
+
2025-04-25 00:58:25,424 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
265 |
+
2025-04-25 00:58:25,424 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
266 |
+
2025-04-25 00:58:25,424 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
267 |
+
2025-04-25 00:58:25,424 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
268 |
+
2025-04-25 00:58:25,424 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
269 |
+
2025-04-25 00:58:25,424 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
270 |
+
2025-04-25 00:58:25,424 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
271 |
+
2025-04-25 00:58:25,424 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
272 |
+
2025-04-25 00:58:25,424 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
273 |
+
2025-04-25 00:58:25,424 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
274 |
+
2025-04-25 00:58:25,424 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
275 |
+
2025-04-25 00:58:25,424 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
276 |
+
2025-04-25 00:58:25,424 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
277 |
+
2025-04-25 00:58:25,424 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
278 |
+
2025-04-25 00:58:25,424 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
279 |
+
2025-04-25 00:58:25,424 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
280 |
+
2025-04-25 00:58:25,424 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
281 |
+
2025-04-25 00:58:25,424 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
282 |
+
2025-04-25 00:58:25,424 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
283 |
+
2025-04-25 00:58:25,424 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
284 |
+
2025-04-25 00:58:25,424 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
285 |
+
2025-04-25 00:58:25,424 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
286 |
+
2025-04-25 00:58:25,425 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
287 |
+
2025-04-25 00:58:25,425 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
288 |
+
2025-04-25 00:58:25,425 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
289 |
+
2025-04-25 00:58:25,425 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
290 |
+
2025-04-25 00:58:25,425 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
291 |
+
2025-04-25 00:58:25,425 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
292 |
+
2025-04-25 00:58:25,425 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
293 |
+
2025-04-25 00:58:25,425 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
294 |
+
2025-04-25 00:58:25,425 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
295 |
+
2025-04-25 00:58:25,425 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
296 |
+
2025-04-25 00:58:25,425 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
297 |
+
2025-04-25 00:58:25,425 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
298 |
+
2025-04-25 00:58:25,425 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
299 |
+
2025-04-25 00:58:25,425 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
300 |
+
2025-04-25 00:58:25,425 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
301 |
+
2025-04-25 00:58:25,425 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
302 |
+
2025-04-25 00:58:25,425 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
303 |
+
2025-04-25 00:58:25,425 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
304 |
+
2025-04-25 00:58:25,425 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
305 |
+
2025-04-25 00:58:25,425 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
306 |
+
2025-04-25 00:58:25,425 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
307 |
+
2025-04-25 00:58:25,425 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
308 |
+
2025-04-25 00:58:25,425 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
309 |
+
2025-04-25 00:58:25,426 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
310 |
+
2025-04-25 00:58:25,426 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
311 |
+
2025-04-25 00:58:25,426 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
312 |
+
2025-04-25 00:58:25,426 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
313 |
+
2025-04-25 00:58:25,426 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
314 |
+
2025-04-25 00:58:25,426 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
315 |
+
2025-04-25 00:58:25,426 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
316 |
+
2025-04-25 00:58:25,426 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
317 |
+
2025-04-25 00:58:25,426 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
318 |
+
2025-04-25 00:58:25,426 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
319 |
+
2025-04-25 00:58:25,426 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
320 |
+
2025-04-25 00:58:25,426 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
321 |
+
2025-04-25 00:58:25,426 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
322 |
+
2025-04-25 00:58:25,426 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
323 |
+
2025-04-25 00:58:25,426 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
324 |
+
2025-04-25 00:58:25,426 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
325 |
+
2025-04-25 00:58:25,426 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
326 |
+
2025-04-25 00:58:25,426 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
327 |
+
2025-04-25 00:58:25,426 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
328 |
+
2025-04-25 00:58:25,426 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
329 |
+
2025-04-25 00:58:25,426 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
330 |
+
2025-04-25 00:58:25,427 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
331 |
+
2025-04-25 00:58:25,427 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
332 |
+
2025-04-25 00:58:25,427 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
333 |
+
2025-04-25 00:58:25,427 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
334 |
+
2025-04-25 00:58:25,427 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
335 |
+
2025-04-25 00:58:25,427 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
336 |
+
2025-04-25 00:58:25,427 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
337 |
+
2025-04-25 00:58:25,427 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
338 |
+
2025-04-25 00:58:25,427 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
339 |
+
2025-04-25 00:58:25,427 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
340 |
+
2025-04-25 00:58:25,427 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
341 |
+
2025-04-25 00:58:25,427 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
342 |
+
2025-04-25 00:58:25,427 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
343 |
+
2025-04-25 00:58:25,427 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
344 |
+
2025-04-25 00:58:25,427 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
345 |
+
2025-04-25 00:58:25,427 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
346 |
+
2025-04-25 00:58:25,427 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
347 |
+
2025-04-25 00:58:25,427 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
348 |
+
2025-04-25 00:58:25,427 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
349 |
+
2025-04-25 00:58:25,427 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
350 |
+
2025-04-25 00:58:25,427 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
351 |
+
2025-04-25 00:58:25,427 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
352 |
+
2025-04-25 00:58:25,428 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
353 |
+
2025-04-25 00:58:25,428 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
354 |
+
2025-04-25 00:58:25,428 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
355 |
+
2025-04-25 00:58:25,428 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
356 |
+
2025-04-25 00:58:25,428 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
357 |
+
2025-04-25 00:58:25,428 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
358 |
+
2025-04-25 00:58:25,428 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
359 |
+
2025-04-25 00:58:25,428 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
360 |
+
2025-04-25 00:58:25,428 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
361 |
+
2025-04-25 00:58:25,428 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
362 |
+
2025-04-25 00:58:25,428 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
363 |
+
2025-04-25 00:58:25,428 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
364 |
+
2025-04-25 00:58:25,428 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
365 |
+
2025-04-25 00:58:25,428 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
366 |
+
2025-04-25 00:58:25,428 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
367 |
+
2025-04-25 00:58:25,428 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
368 |
+
2025-04-25 00:58:25,428 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
369 |
+
2025-04-25 00:58:25,428 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
370 |
+
2025-04-25 00:58:25,428 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
371 |
+
2025-04-25 00:58:25,428 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
372 |
+
2025-04-25 00:58:25,428 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
373 |
+
2025-04-25 00:58:25,428 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
374 |
+
2025-04-25 00:58:25,428 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
375 |
+
2025-04-25 00:58:25,429 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
376 |
+
2025-04-25 00:58:25,429 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
377 |
+
2025-04-25 00:58:25,429 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
378 |
+
2025-04-25 00:58:25,429 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
379 |
+
2025-04-25 00:58:25,429 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
380 |
+
2025-04-25 00:58:25,429 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
381 |
+
2025-04-25 00:58:25,429 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
382 |
+
2025-04-25 00:58:25,429 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
383 |
+
2025-04-25 00:58:25,429 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
384 |
+
2025-04-25 00:58:25,429 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
385 |
+
2025-04-25 00:58:25,429 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
386 |
+
2025-04-25 00:58:25,429 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
387 |
+
2025-04-25 00:58:25,429 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
388 |
+
2025-04-25 00:58:25,429 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
389 |
+
2025-04-25 00:58:25,429 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
390 |
+
2025-04-25 00:58:25,429 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
391 |
+
2025-04-25 00:58:25,429 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
392 |
+
2025-04-25 00:58:25,429 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
393 |
+
2025-04-25 00:58:25,429 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
394 |
+
2025-04-25 00:58:25,429 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
395 |
+
2025-04-25 00:58:25,429 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
396 |
+
2025-04-25 00:58:25,429 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
|
397 |
+
2025-04-25 00:58:25,429 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
|
398 |
+
2025-04-25 00:58:25,430 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
|
399 |
+
2025-04-25 00:58:25,430 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
|
400 |
+
2025-04-25 00:58:25,430 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
|
401 |
+
2025-04-25 00:58:25,430 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
|
402 |
+
2025-04-25 00:58:25,430 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
|
403 |
+
2025-04-25 00:58:25,430 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
|
404 |
+
2025-04-25 00:58:25,430 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
|
405 |
+
2025-04-25 00:58:25,430 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
|
406 |
+
2025-04-25 00:58:25,430 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
|
407 |
+
2025-04-25 00:58:25,430 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
|
408 |
+
2025-04-25 00:58:25,430 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
|
409 |
+
2025-04-25 00:58:25,430 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
|
410 |
+
2025-04-25 00:58:25,430 INFO [train.py:813] encoder_projector.linear1.weight: torch.Size([896, 10240])
|
411 |
+
2025-04-25 00:58:25,430 INFO [train.py:813] encoder_projector.linear1.bias: torch.Size([896])
|
412 |
+
2025-04-25 00:58:25,430 INFO [train.py:813] encoder_projector.linear2.weight: torch.Size([896, 896])
|
413 |
+
2025-04-25 00:58:25,430 INFO [train.py:813] encoder_projector.linear2.bias: torch.Size([896])
|
414 |
+
2025-04-25 00:58:25,430 INFO [train.py:813] codec_lm.model.embed_tokens.weight: torch.Size([4100, 1024])
|
415 |
+
2025-04-25 00:58:25,430 INFO [train.py:813] codec_lm.model.layers.0.self_attn.q_proj.weight: torch.Size([1024, 1024])
|
416 |
+
2025-04-25 00:58:25,430 INFO [train.py:813] codec_lm.model.layers.0.self_attn.q_proj.bias: torch.Size([1024])
|
417 |
+
2025-04-25 00:58:25,430 INFO [train.py:813] codec_lm.model.layers.0.self_attn.k_proj.weight: torch.Size([1024, 1024])
|
418 |
+
2025-04-25 00:58:25,430 INFO [train.py:813] codec_lm.model.layers.0.self_attn.k_proj.bias: torch.Size([1024])
|
419 |
+
2025-04-25 00:58:25,430 INFO [train.py:813] codec_lm.model.layers.0.self_attn.v_proj.weight: torch.Size([1024, 1024])
|
420 |
+
2025-04-25 00:58:25,430 INFO [train.py:813] codec_lm.model.layers.0.self_attn.v_proj.bias: torch.Size([1024])
|
421 |
+
2025-04-25 00:58:25,430 INFO [train.py:813] codec_lm.model.layers.0.self_attn.o_proj.weight: torch.Size([1024, 1024])
|
422 |
+
2025-04-25 00:58:25,430 INFO [train.py:813] codec_lm.model.layers.0.mlp.gate_proj.weight: torch.Size([2048, 1024])
|
423 |
+
2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.0.mlp.up_proj.weight: torch.Size([2048, 1024])
|
424 |
+
2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.0.mlp.down_proj.weight: torch.Size([1024, 2048])
|
425 |
+
2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.0.input_layernorm.weight: torch.Size([1024])
|
426 |
+
2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.0.post_attention_layernorm.weight: torch.Size([1024])
|
427 |
+
2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.1.self_attn.q_proj.weight: torch.Size([1024, 1024])
|
428 |
+
2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.1.self_attn.q_proj.bias: torch.Size([1024])
|
429 |
+
2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.1.self_attn.k_proj.weight: torch.Size([1024, 1024])
|
430 |
+
2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.1.self_attn.k_proj.bias: torch.Size([1024])
|
431 |
+
2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.1.self_attn.v_proj.weight: torch.Size([1024, 1024])
|
432 |
+
2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.1.self_attn.v_proj.bias: torch.Size([1024])
|
433 |
+
2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.1.self_attn.o_proj.weight: torch.Size([1024, 1024])
|
434 |
+
2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.1.mlp.gate_proj.weight: torch.Size([2048, 1024])
|
435 |
+
2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.1.mlp.up_proj.weight: torch.Size([2048, 1024])
|
436 |
+
2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.1.mlp.down_proj.weight: torch.Size([1024, 2048])
|
437 |
+
2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.1.input_layernorm.weight: torch.Size([1024])
|
438 |
+
2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.1.post_attention_layernorm.weight: torch.Size([1024])
|
439 |
+
2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.2.self_attn.q_proj.weight: torch.Size([1024, 1024])
|
440 |
+
2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.2.self_attn.q_proj.bias: torch.Size([1024])
|
441 |
+
2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.2.self_attn.k_proj.weight: torch.Size([1024, 1024])
|
442 |
+
2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.2.self_attn.k_proj.bias: torch.Size([1024])
|
443 |
+
2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.2.self_attn.v_proj.weight: torch.Size([1024, 1024])
|
444 |
+
2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.2.self_attn.v_proj.bias: torch.Size([1024])
|
445 |
+
2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.2.self_attn.o_proj.weight: torch.Size([1024, 1024])
|
446 |
+
2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.2.mlp.gate_proj.weight: torch.Size([2048, 1024])
|
447 |
+
2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.2.mlp.up_proj.weight: torch.Size([2048, 1024])
|
448 |
+
2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.2.mlp.down_proj.weight: torch.Size([1024, 2048])
|
449 |
+
2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.2.input_layernorm.weight: torch.Size([1024])
|
450 |
+
2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.2.post_attention_layernorm.weight: torch.Size([1024])
|
451 |
+
2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.3.self_attn.q_proj.weight: torch.Size([1024, 1024])
|
452 |
+
2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.3.self_attn.q_proj.bias: torch.Size([1024])
|
453 |
+
2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.3.self_attn.k_proj.weight: torch.Size([1024, 1024])
|
454 |
+
2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.3.self_attn.k_proj.bias: torch.Size([1024])
|
455 |
+
2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.3.self_attn.v_proj.weight: torch.Size([1024, 1024])
|
456 |
+
2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.3.self_attn.v_proj.bias: torch.Size([1024])
|
457 |
+
2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.3.self_attn.o_proj.weight: torch.Size([1024, 1024])
|
458 |
+
2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.3.mlp.gate_proj.weight: torch.Size([2048, 1024])
|
459 |
+
2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.3.mlp.up_proj.weight: torch.Size([2048, 1024])
|
460 |
+
2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.3.mlp.down_proj.weight: torch.Size([1024, 2048])
|
461 |
+
2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.3.input_layernorm.weight: torch.Size([1024])
|
462 |
+
2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.3.post_attention_layernorm.weight: torch.Size([1024])
|
463 |
+
2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.4.self_attn.q_proj.weight: torch.Size([1024, 1024])
|
464 |
+
2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.4.self_attn.q_proj.bias: torch.Size([1024])
|
465 |
+
2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.4.self_attn.k_proj.weight: torch.Size([1024, 1024])
|
466 |
+
2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.4.self_attn.k_proj.bias: torch.Size([1024])
|
467 |
+
2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.4.self_attn.v_proj.weight: torch.Size([1024, 1024])
|
468 |
+
2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.4.self_attn.v_proj.bias: torch.Size([1024])
|
469 |
+
2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.4.self_attn.o_proj.weight: torch.Size([1024, 1024])
|
470 |
+
2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.4.mlp.gate_proj.weight: torch.Size([2048, 1024])
|
471 |
+
2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.4.mlp.up_proj.weight: torch.Size([2048, 1024])
|
472 |
+
2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.4.mlp.down_proj.weight: torch.Size([1024, 2048])
|
473 |
+
2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.4.input_layernorm.weight: torch.Size([1024])
|
474 |
+
2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.4.post_attention_layernorm.weight: torch.Size([1024])
|
475 |
+
2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.5.self_attn.q_proj.weight: torch.Size([1024, 1024])
|
476 |
+
2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.5.self_attn.q_proj.bias: torch.Size([1024])
|
477 |
+
2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.5.self_attn.k_proj.weight: torch.Size([1024, 1024])
|
478 |
+
2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.5.self_attn.k_proj.bias: torch.Size([1024])
|
479 |
+
2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.5.self_attn.v_proj.weight: torch.Size([1024, 1024])
|
480 |
+
2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.5.self_attn.v_proj.bias: torch.Size([1024])
|
481 |
+
2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.5.self_attn.o_proj.weight: torch.Size([1024, 1024])
|
482 |
+
2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.5.mlp.gate_proj.weight: torch.Size([2048, 1024])
|
483 |
+
2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.5.mlp.up_proj.weight: torch.Size([2048, 1024])
|
484 |
+
2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.5.mlp.down_proj.weight: torch.Size([1024, 2048])
|
485 |
+
2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.5.input_layernorm.weight: torch.Size([1024])
|
486 |
+
2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.5.post_attention_layernorm.weight: torch.Size([1024])
|
487 |
+
2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.6.self_attn.q_proj.weight: torch.Size([1024, 1024])
|
488 |
+
2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.6.self_attn.q_proj.bias: torch.Size([1024])
|
489 |
+
2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.6.self_attn.k_proj.weight: torch.Size([1024, 1024])
|
490 |
+
2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.6.self_attn.k_proj.bias: torch.Size([1024])
|
491 |
+
2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.6.self_attn.v_proj.weight: torch.Size([1024, 1024])
|
492 |
+
2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.6.self_attn.v_proj.bias: torch.Size([1024])
|
493 |
+
2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.6.self_attn.o_proj.weight: torch.Size([1024, 1024])
|
494 |
+
2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.6.mlp.gate_proj.weight: torch.Size([2048, 1024])
|
495 |
+
2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.6.mlp.up_proj.weight: torch.Size([2048, 1024])
|
496 |
+
2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.6.mlp.down_proj.weight: torch.Size([1024, 2048])
|
497 |
+
2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.6.input_layernorm.weight: torch.Size([1024])
|
498 |
+
2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.6.post_attention_layernorm.weight: torch.Size([1024])
|
499 |
+
2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.7.self_attn.q_proj.weight: torch.Size([1024, 1024])
|
500 |
+
2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.7.self_attn.q_proj.bias: torch.Size([1024])
|
501 |
+
2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.7.self_attn.k_proj.weight: torch.Size([1024, 1024])
|
502 |
+
2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.7.self_attn.k_proj.bias: torch.Size([1024])
|
503 |
+
2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.7.self_attn.v_proj.weight: torch.Size([1024, 1024])
|
504 |
+
2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.7.self_attn.v_proj.bias: torch.Size([1024])
|
505 |
+
2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.7.self_attn.o_proj.weight: torch.Size([1024, 1024])
|
506 |
+
2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.7.mlp.gate_proj.weight: torch.Size([2048, 1024])
|
507 |
+
2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.7.mlp.up_proj.weight: torch.Size([2048, 1024])
|
508 |
+
2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.7.mlp.down_proj.weight: torch.Size([1024, 2048])
|
509 |
+
2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.7.input_layernorm.weight: torch.Size([1024])
|
510 |
+
2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.7.post_attention_layernorm.weight: torch.Size([1024])
|
511 |
+
2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.8.self_attn.q_proj.weight: torch.Size([1024, 1024])
|
512 |
+
2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.8.self_attn.q_proj.bias: torch.Size([1024])
|
513 |
+
2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.8.self_attn.k_proj.weight: torch.Size([1024, 1024])
|
514 |
+
2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.8.self_attn.k_proj.bias: torch.Size([1024])
|
515 |
+
2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.8.self_attn.v_proj.weight: torch.Size([1024, 1024])
|
516 |
+
2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.8.self_attn.v_proj.bias: torch.Size([1024])
|
517 |
+
2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.8.self_attn.o_proj.weight: torch.Size([1024, 1024])
|
518 |
+
2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.8.mlp.gate_proj.weight: torch.Size([2048, 1024])
|
519 |
+
2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.8.mlp.up_proj.weight: torch.Size([2048, 1024])
|
520 |
+
2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.8.mlp.down_proj.weight: torch.Size([1024, 2048])
|
521 |
+
2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.8.input_layernorm.weight: torch.Size([1024])
|
522 |
+
2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.8.post_attention_layernorm.weight: torch.Size([1024])
|
523 |
+
2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.9.self_attn.q_proj.weight: torch.Size([1024, 1024])
|
524 |
+
2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.9.self_attn.q_proj.bias: torch.Size([1024])
|
525 |
+
2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.9.self_attn.k_proj.weight: torch.Size([1024, 1024])
|
526 |
+
2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.9.self_attn.k_proj.bias: torch.Size([1024])
|
527 |
+
2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.9.self_attn.v_proj.weight: torch.Size([1024, 1024])
|
528 |
+
2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.9.self_attn.v_proj.bias: torch.Size([1024])
|
529 |
+
2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.9.self_attn.o_proj.weight: torch.Size([1024, 1024])
|
530 |
+
2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.9.mlp.gate_proj.weight: torch.Size([2048, 1024])
|
531 |
+
2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.9.mlp.up_proj.weight: torch.Size([2048, 1024])
|
532 |
+
2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.9.mlp.down_proj.weight: torch.Size([1024, 2048])
|
533 |
+
2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.9.input_layernorm.weight: torch.Size([1024])
|
534 |
+
2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.9.post_attention_layernorm.weight: torch.Size([1024])
|
535 |
+
2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.10.self_attn.q_proj.weight: torch.Size([1024, 1024])
|
536 |
+
2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.10.self_attn.q_proj.bias: torch.Size([1024])
|
537 |
+
2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.10.self_attn.k_proj.weight: torch.Size([1024, 1024])
|
538 |
+
2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.10.self_attn.k_proj.bias: torch.Size([1024])
|
539 |
+
2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.10.self_attn.v_proj.weight: torch.Size([1024, 1024])
|
540 |
+
2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.10.self_attn.v_proj.bias: torch.Size([1024])
|
541 |
+
2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.10.self_attn.o_proj.weight: torch.Size([1024, 1024])
|
542 |
+
2025-04-25 00:58:25,435 INFO [train.py:813] codec_lm.model.layers.10.mlp.gate_proj.weight: torch.Size([2048, 1024])
|
543 |
+
2025-04-25 00:58:25,435 INFO [train.py:813] codec_lm.model.layers.10.mlp.up_proj.weight: torch.Size([2048, 1024])
|
544 |
+
2025-04-25 00:58:25,435 INFO [train.py:813] codec_lm.model.layers.10.mlp.down_proj.weight: torch.Size([1024, 2048])
|
545 |
+
2025-04-25 00:58:25,435 INFO [train.py:813] codec_lm.model.layers.10.input_layernorm.weight: torch.Size([1024])
|
546 |
+
2025-04-25 00:58:25,435 INFO [train.py:813] codec_lm.model.layers.10.post_attention_layernorm.weight: torch.Size([1024])
|
547 |
+
2025-04-25 00:58:25,435 INFO [train.py:813] codec_lm.model.layers.11.self_attn.q_proj.weight: torch.Size([1024, 1024])
|
548 |
+
2025-04-25 00:58:25,435 INFO [train.py:813] codec_lm.model.layers.11.self_attn.q_proj.bias: torch.Size([1024])
|
549 |
+
2025-04-25 00:58:25,435 INFO [train.py:813] codec_lm.model.layers.11.self_attn.k_proj.weight: torch.Size([1024, 1024])
|
550 |
+
2025-04-25 00:58:25,435 INFO [train.py:813] codec_lm.model.layers.11.self_attn.k_proj.bias: torch.Size([1024])
|
551 |
+
2025-04-25 00:58:25,435 INFO [train.py:813] codec_lm.model.layers.11.self_attn.v_proj.weight: torch.Size([1024, 1024])
|
552 |
+
2025-04-25 00:58:25,435 INFO [train.py:813] codec_lm.model.layers.11.self_attn.v_proj.bias: torch.Size([1024])
|
553 |
+
2025-04-25 00:58:25,435 INFO [train.py:813] codec_lm.model.layers.11.self_attn.o_proj.weight: torch.Size([1024, 1024])
|
554 |
+
2025-04-25 00:58:25,435 INFO [train.py:813] codec_lm.model.layers.11.mlp.gate_proj.weight: torch.Size([2048, 1024])
|
555 |
+
2025-04-25 00:58:25,435 INFO [train.py:813] codec_lm.model.layers.11.mlp.up_proj.weight: torch.Size([2048, 1024])
|
556 |
+
2025-04-25 00:58:25,435 INFO [train.py:813] codec_lm.model.layers.11.mlp.down_proj.weight: torch.Size([1024, 2048])
|
557 |
+
2025-04-25 00:58:25,435 INFO [train.py:813] codec_lm.model.layers.11.input_layernorm.weight: torch.Size([1024])
|
558 |
+
2025-04-25 00:58:25,435 INFO [train.py:813] codec_lm.model.layers.11.post_attention_layernorm.weight: torch.Size([1024])
|
559 |
+
2025-04-25 00:58:25,435 INFO [train.py:813] codec_lm.model.norm.weight: torch.Size([1024])
|
560 |
+
2025-04-25 00:58:25,435 INFO [train.py:813] codec_lm.lm_head.weight: torch.Size([4100, 1024])
|
561 |
+
2025-04-25 00:58:25,435 INFO [train.py:813] speech_token_projector.weight: torch.Size([1024, 1792])
|
562 |
+
2025-04-25 00:58:25,435 INFO [train.py:813] speech_token_projector.bias: torch.Size([1024])
|
563 |
+
2025-04-25 00:58:25,435 INFO [train.py:813] codec_lm_head.weight: torch.Size([4100, 1024])
|
564 |
+
2025-04-25 00:58:25,435 INFO [train.py:813] codec_lm_head.bias: torch.Size([4100])
|
565 |
+
2025-04-25 00:58:25,435 INFO [train.py:819] Device: cuda:1
|
566 |
+
2025-04-25 00:58:26,665 INFO [train.py:823] Using DeepSpeed
|
567 |
+
2025-04-25 00:58:30,432 INFO [data_module.py:445] About to get train cuts
|
568 |
+
2025-04-25 00:58:30,434 INFO [data_module.py:250] Disable MUSAN
|
569 |
+
2025-04-25 00:58:30,434 INFO [data_module.py:268] Enable SpecAugment
|
570 |
+
2025-04-25 00:58:30,434 INFO [data_module.py:269] Time warp factor: 80
|
571 |
+
2025-04-25 00:58:30,434 INFO [data_module.py:279] Num frame mask: 10
|
572 |
+
2025-04-25 00:58:30,434 INFO [data_module.py:292] About to create train dataset
|
573 |
+
2025-04-25 00:58:30,434 INFO [data_module.py:319] Using DynamicBucketingSampler.
|
574 |
+
2025-04-25 00:58:31,421 INFO [data_module.py:336] About to create train dataloader
|
575 |
+
2025-04-25 00:58:31,421 INFO [data_module.py:339] Loading sampler state dict
|
log/log-train-2025-04-25-01-06-08
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2025-04-25 01:06:08,819 INFO [train.py:653] {
|
2 |
+
"allowed_excess_duration_ratio": 0.1,
|
3 |
+
"audio_key": "question_audio",
|
4 |
+
"batch_idx_train": 0,
|
5 |
+
"best_train_epoch": -1,
|
6 |
+
"best_train_loss": Infinity,
|
7 |
+
"best_valid_epoch": -1,
|
8 |
+
"best_valid_loss": Infinity,
|
9 |
+
"bucketing_sampler": true,
|
10 |
+
"deepscale": false,
|
11 |
+
"deepscale_config": null,
|
12 |
+
"deepspeed": true,
|
13 |
+
"deepspeed_config": "./slam_omni/ds_config_zero1.json",
|
14 |
+
"drop_last": true,
|
15 |
+
"enable_musan": false,
|
16 |
+
"enable_spec_aug": true,
|
17 |
+
"enable_speech_output": true,
|
18 |
+
"encoder_projector_ds_rate": 8,
|
19 |
+
"env_info": {
|
20 |
+
"IP address": "0.114.183.253",
|
21 |
+
"hostname": "7518205",
|
22 |
+
"icefall-git-branch": null,
|
23 |
+
"icefall-git-date": null,
|
24 |
+
"icefall-git-sha1": null,
|
25 |
+
"icefall-path": "/workspace/slam/icefall_omni",
|
26 |
+
"k2-build-type": "Release",
|
27 |
+
"k2-git-date": "Tue Oct 29 09:02:19 2024",
|
28 |
+
"k2-git-sha1": "75e2ed6b2fd87c22b7f3f34bad48a69984bb8755",
|
29 |
+
"k2-path": "/opt/conda/lib/python3.11/site-packages/k2/__init__.py",
|
30 |
+
"k2-version": "1.24.4",
|
31 |
+
"k2-with-cuda": true,
|
32 |
+
"lhotse-path": "/workspace/slam/lhotse/lhotse/__init__.py",
|
33 |
+
"lhotse-version": "1.30.0.dev+git.13c7616f.dirty",
|
34 |
+
"python-version": "3.11",
|
35 |
+
"torch-cuda-available": true,
|
36 |
+
"torch-cuda-version": "12.4",
|
37 |
+
"torch-version": "2.4.0"
|
38 |
+
},
|
39 |
+
"exp_dir": "slam_omni/exp_speech2speech_rerun",
|
40 |
+
"frame_shift_ms": 10,
|
41 |
+
"huggingface_dataset_path_or_name": "/workspace/Belle_1.4M-SLAM-Omni",
|
42 |
+
"input_strategy": "PrecomputedFeatures",
|
43 |
+
"llm_path_or_name": "models/Qwen2.5-0.5B-Instruct",
|
44 |
+
"log_interval": 50,
|
45 |
+
"manifest_dir": "data/fbank",
|
46 |
+
"max_duration": 50,
|
47 |
+
"num_buckets": 30,
|
48 |
+
"num_epochs": 10,
|
49 |
+
"num_workers": 2,
|
50 |
+
"on_the_fly_feats": false,
|
51 |
+
"pretrained_model_path": "./slam_omni/exp_speech2speech_rerun/epoch-1-checkpoint-15000.pt/pytorch_model.bin",
|
52 |
+
"resample_to_16kHz": true,
|
53 |
+
"reset_interval": 200,
|
54 |
+
"return_cuts": true,
|
55 |
+
"sampler_state_dict_path": "./slam_omni/exp_speech2speech_rerun/epoch-1-checkpoint-15000-sampler.pt",
|
56 |
+
"seed": 42,
|
57 |
+
"shuffle": true,
|
58 |
+
"spec_aug_time_warp_factor": 80,
|
59 |
+
"speech_encoder_path_or_name": "models/whisper/v1.1/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt",
|
60 |
+
"start_epoch": 1,
|
61 |
+
"subsampling_factor": 2,
|
62 |
+
"tensorboard": true,
|
63 |
+
"text_key": "answer",
|
64 |
+
"unfreeze_llm": true,
|
65 |
+
"unfreeze_speech_projector": true,
|
66 |
+
"use_flash_attn": true,
|
67 |
+
"use_fp16": true,
|
68 |
+
"use_lora": true,
|
69 |
+
"valid_interval": 5000
|
70 |
+
}
|
71 |
+
2025-04-25 01:06:08,819 INFO [train.py:655] About to create model
|
log/log-train-2025-04-25-01-06-09
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2025-04-25 01:06:09,289 INFO [train.py:653] {
|
2 |
+
"allowed_excess_duration_ratio": 0.1,
|
3 |
+
"audio_key": "question_audio",
|
4 |
+
"batch_idx_train": 0,
|
5 |
+
"best_train_epoch": -1,
|
6 |
+
"best_train_loss": Infinity,
|
7 |
+
"best_valid_epoch": -1,
|
8 |
+
"best_valid_loss": Infinity,
|
9 |
+
"bucketing_sampler": true,
|
10 |
+
"deepscale": false,
|
11 |
+
"deepscale_config": null,
|
12 |
+
"deepspeed": true,
|
13 |
+
"deepspeed_config": "./slam_omni/ds_config_zero1.json",
|
14 |
+
"drop_last": true,
|
15 |
+
"enable_musan": false,
|
16 |
+
"enable_spec_aug": true,
|
17 |
+
"enable_speech_output": true,
|
18 |
+
"encoder_projector_ds_rate": 8,
|
19 |
+
"env_info": {
|
20 |
+
"IP address": "0.114.183.253",
|
21 |
+
"hostname": "7518205",
|
22 |
+
"icefall-git-branch": null,
|
23 |
+
"icefall-git-date": null,
|
24 |
+
"icefall-git-sha1": null,
|
25 |
+
"icefall-path": "/workspace/slam/icefall_omni",
|
26 |
+
"k2-build-type": "Release",
|
27 |
+
"k2-git-date": "Tue Oct 29 09:02:19 2024",
|
28 |
+
"k2-git-sha1": "75e2ed6b2fd87c22b7f3f34bad48a69984bb8755",
|
29 |
+
"k2-path": "/opt/conda/lib/python3.11/site-packages/k2/__init__.py",
|
30 |
+
"k2-version": "1.24.4",
|
31 |
+
"k2-with-cuda": true,
|
32 |
+
"lhotse-path": "/workspace/slam/lhotse/lhotse/__init__.py",
|
33 |
+
"lhotse-version": "1.30.0.dev+git.13c7616f.dirty",
|
34 |
+
"python-version": "3.11",
|
35 |
+
"torch-cuda-available": true,
|
36 |
+
"torch-cuda-version": "12.4",
|
37 |
+
"torch-version": "2.4.0"
|
38 |
+
},
|
39 |
+
"exp_dir": "slam_omni/exp_speech2speech_rerun",
|
40 |
+
"frame_shift_ms": 10,
|
41 |
+
"huggingface_dataset_path_or_name": "/workspace/Belle_1.4M-SLAM-Omni",
|
42 |
+
"input_strategy": "PrecomputedFeatures",
|
43 |
+
"llm_path_or_name": "models/Qwen2.5-0.5B-Instruct",
|
44 |
+
"log_interval": 50,
|
45 |
+
"manifest_dir": "data/fbank",
|
46 |
+
"max_duration": 50,
|
47 |
+
"num_buckets": 30,
|
48 |
+
"num_epochs": 10,
|
49 |
+
"num_workers": 2,
|
50 |
+
"on_the_fly_feats": false,
|
51 |
+
"pretrained_model_path": "./slam_omni/exp_speech2speech_rerun/epoch-1-checkpoint-15000.pt/pytorch_model.bin",
|
52 |
+
"resample_to_16kHz": true,
|
53 |
+
"reset_interval": 200,
|
54 |
+
"return_cuts": true,
|
55 |
+
"sampler_state_dict_path": "./slam_omni/exp_speech2speech_rerun/epoch-1-checkpoint-15000-sampler.pt",
|
56 |
+
"seed": 42,
|
57 |
+
"shuffle": true,
|
58 |
+
"spec_aug_time_warp_factor": 80,
|
59 |
+
"speech_encoder_path_or_name": "models/whisper/v1.1/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt",
|
60 |
+
"start_epoch": 1,
|
61 |
+
"subsampling_factor": 2,
|
62 |
+
"tensorboard": true,
|
63 |
+
"text_key": "answer",
|
64 |
+
"unfreeze_llm": true,
|
65 |
+
"unfreeze_speech_projector": true,
|
66 |
+
"use_flash_attn": true,
|
67 |
+
"use_fp16": true,
|
68 |
+
"use_lora": true,
|
69 |
+
"valid_interval": 5000
|
70 |
+
}
|
71 |
+
2025-04-25 01:06:09,289 INFO [train.py:655] About to create model
|
log/log-train-2025-04-25-01-07-00
ADDED
The diff for this file is too large to render.
See raw diff
|
|
pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2e518bb90d19eff29cab0e36e024ea6c8c51eb0ebf01bbb92026646d414da9c2
|
3 |
+
size 749750076
|
tensorboard/events.out.tfevents.1745485127.7515258.1454637.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3420d0018b82da7a2f43ca561c9fdfa179db66e06bb2d3ac614ce2e15aef91d3
|
3 |
+
size 135
|
tensorboard/events.out.tfevents.1745485786.7515258.1472312.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:574e628eb7cf4b844bcce9409abb79402c7e6572268922ca4e0be5303917364a
|
3 |
+
size 135
|
tensorboard/events.out.tfevents.1745486182.7515258.1482888.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d4258223345f90a5e9b4adff784bafa145a6eae5082a911e0f78c0af8f1e8353
|
3 |
+
size 135
|
tensorboard/events.out.tfevents.1745489567.7515258.1571829.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:82506830e9b646b9f24e159b391bf95b5d35b7bc747f0ce5a17e357f8e7ecb2a
|
3 |
+
size 135
|
tensorboard/events.out.tfevents.1745489713.7515258.1575936.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d6e078e3c3f3220e183555839741d1b351c12f607c8f4faded3aa6ef3320f8a0
|
3 |
+
size 135
|
tensorboard/events.out.tfevents.1745489870.7515258.1580280.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b604e4364358ea7691f076b13759f31ef10f29a085d85bd1b0851b0eedf3bf9a
|
3 |
+
size 2749
|
tensorboard/events.out.tfevents.1745490173.7518205.24828.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:65fdf02f6e5f61a8c53283b6bab3e98481ba09c0df80c0b426ebf5098c34a1a4
|
3 |
+
size 1239
|
tensorboard/events.out.tfevents.1745490285.7518205.28691.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:38670ceb5697c143c012d8a151c2d7c2a3df929d4078f9d78dea7e75db336bf0
|
3 |
+
size 243430
|
tensorboard/events.out.tfevents.1745505052.7518205.257175.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:61e88e23f8a8a50b7e58df2cabdfab400daedb6b180b2d3a6c4d3708234b9c96
|
3 |
+
size 68029
|
tensorboard/events.out.tfevents.1745543674.7518205.398874.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0293cafc5c6bfb857b9c784b441ede3d8066a65e7c2187c8776a7065d906538f
|
3 |
+
size 3480854
|