yuekai commited on
Commit
8c81611
·
verified ·
1 Parent(s): b9fc209

Upload folder using huggingface_hub

Browse files
Files changed (42) hide show
  1. .gitattributes +9 -0
  2. log-e2e-epoch10_speech2speech_rerun/2a8211e0e75e000bbae1134b21fdbfa4.wav +3 -0
  3. log-e2e-epoch10_speech2speech_rerun/2e9fddd659b1167aed93534c608f1d99.wav +0 -0
  4. log-e2e-epoch10_speech2speech_rerun/31b4015dd1258ff1f8be44d6504e6956.wav +3 -0
  5. log-e2e-epoch10_speech2speech_rerun/3243962376f8c2e411e587e6482f9367.wav +3 -0
  6. log-e2e-epoch10_speech2speech_rerun/450e7541d093ee0e7e12ab76a0dff546.wav +3 -0
  7. log-e2e-epoch10_speech2speech_rerun/88562bce265cff88246933e62b756e84.wav +3 -0
  8. log-e2e-epoch10_speech2speech_rerun/ca7f2b2ab703a4f37b5da09fd119a28a.wav +3 -0
  9. log-e2e-epoch10_speech2speech_rerun/cer-summary-test-epoch-999-avg-1.txt +2 -0
  10. log-e2e-epoch10_speech2speech_rerun/d60daeaebb5b0b14ee34f5d714075108.wav +3 -0
  11. log-e2e-epoch10_speech2speech_rerun/ee07a1222f1d62cf0358f51d12f4c67c.wav +3 -0
  12. log-e2e-epoch10_speech2speech_rerun/errs-test-beam-search-epoch-999-avg-1.txt +1000 -0
  13. log-e2e-epoch10_speech2speech_rerun/f5cf13be42666535036bdf0f05fcc0e3.wav +3 -0
  14. log-e2e-epoch10_speech2speech_rerun/log-decode-epoch-999-avg-1-2025-04-29-06-36-58 +66 -0
  15. log-e2e-epoch10_speech2speech_rerun/log-decode-epoch-999-avg-1-2025-04-29-06-40-09 +66 -0
  16. log-e2e-epoch10_speech2speech_rerun/log-decode-epoch-999-avg-1-2025-04-29-06-41-35 +75 -0
  17. log-e2e-epoch10_speech2speech_rerun/recogs-test-beam-search-epoch-999-avg-1.txt +26 -0
  18. log/log-train-2025-04-24-08-58-17 +689 -0
  19. log/log-train-2025-04-24-09-09-22 +689 -0
  20. log/log-train-2025-04-24-09-15-56 +689 -0
  21. log/log-train-2025-04-24-10-12-23 +600 -0
  22. log/log-train-2025-04-24-10-14-50 +600 -0
  23. log/log-train-2025-04-24-10-17-26 +607 -0
  24. log/log-train-2025-04-24-10-21-52 +604 -0
  25. log/log-train-2025-04-24-10-24-20 +0 -0
  26. log/log-train-2025-04-24-14-23-07 +0 -0
  27. log/log-train-2025-04-25-00-57-07 +71 -0
  28. log/log-train-2025-04-25-00-58-05 +575 -0
  29. log/log-train-2025-04-25-01-06-08 +71 -0
  30. log/log-train-2025-04-25-01-06-09 +71 -0
  31. log/log-train-2025-04-25-01-07-00 +0 -0
  32. pytorch_model.bin +3 -0
  33. tensorboard/events.out.tfevents.1745485127.7515258.1454637.0 +3 -0
  34. tensorboard/events.out.tfevents.1745485786.7515258.1472312.0 +3 -0
  35. tensorboard/events.out.tfevents.1745486182.7515258.1482888.0 +3 -0
  36. tensorboard/events.out.tfevents.1745489567.7515258.1571829.0 +3 -0
  37. tensorboard/events.out.tfevents.1745489713.7515258.1575936.0 +3 -0
  38. tensorboard/events.out.tfevents.1745489870.7515258.1580280.0 +3 -0
  39. tensorboard/events.out.tfevents.1745490173.7518205.24828.0 +3 -0
  40. tensorboard/events.out.tfevents.1745490285.7518205.28691.0 +3 -0
  41. tensorboard/events.out.tfevents.1745505052.7518205.257175.0 +3 -0
  42. tensorboard/events.out.tfevents.1745543674.7518205.398874.0 +3 -0
.gitattributes CHANGED
@@ -33,3 +33,12 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ log-e2e-epoch10_speech2speech_rerun/2a8211e0e75e000bbae1134b21fdbfa4.wav filter=lfs diff=lfs merge=lfs -text
37
+ log-e2e-epoch10_speech2speech_rerun/31b4015dd1258ff1f8be44d6504e6956.wav filter=lfs diff=lfs merge=lfs -text
38
+ log-e2e-epoch10_speech2speech_rerun/3243962376f8c2e411e587e6482f9367.wav filter=lfs diff=lfs merge=lfs -text
39
+ log-e2e-epoch10_speech2speech_rerun/450e7541d093ee0e7e12ab76a0dff546.wav filter=lfs diff=lfs merge=lfs -text
40
+ log-e2e-epoch10_speech2speech_rerun/88562bce265cff88246933e62b756e84.wav filter=lfs diff=lfs merge=lfs -text
41
+ log-e2e-epoch10_speech2speech_rerun/ca7f2b2ab703a4f37b5da09fd119a28a.wav filter=lfs diff=lfs merge=lfs -text
42
+ log-e2e-epoch10_speech2speech_rerun/d60daeaebb5b0b14ee34f5d714075108.wav filter=lfs diff=lfs merge=lfs -text
43
+ log-e2e-epoch10_speech2speech_rerun/ee07a1222f1d62cf0358f51d12f4c67c.wav filter=lfs diff=lfs merge=lfs -text
44
+ log-e2e-epoch10_speech2speech_rerun/f5cf13be42666535036bdf0f05fcc0e3.wav filter=lfs diff=lfs merge=lfs -text
log-e2e-epoch10_speech2speech_rerun/2a8211e0e75e000bbae1134b21fdbfa4.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2e2b46ccfa55b20f3f14951cdd5477dd34bc40d85466183d755561fa37aa42c
3
+ size 632364
log-e2e-epoch10_speech2speech_rerun/2e9fddd659b1167aed93534c608f1d99.wav ADDED
Binary file (59.9 kB). View file
 
log-e2e-epoch10_speech2speech_rerun/31b4015dd1258ff1f8be44d6504e6956.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6cf3ef873d0a7ff9e571d49b021955269d9de581544a223c278f8524fa04ff96
3
+ size 903212
log-e2e-epoch10_speech2speech_rerun/3243962376f8c2e411e587e6482f9367.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:989b982aa534f47c7f4d631799c554b5b77f07869a1acf3196a6637db740fd76
3
+ size 903212
log-e2e-epoch10_speech2speech_rerun/450e7541d093ee0e7e12ab76a0dff546.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c97ba2b00e335755d49d9535584d28db36708304749d739329afc739d602f63
3
+ size 785452
log-e2e-epoch10_speech2speech_rerun/88562bce265cff88246933e62b756e84.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2092b38745caa4592f1a81d87c2e5b8cd2847173ae12e63e2387147976cae7a1
3
+ size 326188
log-e2e-epoch10_speech2speech_rerun/ca7f2b2ab703a4f37b5da09fd119a28a.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92f685106b2ffa556c31b03afd26ce63144a7f241395bac53114f480e2fa41b1
3
+ size 903212
log-e2e-epoch10_speech2speech_rerun/cer-summary-test-epoch-999-avg-1.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ settings CER
2
+ beam-search 229.61
log-e2e-epoch10_speech2speech_rerun/d60daeaebb5b0b14ee34f5d714075108.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb1be48a81e69d6c594cfc255be0788037854cf3b7df9fd157a9db70b1f8e673
3
+ size 903212
log-e2e-epoch10_speech2speech_rerun/ee07a1222f1d62cf0358f51d12f4c67c.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4479c168805652797770f1697db0de4eabda67ad4961649268a4f6cb9ca4b624
3
+ size 280108
log-e2e-epoch10_speech2speech_rerun/errs-test-beam-search-epoch-999-avg-1.txt ADDED
@@ -0,0 +1,1000 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ %WER = 229.61
2
+ Errors: 643 insertions, 78 deletions, 163 substitutions, over 385 reference words (144 correct)
3
+ Search below for sections starting with PER-UTT DETAILS:, SUBSTITUTIONS:, DELETIONS:, INSERTIONS:, PER-WORD STATS:
4
+
5
+ PER-UTT DETAILS: corr or (ref->hyp)
6
+ 2a8211e0e75e000bbae1134b21fdbfa4: (以 下->答 案) 是 (一 道 小 学 数 学 题 : 有 一 条 直 线 , 上 面 有 6->D . 6 0 %
7
+ 向 右 走 的 蚂 蚁 数 量 为 2) 只 (蚂 蚁->*) , (其 中 4->总 共 6) 只 (向 左 走->蚂 蚁 中) , (*->有) 2 只 (向 右 走 。 问 :->是) 向 右 走 的 (蚂 蚁 数 量 占 比 是 多 少 ? A .->, 因 此 6 0 % 的 6 只 蚂 蚁 中 有) 2 (5 % B . 3 3 . 3 % C . 5 0 %->只 。
8
+ 所 以 , 答 案 选) D (. 6 6 . 6 % 回 答 该 问 题 。->。 < | i m _ e n d | >)
9
+ 2e9fddd659b1167aed93534c608f1d99: (A 公 司 去 年 亏 损 了->2) 5 0 (0->*) 万 美 元 (, 今 年 净 利 润 增 长 了 5 0 % , 今 年 的 净 利 润 是 多 少 ?->。 < | i m _ e n d | >)
10
+ 31b4015dd1258ff1f8be44d6504e6956: (请 给 我 介 绍->人 工 智 能 是) 一 (下->种 模 拟) 人 (*->类 智 能 的 技 术 , 它 包 括 机 器 学 习 、 自 然 语 言 处 理 、 计 算 机 视 觉 等 多 个 方 面 。 目 前 , 人) 工 智 能 (*->技 术 已 经 广 泛 应 用 于 各 个 领 域 , 比 如 医 疗 保 健 、 金 融 、 交 通 等 。 其 中 , 深 度 学 习 是 当 前 最 热 门) 的 技 术 (*->之 一 , 它 可 以 实 现 对 大 量 数 据 的 自 动 分 析) 和 (*->学 习 , 并 且 可 以 用 于 图 像 识 别 、 语 音 识 别 等 方 面 。 未 来 , 随 着 科 技 的) 发 展 (*->, 人 工 智 能 将 会 更 加 普 及 , 为 我 们 的 生 活 带 来 更 多 的 便 利 和 创 新) 。 (*->< | i m _ e n d | >)
11
+ 3243962376f8c2e411e587e6482f9367: (写 一 首 关 于->在 大 自 然 的 怀 抱 中 , 我 感 受 到 了 生 命 的 美 好 。 山 川 河 流 、 花 草 树 木 , 都 散 发 着 独 特 的 魅 力 。 阳 光 明 媚 , 微 风 拂 面 , 让 人 感 受 到) 自 然 (*->的 力 量 和) 美 (景->丽 。 在 这 片 广 袤) 的 (诗 歌->大 地 上 , 我 看 到 了 无 尽 的 可 能 和 希 望 , 也 感 受 到 了 内 心 的 宁 静 和 平 静) 。 (*->< | i m _ e n d | >)
12
+ 450e7541d093ee0e7e12ab76a0dff546: (帮 我 想->好 的 , 以 下 是) 一 些 关 于 狗 的 名 字 (*->: 旺 财 、 小 黑 、 巧 克 力 、 大 黄 、 金 毛 、 拉 布 拉 多 、 吉 娃 娃 、 柯 基 、 贵 宾 、 斗 牛 犬) 。 (*->当 然 , 如 果 您 有 其 他 要 求 或 想 法 , 我 也 可 以 根 据 您 的 需 求 进 行 更 精 准 的 推 荐 。 < | i m _ e n d | >)
13
+ 88562bce265cff88246933e62b756e84: (给 定 一 段 文 本 和 关 键 词 列 表 ,->*) 删 除 (文 本 中 包 含 所 有 给 定 关 键 词->后) 的 (子 字 符 串 。->*) 文 本 (*->为) : " 这 是 一 个 测 试 句 子 , 目 的 是 看 看 模 型 是 否 (可 以->*) 正 确 地 从 这 个 句 子 中 删 除 关 键 词 (。->*) " (\ n 关 键 词 列 表 : [ ‘ 测 试 ’ , ‘ 模 型 ’ ]->。 < | i m _ e n d | >)
14
+ ca7f2b2ab703a4f37b5da09fd119a28a: 根 据 输 入 的 文 本 , (*->可 以) 判 断 它 (是 否->*) 属 于 新 闻 报 道 (、 广 告 或 博 客->类 别 。 原 因 如 下 :
15
+ 1 . ) 文 (章 类 别 之 一 。->本 主 题 : 该 段 文 字 描 述 的 是 奥 运 会 的 消 息 , 这 通 常 是) 新 闻 报 道 (: “ 奥 运 会 在 东 京 隆 重 开 幕->的 主 题 之 一 。
16
+ 2 . 文 本 内 容 : 文 本 中 提 到 了 多 个 事 件) , (*->包 括) 中 国 代 表 团 获 得 多 项 奖 牌 (*->, 这 是 典 型 的 新 闻 报 道 内 容) 。 (”->
17
+ 3 . 文 本 语 气 : 整 个 文 本 的 语 气 客 观 、 中 立 , 没 有 明 显 的 广 告 或 博 客 文 章 的 推 销 和 宣 传 元 素 。
18
+ 综 上 所 述 , 可 以 得 出 结 论 , 这 段 文 本 是 一 篇 新 闻 报 道 。 < | i m _ e n d | >)
19
+ d60daeaebb5b0b14ee34f5d714075108: (给->如 果 你 喜 欢 自 然 风 光 ,) 我 推 荐 (一->你 去 云 南 的 丽 江 、 大 理 和 西 双 版 纳 。 这) 些 (旅 游 景 点->地 方 有 壮 丽 的 山 脉 、 清 澈 的 湖 泊 和 独 特 的 文 化) 。 (*->如 果 你 想 体 验 异 国 风 情 , 可 以 前 往 泰 国 的 曼 谷 、 印 度 尼 西 亚 的 巴 厘 岛 或 日 本 的 东 京 。 此 外 , 欧 洲 有 许 多 美 丽 的 城 市 , 如 巴 黎 、 威 尼 斯 和 阿 姆 斯 特 丹 等 , 也 是 不 错 的 选 择 。 < | i m _ e n d | >)
20
+ ee07a1222f1d62cf0358f51d12f4c67c: (很->*) 好 (*->的) , (现 在 请 你->让 我 来) 计 算 一 下 (*->。 在) 这 篇 文 章 中 (*->, “ 获 得 ”) 出 现 了 (多 少 个 “ 获 得 ” 字 样 。->两 次 。 < | i m _ e n d | >)
21
+ f5cf13be42666535036bdf0f05fcc0e3: 好 的 (。 现 在 请 你 将->, 替 换 后 的 文 本 为 : ") 这 (*->是 一) 个 (文 本 中 的 ��� 有->句 子 这 是 目) 的 (逗 号 都 替 换 成 空 格->看 看 是 否 可 以 正 确 地 从 这 个 句 子 中 删 除 关 键 词 ") 。 (*->< | i m _ e n d | >)
22
+
23
+ SUBSTITUTIONS: count ref -> hyp
24
+ 1 ? -> 蚂
25
+ 1 : -> 的
26
+ 1 : -> 向
27
+ 1 : -> n
28
+ 1 , -> 数
29
+ 1 , -> 。
30
+ 1 首 -> 自
31
+ 1 题 -> >
32
+ 1 题 ->
33
+
34
+ 1 面 -> 为
35
+ 1 隆 -> .
36
+ 1 问 -> |
37
+ 1 长 -> n
38
+ 1 键 -> i
39
+ 1 量 -> 6
40
+ 1 重 ->
41
+ 1 都 -> 是
42
+ 1 道 -> .
43
+ 1 逗 -> 看
44
+ 1 运 -> 之
45
+ 1 走 -> 中
46
+ 1 请 -> 来
47
+ 1 请 -> 后
48
+ 1 请 -> 人
49
+ 1 该 -> d
50
+ 1 诗 -> 大
51
+ 1 词 -> m
52
+ 1 表 -> e
53
+ 1 蚂 -> ,
54
+ 1 蚁 -> 因
55
+ 1 获 -> |
56
+ 1 给 -> 工
57
+ 1 给 -> 如
58
+ 1 绍 -> 是
59
+ 1 线 -> 蚁
60
+ 1 类 -> 主
61
+ 1 答 -> n
62
+ 1 章 -> 本
63
+ 1 空 -> 正
64
+ 1 直 -> 蚂
65
+ 1 的 -> 这
66
+ 1 现 -> 让
67
+ 1 现 -> 替
68
+ 1 点 -> 壮
69
+ 1 游 -> 方
70
+ 1 润 -> _
71
+ 1 测 -> >
72
+ 1 比 -> %
73
+ 1 歌 -> 地
74
+ 1 格 -> 确
75
+ 1 样 -> e
76
+ 1 条 -> 的
77
+ 1 本 -> 子
78
+ 1 有 -> 目
79
+ 1 有 -> 右
80
+ 1 有 -> 2
81
+ 1 替 -> 否
82
+ 1 景 -> 有
83
+ 1 景 -> 丽
84
+ 1 是 -> 的
85
+ 1 旅 -> 地
86
+ 1 文 -> 后
87
+ 1 文 -> 句
88
+ 1 数 -> 此
89
+ 1 数 -> 0
90
+ 1 换 -> 可
91
+ 1 所 -> 是
92
+ 1 或 -> 原
93
+ 1 我 -> 的
94
+ 1 我 -> 智
95
+ 1 成 -> 以
96
+ 1 想 -> ,
97
+ 1 得 -> i
98
+ 1 开 -> 文
99
+ 1 广 -> 别
100
+ 1 年 -> |
101
+ 1 幕 -> 本
102
+ 1 帮 -> 好
103
+ 1 左 -> 蚁
104
+ 1 少 -> 次
105
+ 1 少 -> 只
106
+ 1 小 ->
107
+ 1 将 -> 文
108
+ 1 客 -> 如
109
+ 1 学 -> 6
110
+ 1 学 -> %
111
+ 1 字 -> _
112
+ 1 奥 -> 题
113
+ 1 多 -> 两
114
+ 1 多 -> 6
115
+ 1 增 -> e
116
+ 1 在 -> 换
117
+ 1 在 -> 我
118
+ 1 在 -> 。
119
+ 1 回 -> e
120
+ 1 告 -> 。
121
+ 1 向 -> 蚂
122
+ 1 向 -> 是
123
+ 1 号 -> 看
124
+ 1 占 -> 0
125
+ 1 博 -> 因
126
+ 1 别 -> 题
127
+ 1 利 -> m
128
+ 1 列 -> _
129
+ 1 净 -> i
130
+ 1 写 -> 在
131
+ 1 其 -> 总
132
+ 1 关 -> 然
133
+ 1 关 -> |
134
+ 1 你 -> 的
135
+ 1 会 -> 一
136
+ 1 以 -> 答
137
+ 1 介 -> 能
138
+ 1 今 -> <
139
+ 1 京 -> 2
140
+ 1 于 -> 的
141
+ 1 了 -> d
142
+ 1 之 -> :
143
+ 1 中 -> 共
144
+ 1 中 ->
145
+ 1 个 -> 。
146
+ 1 东 ->
147
+
148
+ 1 下 -> 种
149
+ 1 下 -> 案
150
+ 1 上 -> 量
151
+ 1 一 -> 走
152
+ 1 一 -> 该
153
+ 1 一 -> 大
154
+ 1 一 -> 你
155
+ 1 一 -> D
156
+ 1 。 -> ,
157
+ 1 。 -> 段
158
+ 1 。 -> n
159
+ 1 、 -> 类
160
+ 1 ” -> m
161
+ 1 ” ->
162
+
163
+ 1 “ -> 主
164
+ 1 “ -> <
165
+ 1 ‘ -> |
166
+ 1 n -> <
167
+ 1 \ -> 。
168
+ 1 [ -> d
169
+ 1 B ->
170
+
171
+ 1 A -> 蚁
172
+ 1 A -> 2
173
+ 1 6 -> |
174
+ 1 6 -> m
175
+ 1 6 -> <
176
+ 1 5 -> 只
177
+ 1 5 -> |
178
+ 1 4 -> 6
179
+ 1 3 -> ,
180
+ 1 3 -> 案
181
+ 1 3 -> 以
182
+ 1 0 -> >
183
+ 1 . -> 答
184
+ 1 . -> 所
185
+ 1 . -> 中
186
+ 1 . -> 。
187
+ 1 . -> i
188
+ 1 % -> 选
189
+ 1 % -> 。
190
+ 1 % -> _
191
+
192
+ DELETIONS: count ref
193
+ 4 。
194
+ 3 ,
195
+ 2 键
196
+ 2 词
197
+ 2 给
198
+ 2 本
199
+ 2 是
200
+ 2 年
201
+ 2 定
202
+ 2 关
203
+ 2 ’
204
+ 2 0
205
+ 2 %
206
+ 1 ?
207
+ 1 :
208
+ 1 问
209
+ 1 走
210
+ 1 试
211
+ 1 表
212
+ 1 蚂
213
+ 1 蚁
214
+ 1 符
215
+ 1 的
216
+ 1 润
217
+ 1 段
218
+ 1 模
219
+ 1 有
220
+ 1 文
221
+ 1 损
222
+ 1 所
223
+ 1 很
224
+ 1 少
225
+ 1 字
226
+ 1 子
227
+ 1 多
228
+ 1 型
229
+ 1 和
230
+ 1 含
231
+ 1 否
232
+ 1 司
233
+ 1 右
234
+ 1 可
235
+ 1 去
236
+ 1 包
237
+ 1 利
238
+ 1 列
239
+ 1 净
240
+ 1 公
241
+ 1 你
242
+ 1 以
243
+ 1 今
244
+ 1 亏
245
+ 1 了
246
+ 1 串
247
+ 1 中
248
+ 1 一
249
+ 1 ‘
250
+ 1 ]
251
+ 1 C
252
+ 1 6
253
+ 1 5
254
+ 1 .
255
+
256
+ INSERTIONS: count hyp
257
+ 28 ,
258
+ 28 的
259
+ 20 、
260
+ 13 。
261
+ 13 |
262
+ 8 是
263
+ 8 和
264
+ 7 文
265
+ 7 可
266
+ 7 以
267
+ 7 d
268
+ 7 >
269
+ 6 这
270
+ 6 本
271
+ 6 n
272
+ 6 m
273
+ 6 i
274
+ 6 e
275
+ 6 _
276
+ 6 <
277
+ 5 :
278
+ 5 有
279
+ 5 多
280
+ 5 到
281
+ 5 中
282
+ 5 个
283
+ 4 语
284
+ 4 等
285
+ 4 我
286
+ 4 如
287
+ 4 了
288
+ 3 风
289
+ 3 面
290
+ 3 自
291
+ 3 能
292
+ 3 特
293
+ 3 然
294
+ 3 果
295
+ 3 更
296
+ 3 新
297
+ 3 技
298
+ 3 或
299
+ 3 感
300
+ 3 广
301
+ 3 学
302
+ 3 大
303
+ 3 受
304
+ 3 力
305
+ 3 前
306
+ 3 内
307
+ 3 人
308
+ 3 习
309
+ 3 也
310
+ 3 丽
311
+ 3 为
312
+ 3 一
313
+ 2 静
314
+ 2 闻
315
+ 2 金
316
+ 2 量
317
+ 2 道
318
+ 2 通
319
+ 2 述
320
+ 2 识
321
+ 2 西
322
+ 2 美
323
+ 2 着
324
+ 2 用
325
+ 2 生
326
+ 2 理
327
+ 2 独
328
+ 2 求
329
+ 2 气
330
+ 2 来
331
+ 2 机
332
+ 2 术
333
+ 2 智
334
+ 2 明
335
+ 2 方
336
+ 2 斯
337
+ 2 推
338
+ 2 据
339
+ 2 括
340
+ 2 拉
341
+ 2 报
342
+ 2 想
343
+ 2 您
344
+ 2 得
345
+ 2 当
346
+ 2 度
347
+ 2 巴
348
+ 2 山
349
+ 2 尼
350
+ 2 容
351
+ 2 客
352
+ 2 它
353
+ 2 娃
354
+ 2 在
355
+ 2 国
356
+ 2 包
357
+ 2 别
358
+ 2 其
359
+ 2 光
360
+ 2 你
361
+ 2 会
362
+ 2 于
363
+ 2 下
364
+ 2 上
365
+ 2 .
366
+ 2 "
367
+ 2
368
+ 2
369
+
370
+ 1 黑
371
+ 1 黎
372
+ 1 黄
373
+ 1 魅
374
+ 1 验
375
+ 1 领
376
+ 1 音
377
+ 1 需
378
+ 1 随
379
+ 1 除
380
+ 1 阿
381
+ 1 阳
382
+ 1 门
383
+ 1 键
384
+ 1 错
385
+ 1 销
386
+ 1 都
387
+ 1 选
388
+ 1 进
389
+ 1 运
390
+ 1 贵
391
+ 1 财
392
+ 1 谷
393
+ 1 词
394
+ 1 论
395
+ 1 许
396
+ 1 让
397
+ 1 计
398
+ 1 言
399
+ 1 觉
400
+ 1 视
401
+ 1 观
402
+ 1 要
403
+ 1 袤
404
+ 1 行
405
+ 1 融
406
+ 1 获
407
+ 1 荐
408
+ 1 草
409
+ 1 花
410
+ 1 脉
411
+ 1 综
412
+ 1 结
413
+ 1 经
414
+ 1 纳
415
+ 1 素
416
+ 1 精
417
+ 1 类
418
+ 1 篇
419
+ 1 算
420
+ 1 章
421
+ 1 立
422
+ 1 科
423
+ 1 看
424
+ 1 目
425
+ 1 疗
426
+ 1 现
427
+ 1 犬
428
+ 1 牛
429
+ 1 版
430
+ 1 片
431
+ 1 热
432
+ 1 澈
433
+ 1 湖
434
+ 1 清
435
+ 1 深
436
+ 1 消
437
+ 1 流
438
+ 1 活
439
+ 1 洲
440
+ 1 泰
441
+ 1 泛
442
+ 1 法
443
+ 1 泊
444
+ 1 河
445
+ 1 没
446
+ 1 江
447
+ 1 毛
448
+ 1 比
449
+ 1 段
450
+ 1 此
451
+ 1 欧
452
+ 1 欢
453
+ 1 模
454
+ 1 根
455
+ 1 树
456
+ 1 柯
457
+ 1 析
458
+ 1 未
459
+ 1 木
460
+ 1 望
461
+ 1 最
462
+ 1 曼
463
+ 1 普
464
+ 1 显
465
+ 1 旺
466
+ 1 日
467
+ 1 无
468
+ 1 斗
469
+ 1 整
470
+ 1 数
471
+ 1 散
472
+ 1 提
473
+ 1 描
474
+ 1 择
475
+ 1 拟
476
+ 1 拂
477
+ 1 抱
478
+ 1 所
479
+ 1 情
480
+ 1 息
481
+ 1 怀
482
+ 1 心
483
+ 1 微
484
+ 1 往
485
+ 1 异
486
+ 1 应
487
+ 1 并
488
+ 1 平
489
+ 1 常
490
+ 1 带
491
+ 1 希
492
+ 1 布
493
+ 1 市
494
+ 1 已
495
+ 1 巧
496
+ 1 工
497
+ 1 川
498
+ 1 岛
499
+ 1 尽
500
+ 1 小
501
+ 1 将
502
+ 1 对
503
+ 1 宾
504
+ 1 宣
505
+ 1 实
506
+ 1 宁
507
+ 1 字
508
+ 1 子
509
+ 1 媚
510
+ 1 威
511
+ 1 姆
512
+ 1 好
513
+ 1 奥
514
+ 1 外
515
+ 1 处
516
+ 1 基
517
+ 1 域
518
+ 1 城
519
+ 1 型
520
+ 1 地
521
+ 1 图
522
+ 1 器
523
+ 1 喜
524
+ 1 命
525
+ 1 告
526
+ 1 吉
527
+ 1 各
528
+ 1 句
529
+ 1 发
530
+ 1 双
531
+ 1 及
532
+ 1 去
533
+ 1 厘
534
+ 1 印
535
+ 1 博
536
+ 1 南
537
+ 1 医
538
+ 1 化
539
+ 1 动
540
+ 1 加
541
+ 1 利
542
+ 1 删
543
+ 1 创
544
+ 1 分
545
+ 1 出
546
+ 1 准
547
+ 1 典
548
+ 1 关
549
+ 1 克
550
+ 1 元
551
+ 1 像
552
+ 1 健
553
+ 1 保
554
+ 1 便
555
+ 1 体
556
+ 1 传
557
+ 1 件
558
+ 1 们
559
+ 1 他
560
+ 1 从
561
+ 1 京
562
+ 1 交
563
+ 1 亚
564
+ 1 云
565
+ 1 事
566
+ 1 之
567
+ 1 丹
568
+ 1 东
569
+ 1 且
570
+ 1 不
571
+ 1 ”
572
+ 1 “
573
+ 1 3
574
+ 1 1
575
+
576
+ PER-WORD STATS: word corr tot_errs count_in_ref count_in_hyp
577
+ , 6 37 11 38
578
+ 的 9 36 11 43
579
+ 。 6 27 13 26
580
+ 、 0 21 1 20
581
+ | 0 20 0 20
582
+ 是 4 15 7 16
583
+ 文 4 12 7 13
584
+ 本 2 11 5 10
585
+ 以 0 11 2 9
586
+ n 0 11 1 10
587
+ : 1 10 5 7
588
+ 有 0 10 4 6
589
+ 中 3 10 6 10
590
+ 一 4 10 10 8
591
+ m 0 10 0 10
592
+ i 0 10 0 10
593
+ e 0 10 0 10
594
+ d 0 10 0 10
595
+ _ 0 10 0 10
596
+ > 0 10 0 10
597
+ < 0 10 0 10
598
+ . 0 10 6 4
599
+ 和 1 9 2 9
600
+ 可 0 9 1 8
601
+ 多 1 8 4 6
602
+ 6 0 8 4 4
603
+ 这 4 7 4 11
604
+ 我 1 7 3 6
605
+ % 0 7 5 2
606
+ 如 0 6 0 6
607
+ 在 0 6 3 3
608
+ 了 1 6 3 5
609
+ 个 3 6 4 8
610
+
611
+ 0 6 0 6
612
+ 蚂 0 5 2 3
613
+ 蚁 0 5 2 3
614
+ 学 0 5 2 3
615
+ 大 0 5 0 5
616
+ 到 0 5 0 5
617
+ 关 2 5 6 3
618
+ 你 0 5 2 3
619
+ 0 1 5 4 3
620
+ 0 5 0 5
621
+ 题 0 4 2 2
622
+ 面 0 4 1 3
623
+ 键 1 4 4 2
624
+ 量 0 4 1 3
625
+ 语 0 4 0 4
626
+ 词 1 4 4 2
627
+ 自 1 4 1 5
628
+ 能 1 4 1 5
629
+ 给 0 4 4 0
630
+ 等 0 4 0 4
631
+ 然 1 4 1 5
632
+ 数 0 4 2 2
633
+ 所 0 4 2 2
634
+ 或 0 4 1 3
635
+ 广 0 4 1 3
636
+ 别 0 4 1 3
637
+ 人 1 4 1 5
638
+ 丽 0 4 0 4
639
+ 为 0 4 0 4
640
+ 下 1 4 3 3
641
+ 3 0 4 3 1
642
+ 风 0 3 0 3
643
+ 道 2 3 3 4
644
+ 走 1 3 3 2
645
+ 请 0 3 3 0
646
+ 类 0 3 1 2
647
+ 答 0 3 1 2
648
+ 看 2 3 2 5
649
+ 现 1 3 3 2
650
+ 特 0 3 0 3
651
+ 段 0 3 1 2
652
+ 果 0 3 0 3
653
+ 来 0 3 0 3
654
+ 更 0 3 0 3
655
+ 智 1 3 1 4
656
+ 方 0 3 0 3
657
+ 新 2 3 2 5
658
+ 技 1 3 1 4
659
+ 感 0 3 0 3
660
+ 想 0 3 1 2
661
+ 得 1 3 2 3
662
+ 年 0 3 3 0
663
+ 少 0 3 3 0
664
+ 客 0 3 1 2
665
+ 字 1 3 3 2
666
+ 子 2 3 3 4
667
+ 地 1 3 1 4
668
+ 向 1 3 3 2
669
+ 受 0 3 0 3
670
+ 包 0 3 1 2
671
+ 力 0 3 0 3
672
+ 前 0 3 0 3
673
+ 利 0 3 2 1
674
+ 内 0 3 0 3
675
+ 其 0 3 1 2
676
+ 会 0 3 1 2
677
+ 于 2 3 3 4
678
+ 习 0 3 0 3
679
+ 也 0 3 0 3
680
+ 之 0 3 1 2
681
+ 上 0 3 1 2
682
+ ” 0 3 2 1
683
+ “ 0 3 2 1
684
+ 5 1 3 4 1
685
+ 2 2 3 2 5
686
+ ? 0 2 2 0
687
+ 静 0 2 0 2
688
+ 闻 2 2 2 4
689
+ 问 0 2 2 0
690
+ 金 0 2 0 2
691
+ 都 0 2 1 1
692
+ 通 0 2 0 2
693
+ 选 0 2 0 2
694
+ 述 0 2 0 2
695
+ 运 0 2 1 1
696
+ 该 0 2 1 1
697
+ 识 0 2 0 2
698
+ 让 0 2 0 2
699
+ 西 0 2 0 2
700
+ 表 1 2 3 1
701
+ 获 1 2 2 2
702
+ 美 2 2 2 4
703
+ 章 1 2 2 2
704
+ 着 0 2 0 2
705
+ 目 1 2 1 3
706
+ 用 0 2 0 2
707
+ 生 0 2 0 2
708
+ 理 0 2 0 2
709
+ 独 0 2 0 2
710
+ 润 0 2 2 0
711
+ 求 0 2 0 2
712
+ 气 0 2 0 2
713
+ 比 0 2 1 1
714
+ 此 0 2 0 2
715
+ 模 1 2 2 2
716
+ 案 0 2 0 2
717
+ 机 0 2 0 2
718
+ 术 1 2 1 3
719
+ 替 0 2 1 1
720
+ 景 0 2 2 0
721
+ 明 0 2 0 2
722
+ 斯 0 2 0 2
723
+ 推 1 2 1 3
724
+ 据 1 2 1 3
725
+ 换 0 2 1 1
726
+ 括 0 2 0 2
727
+ 拉 0 2 0 2
728
+ 报 2 2 2 4
729
+ 您 0 2 0 2
730
+ 当 0 2 0 2
731
+ 度 0 2 0 2
732
+ 巴 0 2 0 2
733
+ 工 1 2 1 3
734
+ 山 0 2 0 2
735
+ 尼 0 2 0 2
736
+ 小 0 2 1 1
737
+ 将 0 2 1 1
738
+ 容 0 2 0 2
739
+ 定 0 2 2 0
740
+ 它 1 2 1 3
741
+ 娃 0 2 0 2
742
+ 好 2 2 2 4
743
+ 奥 0 2 1 1
744
+ 型 1 2 2 2
745
+ 国 1 2 1 3
746
+ 因 0 2 0 2
747
+ 告 0 2 1 1
748
+ 否 1 2 2 2
749
+ 后 0 2 0 2
750
+ 右 1 2 2 2
751
+ 只 3 2 3 5
752
+ 句 2 2 2 4
753
+ 去 0 2 1 1
754
+ 博 0 2 1 1
755
+ 列 0 2 2 0
756
+ 净 0 2 2 0
757
+ 光 0 2 0 2
758
+ 今 0 2 2 0
759
+ 京 0 2 1 1
760
+ 主 0 2 0 2
761
+ 东 0 2 1 1
762
+ ’ 0 2 2 0
763
+ ‘ 0 2 2 0
764
+ A 0 2 2 0
765
+ " 2 2 2 4
766
+ 黑 0 1 0 1
767
+ 黎 0 1 0 1
768
+ 黄 0 1 0 1
769
+ 魅 0 1 0 1
770
+ 验 0 1 0 1
771
+ 首 0 1 1 0
772
+ 领 0 1 0 1
773
+ 音 0 1 0 1
774
+ 需 0 1 0 1
775
+ 随 0 1 0 1
776
+ 隆 0 1 1 0
777
+ 除 2 1 2 3
778
+ 阿 0 1 0 1
779
+ 阳 0 1 0 1
780
+ 门 0 1 0 1
781
+ 长 0 1 1 0
782
+ 错 0 1 0 1
783
+ 销 0 1 0 1
784
+ 重 0 1 1 0
785
+ 逗 0 1 1 0
786
+ 进 0 1 0 1
787
+ 贵 0 1 0 1
788
+ 财 0 1 0 1
789
+ 谷 0 1 0 1
790
+ 诗 0 1 1 0
791
+ 试 1 1 2 1
792
+ 论 0 1 0 1
793
+ 许 0 1 0 1
794
+ 计 1 1 1 2
795
+ 言 0 1 0 1
796
+ 觉 0 1 0 1
797
+ 视 0 1 0 1
798
+ 观 0 1 0 1
799
+ 要 0 1 0 1
800
+ 袤 0 1 0 1
801
+ 行 0 1 0 1
802
+ 融 0 1 0 1
803
+ 荐 1 1 1 2
804
+ 草 0 1 0 1
805
+ 花 0 1 0 1
806
+ 脉 0 1 0 1
807
+ 综 0 1 0 1
808
+ 结 0 1 0 1
809
+ 经 0 1 0 1
810
+ 绍 0 1 1 0
811
+ 线 0 1 1 0
812
+ 纳 0 1 0 1
813
+ 素 0 1 0 1
814
+ 精 0 1 0 1
815
+ 篇 1 1 1 2
816
+ 算 1 1 1 2
817
+ 符 0 1 1 0
818
+ 立 0 1 0 1
819
+ 空 0 1 1 0
820
+ 科 0 1 0 1
821
+ 种 0 1 0 1
822
+ 确 1 1 1 2
823
+ 直 0 1 1 0
824
+ 疗 0 1 0 1
825
+ 犬 0 1 0 1
826
+ 牛 0 1 0 1
827
+ 版 0 1 0 1
828
+ 片 0 1 0 1
829
+ 热 0 1 0 1
830
+ 点 0 1 1 0
831
+ 澈 0 1 0 1
832
+ 湖 0 1 0 1
833
+ 游 0 1 1 0
834
+ 清 0 1 0 1
835
+ 深 0 1 0 1
836
+ 消 0 1 0 1
837
+ 测 1 1 2 1
838
+ 流 0 1 0 1
839
+ 活 0 1 0 1
840
+ 洲 0 1 0 1
841
+ 泰 0 1 0 1
842
+ 泛 0 1 0 1
843
+ 法 0 1 0 1
844
+ 泊 0 1 0 1
845
+ 河 0 1 0 1
846
+ 没 0 1 0 1
847
+ 江 0 1 0 1
848
+ 毛 0 1 0 1
849
+ 正 1 1 1 2
850
+ 歌 0 1 1 0
851
+ 欧 0 1 0 1
852
+ 欢 0 1 0 1
853
+ 次 0 1 0 1
854
+ 格 0 1 1 0
855
+ 根 1 1 1 2
856
+ 样 0 1 1 0
857
+ 树 0 1 0 1
858
+ 柯 0 1 0 1
859
+ 析 0 1 0 1
860
+ 条 0 1 1 0
861
+ 未 0 1 0 1
862
+ 木 0 1 0 1
863
+ 望 0 1 0 1
864
+ 最 0 1 0 1
865
+ 曼 0 1 0 1
866
+ 普 0 1 0 1
867
+ 显 0 1 0 1
868
+ 旺 0 1 0 1
869
+ 日 0 1 0 1
870
+ 无 0 1 0 1
871
+ 旅 0 1 1 0
872
+ 斗 0 1 0 1
873
+ 整 0 1 0 1
874
+ 散 0 1 0 1
875
+ 提 0 1 0 1
876
+ 描 0 1 0 1
877
+ 损 0 1 1 0
878
+ 择 0 1 0 1
879
+ 拟 0 1 0 1
880
+ 拂 0 1 0 1
881
+ 抱 0 1 0 1
882
+ 成 0 1 1 0
883
+ 情 0 1 0 1
884
+ 息 0 1 0 1
885
+ 总 0 1 0 1
886
+ 怀 0 1 0 1
887
+ 心 0 1 0 1
888
+ 微 0 1 0 1
889
+ 很 0 1 1 0
890
+ 往 0 1 0 1
891
+ 异 0 1 0 1
892
+ 开 0 1 1 0
893
+ 应 0 1 0 1
894
+ 并 0 1 0 1
895
+ 平 0 1 0 1
896
+ 幕 0 1 1 0
897
+ 常 0 1 0 1
898
+ 帮 0 1 1 0
899
+ 带 0 1 0 1
900
+ 希 0 1 0 1
901
+ 布 0 1 0 1
902
+ 市 0 1 0 1
903
+ 已 0 1 0 1
904
+ 巧 0 1 0 1
905
+ 左 0 1 1 0
906
+ 川 0 1 0 1
907
+ 岛 0 1 0 1
908
+ 尽 0 1 0 1
909
+ 对 0 1 0 1
910
+ 宾 0 1 0 1
911
+ 宣 0 1 0 1
912
+ 实 0 1 0 1
913
+ 宁 0 1 0 1
914
+ 媚 0 1 0 1
915
+ 威 0 1 0 1
916
+ 姆 0 1 0 1
917
+ 外 0 1 0 1
918
+ 处 0 1 0 1
919
+ 壮 0 1 0 1
920
+ 增 0 1 1 0
921
+ 基 0 1 0 1
922
+ 域 0 1 0 1
923
+ 城 0 1 0 1
924
+ 图 0 1 0 1
925
+ 回 0 1 1 0
926
+ 器 0 1 0 1
927
+ 喜 0 1 0 1
928
+ 命 0 1 0 1
929
+ 含 0 1 1 0
930
+ 吉 0 1 0 1
931
+ 各 0 1 0 1
932
+ 司 0 1 1 0
933
+ 号 0 1 1 0
934
+ 发 1 1 1 2
935
+ 双 0 1 0 1
936
+ 及 0 1 0 1
937
+ 原 0 1 0 1
938
+ 厘 0 1 0 1
939
+ 印 0 1 0 1
940
+ 占 0 1 1 0
941
+ 南 0 1 0 1
942
+ 医 0 1 0 1
943
+ 化 0 1 0 1
944
+ 动 0 1 0 1
945
+ 加 0 1 0 1
946
+ 删 2 1 2 3
947
+ 创 0 1 0 1
948
+ 分 0 1 0 1
949
+ 出 1 1 1 2
950
+ 准 0 1 0 1
951
+ 写 0 1 1 0
952
+ 典 0 1 0 1
953
+ 共 0 1 0 1
954
+ 公 0 1 1 0
955
+ 克 0 1 0 1
956
+ 元 1 1 1 2
957
+ 像 0 1 0 1
958
+ 健 0 1 0 1
959
+ 保 0 1 0 1
960
+ 便 0 1 0 1
961
+ 体 0 1 0 1
962
+ 传 0 1 0 1
963
+ 件 0 1 0 1
964
+ 们 0 1 0 1
965
+ 他 0 1 0 1
966
+ 从 1 1 1 2
967
+ 介 0 1 1 0
968
+ 交 0 1 0 1
969
+ 亚 0 1 0 1
970
+ 云 0 1 0 1
971
+ 亏 0 1 1 0
972
+ 事 0 1 0 1
973
+ 丹 0 1 0 1
974
+ 串 0 1 1 0
975
+ 两 0 1 0 1
976
+ 且 0 1 0 1
977
+ 不 0 1 0 1
978
+ ] 0 1 1 0
979
+ \ 0 1 1 0
980
+ [ 0 1 1 0
981
+ D 1 1 1 2
982
+ C 0 1 1 0
983
+ B 0 1 1 0
984
+ 4 0 1 1 0
985
+ 1 0 1 0 1
986
+ 项 1 0 1 1
987
+ 输 1 0 1 1
988
+ 狗 1 0 1 1
989
+ 牌 1 0 1 1
990
+ 断 1 0 1 1
991
+ 属 1 0 1 1
992
+ 展 1 0 1 1
993
+ 奖 1 0 1 1
994
+ 团 1 0 1 1
995
+ 名 1 0 1 1
996
+ 判 1 0 1 1
997
+ 入 1 0 1 1
998
+ 代 1 0 1 1
999
+ 些 2 0 2 2
1000
+ 万 1 0 1 1
log-e2e-epoch10_speech2speech_rerun/f5cf13be42666535036bdf0f05fcc0e3.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f4910d3a76bde71dfd2668454aa7caf63978e2ecb8290b97ca868e91dfaa242
3
+ size 320044
log-e2e-epoch10_speech2speech_rerun/log-decode-epoch-999-avg-1-2025-04-29-06-36-58 ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-04-29 06:36:58,118 INFO [decode.py:695] Decoding started
2
+ 2025-04-29 06:36:58,120 INFO [decode.py:696] {
3
+ "audio_key": "question_audio",
4
+ "avg": 1,
5
+ "beam_size": 1,
6
+ "bucketing_sampler": true,
7
+ "drop_last": true,
8
+ "enable_musan": true,
9
+ "enable_spec_aug": true,
10
+ "enable_speech_output": true,
11
+ "encoder_projector_ds_rate": 8,
12
+ "env_info": {
13
+ "IP address": "0.114.200.181",
14
+ "hostname": "7522485",
15
+ "icefall-git-branch": "master",
16
+ "icefall-git-date": "Fri Apr 25 10:31:43 2025",
17
+ "icefall-git-sha1": "d742043e-dirty",
18
+ "icefall-path": "/workspace/slam/icefall_omni",
19
+ "k2-build-type": "Release",
20
+ "k2-git-date": "Tue Oct 29 09:02:19 2024",
21
+ "k2-git-sha1": "75e2ed6b2fd87c22b7f3f34bad48a69984bb8755",
22
+ "k2-path": "/opt/conda/lib/python3.11/site-packages/k2/__init__.py",
23
+ "k2-version": "1.24.4",
24
+ "k2-with-cuda": true,
25
+ "lhotse-path": "/workspace/slam/lhotse_pr/lhotse/__init__.py",
26
+ "lhotse-version": "1.31.0.dev+git.8c56a3e6.dirty",
27
+ "python-version": "3.11",
28
+ "torch-cuda-available": true,
29
+ "torch-cuda-version": "12.4",
30
+ "torch-version": "2.4.0"
31
+ },
32
+ "epoch": 999,
33
+ "exp_dir": "slam_omni/exp_speech2speech_rerun",
34
+ "huggingface_dataset_path_or_name": "/workspace/Belle_1.4M-SLAM-Omni",
35
+ "input_strategy": "PrecomputedFeatures",
36
+ "llm_path_or_name": "models/Qwen2.5-0.5B-Instruct",
37
+ "log_dir": "slam_omni/exp_speech2speech_rerun/log-e2e-epoch10_speech2speech_rerun",
38
+ "manifest_dir": "data/fbank",
39
+ "max_duration": 1,
40
+ "method": "e2e-epoch10_speech2speech_rerun",
41
+ "num_buckets": 30,
42
+ "num_workers": 2,
43
+ "on_the_fly_feats": false,
44
+ "remove_whisper_encoder_input_length_restriction": true,
45
+ "resample_to_16kHz": true,
46
+ "return_cuts": true,
47
+ "shuffle": true,
48
+ "spec_aug_time_warp_factor": 80,
49
+ "speech_encoder_path_or_name": "models/whisper/v1.1/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt",
50
+ "suffix": "epoch-999-avg-1",
51
+ "text_key": "answer",
52
+ "token2wav_path": "/workspace/CosyVoice-300M-SFT",
53
+ "use_flash_attn": true,
54
+ "use_lora": true
55
+ }
56
+ 2025-04-29 06:36:58,120 INFO [decode.py:702] device: cuda
57
+ 2025-04-29 06:37:30,467 INFO [flow.py:51] input frame rate=50
58
+ 2025-04-29 06:37:34,971 INFO [processor.py:101] building fst for zh_normalizer ...
59
+ 2025-04-29 06:37:59,937 INFO [processor.py:106] done
60
+ 2025-04-29 06:37:59,942 INFO [processor.py:107] fst path: /opt/conda/lib/python3.11/site-packages/tn/zh_tn_tagger.fst
61
+ 2025-04-29 06:37:59,943 INFO [processor.py:108] /opt/conda/lib/python3.11/site-packages/tn/zh_tn_verbalizer.fst
62
+ 2025-04-29 06:37:59,949 INFO [processor.py:95] found existing fst: /opt/conda/lib/python3.11/site-packages/tn/en_tn_tagger.fst
63
+ 2025-04-29 06:37:59,950 INFO [processor.py:96] /opt/conda/lib/python3.11/site-packages/tn/en_tn_verbalizer.fst
64
+ 2025-04-29 06:37:59,950 INFO [processor.py:97] skip building fst for en_normalizer ...
65
+ 2025-04-29 06:38:05,719 INFO [decode.py:708] Number of model parameters: 1314396804
66
+ 2025-04-29 06:38:05,720 INFO [data_module.py:414] About to get test cuts
log-e2e-epoch10_speech2speech_rerun/log-decode-epoch-999-avg-1-2025-04-29-06-40-09 ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-04-29 06:40:09,268 INFO [decode.py:696] Decoding started
2
+ 2025-04-29 06:40:09,270 INFO [decode.py:697] {
3
+ "audio_key": "question_audio",
4
+ "avg": 1,
5
+ "beam_size": 1,
6
+ "bucketing_sampler": true,
7
+ "drop_last": true,
8
+ "enable_musan": true,
9
+ "enable_spec_aug": true,
10
+ "enable_speech_output": true,
11
+ "encoder_projector_ds_rate": 8,
12
+ "env_info": {
13
+ "IP address": "0.114.200.181",
14
+ "hostname": "7522485",
15
+ "icefall-git-branch": "master",
16
+ "icefall-git-date": "Fri Apr 25 10:31:43 2025",
17
+ "icefall-git-sha1": "d742043e-dirty",
18
+ "icefall-path": "/workspace/slam/icefall_omni",
19
+ "k2-build-type": "Release",
20
+ "k2-git-date": "Tue Oct 29 09:02:19 2024",
21
+ "k2-git-sha1": "75e2ed6b2fd87c22b7f3f34bad48a69984bb8755",
22
+ "k2-path": "/opt/conda/lib/python3.11/site-packages/k2/__init__.py",
23
+ "k2-version": "1.24.4",
24
+ "k2-with-cuda": true,
25
+ "lhotse-path": "/workspace/slam/lhotse_pr/lhotse/__init__.py",
26
+ "lhotse-version": "1.31.0.dev+git.8c56a3e6.dirty",
27
+ "python-version": "3.11",
28
+ "torch-cuda-available": true,
29
+ "torch-cuda-version": "12.4",
30
+ "torch-version": "2.4.0"
31
+ },
32
+ "epoch": 999,
33
+ "exp_dir": "slam_omni/exp_speech2speech_rerun",
34
+ "huggingface_dataset_path_or_name": "/workspace/Belle_1.4M-SLAM-Omni",
35
+ "input_strategy": "PrecomputedFeatures",
36
+ "llm_path_or_name": "models/Qwen2.5-0.5B-Instruct",
37
+ "log_dir": "slam_omni/exp_speech2speech_rerun/log-e2e-epoch10_speech2speech_rerun",
38
+ "manifest_dir": "data/fbank",
39
+ "max_duration": 1,
40
+ "method": "e2e-epoch10_speech2speech_rerun",
41
+ "num_buckets": 30,
42
+ "num_workers": 2,
43
+ "on_the_fly_feats": false,
44
+ "remove_whisper_encoder_input_length_restriction": true,
45
+ "resample_to_16kHz": true,
46
+ "return_cuts": true,
47
+ "shuffle": true,
48
+ "spec_aug_time_warp_factor": 80,
49
+ "speech_encoder_path_or_name": "models/whisper/v1.1/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt",
50
+ "suffix": "epoch-999-avg-1",
51
+ "text_key": "answer",
52
+ "token2wav_path": "/workspace/CosyVoice-300M-SFT",
53
+ "use_flash_attn": true,
54
+ "use_lora": true
55
+ }
56
+ 2025-04-29 06:40:09,270 INFO [decode.py:703] device: cuda
57
+ 2025-04-29 06:40:31,296 INFO [flow.py:51] input frame rate=50
58
+ 2025-04-29 06:40:32,568 INFO [processor.py:101] building fst for zh_normalizer ...
59
+ 2025-04-29 06:40:57,320 INFO [processor.py:106] done
60
+ 2025-04-29 06:40:57,320 INFO [processor.py:107] fst path: /opt/conda/lib/python3.11/site-packages/tn/zh_tn_tagger.fst
61
+ 2025-04-29 06:40:57,321 INFO [processor.py:108] /opt/conda/lib/python3.11/site-packages/tn/zh_tn_verbalizer.fst
62
+ 2025-04-29 06:40:57,325 INFO [processor.py:95] found existing fst: /opt/conda/lib/python3.11/site-packages/tn/en_tn_tagger.fst
63
+ 2025-04-29 06:40:57,325 INFO [processor.py:96] /opt/conda/lib/python3.11/site-packages/tn/en_tn_verbalizer.fst
64
+ 2025-04-29 06:40:57,325 INFO [processor.py:97] skip building fst for en_normalizer ...
65
+ 2025-04-29 06:40:59,720 INFO [decode.py:709] Number of model parameters: 1314396804
66
+ 2025-04-29 06:40:59,721 INFO [data_module.py:414] About to get test cuts
log-e2e-epoch10_speech2speech_rerun/log-decode-epoch-999-avg-1-2025-04-29-06-41-35 ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-04-29 06:41:35,728 INFO [decode.py:697] Decoding started
2
+ 2025-04-29 06:41:35,729 INFO [decode.py:698] {
3
+ "audio_key": "question_audio",
4
+ "avg": 1,
5
+ "beam_size": 1,
6
+ "bucketing_sampler": true,
7
+ "drop_last": true,
8
+ "enable_musan": true,
9
+ "enable_spec_aug": true,
10
+ "enable_speech_output": true,
11
+ "encoder_projector_ds_rate": 8,
12
+ "env_info": {
13
+ "IP address": "0.114.200.181",
14
+ "hostname": "7522485",
15
+ "icefall-git-branch": "master",
16
+ "icefall-git-date": "Fri Apr 25 10:31:43 2025",
17
+ "icefall-git-sha1": "d742043e-dirty",
18
+ "icefall-path": "/workspace/slam/icefall_omni",
19
+ "k2-build-type": "Release",
20
+ "k2-git-date": "Tue Oct 29 09:02:19 2024",
21
+ "k2-git-sha1": "75e2ed6b2fd87c22b7f3f34bad48a69984bb8755",
22
+ "k2-path": "/opt/conda/lib/python3.11/site-packages/k2/__init__.py",
23
+ "k2-version": "1.24.4",
24
+ "k2-with-cuda": true,
25
+ "lhotse-path": "/workspace/slam/lhotse_pr/lhotse/__init__.py",
26
+ "lhotse-version": "1.31.0.dev+git.8c56a3e6.dirty",
27
+ "python-version": "3.11",
28
+ "torch-cuda-available": true,
29
+ "torch-cuda-version": "12.4",
30
+ "torch-version": "2.4.0"
31
+ },
32
+ "epoch": 999,
33
+ "exp_dir": "slam_omni/exp_speech2speech_rerun",
34
+ "huggingface_dataset_path_or_name": "/workspace/Belle_1.4M-SLAM-Omni",
35
+ "input_strategy": "PrecomputedFeatures",
36
+ "llm_path_or_name": "models/Qwen2.5-0.5B-Instruct",
37
+ "log_dir": "slam_omni/exp_speech2speech_rerun/log-e2e-epoch10_speech2speech_rerun",
38
+ "manifest_dir": "data/fbank",
39
+ "max_duration": 1,
40
+ "method": "e2e-epoch10_speech2speech_rerun",
41
+ "num_buckets": 30,
42
+ "num_workers": 2,
43
+ "on_the_fly_feats": false,
44
+ "remove_whisper_encoder_input_length_restriction": true,
45
+ "resample_to_16kHz": true,
46
+ "return_cuts": true,
47
+ "shuffle": true,
48
+ "spec_aug_time_warp_factor": 80,
49
+ "speech_encoder_path_or_name": "models/whisper/v1.1/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt",
50
+ "suffix": "epoch-999-avg-1",
51
+ "text_key": "answer",
52
+ "token2wav_path": "/workspace/CosyVoice-300M-SFT",
53
+ "use_flash_attn": true,
54
+ "use_lora": true
55
+ }
56
+ 2025-04-29 06:41:35,729 INFO [decode.py:704] device: cuda
57
+ 2025-04-29 06:41:58,540 INFO [flow.py:51] input frame rate=50
58
+ 2025-04-29 06:41:59,853 INFO [processor.py:101] building fst for zh_normalizer ...
59
+ 2025-04-29 06:42:24,447 INFO [processor.py:106] done
60
+ 2025-04-29 06:42:24,447 INFO [processor.py:107] fst path: /opt/conda/lib/python3.11/site-packages/tn/zh_tn_tagger.fst
61
+ 2025-04-29 06:42:24,447 INFO [processor.py:108] /opt/conda/lib/python3.11/site-packages/tn/zh_tn_verbalizer.fst
62
+ 2025-04-29 06:42:24,452 INFO [processor.py:95] found existing fst: /opt/conda/lib/python3.11/site-packages/tn/en_tn_tagger.fst
63
+ 2025-04-29 06:42:24,452 INFO [processor.py:96] /opt/conda/lib/python3.11/site-packages/tn/en_tn_verbalizer.fst
64
+ 2025-04-29 06:42:24,452 INFO [processor.py:97] skip building fst for en_normalizer ...
65
+ 2025-04-29 06:42:26,967 INFO [decode.py:710] Number of model parameters: 1314396804
66
+ 2025-04-29 06:42:26,967 INFO [data_module.py:414] About to get test cuts
67
+ 2025-04-29 06:42:35,359 INFO [decode.py:627] batch 0/?, cuts processed until now is 1
68
+ 2025-04-29 06:44:35,705 INFO [decode.py:646] The transcripts are stored in slam_omni/exp_speech2speech_rerun/log-e2e-epoch10_speech2speech_rerun/recogs-test-beam-search-epoch-999-avg-1.txt
69
+ 2025-04-29 06:44:35,709 INFO [utils.py:670] [test-beam-search] %WER 229.61% [884 / 385, 643 ins, 78 del, 163 sub ]
70
+ 2025-04-29 06:44:35,719 INFO [decode.py:664] Wrote detailed error stats to slam_omni/exp_speech2speech_rerun/log-e2e-epoch10_speech2speech_rerun/errs-test-beam-search-epoch-999-avg-1.txt
71
+ 2025-04-29 06:44:35,722 INFO [decode.py:678]
72
+ For test, CER of different settings are:
73
+ beam-search 229.61 best for test
74
+
75
+ 2025-04-29 06:44:35,723 INFO [decode.py:754] Done!
log-e2e-epoch10_speech2speech_rerun/recogs-test-beam-search-epoch-999-avg-1.txt ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2a8211e0e75e000bbae1134b21fdbfa4: ref=['以下是一道小学数学题:', '有一条直线,上面有', '6', '只蚂蚁,其中', '4', '只向左走,2', '只向右走。问:向右走的蚂蚁数量占比是多少?', 'A.', '25%', 'B.', '33.3%', 'C.', '50%', 'D.', '66.6%', '回答该问题。']
2
+ 2a8211e0e75e000bbae1134b21fdbfa4: hyp=答案是D. 60%
3
+ 向右走的蚂蚁数量为2只,总共6只蚂蚁中,有2只是向右走的,因此60%的6只蚂蚁中有2只。
4
+ 所以,答案选D。<|im_end|>
5
+ 2e9fddd659b1167aed93534c608f1d99: ref=['A公司去年亏损了500万美元,今年净利润增长了50%,今年的净利润是多少?']
6
+ 2e9fddd659b1167aed93534c608f1d99: hyp=250万美元。<|im_end|>
7
+ 31b4015dd1258ff1f8be44d6504e6956: ref=['请给我介绍一下人工智能的技术和发展。']
8
+ 31b4015dd1258ff1f8be44d6504e6956: hyp=人工智能是一种模拟人类智能的技术,它包括机器学习、自然语言处理、计算机视觉等多个方面。目前,人工智能技术已经广泛应用于各个领域,比如医疗保健、金融、交通等。其中,深度学习是当前最热门的技术之一,它可以实现对大量数据的自动分析和学习,并且可以用于图像识别、语音识别等方面。未来,随着科技的发展,人工智能将会更加普及,为我们的生活带来更多的便利和创新。<|im_end|>
9
+ 3243962376f8c2e411e587e6482f9367: ref=['写一首关于自然美景的诗歌。']
10
+ 3243962376f8c2e411e587e6482f9367: hyp=在大自然的怀抱中,我感受到了生命的美好。山川河流、花草树木,都散发着独特的魅力。阳光明媚,微风拂面,让人感受到自然的力量和美丽。在这片广袤的大地上,我看到了无尽的可能和希望,也感受到了内心的宁静和平静。<|im_end|>
11
+ 450e7541d093ee0e7e12ab76a0dff546: ref=['帮我想一些关于狗的名字。']
12
+ 450e7541d093ee0e7e12ab76a0dff546: hyp=好的,以下是一些关于狗的名字:旺财、小黑、巧克力、大黄、金毛、拉布拉多、吉娃娃、柯基、贵宾、斗牛犬。当然,如果您有其他要求或想法,我也可以根据您的需求进行更精准的推荐。<|im_end|>
13
+ 88562bce265cff88246933e62b756e84: ref=['给定一段文本和关键词列表,删除文本中包含所有给定关键词的子字符串。', '文本:"这是一个测试句子,目的是看看模型是否可以正确地从这个句子中删除关键词。"\\n关键词列表:[‘测试’,‘模型’]']
14
+ 88562bce265cff88246933e62b756e84: hyp=删除后的文本为:"这是一个测试句子,目的是看看模型是否正确地从这个句子中删除关键词"。<|im_end|>
15
+ ca7f2b2ab703a4f37b5da09fd119a28a: ref=['根据输入的文本,判断它是否属于新闻报道、广告或博客文章类别之一。', '新闻报道:“奥运会在东京隆重开幕,中国代表团获得多项奖牌。”']
16
+ ca7f2b2ab703a4f37b5da09fd119a28a: hyp=根据输入的文本,可以判断它属于新闻报道类别。原因如下:
17
+ 1. 文本主题:该段文字描述的是奥运会的消息,这通常是新闻报道的主题之一。
18
+ 2. 文本内容:文本中提到了多个事件,包括中国代表团获得多项奖牌,这是典型的新闻报道内容。
19
+ 3. 文本语气:整个文本的语气客观、中立,没有明显的广告或博客文章的推销和宣传元素。
20
+ 综上所述,可以得出结论,这段文本是一篇新闻报道。<|im_end|>
21
+ d60daeaebb5b0b14ee34f5d714075108: ref=['给我推荐一些旅游景点。']
22
+ d60daeaebb5b0b14ee34f5d714075108: hyp=如果你喜欢自然风光,我推荐你去云南的丽江、大理和西双版纳。这些地方有壮丽的山脉、清澈的湖泊和独特的文化。如果你想体验异国风情,可以前往泰国的曼谷、印度尼西亚的巴厘岛或日本的东京。此外,欧洲有许多美丽的城市,如巴黎、威尼斯和阿姆斯特丹等,也是不错的选择。<|im_end|>
23
+ ee07a1222f1d62cf0358f51d12f4c67c: ref=['很好,现在请你计算一下这篇文章中出现了多少个“获得”字样。']
24
+ ee07a1222f1d62cf0358f51d12f4c67c: hyp=好的,让我来计算一下。在这篇文章中,“获得”出现了两次。<|im_end|>
25
+ f5cf13be42666535036bdf0f05fcc0e3: ref=['好的。现在请你将这个文本中的所有的逗号都替换成空格。']
26
+ f5cf13be42666535036bdf0f05fcc0e3: hyp=好的,替换后的文本为:"这是一个句子 这是目的看看是否可以正确地从这个句子中删除关键词"。<|im_end|>
log/log-train-2025-04-24-08-58-17 ADDED
@@ -0,0 +1,689 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-04-24 08:58:17,434 INFO [train.py:653] {
2
+ "allowed_excess_duration_ratio": 0.1,
3
+ "audio_key": "question_audio",
4
+ "batch_idx_train": 0,
5
+ "best_train_epoch": -1,
6
+ "best_train_loss": Infinity,
7
+ "best_valid_epoch": -1,
8
+ "best_valid_loss": Infinity,
9
+ "bucketing_sampler": true,
10
+ "deepscale": false,
11
+ "deepscale_config": null,
12
+ "deepspeed": true,
13
+ "deepspeed_config": "./slam_omni/ds_config_zero1.json",
14
+ "drop_last": true,
15
+ "enable_musan": false,
16
+ "enable_spec_aug": true,
17
+ "enable_speech_output": true,
18
+ "encoder_projector_ds_rate": 8,
19
+ "env_info": {
20
+ "IP address": "0.114.172.122",
21
+ "hostname": "7515258",
22
+ "icefall-git-branch": "master",
23
+ "icefall-git-date": "Thu Apr 24 08:24:11 2025",
24
+ "icefall-git-sha1": "2e9be467-dirty",
25
+ "icefall-path": "/workspace/slam/icefall_omni",
26
+ "k2-build-type": "Release",
27
+ "k2-git-date": "Tue Oct 29 09:02:19 2024",
28
+ "k2-git-sha1": "75e2ed6b2fd87c22b7f3f34bad48a69984bb8755",
29
+ "k2-path": "/opt/conda/lib/python3.11/site-packages/k2/__init__.py",
30
+ "k2-version": "1.24.4",
31
+ "k2-with-cuda": true,
32
+ "lhotse-path": "/workspace/slam/lhotse/lhotse/__init__.py",
33
+ "lhotse-version": "1.30.0.dev+git.13c7616f.dirty",
34
+ "python-version": "3.11",
35
+ "torch-cuda-available": true,
36
+ "torch-cuda-version": "12.4",
37
+ "torch-version": "2.4.0"
38
+ },
39
+ "exp_dir": "slam_omni/exp_speech2speech_rerun",
40
+ "frame_shift_ms": 10,
41
+ "huggingface_dataset_path_or_name": "/workspace/Belle_1.4M-SLAM-Omni",
42
+ "input_strategy": "PrecomputedFeatures",
43
+ "llm_path_or_name": "models/Qwen2.5-0.5B-Instruct",
44
+ "log_interval": 50,
45
+ "manifest_dir": "data/fbank",
46
+ "max_duration": 40,
47
+ "num_buckets": 30,
48
+ "num_epochs": 10,
49
+ "num_workers": 2,
50
+ "on_the_fly_feats": false,
51
+ "pretrained_model_path": null,
52
+ "resample_to_16kHz": true,
53
+ "reset_interval": 200,
54
+ "return_cuts": true,
55
+ "sampler_state_dict_path": null,
56
+ "seed": 42,
57
+ "shuffle": true,
58
+ "spec_aug_time_warp_factor": 80,
59
+ "speech_encoder_path_or_name": "models/whisper/v1.1/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt",
60
+ "start_epoch": 1,
61
+ "subsampling_factor": 2,
62
+ "tensorboard": true,
63
+ "text_key": "answer",
64
+ "unfreeze_llm": true,
65
+ "unfreeze_speech_projector": true,
66
+ "use_flash_attn": true,
67
+ "use_fp16": true,
68
+ "use_lora": true,
69
+ "valid_interval": 5000
70
+ }
71
+ 2025-04-24 08:58:17,434 INFO [train.py:655] About to create model
72
+ 2025-04-24 08:58:43,456 INFO [train.py:808] Number of model parameters: 1326848644
73
+ 2025-04-24 08:58:43,457 INFO [train.py:810] Trainable parameters (excluding model.eval modules):
74
+ 2025-04-24 08:58:43,457 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
75
+ 2025-04-24 08:58:43,458 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
76
+ 2025-04-24 08:58:43,458 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
77
+ 2025-04-24 08:58:43,458 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
78
+ 2025-04-24 08:58:43,458 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
79
+ 2025-04-24 08:58:43,458 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
80
+ 2025-04-24 08:58:43,458 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
81
+ 2025-04-24 08:58:43,458 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
82
+ 2025-04-24 08:58:43,458 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
83
+ 2025-04-24 08:58:43,458 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
84
+ 2025-04-24 08:58:43,458 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
85
+ 2025-04-24 08:58:43,458 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
86
+ 2025-04-24 08:58:43,458 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
87
+ 2025-04-24 08:58:43,458 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
88
+ 2025-04-24 08:58:43,458 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
89
+ 2025-04-24 08:58:43,458 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
90
+ 2025-04-24 08:58:43,458 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
91
+ 2025-04-24 08:58:43,458 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
92
+ 2025-04-24 08:58:43,458 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
93
+ 2025-04-24 08:58:43,459 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
94
+ 2025-04-24 08:58:43,459 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
95
+ 2025-04-24 08:58:43,459 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
96
+ 2025-04-24 08:58:43,459 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
97
+ 2025-04-24 08:58:43,459 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
98
+ 2025-04-24 08:58:43,459 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
99
+ 2025-04-24 08:58:43,459 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
100
+ 2025-04-24 08:58:43,459 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
101
+ 2025-04-24 08:58:43,459 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
102
+ 2025-04-24 08:58:43,459 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
103
+ 2025-04-24 08:58:43,459 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
104
+ 2025-04-24 08:58:43,459 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
105
+ 2025-04-24 08:58:43,459 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
106
+ 2025-04-24 08:58:43,459 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
107
+ 2025-04-24 08:58:43,459 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
108
+ 2025-04-24 08:58:43,459 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
109
+ 2025-04-24 08:58:43,459 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
110
+ 2025-04-24 08:58:43,459 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
111
+ 2025-04-24 08:58:43,459 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
112
+ 2025-04-24 08:58:43,460 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
113
+ 2025-04-24 08:58:43,460 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
114
+ 2025-04-24 08:58:43,460 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
115
+ 2025-04-24 08:58:43,460 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
116
+ 2025-04-24 08:58:43,460 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
117
+ 2025-04-24 08:58:43,460 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
118
+ 2025-04-24 08:58:43,460 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
119
+ 2025-04-24 08:58:43,460 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
120
+ 2025-04-24 08:58:43,460 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
121
+ 2025-04-24 08:58:43,460 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
122
+ 2025-04-24 08:58:43,460 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
123
+ 2025-04-24 08:58:43,460 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
124
+ 2025-04-24 08:58:43,460 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
125
+ 2025-04-24 08:58:43,460 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
126
+ 2025-04-24 08:58:43,460 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
127
+ 2025-04-24 08:58:43,460 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
128
+ 2025-04-24 08:58:43,460 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
129
+ 2025-04-24 08:58:43,460 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
130
+ 2025-04-24 08:58:43,460 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
131
+ 2025-04-24 08:58:43,461 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
132
+ 2025-04-24 08:58:43,461 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
133
+ 2025-04-24 08:58:43,461 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
134
+ 2025-04-24 08:58:43,461 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
135
+ 2025-04-24 08:58:43,461 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
136
+ 2025-04-24 08:58:43,461 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
137
+ 2025-04-24 08:58:43,461 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
138
+ 2025-04-24 08:58:43,461 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
139
+ 2025-04-24 08:58:43,461 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
140
+ 2025-04-24 08:58:43,461 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
141
+ 2025-04-24 08:58:43,461 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
142
+ 2025-04-24 08:58:43,461 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
143
+ 2025-04-24 08:58:43,461 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
144
+ 2025-04-24 08:58:43,461 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
145
+ 2025-04-24 08:58:43,461 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
146
+ 2025-04-24 08:58:43,461 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
147
+ 2025-04-24 08:58:43,461 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
148
+ 2025-04-24 08:58:43,461 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
149
+ 2025-04-24 08:58:43,461 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
150
+ 2025-04-24 08:58:43,462 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
151
+ 2025-04-24 08:58:43,462 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
152
+ 2025-04-24 08:58:43,462 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
153
+ 2025-04-24 08:58:43,462 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
154
+ 2025-04-24 08:58:43,462 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
155
+ 2025-04-24 08:58:43,462 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
156
+ 2025-04-24 08:58:43,462 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
157
+ 2025-04-24 08:58:43,462 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
158
+ 2025-04-24 08:58:43,462 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
159
+ 2025-04-24 08:58:43,462 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
160
+ 2025-04-24 08:58:43,462 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
161
+ 2025-04-24 08:58:43,462 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
162
+ 2025-04-24 08:58:43,462 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
163
+ 2025-04-24 08:58:43,462 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
164
+ 2025-04-24 08:58:43,462 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
165
+ 2025-04-24 08:58:43,462 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
166
+ 2025-04-24 08:58:43,462 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
167
+ 2025-04-24 08:58:43,462 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
168
+ 2025-04-24 08:58:43,462 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
169
+ 2025-04-24 08:58:43,462 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
170
+ 2025-04-24 08:58:43,463 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
171
+ 2025-04-24 08:58:43,463 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
172
+ 2025-04-24 08:58:43,463 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
173
+ 2025-04-24 08:58:43,463 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
174
+ 2025-04-24 08:58:43,463 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
175
+ 2025-04-24 08:58:43,463 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
176
+ 2025-04-24 08:58:43,463 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
177
+ 2025-04-24 08:58:43,463 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
178
+ 2025-04-24 08:58:43,463 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
179
+ 2025-04-24 08:58:43,463 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
180
+ 2025-04-24 08:58:43,463 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
181
+ 2025-04-24 08:58:43,463 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
182
+ 2025-04-24 08:58:43,463 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
183
+ 2025-04-24 08:58:43,463 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
184
+ 2025-04-24 08:58:43,463 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
185
+ 2025-04-24 08:58:43,463 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
186
+ 2025-04-24 08:58:43,463 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
187
+ 2025-04-24 08:58:43,463 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
188
+ 2025-04-24 08:58:43,463 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
189
+ 2025-04-24 08:58:43,464 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
190
+ 2025-04-24 08:58:43,464 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
191
+ 2025-04-24 08:58:43,464 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
192
+ 2025-04-24 08:58:43,464 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
193
+ 2025-04-24 08:58:43,464 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
194
+ 2025-04-24 08:58:43,464 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
195
+ 2025-04-24 08:58:43,464 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
196
+ 2025-04-24 08:58:43,464 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
197
+ 2025-04-24 08:58:43,464 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
198
+ 2025-04-24 08:58:43,464 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
199
+ 2025-04-24 08:58:43,464 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
200
+ 2025-04-24 08:58:43,464 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
201
+ 2025-04-24 08:58:43,464 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
202
+ 2025-04-24 08:58:43,464 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
203
+ 2025-04-24 08:58:43,464 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
204
+ 2025-04-24 08:58:43,464 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
205
+ 2025-04-24 08:58:43,464 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
206
+ 2025-04-24 08:58:43,464 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
207
+ 2025-04-24 08:58:43,464 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
208
+ 2025-04-24 08:58:43,464 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
209
+ 2025-04-24 08:58:43,465 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
210
+ 2025-04-24 08:58:43,465 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
211
+ 2025-04-24 08:58:43,465 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
212
+ 2025-04-24 08:58:43,465 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
213
+ 2025-04-24 08:58:43,465 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
214
+ 2025-04-24 08:58:43,465 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
215
+ 2025-04-24 08:58:43,465 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
216
+ 2025-04-24 08:58:43,465 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
217
+ 2025-04-24 08:58:43,465 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
218
+ 2025-04-24 08:58:43,465 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
219
+ 2025-04-24 08:58:43,465 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
220
+ 2025-04-24 08:58:43,465 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
221
+ 2025-04-24 08:58:43,465 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
222
+ 2025-04-24 08:58:43,465 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
223
+ 2025-04-24 08:58:43,465 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
224
+ 2025-04-24 08:58:43,465 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
225
+ 2025-04-24 08:58:43,465 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
226
+ 2025-04-24 08:58:43,465 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
227
+ 2025-04-24 08:58:43,465 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
228
+ 2025-04-24 08:58:43,466 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
229
+ 2025-04-24 08:58:43,466 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
230
+ 2025-04-24 08:58:43,466 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
231
+ 2025-04-24 08:58:43,466 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
232
+ 2025-04-24 08:58:43,466 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
233
+ 2025-04-24 08:58:43,466 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
234
+ 2025-04-24 08:58:43,466 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
235
+ 2025-04-24 08:58:43,466 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
236
+ 2025-04-24 08:58:43,466 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
237
+ 2025-04-24 08:58:43,466 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
238
+ 2025-04-24 08:58:43,466 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
239
+ 2025-04-24 08:58:43,466 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
240
+ 2025-04-24 08:58:43,466 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
241
+ 2025-04-24 08:58:43,466 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
242
+ 2025-04-24 08:58:43,466 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
243
+ 2025-04-24 08:58:43,466 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
244
+ 2025-04-24 08:58:43,466 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
245
+ 2025-04-24 08:58:43,466 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
246
+ 2025-04-24 08:58:43,466 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
247
+ 2025-04-24 08:58:43,466 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
248
+ 2025-04-24 08:58:43,467 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
249
+ 2025-04-24 08:58:43,467 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
250
+ 2025-04-24 08:58:43,467 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
251
+ 2025-04-24 08:58:43,467 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
252
+ 2025-04-24 08:58:43,467 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
253
+ 2025-04-24 08:58:43,467 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
254
+ 2025-04-24 08:58:43,467 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
255
+ 2025-04-24 08:58:43,467 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
256
+ 2025-04-24 08:58:43,467 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
257
+ 2025-04-24 08:58:43,467 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
258
+ 2025-04-24 08:58:43,467 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
259
+ 2025-04-24 08:58:43,467 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
260
+ 2025-04-24 08:58:43,467 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
261
+ 2025-04-24 08:58:43,467 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
262
+ 2025-04-24 08:58:43,467 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
263
+ 2025-04-24 08:58:43,467 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
264
+ 2025-04-24 08:58:43,467 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
265
+ 2025-04-24 08:58:43,467 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
266
+ 2025-04-24 08:58:43,467 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
267
+ 2025-04-24 08:58:43,467 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
268
+ 2025-04-24 08:58:43,468 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
269
+ 2025-04-24 08:58:43,468 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
270
+ 2025-04-24 08:58:43,468 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
271
+ 2025-04-24 08:58:43,468 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
272
+ 2025-04-24 08:58:43,468 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
273
+ 2025-04-24 08:58:43,468 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
274
+ 2025-04-24 08:58:43,468 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
275
+ 2025-04-24 08:58:43,468 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
276
+ 2025-04-24 08:58:43,468 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
277
+ 2025-04-24 08:58:43,468 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
278
+ 2025-04-24 08:58:43,468 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
279
+ 2025-04-24 08:58:43,468 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
280
+ 2025-04-24 08:58:43,468 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
281
+ 2025-04-24 08:58:43,468 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
282
+ 2025-04-24 08:58:43,468 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
283
+ 2025-04-24 08:58:43,468 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
284
+ 2025-04-24 08:58:43,468 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
285
+ 2025-04-24 08:58:43,468 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
286
+ 2025-04-24 08:58:43,468 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
287
+ 2025-04-24 08:58:43,468 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
288
+ 2025-04-24 08:58:43,469 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
289
+ 2025-04-24 08:58:43,469 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
290
+ 2025-04-24 08:58:43,469 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
291
+ 2025-04-24 08:58:43,469 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
292
+ 2025-04-24 08:58:43,469 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
293
+ 2025-04-24 08:58:43,469 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
294
+ 2025-04-24 08:58:43,469 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
295
+ 2025-04-24 08:58:43,469 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
296
+ 2025-04-24 08:58:43,469 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
297
+ 2025-04-24 08:58:43,469 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
298
+ 2025-04-24 08:58:43,469 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
299
+ 2025-04-24 08:58:43,469 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
300
+ 2025-04-24 08:58:43,469 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
301
+ 2025-04-24 08:58:43,469 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
302
+ 2025-04-24 08:58:43,469 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
303
+ 2025-04-24 08:58:43,469 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
304
+ 2025-04-24 08:58:43,469 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
305
+ 2025-04-24 08:58:43,469 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
306
+ 2025-04-24 08:58:43,469 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
307
+ 2025-04-24 08:58:43,470 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
308
+ 2025-04-24 08:58:43,470 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
309
+ 2025-04-24 08:58:43,470 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
310
+ 2025-04-24 08:58:43,470 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
311
+ 2025-04-24 08:58:43,470 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
312
+ 2025-04-24 08:58:43,470 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
313
+ 2025-04-24 08:58:43,470 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
314
+ 2025-04-24 08:58:43,470 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
315
+ 2025-04-24 08:58:43,470 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
316
+ 2025-04-24 08:58:43,470 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
317
+ 2025-04-24 08:58:43,470 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
318
+ 2025-04-24 08:58:43,470 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
319
+ 2025-04-24 08:58:43,470 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
320
+ 2025-04-24 08:58:43,470 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
321
+ 2025-04-24 08:58:43,470 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
322
+ 2025-04-24 08:58:43,470 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
323
+ 2025-04-24 08:58:43,470 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
324
+ 2025-04-24 08:58:43,470 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
325
+ 2025-04-24 08:58:43,470 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
326
+ 2025-04-24 08:58:43,471 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
327
+ 2025-04-24 08:58:43,471 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
328
+ 2025-04-24 08:58:43,471 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
329
+ 2025-04-24 08:58:43,471 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
330
+ 2025-04-24 08:58:43,471 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
331
+ 2025-04-24 08:58:43,471 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
332
+ 2025-04-24 08:58:43,471 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
333
+ 2025-04-24 08:58:43,471 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
334
+ 2025-04-24 08:58:43,471 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
335
+ 2025-04-24 08:58:43,471 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
336
+ 2025-04-24 08:58:43,471 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
337
+ 2025-04-24 08:58:43,471 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
338
+ 2025-04-24 08:58:43,471 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
339
+ 2025-04-24 08:58:43,471 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
340
+ 2025-04-24 08:58:43,471 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
341
+ 2025-04-24 08:58:43,471 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
342
+ 2025-04-24 08:58:43,471 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
343
+ 2025-04-24 08:58:43,471 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
344
+ 2025-04-24 08:58:43,471 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
345
+ 2025-04-24 08:58:43,471 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
346
+ 2025-04-24 08:58:43,472 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
347
+ 2025-04-24 08:58:43,472 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
348
+ 2025-04-24 08:58:43,472 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
349
+ 2025-04-24 08:58:43,472 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
350
+ 2025-04-24 08:58:43,472 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
351
+ 2025-04-24 08:58:43,472 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
352
+ 2025-04-24 08:58:43,472 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
353
+ 2025-04-24 08:58:43,472 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
354
+ 2025-04-24 08:58:43,472 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
355
+ 2025-04-24 08:58:43,472 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
356
+ 2025-04-24 08:58:43,472 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
357
+ 2025-04-24 08:58:43,472 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
358
+ 2025-04-24 08:58:43,472 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
359
+ 2025-04-24 08:58:43,472 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
360
+ 2025-04-24 08:58:43,472 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
361
+ 2025-04-24 08:58:43,472 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
362
+ 2025-04-24 08:58:43,472 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
363
+ 2025-04-24 08:58:43,472 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
364
+ 2025-04-24 08:58:43,472 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
365
+ 2025-04-24 08:58:43,472 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
366
+ 2025-04-24 08:58:43,473 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
367
+ 2025-04-24 08:58:43,473 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
368
+ 2025-04-24 08:58:43,473 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
369
+ 2025-04-24 08:58:43,473 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
370
+ 2025-04-24 08:58:43,473 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
371
+ 2025-04-24 08:58:43,473 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
372
+ 2025-04-24 08:58:43,473 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
373
+ 2025-04-24 08:58:43,473 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
374
+ 2025-04-24 08:58:43,473 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
375
+ 2025-04-24 08:58:43,473 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
376
+ 2025-04-24 08:58:43,473 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
377
+ 2025-04-24 08:58:43,473 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
378
+ 2025-04-24 08:58:43,473 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
379
+ 2025-04-24 08:58:43,473 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
380
+ 2025-04-24 08:58:43,473 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
381
+ 2025-04-24 08:58:43,473 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
382
+ 2025-04-24 08:58:43,473 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
383
+ 2025-04-24 08:58:43,473 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
384
+ 2025-04-24 08:58:43,473 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
385
+ 2025-04-24 08:58:43,474 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
386
+ 2025-04-24 08:58:43,474 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
387
+ 2025-04-24 08:58:43,474 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
388
+ 2025-04-24 08:58:43,474 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
389
+ 2025-04-24 08:58:43,474 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
390
+ 2025-04-24 08:58:43,474 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
391
+ 2025-04-24 08:58:43,474 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
392
+ 2025-04-24 08:58:43,474 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
393
+ 2025-04-24 08:58:43,474 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
394
+ 2025-04-24 08:58:43,474 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
395
+ 2025-04-24 08:58:43,474 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
396
+ 2025-04-24 08:58:43,474 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
397
+ 2025-04-24 08:58:43,474 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
398
+ 2025-04-24 08:58:43,474 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
399
+ 2025-04-24 08:58:43,474 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
400
+ 2025-04-24 08:58:43,474 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
401
+ 2025-04-24 08:58:43,474 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
402
+ 2025-04-24 08:58:43,474 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
403
+ 2025-04-24 08:58:43,474 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
404
+ 2025-04-24 08:58:43,474 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
405
+ 2025-04-24 08:58:43,475 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
406
+ 2025-04-24 08:58:43,475 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
407
+ 2025-04-24 08:58:43,475 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
408
+ 2025-04-24 08:58:43,475 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
409
+ 2025-04-24 08:58:43,475 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
410
+ 2025-04-24 08:58:43,475 INFO [train.py:813] encoder_projector.linear1.weight: torch.Size([896, 10240])
411
+ 2025-04-24 08:58:43,475 INFO [train.py:813] encoder_projector.linear1.bias: torch.Size([896])
412
+ 2025-04-24 08:58:43,475 INFO [train.py:813] encoder_projector.linear2.weight: torch.Size([896, 896])
413
+ 2025-04-24 08:58:43,475 INFO [train.py:813] encoder_projector.linear2.bias: torch.Size([896])
414
+ 2025-04-24 08:58:43,475 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
415
+ 2025-04-24 08:58:43,475 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
416
+ 2025-04-24 08:58:43,475 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
417
+ 2025-04-24 08:58:43,475 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
418
+ 2025-04-24 08:58:43,475 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
419
+ 2025-04-24 08:58:43,475 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
420
+ 2025-04-24 08:58:43,475 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
421
+ 2025-04-24 08:58:43,475 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
422
+ 2025-04-24 08:58:43,475 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
423
+ 2025-04-24 08:58:43,475 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
424
+ 2025-04-24 08:58:43,475 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
425
+ 2025-04-24 08:58:43,476 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
426
+ 2025-04-24 08:58:43,476 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
427
+ 2025-04-24 08:58:43,476 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
428
+ 2025-04-24 08:58:43,476 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
429
+ 2025-04-24 08:58:43,476 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
430
+ 2025-04-24 08:58:43,476 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
431
+ 2025-04-24 08:58:43,476 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
432
+ 2025-04-24 08:58:43,476 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
433
+ 2025-04-24 08:58:43,476 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
434
+ 2025-04-24 08:58:43,476 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
435
+ 2025-04-24 08:58:43,476 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
436
+ 2025-04-24 08:58:43,476 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
437
+ 2025-04-24 08:58:43,476 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
438
+ 2025-04-24 08:58:43,476 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
439
+ 2025-04-24 08:58:43,476 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
440
+ 2025-04-24 08:58:43,476 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
441
+ 2025-04-24 08:58:43,476 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
442
+ 2025-04-24 08:58:43,476 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
443
+ 2025-04-24 08:58:43,476 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
444
+ 2025-04-24 08:58:43,476 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
445
+ 2025-04-24 08:58:43,477 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
446
+ 2025-04-24 08:58:43,477 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
447
+ 2025-04-24 08:58:43,477 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
448
+ 2025-04-24 08:58:43,477 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
449
+ 2025-04-24 08:58:43,477 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
450
+ 2025-04-24 08:58:43,477 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
451
+ 2025-04-24 08:58:43,477 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
452
+ 2025-04-24 08:58:43,477 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
453
+ 2025-04-24 08:58:43,477 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
454
+ 2025-04-24 08:58:43,477 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
455
+ 2025-04-24 08:58:43,477 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
456
+ 2025-04-24 08:58:43,477 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
457
+ 2025-04-24 08:58:43,477 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
458
+ 2025-04-24 08:58:43,477 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
459
+ 2025-04-24 08:58:43,477 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
460
+ 2025-04-24 08:58:43,477 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
461
+ 2025-04-24 08:58:43,477 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
462
+ 2025-04-24 08:58:43,477 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
463
+ 2025-04-24 08:58:43,477 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
464
+ 2025-04-24 08:58:43,477 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
465
+ 2025-04-24 08:58:43,478 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
466
+ 2025-04-24 08:58:43,478 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
467
+ 2025-04-24 08:58:43,478 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
468
+ 2025-04-24 08:58:43,478 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
469
+ 2025-04-24 08:58:43,478 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
470
+ 2025-04-24 08:58:43,478 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
471
+ 2025-04-24 08:58:43,478 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
472
+ 2025-04-24 08:58:43,478 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
473
+ 2025-04-24 08:58:43,478 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
474
+ 2025-04-24 08:58:43,478 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
475
+ 2025-04-24 08:58:43,478 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
476
+ 2025-04-24 08:58:43,478 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
477
+ 2025-04-24 08:58:43,478 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
478
+ 2025-04-24 08:58:43,478 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
479
+ 2025-04-24 08:58:43,478 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
480
+ 2025-04-24 08:58:43,478 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
481
+ 2025-04-24 08:58:43,478 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
482
+ 2025-04-24 08:58:43,478 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
483
+ 2025-04-24 08:58:43,478 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
484
+ 2025-04-24 08:58:43,478 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
485
+ 2025-04-24 08:58:43,479 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
486
+ 2025-04-24 08:58:43,479 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
487
+ 2025-04-24 08:58:43,479 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
488
+ 2025-04-24 08:58:43,479 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
489
+ 2025-04-24 08:58:43,479 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
490
+ 2025-04-24 08:58:43,479 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
491
+ 2025-04-24 08:58:43,479 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
492
+ 2025-04-24 08:58:43,479 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
493
+ 2025-04-24 08:58:43,479 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
494
+ 2025-04-24 08:58:43,479 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
495
+ 2025-04-24 08:58:43,479 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
496
+ 2025-04-24 08:58:43,479 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
497
+ 2025-04-24 08:58:43,479 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
498
+ 2025-04-24 08:58:43,479 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
499
+ 2025-04-24 08:58:43,479 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
500
+ 2025-04-24 08:58:43,479 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
501
+ 2025-04-24 08:58:43,479 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
502
+ 2025-04-24 08:58:43,479 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
503
+ 2025-04-24 08:58:43,479 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
504
+ 2025-04-24 08:58:43,480 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
505
+ 2025-04-24 08:58:43,480 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
506
+ 2025-04-24 08:58:43,480 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
507
+ 2025-04-24 08:58:43,480 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
508
+ 2025-04-24 08:58:43,480 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
509
+ 2025-04-24 08:58:43,480 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
510
+ 2025-04-24 08:58:43,480 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
511
+ 2025-04-24 08:58:43,480 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
512
+ 2025-04-24 08:58:43,480 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
513
+ 2025-04-24 08:58:43,480 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
514
+ 2025-04-24 08:58:43,480 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
515
+ 2025-04-24 08:58:43,480 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
516
+ 2025-04-24 08:58:43,480 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
517
+ 2025-04-24 08:58:43,480 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
518
+ 2025-04-24 08:58:43,480 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
519
+ 2025-04-24 08:58:43,480 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
520
+ 2025-04-24 08:58:43,480 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
521
+ 2025-04-24 08:58:43,480 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
522
+ 2025-04-24 08:58:43,480 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
523
+ 2025-04-24 08:58:43,480 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
524
+ 2025-04-24 08:58:43,481 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
525
+ 2025-04-24 08:58:43,481 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
526
+ 2025-04-24 08:58:43,481 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
527
+ 2025-04-24 08:58:43,481 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
528
+ 2025-04-24 08:58:43,481 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
529
+ 2025-04-24 08:58:43,481 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
530
+ 2025-04-24 08:58:43,481 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
531
+ 2025-04-24 08:58:43,481 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
532
+ 2025-04-24 08:58:43,481 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
533
+ 2025-04-24 08:58:43,481 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
534
+ 2025-04-24 08:58:43,481 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
535
+ 2025-04-24 08:58:43,481 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
536
+ 2025-04-24 08:58:43,481 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
537
+ 2025-04-24 08:58:43,481 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
538
+ 2025-04-24 08:58:43,481 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
539
+ 2025-04-24 08:58:43,481 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
540
+ 2025-04-24 08:58:43,481 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
541
+ 2025-04-24 08:58:43,481 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
542
+ 2025-04-24 08:58:43,481 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
543
+ 2025-04-24 08:58:43,481 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
544
+ 2025-04-24 08:58:43,481 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
545
+ 2025-04-24 08:58:43,482 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
546
+ 2025-04-24 08:58:43,482 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
547
+ 2025-04-24 08:58:43,482 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
548
+ 2025-04-24 08:58:43,482 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
549
+ 2025-04-24 08:58:43,482 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
550
+ 2025-04-24 08:58:43,482 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
551
+ 2025-04-24 08:58:43,482 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
552
+ 2025-04-24 08:58:43,482 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
553
+ 2025-04-24 08:58:43,482 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
554
+ 2025-04-24 08:58:43,482 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
555
+ 2025-04-24 08:58:43,482 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
556
+ 2025-04-24 08:58:43,482 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
557
+ 2025-04-24 08:58:43,482 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
558
+ 2025-04-24 08:58:43,482 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
559
+ 2025-04-24 08:58:43,482 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
560
+ 2025-04-24 08:58:43,482 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
561
+ 2025-04-24 08:58:43,482 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
562
+ 2025-04-24 08:58:43,482 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
563
+ 2025-04-24 08:58:43,482 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
564
+ 2025-04-24 08:58:43,482 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
565
+ 2025-04-24 08:58:43,483 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
566
+ 2025-04-24 08:58:43,483 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
567
+ 2025-04-24 08:58:43,483 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
568
+ 2025-04-24 08:58:43,483 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
569
+ 2025-04-24 08:58:43,483 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
570
+ 2025-04-24 08:58:43,483 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
571
+ 2025-04-24 08:58:43,483 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
572
+ 2025-04-24 08:58:43,483 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
573
+ 2025-04-24 08:58:43,483 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
574
+ 2025-04-24 08:58:43,483 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
575
+ 2025-04-24 08:58:43,483 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
576
+ 2025-04-24 08:58:43,483 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
577
+ 2025-04-24 08:58:43,483 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
578
+ 2025-04-24 08:58:43,483 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
579
+ 2025-04-24 08:58:43,483 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
580
+ 2025-04-24 08:58:43,483 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
581
+ 2025-04-24 08:58:43,483 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
582
+ 2025-04-24 08:58:43,483 INFO [train.py:813] speech_token_projector.weight: torch.Size([1024, 896])
583
+ 2025-04-24 08:58:43,483 INFO [train.py:813] speech_token_projector.bias: torch.Size([1024])
584
+ 2025-04-24 08:58:43,483 INFO [train.py:813] codec_lm_head.weight: torch.Size([4100, 1024])
585
+ 2025-04-24 08:58:43,484 INFO [train.py:813] codec_lm_head.bias: torch.Size([4100])
586
+ 2025-04-24 08:58:43,484 INFO [train.py:819] Device: cuda:1
587
+ 2025-04-24 08:58:44,314 INFO [train.py:823] Using DeepSpeed
588
+ 2025-04-24 08:58:46,121 INFO [data_module.py:445] About to get train cuts
589
+ 2025-04-24 08:58:46,124 INFO [data_module.py:250] Disable MUSAN
590
+ 2025-04-24 08:58:46,124 INFO [data_module.py:268] Enable SpecAugment
591
+ 2025-04-24 08:58:46,124 INFO [data_module.py:269] Time warp factor: 80
592
+ 2025-04-24 08:58:46,124 INFO [data_module.py:279] Num frame mask: 10
593
+ 2025-04-24 08:58:46,124 INFO [data_module.py:292] About to create train dataset
594
+ 2025-04-24 08:58:46,124 INFO [data_module.py:319] Using DynamicBucketingSampler.
595
+ 2025-04-24 08:58:46,156 WARNING [train.py:847] Exclude cut with ID c45eb75253bb65901de61b81270c9974 from training. Duration: 6.199727891156463, lenth: 2321
596
+ 2025-04-24 08:58:46,338 WARNING [train.py:847] Exclude cut with ID 82678aa1e73accb2d11a87440a90fd55 from training. Duration: 5.0967800453514736, lenth: 2413
597
+ 2025-04-24 08:58:46,831 WARNING [train.py:847] Exclude cut with ID 345bcf1b57cbfcf140007812e992c07a from training. Duration: 5.712108843537415, lenth: 2520
598
+ 2025-04-24 08:58:46,985 WARNING [train.py:847] Exclude cut with ID 2bc428341a40605b4887a7e46e3ee4ac from training. Duration: 3.657142857142857, lenth: 2282
599
+ 2025-04-24 08:58:47,053 WARNING [train.py:847] Exclude cut with ID 24c14f6007ceb8d993f6496aeb663c24 from training. Duration: 2.391655328798186, lenth: 2218
600
+ 2025-04-24 08:58:47,120 INFO [data_module.py:336] About to create train dataloader
601
+ 2025-04-24 08:58:47,120 INFO [data_module.py:436] About to get test cuts
602
+ 2025-04-24 08:58:47,121 INFO [data_module.py:365] About to create dev dataset
603
+ 2025-04-24 08:58:47,209 INFO [data_module.py:379] About to create dev dataloader
604
+ 2025-04-24 08:58:47,215 INFO [train.py:875] start training from epoch 1
605
+ 2025-04-24 08:58:47,376 WARNING [train.py:847] Exclude cut with ID c45eb75253bb65901de61b81270c9974 from training. Duration: 6.199727891156463, lenth: 2321
606
+ 2025-04-24 08:58:47,606 WARNING [train.py:847] Exclude cut with ID 82678aa1e73accb2d11a87440a90fd55 from training. Duration: 5.0967800453514736, lenth: 2413
607
+ 2025-04-24 08:58:48,237 WARNING [train.py:847] Exclude cut with ID 345bcf1b57cbfcf140007812e992c07a from training. Duration: 5.712108843537415, lenth: 2520
608
+ 2025-04-24 08:58:48,432 WARNING [train.py:847] Exclude cut with ID 2bc428341a40605b4887a7e46e3ee4ac from training. Duration: 3.657142857142857, lenth: 2282
609
+ 2025-04-24 08:58:48,511 WARNING [train.py:847] Exclude cut with ID 24c14f6007ceb8d993f6496aeb663c24 from training. Duration: 2.391655328798186, lenth: 2218
610
+ 2025-04-24 08:58:49,064 WARNING [train.py:847] Exclude cut with ID de225db1da1393474e4ce9af8cbafb42 from training. Duration: 5.433469387755102, lenth: 2230
611
+ 2025-04-24 08:58:49,354 WARNING [train.py:847] Exclude cut with ID ef376c7d1fb4606edb0547b2a7fb95b3 from training. Duration: 2.7399546485260773, lenth: 2637
612
+ 2025-04-24 08:58:49,692 WARNING [train.py:847] Exclude cut with ID c7f3c7da44c2c6e27f86bda421b958d8 from training. Duration: 2.995374149659864, lenth: 2318
613
+ 2025-04-24 08:58:49,867 WARNING [train.py:847] Exclude cut with ID 8d724f452d1d99549eebf90ab658e7bd from training. Duration: 5.712108843537415, lenth: 2209
614
+ 2025-04-24 08:58:49,983 WARNING [train.py:847] Exclude cut with ID a585785e795cb35d02d8f7b599352e16 from training. Duration: 4.911020408163266, lenth: 2575
615
+ 2025-04-24 08:58:50,062 WARNING [train.py:847] Exclude cut with ID 31a00c5e63c88ef67402f578ebc6ebf7 from training. Duration: 3.575873015873016, lenth: 2307
616
+ 2025-04-24 08:58:50,180 WARNING [train.py:847] Exclude cut with ID 32ab4761aaf08d9473fc8388c3a6cffc from training. Duration: 7.9760544217687075, lenth: 2201
617
+ 2025-04-24 08:58:50,237 WARNING [train.py:847] Exclude cut with ID 5fddb6bf9d5265c0409b81c85fadbaec from training. Duration: 3.4597732426303853, lenth: 2210
618
+ 2025-04-24 08:58:51,088 WARNING [train.py:847] Exclude cut with ID c5d43a2443ce7820cf864f7c55eafb77 from training. Duration: 3.912562358276644, lenth: 2315
619
+ 2025-04-24 08:58:52,211 WARNING [train.py:847] Exclude cut with ID a951e36267c8901d4c03fb6b4dffe636 from training. Duration: 4.73687074829932, lenth: 2223
620
+ 2025-04-24 08:58:52,283 WARNING [train.py:847] Exclude cut with ID 595680451c1ee7e42dba0452e165dea3 from training. Duration: 3.959002267573696, lenth: 2251
621
+ 2025-04-24 08:58:52,830 WARNING [train.py:847] Exclude cut with ID 20fa9b11557159d78f72ccc6b464de18 from training. Duration: 3.9938321995464854, lenth: 2215
622
+ 2025-04-24 08:58:52,884 WARNING [train.py:847] Exclude cut with ID 8a01fc72b4108673fd8393e523450e23 from training. Duration: 8.719092970521542, lenth: 2517
623
+ 2025-04-24 08:58:53,397 WARNING [train.py:847] Exclude cut with ID 924d54288847d448a20809a26b320477 from training. Duration: 3.81968253968254, lenth: 2302
624
+ 2025-04-24 08:58:53,695 WARNING [train.py:847] Exclude cut with ID a7f91a17f6318d896d0bee7b7307d000 from training. Duration: 5.294149659863946, lenth: 2239
625
+ 2025-04-24 08:58:54,189 WARNING [train.py:847] Exclude cut with ID 8d7cff5d1d25a773f9d27590b5912b60 from training. Duration: 4.597551020408163, lenth: 2203
626
+ 2025-04-24 08:58:54,309 WARNING [train.py:847] Exclude cut with ID 13407ca1ec78def38346569b54d11a78 from training. Duration: 5.909478458049887, lenth: 2280
627
+ 2025-04-24 08:58:54,313 WARNING [train.py:847] Exclude cut with ID 6390ab6964b52f2ee9bf36826393abaa from training. Duration: 4.017052154195011, lenth: 2209
628
+ 2025-04-24 08:58:54,357 WARNING [train.py:847] Exclude cut with ID f55c19435d6643ebf7356a1072a014fa from training. Duration: 5.259319727891157, lenth: 2310
629
+ 2025-04-24 08:58:55,044 WARNING [train.py:847] Exclude cut with ID 546f5fb6d11db41d104ee7b0667f4e1d from training. Duration: 5.712108843537415, lenth: 2770
630
+ 2025-04-24 08:58:55,337 WARNING [train.py:847] Exclude cut with ID b2069846ab4df18f5caa676751aa0b40 from training. Duration: 6.6525170068027215, lenth: 2588
631
+ 2025-04-24 08:58:55,708 WARNING [train.py:847] Exclude cut with ID 5717a65b9855425c0330e6022a71cc2f from training. Duration: 5.073560090702948, lenth: 2269
632
+ 2025-04-24 08:58:56,015 WARNING [train.py:847] Exclude cut with ID d6e37a975b3c69a86a808347b53e42f2 from training. Duration: 5.259319727891157, lenth: 2885
633
+ 2025-04-24 08:58:56,164 WARNING [train.py:847] Exclude cut with ID d15e75257f8dac616c078113f8f96c63 from training. Duration: 7.6741950113378685, lenth: 2276
634
+ 2025-04-24 08:58:56,227 WARNING [train.py:847] Exclude cut with ID 7ea49b35e0dc536537ae3ddcd07032af from training. Duration: 4.829750566893424, lenth: 2206
635
+ 2025-04-24 08:58:56,689 WARNING [train.py:847] Exclude cut with ID 0a1a80c13fd0c38d97c9491dff84c7da from training. Duration: 2.914104308390023, lenth: 2220
636
+ 2025-04-24 08:58:56,739 WARNING [train.py:847] Exclude cut with ID 359ff41c694bb848c3e37b948b1434f7 from training. Duration: 6.478367346938776, lenth: 2881
637
+ 2025-04-24 08:58:56,747 WARNING [train.py:847] Exclude cut with ID 6989881203588034ddf38ae5a7cfe238 from training. Duration: 5.491519274376417, lenth: 2243
638
+ 2025-04-24 08:58:57,027 WARNING [train.py:847] Exclude cut with ID fd7b5cb9a6bce2c02bf8a7c3dd2b35d6 from training. Duration: 5.793378684807256, lenth: 2305
639
+ 2025-04-24 08:58:58,007 WARNING [train.py:847] Exclude cut with ID e1c5efd6d4cd7c6dc0f3ac7720161c49 from training. Duration: 9.438911564625851, lenth: 2201
640
+ 2025-04-24 08:58:58,405 WARNING [train.py:847] Exclude cut with ID c13ffcc7b60ea460731e417d2d9575cf from training. Duration: 3.250793650793651, lenth: 2205
641
+ 2025-04-24 08:58:58,498 WARNING [train.py:847] Exclude cut with ID ba1a37e64efe00d8b8cbfb0069e65f8d from training. Duration: 3.274013605442177, lenth: 2290
642
+ 2025-04-24 08:58:58,579 WARNING [train.py:847] Exclude cut with ID de9f5fe02034bc4b0a66fbe378fffb1a from training. Duration: 6.455147392290249, lenth: 2228
643
+ 2025-04-24 08:58:59,067 WARNING [train.py:847] Exclude cut with ID 8ca7a35870994edd410e1ea8e951faa3 from training. Duration: 7.012426303854875, lenth: 2233
644
+ 2025-04-24 08:58:59,214 WARNING [train.py:847] Exclude cut with ID 3260368505eddf46fb1c9b562124ff18 from training. Duration: 12.097596371882085, lenth: 2261
645
+ 2025-04-24 08:58:59,412 WARNING [train.py:847] Exclude cut with ID 36878ea551121147626747e9fc39ae17 from training. Duration: 5.537959183673469, lenth: 2224
646
+ 2025-04-24 08:59:00,361 WARNING [train.py:847] Exclude cut with ID d4c7270e6c9b3dffed8643040b5f922b from training. Duration: 4.133151927437642, lenth: 2329
647
+ 2025-04-24 08:59:00,843 WARNING [train.py:847] Exclude cut with ID 179517464c97fbbcb8909a56b849bf73 from training. Duration: 6.617687074829932, lenth: 2257
648
+ 2025-04-24 08:59:01,065 WARNING [train.py:847] Exclude cut with ID 17480e9524e4c0f8e405487e50c8e98e from training. Duration: 4.551111111111111, lenth: 2263
649
+ 2025-04-24 08:59:01,165 WARNING [train.py:847] Exclude cut with ID 2c10af552b911b8ac45c1fc7bb9fcb5d from training. Duration: 6.2693877551020405, lenth: 2481
650
+ 2025-04-24 08:59:01,276 WARNING [train.py:847] Exclude cut with ID aa2539588175d8c96975a06a8993e5b6 from training. Duration: 5.456689342403628, lenth: 2573
651
+ 2025-04-24 08:59:01,317 WARNING [train.py:847] Exclude cut with ID 32945d8e21a8628bb3c93fd499330639 from training. Duration: 13.455963718820861, lenth: 2205
652
+ 2025-04-24 08:59:01,495 WARNING [train.py:847] Exclude cut with ID 64db5d6166c3edc340d8e1b02c059f19 from training. Duration: 4.818140589569161, lenth: 2259
653
+ 2025-04-24 08:59:01,638 WARNING [train.py:847] Exclude cut with ID 39483151e08291efe16217ab41622d59 from training. Duration: 7.952834467120182, lenth: 2439
654
+ 2025-04-24 08:59:02,375 WARNING [train.py:847] Exclude cut with ID c4a74dd23aa9a3fb13330113787139d6 from training. Duration: 3.239183673469388, lenth: 2276
655
+ 2025-04-24 08:59:02,420 WARNING [train.py:847] Exclude cut with ID b12a8836f1aa525a763ac657a44229a3 from training. Duration: 5.0967800453514736, lenth: 2427
656
+ 2025-04-24 08:59:05,911 WARNING [train.py:847] Exclude cut with ID 185cfa3ae9c12babe1e217de612f0c7b from training. Duration: 6.257777777777778, lenth: 2587
657
+ 2025-04-24 08:59:06,001 WARNING [train.py:847] Exclude cut with ID 08eff5b64d191a0d50634630437a3107 from training. Duration: 10.135510204081633, lenth: 2273
658
+ 2025-04-24 08:59:06,092 WARNING [train.py:847] Exclude cut with ID bcd8207538deeccfcf8d1de872b8f7e9 from training. Duration: 4.179591836734694, lenth: 2242
659
+ 2025-04-24 08:59:06,302 WARNING [train.py:847] Exclude cut with ID f909651c923fdd6cc0eaa81de64f4a5b from training. Duration: 8.068934240362811, lenth: 2421
660
+ 2025-04-24 08:59:06,330 WARNING [train.py:847] Exclude cut with ID f5d34097b89bd201018189ff44b10b33 from training. Duration: 3.7964625850340137, lenth: 2250
661
+ 2025-04-24 08:59:06,425 WARNING [train.py:847] Exclude cut with ID 245530abe5a62b390611b7a3b91ddfef from training. Duration: 9.079002267573696, lenth: 2299
662
+ 2025-04-24 08:59:07,344 WARNING [train.py:847] Exclude cut with ID 79d0d3129c764fd4d9c5391b6353e349 from training. Duration: 5.0967800453514736, lenth: 2309
663
+ 2025-04-24 08:59:07,382 WARNING [train.py:847] Exclude cut with ID 1969177f73511ef204642f43ce029d05 from training. Duration: 3.657142857142857, lenth: 2339
664
+ 2025-04-24 08:59:07,895 WARNING [train.py:847] Exclude cut with ID f601e756ecd6074cd3037c61e413cb6e from training. Duration: 5.712108843537415, lenth: 2287
665
+ 2025-04-24 08:59:08,149 WARNING [train.py:847] Exclude cut with ID 9f7758e035b3eebcb9267aa49ad65d4c from training. Duration: 3.9938321995464854, lenth: 2303
666
+ 2025-04-24 08:59:08,619 WARNING [train.py:847] Exclude cut with ID b29a42e8c0af813bf605d7f0a974620c from training. Duration: 5.050340136054421, lenth: 2219
667
+ 2025-04-24 08:59:08,791 WARNING [train.py:847] Exclude cut with ID 192e5517b050a4659892a2832d986764 from training. Duration: 5.12, lenth: 2420
668
+ 2025-04-24 08:59:08,959 WARNING [train.py:847] Exclude cut with ID bc262e923d16a434a3c5701bea841e4e from training. Duration: 7.418775510204082, lenth: 2205
669
+ 2025-04-24 08:59:09,126 WARNING [train.py:847] Exclude cut with ID 5141e01019dec47ed309fbf5881ab675 from training. Duration: 8.034104308390022, lenth: 2284
670
+ 2025-04-24 08:59:09,156 WARNING [train.py:847] Exclude cut with ID 3555e5c7123d744b39ba7ecce44ce1e2 from training. Duration: 3.7151927437641725, lenth: 2226
671
+ 2025-04-24 08:59:09,605 WARNING [train.py:847] Exclude cut with ID 973a1dc43ca452580d6999027a82fbec from training. Duration: 5.270929705215419, lenth: 2250
672
+ 2025-04-24 08:59:09,718 WARNING [train.py:847] Exclude cut with ID e2791accb62bb5e4bde9e0e2b04945c5 from training. Duration: 3.517823129251701, lenth: 2286
673
+ 2025-04-24 08:59:09,897 WARNING [train.py:847] Exclude cut with ID e91152d010c4479932b7ab64a998d5c9 from training. Duration: 4.771700680272109, lenth: 2468
674
+ 2025-04-24 08:59:10,032 WARNING [train.py:847] Exclude cut with ID cff6a8011a86580fcf47595d18819ae9 from training. Duration: 4.458231292517007, lenth: 2205
675
+ 2025-04-24 08:59:10,081 WARNING [train.py:847] Exclude cut with ID 1623cdac3d17f7a211e8a8f434f70657 from training. Duration: 4.191201814058957, lenth: 2308
676
+ 2025-04-24 08:59:10,723 WARNING [train.py:847] Exclude cut with ID 273cfc04aebaf8b0cad541d0f3d1cded from training. Duration: 6.408707482993197, lenth: 2609
677
+ 2025-04-24 08:59:10,804 WARNING [train.py:847] Exclude cut with ID 7d591b090a42935fd1edd8de97c690b6 from training. Duration: 5.038730158730159, lenth: 2229
678
+ 2025-04-24 08:59:10,900 WARNING [train.py:847] Exclude cut with ID 1cfa9ef743b568bb034b415000c8c479 from training. Duration: 7.174965986394557, lenth: 2440
679
+ 2025-04-24 08:59:11,699 WARNING [train.py:847] Exclude cut with ID 762676d460719a9203fa7e7537560530 from training. Duration: 3.1346938775510202, lenth: 2443
680
+ 2025-04-24 08:59:12,205 WARNING [train.py:847] Exclude cut with ID 43575814ad7bb6d631483cdc513b49ac from training. Duration: 10.95981859410431, lenth: 2380
681
+ 2025-04-24 08:59:12,326 WARNING [train.py:847] Exclude cut with ID ae208dee7119ca436f380b99077bcde9 from training. Duration: 5.2128798185941045, lenth: 2615
682
+ 2025-04-24 08:59:12,425 WARNING [train.py:847] Exclude cut with ID b56757708077ab00baa7857c1cef8491 from training. Duration: 5.456689342403628, lenth: 2206
683
+ 2025-04-24 08:59:12,660 WARNING [train.py:847] Exclude cut with ID f3f6d5671f3d11bc83f8fcad598007ff from training. Duration: 4.179591836734694, lenth: 2262
684
+ 2025-04-24 08:59:13,091 WARNING [train.py:847] Exclude cut with ID 6af07f098cd794b9dd5a6bd660f53505 from training. Duration: 4.435011337868481, lenth: 2251
685
+ 2025-04-24 08:59:13,234 WARNING [train.py:847] Exclude cut with ID 4ac1131393136a35ef54499bf3c728d4 from training. Duration: 7.732244897959184, lenth: 2211
686
+ 2025-04-24 08:59:13,236 WARNING [train.py:847] Exclude cut with ID 482877e78defa83a355d2cdeb5a629ed from training. Duration: 5.154829931972789, lenth: 2891
687
+ 2025-04-24 08:59:16,145 WARNING [train.py:847] Exclude cut with ID a93637e8f3e7bdf34bc8eb5d34d34197 from training. Duration: 3.215963718820862, lenth: 2235
688
+ 2025-04-24 08:59:17,828 WARNING [train.py:847] Exclude cut with ID 90c6fbf20eeb34c13ffb97a198d84b0b from training. Duration: 5.0967800453514736, lenth: 2236
689
+ 2025-04-24 08:59:17,931 INFO [train.py:539] Computing validation loss
log/log-train-2025-04-24-09-09-22 ADDED
@@ -0,0 +1,689 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-04-24 09:09:22,425 INFO [train.py:653] {
2
+ "allowed_excess_duration_ratio": 0.1,
3
+ "audio_key": "question_audio",
4
+ "batch_idx_train": 0,
5
+ "best_train_epoch": -1,
6
+ "best_train_loss": Infinity,
7
+ "best_valid_epoch": -1,
8
+ "best_valid_loss": Infinity,
9
+ "bucketing_sampler": true,
10
+ "deepscale": false,
11
+ "deepscale_config": null,
12
+ "deepspeed": true,
13
+ "deepspeed_config": "./slam_omni/ds_config_zero1.json",
14
+ "drop_last": true,
15
+ "enable_musan": false,
16
+ "enable_spec_aug": true,
17
+ "enable_speech_output": true,
18
+ "encoder_projector_ds_rate": 8,
19
+ "env_info": {
20
+ "IP address": "0.114.172.122",
21
+ "hostname": "7515258",
22
+ "icefall-git-branch": "master",
23
+ "icefall-git-date": "Thu Apr 24 08:24:11 2025",
24
+ "icefall-git-sha1": "2e9be467-dirty",
25
+ "icefall-path": "/workspace/slam/icefall_omni",
26
+ "k2-build-type": "Release",
27
+ "k2-git-date": "Tue Oct 29 09:02:19 2024",
28
+ "k2-git-sha1": "75e2ed6b2fd87c22b7f3f34bad48a69984bb8755",
29
+ "k2-path": "/opt/conda/lib/python3.11/site-packages/k2/__init__.py",
30
+ "k2-version": "1.24.4",
31
+ "k2-with-cuda": true,
32
+ "lhotse-path": "/workspace/slam/lhotse/lhotse/__init__.py",
33
+ "lhotse-version": "1.30.0.dev+git.13c7616f.dirty",
34
+ "python-version": "3.11",
35
+ "torch-cuda-available": true,
36
+ "torch-cuda-version": "12.4",
37
+ "torch-version": "2.4.0"
38
+ },
39
+ "exp_dir": "slam_omni/exp_speech2speech_rerun",
40
+ "frame_shift_ms": 10,
41
+ "huggingface_dataset_path_or_name": "/workspace/Belle_1.4M-SLAM-Omni",
42
+ "input_strategy": "PrecomputedFeatures",
43
+ "llm_path_or_name": "models/Qwen2.5-0.5B-Instruct",
44
+ "log_interval": 50,
45
+ "manifest_dir": "data/fbank",
46
+ "max_duration": 40,
47
+ "num_buckets": 30,
48
+ "num_epochs": 10,
49
+ "num_workers": 2,
50
+ "on_the_fly_feats": false,
51
+ "pretrained_model_path": null,
52
+ "resample_to_16kHz": true,
53
+ "reset_interval": 200,
54
+ "return_cuts": true,
55
+ "sampler_state_dict_path": null,
56
+ "seed": 42,
57
+ "shuffle": true,
58
+ "spec_aug_time_warp_factor": 80,
59
+ "speech_encoder_path_or_name": "models/whisper/v1.1/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt",
60
+ "start_epoch": 1,
61
+ "subsampling_factor": 2,
62
+ "tensorboard": true,
63
+ "text_key": "answer",
64
+ "unfreeze_llm": true,
65
+ "unfreeze_speech_projector": true,
66
+ "use_flash_attn": true,
67
+ "use_fp16": true,
68
+ "use_lora": true,
69
+ "valid_interval": 5000
70
+ }
71
+ 2025-04-24 09:09:22,426 INFO [train.py:655] About to create model
72
+ 2025-04-24 09:09:42,388 INFO [train.py:808] Number of model parameters: 1326848644
73
+ 2025-04-24 09:09:42,389 INFO [train.py:810] Trainable parameters (excluding model.eval modules):
74
+ 2025-04-24 09:09:42,389 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
75
+ 2025-04-24 09:09:42,390 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
76
+ 2025-04-24 09:09:42,390 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
77
+ 2025-04-24 09:09:42,390 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
78
+ 2025-04-24 09:09:42,390 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
79
+ 2025-04-24 09:09:42,390 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
80
+ 2025-04-24 09:09:42,390 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
81
+ 2025-04-24 09:09:42,390 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
82
+ 2025-04-24 09:09:42,390 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
83
+ 2025-04-24 09:09:42,390 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
84
+ 2025-04-24 09:09:42,390 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
85
+ 2025-04-24 09:09:42,390 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
86
+ 2025-04-24 09:09:42,390 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
87
+ 2025-04-24 09:09:42,390 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
88
+ 2025-04-24 09:09:42,390 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
89
+ 2025-04-24 09:09:42,390 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
90
+ 2025-04-24 09:09:42,390 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
91
+ 2025-04-24 09:09:42,390 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
92
+ 2025-04-24 09:09:42,391 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
93
+ 2025-04-24 09:09:42,391 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
94
+ 2025-04-24 09:09:42,391 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
95
+ 2025-04-24 09:09:42,391 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
96
+ 2025-04-24 09:09:42,391 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
97
+ 2025-04-24 09:09:42,391 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
98
+ 2025-04-24 09:09:42,391 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
99
+ 2025-04-24 09:09:42,391 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
100
+ 2025-04-24 09:09:42,393 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
101
+ 2025-04-24 09:09:42,393 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
102
+ 2025-04-24 09:09:42,393 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
103
+ 2025-04-24 09:09:42,393 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
104
+ 2025-04-24 09:09:42,393 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
105
+ 2025-04-24 09:09:42,393 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
106
+ 2025-04-24 09:09:42,393 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
107
+ 2025-04-24 09:09:42,393 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
108
+ 2025-04-24 09:09:42,393 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
109
+ 2025-04-24 09:09:42,393 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
110
+ 2025-04-24 09:09:42,393 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
111
+ 2025-04-24 09:09:42,394 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
112
+ 2025-04-24 09:09:42,394 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
113
+ 2025-04-24 09:09:42,394 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
114
+ 2025-04-24 09:09:42,394 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
115
+ 2025-04-24 09:09:42,394 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
116
+ 2025-04-24 09:09:42,394 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
117
+ 2025-04-24 09:09:42,394 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
118
+ 2025-04-24 09:09:42,394 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
119
+ 2025-04-24 09:09:42,394 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
120
+ 2025-04-24 09:09:42,394 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
121
+ 2025-04-24 09:09:42,394 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
122
+ 2025-04-24 09:09:42,394 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
123
+ 2025-04-24 09:09:42,394 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
124
+ 2025-04-24 09:09:42,394 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
125
+ 2025-04-24 09:09:42,394 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
126
+ 2025-04-24 09:09:42,394 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
127
+ 2025-04-24 09:09:42,394 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
128
+ 2025-04-24 09:09:42,394 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
129
+ 2025-04-24 09:09:42,395 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
130
+ 2025-04-24 09:09:42,395 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
131
+ 2025-04-24 09:09:42,395 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
132
+ 2025-04-24 09:09:42,395 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
133
+ 2025-04-24 09:09:42,395 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
134
+ 2025-04-24 09:09:42,395 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
135
+ 2025-04-24 09:09:42,395 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
136
+ 2025-04-24 09:09:42,395 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
137
+ 2025-04-24 09:09:42,395 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
138
+ 2025-04-24 09:09:42,395 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
139
+ 2025-04-24 09:09:42,395 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
140
+ 2025-04-24 09:09:42,395 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
141
+ 2025-04-24 09:09:42,395 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
142
+ 2025-04-24 09:09:42,395 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
143
+ 2025-04-24 09:09:42,395 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
144
+ 2025-04-24 09:09:42,395 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
145
+ 2025-04-24 09:09:42,395 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
146
+ 2025-04-24 09:09:42,395 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
147
+ 2025-04-24 09:09:42,396 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
148
+ 2025-04-24 09:09:42,396 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
149
+ 2025-04-24 09:09:42,396 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
150
+ 2025-04-24 09:09:42,396 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
151
+ 2025-04-24 09:09:42,396 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
152
+ 2025-04-24 09:09:42,396 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
153
+ 2025-04-24 09:09:42,396 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
154
+ 2025-04-24 09:09:42,396 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
155
+ 2025-04-24 09:09:42,396 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
156
+ 2025-04-24 09:09:42,396 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
157
+ 2025-04-24 09:09:42,396 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
158
+ 2025-04-24 09:09:42,396 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
159
+ 2025-04-24 09:09:42,396 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
160
+ 2025-04-24 09:09:42,396 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
161
+ 2025-04-24 09:09:42,396 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
162
+ 2025-04-24 09:09:42,396 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
163
+ 2025-04-24 09:09:42,396 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
164
+ 2025-04-24 09:09:42,396 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
165
+ 2025-04-24 09:09:42,396 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
166
+ 2025-04-24 09:09:42,397 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
167
+ 2025-04-24 09:09:42,397 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
168
+ 2025-04-24 09:09:42,397 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
169
+ 2025-04-24 09:09:42,397 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
170
+ 2025-04-24 09:09:42,397 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
171
+ 2025-04-24 09:09:42,397 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
172
+ 2025-04-24 09:09:42,397 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
173
+ 2025-04-24 09:09:42,397 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
174
+ 2025-04-24 09:09:42,397 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
175
+ 2025-04-24 09:09:42,397 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
176
+ 2025-04-24 09:09:42,397 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
177
+ 2025-04-24 09:09:42,397 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
178
+ 2025-04-24 09:09:42,397 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
179
+ 2025-04-24 09:09:42,397 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
180
+ 2025-04-24 09:09:42,397 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
181
+ 2025-04-24 09:09:42,397 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
182
+ 2025-04-24 09:09:42,397 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
183
+ 2025-04-24 09:09:42,397 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
184
+ 2025-04-24 09:09:42,397 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
185
+ 2025-04-24 09:09:42,397 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
186
+ 2025-04-24 09:09:42,398 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
187
+ 2025-04-24 09:09:42,398 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
188
+ 2025-04-24 09:09:42,398 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
189
+ 2025-04-24 09:09:42,398 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
190
+ 2025-04-24 09:09:42,398 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
191
+ 2025-04-24 09:09:42,398 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
192
+ 2025-04-24 09:09:42,398 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
193
+ 2025-04-24 09:09:42,398 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
194
+ 2025-04-24 09:09:42,398 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
195
+ 2025-04-24 09:09:42,398 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
196
+ 2025-04-24 09:09:42,398 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
197
+ 2025-04-24 09:09:42,398 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
198
+ 2025-04-24 09:09:42,398 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
199
+ 2025-04-24 09:09:42,398 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
200
+ 2025-04-24 09:09:42,398 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
201
+ 2025-04-24 09:09:42,398 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
202
+ 2025-04-24 09:09:42,398 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
203
+ 2025-04-24 09:09:42,398 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
204
+ 2025-04-24 09:09:42,399 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
205
+ 2025-04-24 09:09:42,399 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
206
+ 2025-04-24 09:09:42,399 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
207
+ 2025-04-24 09:09:42,399 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
208
+ 2025-04-24 09:09:42,399 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
209
+ 2025-04-24 09:09:42,399 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
210
+ 2025-04-24 09:09:42,399 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
211
+ 2025-04-24 09:09:42,399 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
212
+ 2025-04-24 09:09:42,399 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
213
+ 2025-04-24 09:09:42,399 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
214
+ 2025-04-24 09:09:42,399 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
215
+ 2025-04-24 09:09:42,399 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
216
+ 2025-04-24 09:09:42,399 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
217
+ 2025-04-24 09:09:42,399 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
218
+ 2025-04-24 09:09:42,399 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
219
+ 2025-04-24 09:09:42,399 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
220
+ 2025-04-24 09:09:42,399 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
221
+ 2025-04-24 09:09:42,399 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
222
+ 2025-04-24 09:09:42,399 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
223
+ 2025-04-24 09:09:42,400 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
224
+ 2025-04-24 09:09:42,400 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
225
+ 2025-04-24 09:09:42,400 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
226
+ 2025-04-24 09:09:42,400 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
227
+ 2025-04-24 09:09:42,400 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
228
+ 2025-04-24 09:09:42,400 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
229
+ 2025-04-24 09:09:42,400 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
230
+ 2025-04-24 09:09:42,400 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
231
+ 2025-04-24 09:09:42,400 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
232
+ 2025-04-24 09:09:42,400 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
233
+ 2025-04-24 09:09:42,400 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
234
+ 2025-04-24 09:09:42,400 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
235
+ 2025-04-24 09:09:42,400 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
236
+ 2025-04-24 09:09:42,400 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
237
+ 2025-04-24 09:09:42,400 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
238
+ 2025-04-24 09:09:42,400 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
239
+ 2025-04-24 09:09:42,400 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
240
+ 2025-04-24 09:09:42,400 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
241
+ 2025-04-24 09:09:42,400 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
242
+ 2025-04-24 09:09:42,401 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
243
+ 2025-04-24 09:09:42,401 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
244
+ 2025-04-24 09:09:42,401 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
245
+ 2025-04-24 09:09:42,401 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
246
+ 2025-04-24 09:09:42,401 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
247
+ 2025-04-24 09:09:42,401 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
248
+ 2025-04-24 09:09:42,401 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
249
+ 2025-04-24 09:09:42,401 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
250
+ 2025-04-24 09:09:42,401 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
251
+ 2025-04-24 09:09:42,401 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
252
+ 2025-04-24 09:09:42,401 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
253
+ 2025-04-24 09:09:42,401 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
254
+ 2025-04-24 09:09:42,401 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
255
+ 2025-04-24 09:09:42,401 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
256
+ 2025-04-24 09:09:42,401 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
257
+ 2025-04-24 09:09:42,401 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
258
+ 2025-04-24 09:09:42,401 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
259
+ 2025-04-24 09:09:42,401 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
260
+ 2025-04-24 09:09:42,401 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
261
+ 2025-04-24 09:09:42,402 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
262
+ 2025-04-24 09:09:42,402 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
263
+ 2025-04-24 09:09:42,402 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
264
+ 2025-04-24 09:09:42,402 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
265
+ 2025-04-24 09:09:42,402 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
266
+ 2025-04-24 09:09:42,402 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
267
+ 2025-04-24 09:09:42,402 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
268
+ 2025-04-24 09:09:42,402 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
269
+ 2025-04-24 09:09:42,402 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
270
+ 2025-04-24 09:09:42,402 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
271
+ 2025-04-24 09:09:42,402 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
272
+ 2025-04-24 09:09:42,402 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
273
+ 2025-04-24 09:09:42,402 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
274
+ 2025-04-24 09:09:42,402 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
275
+ 2025-04-24 09:09:42,402 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
276
+ 2025-04-24 09:09:42,402 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
277
+ 2025-04-24 09:09:42,402 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
278
+ 2025-04-24 09:09:42,402 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
279
+ 2025-04-24 09:09:42,402 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
280
+ 2025-04-24 09:09:42,403 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
281
+ 2025-04-24 09:09:42,403 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
282
+ 2025-04-24 09:09:42,403 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
283
+ 2025-04-24 09:09:42,403 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
284
+ 2025-04-24 09:09:42,403 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
285
+ 2025-04-24 09:09:42,403 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
286
+ 2025-04-24 09:09:42,403 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
287
+ 2025-04-24 09:09:42,403 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
288
+ 2025-04-24 09:09:42,403 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
289
+ 2025-04-24 09:09:42,403 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
290
+ 2025-04-24 09:09:42,403 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
291
+ 2025-04-24 09:09:42,403 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
292
+ 2025-04-24 09:09:42,403 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
293
+ 2025-04-24 09:09:42,403 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
294
+ 2025-04-24 09:09:42,403 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
295
+ 2025-04-24 09:09:42,403 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
296
+ 2025-04-24 09:09:42,403 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
297
+ 2025-04-24 09:09:42,403 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
298
+ 2025-04-24 09:09:42,403 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
299
+ 2025-04-24 09:09:42,404 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
300
+ 2025-04-24 09:09:42,404 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
301
+ 2025-04-24 09:09:42,404 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
302
+ 2025-04-24 09:09:42,404 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
303
+ 2025-04-24 09:09:42,404 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
304
+ 2025-04-24 09:09:42,404 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
305
+ 2025-04-24 09:09:42,404 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
306
+ 2025-04-24 09:09:42,404 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
307
+ 2025-04-24 09:09:42,404 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
308
+ 2025-04-24 09:09:42,404 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
309
+ 2025-04-24 09:09:42,404 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
310
+ 2025-04-24 09:09:42,404 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
311
+ 2025-04-24 09:09:42,404 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
312
+ 2025-04-24 09:09:42,404 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
313
+ 2025-04-24 09:09:42,404 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
314
+ 2025-04-24 09:09:42,404 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
315
+ 2025-04-24 09:09:42,404 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
316
+ 2025-04-24 09:09:42,404 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
317
+ 2025-04-24 09:09:42,404 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
318
+ 2025-04-24 09:09:42,405 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
319
+ 2025-04-24 09:09:42,405 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
320
+ 2025-04-24 09:09:42,405 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
321
+ 2025-04-24 09:09:42,405 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
322
+ 2025-04-24 09:09:42,405 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
323
+ 2025-04-24 09:09:42,405 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
324
+ 2025-04-24 09:09:42,405 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
325
+ 2025-04-24 09:09:42,405 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
326
+ 2025-04-24 09:09:42,405 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
327
+ 2025-04-24 09:09:42,405 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
328
+ 2025-04-24 09:09:42,405 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
329
+ 2025-04-24 09:09:42,405 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
330
+ 2025-04-24 09:09:42,405 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
331
+ 2025-04-24 09:09:42,405 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
332
+ 2025-04-24 09:09:42,405 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
333
+ 2025-04-24 09:09:42,405 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
334
+ 2025-04-24 09:09:42,405 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
335
+ 2025-04-24 09:09:42,405 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
336
+ 2025-04-24 09:09:42,406 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
337
+ 2025-04-24 09:09:42,406 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
338
+ 2025-04-24 09:09:42,406 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
339
+ 2025-04-24 09:09:42,406 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
340
+ 2025-04-24 09:09:42,406 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
341
+ 2025-04-24 09:09:42,406 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
342
+ 2025-04-24 09:09:42,406 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
343
+ 2025-04-24 09:09:42,406 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
344
+ 2025-04-24 09:09:42,406 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
345
+ 2025-04-24 09:09:42,406 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
346
+ 2025-04-24 09:09:42,406 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
347
+ 2025-04-24 09:09:42,406 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
348
+ 2025-04-24 09:09:42,406 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
349
+ 2025-04-24 09:09:42,406 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
350
+ 2025-04-24 09:09:42,406 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
351
+ 2025-04-24 09:09:42,406 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
352
+ 2025-04-24 09:09:42,406 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
353
+ 2025-04-24 09:09:42,406 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
354
+ 2025-04-24 09:09:42,406 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
355
+ 2025-04-24 09:09:42,407 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
356
+ 2025-04-24 09:09:42,407 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
357
+ 2025-04-24 09:09:42,407 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
358
+ 2025-04-24 09:09:42,407 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
359
+ 2025-04-24 09:09:42,407 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
360
+ 2025-04-24 09:09:42,407 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
361
+ 2025-04-24 09:09:42,407 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
362
+ 2025-04-24 09:09:42,407 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
363
+ 2025-04-24 09:09:42,407 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
364
+ 2025-04-24 09:09:42,407 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
365
+ 2025-04-24 09:09:42,407 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
366
+ 2025-04-24 09:09:42,407 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
367
+ 2025-04-24 09:09:42,407 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
368
+ 2025-04-24 09:09:42,407 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
369
+ 2025-04-24 09:09:42,407 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
370
+ 2025-04-24 09:09:42,407 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
371
+ 2025-04-24 09:09:42,407 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
372
+ 2025-04-24 09:09:42,407 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
373
+ 2025-04-24 09:09:42,407 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
374
+ 2025-04-24 09:09:42,407 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
375
+ 2025-04-24 09:09:42,408 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
376
+ 2025-04-24 09:09:42,408 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
377
+ 2025-04-24 09:09:42,408 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
378
+ 2025-04-24 09:09:42,408 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
379
+ 2025-04-24 09:09:42,408 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
380
+ 2025-04-24 09:09:42,408 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
381
+ 2025-04-24 09:09:42,408 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
382
+ 2025-04-24 09:09:42,408 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
383
+ 2025-04-24 09:09:42,408 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
384
+ 2025-04-24 09:09:42,408 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
385
+ 2025-04-24 09:09:42,408 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
386
+ 2025-04-24 09:09:42,408 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
387
+ 2025-04-24 09:09:42,408 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
388
+ 2025-04-24 09:09:42,408 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
389
+ 2025-04-24 09:09:42,408 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
390
+ 2025-04-24 09:09:42,408 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
391
+ 2025-04-24 09:09:42,408 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
392
+ 2025-04-24 09:09:42,408 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
393
+ 2025-04-24 09:09:42,408 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
394
+ 2025-04-24 09:09:42,409 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
395
+ 2025-04-24 09:09:42,409 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
396
+ 2025-04-24 09:09:42,409 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
397
+ 2025-04-24 09:09:42,409 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
398
+ 2025-04-24 09:09:42,409 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
399
+ 2025-04-24 09:09:42,409 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
400
+ 2025-04-24 09:09:42,409 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
401
+ 2025-04-24 09:09:42,409 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
402
+ 2025-04-24 09:09:42,409 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
403
+ 2025-04-24 09:09:42,409 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
404
+ 2025-04-24 09:09:42,409 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
405
+ 2025-04-24 09:09:42,409 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
406
+ 2025-04-24 09:09:42,409 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
407
+ 2025-04-24 09:09:42,409 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
408
+ 2025-04-24 09:09:42,409 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
409
+ 2025-04-24 09:09:42,409 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
410
+ 2025-04-24 09:09:42,409 INFO [train.py:813] encoder_projector.linear1.weight: torch.Size([896, 10240])
411
+ 2025-04-24 09:09:42,409 INFO [train.py:813] encoder_projector.linear1.bias: torch.Size([896])
412
+ 2025-04-24 09:09:42,409 INFO [train.py:813] encoder_projector.linear2.weight: torch.Size([896, 896])
413
+ 2025-04-24 09:09:42,410 INFO [train.py:813] encoder_projector.linear2.bias: torch.Size([896])
414
+ 2025-04-24 09:09:42,410 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
415
+ 2025-04-24 09:09:42,410 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
416
+ 2025-04-24 09:09:42,410 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
417
+ 2025-04-24 09:09:42,410 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
418
+ 2025-04-24 09:09:42,410 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
419
+ 2025-04-24 09:09:42,410 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
420
+ 2025-04-24 09:09:42,410 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
421
+ 2025-04-24 09:09:42,410 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
422
+ 2025-04-24 09:09:42,410 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
423
+ 2025-04-24 09:09:42,410 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
424
+ 2025-04-24 09:09:42,410 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
425
+ 2025-04-24 09:09:42,410 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
426
+ 2025-04-24 09:09:42,410 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
427
+ 2025-04-24 09:09:42,410 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
428
+ 2025-04-24 09:09:42,410 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
429
+ 2025-04-24 09:09:42,410 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
430
+ 2025-04-24 09:09:42,410 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
431
+ 2025-04-24 09:09:42,410 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
432
+ 2025-04-24 09:09:42,411 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
433
+ 2025-04-24 09:09:42,411 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
434
+ 2025-04-24 09:09:42,411 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
435
+ 2025-04-24 09:09:42,411 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
436
+ 2025-04-24 09:09:42,411 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
437
+ 2025-04-24 09:09:42,411 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
438
+ 2025-04-24 09:09:42,411 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
439
+ 2025-04-24 09:09:42,411 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
440
+ 2025-04-24 09:09:42,411 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
441
+ 2025-04-24 09:09:42,411 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
442
+ 2025-04-24 09:09:42,411 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
443
+ 2025-04-24 09:09:42,411 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
444
+ 2025-04-24 09:09:42,411 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
445
+ 2025-04-24 09:09:42,411 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
446
+ 2025-04-24 09:09:42,411 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
447
+ 2025-04-24 09:09:42,411 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
448
+ 2025-04-24 09:09:42,411 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
449
+ 2025-04-24 09:09:42,411 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
450
+ 2025-04-24 09:09:42,411 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
451
+ 2025-04-24 09:09:42,412 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
452
+ 2025-04-24 09:09:42,412 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
453
+ 2025-04-24 09:09:42,412 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
454
+ 2025-04-24 09:09:42,412 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
455
+ 2025-04-24 09:09:42,412 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
456
+ 2025-04-24 09:09:42,412 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
457
+ 2025-04-24 09:09:42,412 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
458
+ 2025-04-24 09:09:42,412 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
459
+ 2025-04-24 09:09:42,412 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
460
+ 2025-04-24 09:09:42,412 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
461
+ 2025-04-24 09:09:42,412 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
462
+ 2025-04-24 09:09:42,412 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
463
+ 2025-04-24 09:09:42,412 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
464
+ 2025-04-24 09:09:42,412 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
465
+ 2025-04-24 09:09:42,412 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
466
+ 2025-04-24 09:09:42,412 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
467
+ 2025-04-24 09:09:42,412 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
468
+ 2025-04-24 09:09:42,412 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
469
+ 2025-04-24 09:09:42,412 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
470
+ 2025-04-24 09:09:42,413 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
471
+ 2025-04-24 09:09:42,413 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
472
+ 2025-04-24 09:09:42,413 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
473
+ 2025-04-24 09:09:42,413 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
474
+ 2025-04-24 09:09:42,413 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
475
+ 2025-04-24 09:09:42,413 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
476
+ 2025-04-24 09:09:42,413 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
477
+ 2025-04-24 09:09:42,413 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
478
+ 2025-04-24 09:09:42,413 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
479
+ 2025-04-24 09:09:42,413 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
480
+ 2025-04-24 09:09:42,413 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
481
+ 2025-04-24 09:09:42,413 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
482
+ 2025-04-24 09:09:42,413 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
483
+ 2025-04-24 09:09:42,413 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
484
+ 2025-04-24 09:09:42,413 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
485
+ 2025-04-24 09:09:42,413 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
486
+ 2025-04-24 09:09:42,413 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
487
+ 2025-04-24 09:09:42,413 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
488
+ 2025-04-24 09:09:42,413 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
489
+ 2025-04-24 09:09:42,414 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
490
+ 2025-04-24 09:09:42,414 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
491
+ 2025-04-24 09:09:42,414 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
492
+ 2025-04-24 09:09:42,414 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
493
+ 2025-04-24 09:09:42,414 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
494
+ 2025-04-24 09:09:42,414 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
495
+ 2025-04-24 09:09:42,414 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
496
+ 2025-04-24 09:09:42,414 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
497
+ 2025-04-24 09:09:42,414 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
498
+ 2025-04-24 09:09:42,414 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
499
+ 2025-04-24 09:09:42,414 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
500
+ 2025-04-24 09:09:42,414 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
501
+ 2025-04-24 09:09:42,414 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
502
+ 2025-04-24 09:09:42,414 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
503
+ 2025-04-24 09:09:42,414 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
504
+ 2025-04-24 09:09:42,414 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
505
+ 2025-04-24 09:09:42,414 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
506
+ 2025-04-24 09:09:42,414 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
507
+ 2025-04-24 09:09:42,414 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
508
+ 2025-04-24 09:09:42,415 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
509
+ 2025-04-24 09:09:42,415 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
510
+ 2025-04-24 09:09:42,415 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
511
+ 2025-04-24 09:09:42,415 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
512
+ 2025-04-24 09:09:42,415 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
513
+ 2025-04-24 09:09:42,415 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
514
+ 2025-04-24 09:09:42,415 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
515
+ 2025-04-24 09:09:42,415 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
516
+ 2025-04-24 09:09:42,415 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
517
+ 2025-04-24 09:09:42,415 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
518
+ 2025-04-24 09:09:42,415 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
519
+ 2025-04-24 09:09:42,415 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
520
+ 2025-04-24 09:09:42,415 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
521
+ 2025-04-24 09:09:42,415 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
522
+ 2025-04-24 09:09:42,415 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
523
+ 2025-04-24 09:09:42,415 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
524
+ 2025-04-24 09:09:42,415 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
525
+ 2025-04-24 09:09:42,415 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
526
+ 2025-04-24 09:09:42,415 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
527
+ 2025-04-24 09:09:42,416 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
528
+ 2025-04-24 09:09:42,416 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
529
+ 2025-04-24 09:09:42,416 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
530
+ 2025-04-24 09:09:42,416 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
531
+ 2025-04-24 09:09:42,416 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
532
+ 2025-04-24 09:09:42,416 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
533
+ 2025-04-24 09:09:42,416 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
534
+ 2025-04-24 09:09:42,416 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
535
+ 2025-04-24 09:09:42,416 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
536
+ 2025-04-24 09:09:42,416 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
537
+ 2025-04-24 09:09:42,416 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
538
+ 2025-04-24 09:09:42,416 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
539
+ 2025-04-24 09:09:42,416 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
540
+ 2025-04-24 09:09:42,416 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
541
+ 2025-04-24 09:09:42,416 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
542
+ 2025-04-24 09:09:42,416 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
543
+ 2025-04-24 09:09:42,416 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
544
+ 2025-04-24 09:09:42,416 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
545
+ 2025-04-24 09:09:42,416 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
546
+ 2025-04-24 09:09:42,417 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
547
+ 2025-04-24 09:09:42,417 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
548
+ 2025-04-24 09:09:42,417 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
549
+ 2025-04-24 09:09:42,417 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
550
+ 2025-04-24 09:09:42,417 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
551
+ 2025-04-24 09:09:42,417 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
552
+ 2025-04-24 09:09:42,417 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
553
+ 2025-04-24 09:09:42,417 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
554
+ 2025-04-24 09:09:42,417 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
555
+ 2025-04-24 09:09:42,417 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
556
+ 2025-04-24 09:09:42,417 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
557
+ 2025-04-24 09:09:42,417 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
558
+ 2025-04-24 09:09:42,417 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
559
+ 2025-04-24 09:09:42,417 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
560
+ 2025-04-24 09:09:42,417 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
561
+ 2025-04-24 09:09:42,417 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
562
+ 2025-04-24 09:09:42,417 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
563
+ 2025-04-24 09:09:42,417 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
564
+ 2025-04-24 09:09:42,417 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
565
+ 2025-04-24 09:09:42,417 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
566
+ 2025-04-24 09:09:42,418 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
567
+ 2025-04-24 09:09:42,418 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
568
+ 2025-04-24 09:09:42,418 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
569
+ 2025-04-24 09:09:42,418 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
570
+ 2025-04-24 09:09:42,418 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
571
+ 2025-04-24 09:09:42,418 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
572
+ 2025-04-24 09:09:42,418 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
573
+ 2025-04-24 09:09:42,418 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
574
+ 2025-04-24 09:09:42,418 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
575
+ 2025-04-24 09:09:42,418 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
576
+ 2025-04-24 09:09:42,418 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
577
+ 2025-04-24 09:09:42,418 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
578
+ 2025-04-24 09:09:42,418 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
579
+ 2025-04-24 09:09:42,418 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
580
+ 2025-04-24 09:09:42,418 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
581
+ 2025-04-24 09:09:42,418 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
582
+ 2025-04-24 09:09:42,418 INFO [train.py:813] speech_token_projector.weight: torch.Size([1024, 896])
583
+ 2025-04-24 09:09:42,418 INFO [train.py:813] speech_token_projector.bias: torch.Size([1024])
584
+ 2025-04-24 09:09:42,418 INFO [train.py:813] codec_lm_head.weight: torch.Size([4100, 1024])
585
+ 2025-04-24 09:09:42,418 INFO [train.py:813] codec_lm_head.bias: torch.Size([4100])
586
+ 2025-04-24 09:09:42,419 INFO [train.py:819] Device: cuda:0
587
+ 2025-04-24 09:09:43,225 INFO [train.py:823] Using DeepSpeed
588
+ 2025-04-24 09:09:45,005 INFO [data_module.py:445] About to get train cuts
589
+ 2025-04-24 09:09:45,007 INFO [data_module.py:250] Disable MUSAN
590
+ 2025-04-24 09:09:45,007 INFO [data_module.py:268] Enable SpecAugment
591
+ 2025-04-24 09:09:45,007 INFO [data_module.py:269] Time warp factor: 80
592
+ 2025-04-24 09:09:45,007 INFO [data_module.py:279] Num frame mask: 10
593
+ 2025-04-24 09:09:45,007 INFO [data_module.py:292] About to create train dataset
594
+ 2025-04-24 09:09:45,008 INFO [data_module.py:319] Using DynamicBucketingSampler.
595
+ 2025-04-24 09:09:45,039 WARNING [train.py:847] Exclude cut with ID c45eb75253bb65901de61b81270c9974 from training. Duration: 6.199727891156463, lenth: 2321
596
+ 2025-04-24 09:09:45,222 WARNING [train.py:847] Exclude cut with ID 82678aa1e73accb2d11a87440a90fd55 from training. Duration: 5.0967800453514736, lenth: 2413
597
+ 2025-04-24 09:09:45,716 WARNING [train.py:847] Exclude cut with ID 345bcf1b57cbfcf140007812e992c07a from training. Duration: 5.712108843537415, lenth: 2520
598
+ 2025-04-24 09:09:45,867 WARNING [train.py:847] Exclude cut with ID 2bc428341a40605b4887a7e46e3ee4ac from training. Duration: 3.657142857142857, lenth: 2282
599
+ 2025-04-24 09:09:45,933 WARNING [train.py:847] Exclude cut with ID 24c14f6007ceb8d993f6496aeb663c24 from training. Duration: 2.391655328798186, lenth: 2218
600
+ 2025-04-24 09:09:45,998 INFO [data_module.py:336] About to create train dataloader
601
+ 2025-04-24 09:09:45,999 INFO [data_module.py:436] About to get test cuts
602
+ 2025-04-24 09:09:45,999 INFO [data_module.py:365] About to create dev dataset
603
+ 2025-04-24 09:09:46,086 INFO [data_module.py:379] About to create dev dataloader
604
+ 2025-04-24 09:09:46,092 INFO [train.py:875] start training from epoch 1
605
+ 2025-04-24 09:09:46,257 WARNING [train.py:847] Exclude cut with ID c45eb75253bb65901de61b81270c9974 from training. Duration: 6.199727891156463, lenth: 2321
606
+ 2025-04-24 09:09:46,483 WARNING [train.py:847] Exclude cut with ID 82678aa1e73accb2d11a87440a90fd55 from training. Duration: 5.0967800453514736, lenth: 2413
607
+ 2025-04-24 09:09:47,106 WARNING [train.py:847] Exclude cut with ID 345bcf1b57cbfcf140007812e992c07a from training. Duration: 5.712108843537415, lenth: 2520
608
+ 2025-04-24 09:09:47,300 WARNING [train.py:847] Exclude cut with ID 2bc428341a40605b4887a7e46e3ee4ac from training. Duration: 3.657142857142857, lenth: 2282
609
+ 2025-04-24 09:09:47,378 WARNING [train.py:847] Exclude cut with ID 24c14f6007ceb8d993f6496aeb663c24 from training. Duration: 2.391655328798186, lenth: 2218
610
+ 2025-04-24 09:09:47,927 WARNING [train.py:847] Exclude cut with ID de225db1da1393474e4ce9af8cbafb42 from training. Duration: 5.433469387755102, lenth: 2230
611
+ 2025-04-24 09:09:48,216 WARNING [train.py:847] Exclude cut with ID ef376c7d1fb4606edb0547b2a7fb95b3 from training. Duration: 2.7399546485260773, lenth: 2637
612
+ 2025-04-24 09:09:48,545 WARNING [train.py:847] Exclude cut with ID c7f3c7da44c2c6e27f86bda421b958d8 from training. Duration: 2.995374149659864, lenth: 2318
613
+ 2025-04-24 09:09:48,715 WARNING [train.py:847] Exclude cut with ID 8d724f452d1d99549eebf90ab658e7bd from training. Duration: 5.712108843537415, lenth: 2209
614
+ 2025-04-24 09:09:48,829 WARNING [train.py:847] Exclude cut with ID a585785e795cb35d02d8f7b599352e16 from training. Duration: 4.911020408163266, lenth: 2575
615
+ 2025-04-24 09:09:48,909 WARNING [train.py:847] Exclude cut with ID 31a00c5e63c88ef67402f578ebc6ebf7 from training. Duration: 3.575873015873016, lenth: 2307
616
+ 2025-04-24 09:09:49,023 WARNING [train.py:847] Exclude cut with ID 32ab4761aaf08d9473fc8388c3a6cffc from training. Duration: 7.9760544217687075, lenth: 2201
617
+ 2025-04-24 09:09:49,079 WARNING [train.py:847] Exclude cut with ID 5fddb6bf9d5265c0409b81c85fadbaec from training. Duration: 3.4597732426303853, lenth: 2210
618
+ 2025-04-24 09:09:49,909 WARNING [train.py:847] Exclude cut with ID c5d43a2443ce7820cf864f7c55eafb77 from training. Duration: 3.912562358276644, lenth: 2315
619
+ 2025-04-24 09:09:50,954 WARNING [train.py:847] Exclude cut with ID a951e36267c8901d4c03fb6b4dffe636 from training. Duration: 4.73687074829932, lenth: 2223
620
+ 2025-04-24 09:09:51,025 WARNING [train.py:847] Exclude cut with ID 595680451c1ee7e42dba0452e165dea3 from training. Duration: 3.959002267573696, lenth: 2251
621
+ 2025-04-24 09:09:51,576 WARNING [train.py:847] Exclude cut with ID 20fa9b11557159d78f72ccc6b464de18 from training. Duration: 3.9938321995464854, lenth: 2215
622
+ 2025-04-24 09:09:51,630 WARNING [train.py:847] Exclude cut with ID 8a01fc72b4108673fd8393e523450e23 from training. Duration: 8.719092970521542, lenth: 2517
623
+ 2025-04-24 09:09:52,139 WARNING [train.py:847] Exclude cut with ID 924d54288847d448a20809a26b320477 from training. Duration: 3.81968253968254, lenth: 2302
624
+ 2025-04-24 09:09:52,437 WARNING [train.py:847] Exclude cut with ID a7f91a17f6318d896d0bee7b7307d000 from training. Duration: 5.294149659863946, lenth: 2239
625
+ 2025-04-24 09:09:52,922 WARNING [train.py:847] Exclude cut with ID 8d7cff5d1d25a773f9d27590b5912b60 from training. Duration: 4.597551020408163, lenth: 2203
626
+ 2025-04-24 09:09:53,040 WARNING [train.py:847] Exclude cut with ID 13407ca1ec78def38346569b54d11a78 from training. Duration: 5.909478458049887, lenth: 2280
627
+ 2025-04-24 09:09:53,044 WARNING [train.py:847] Exclude cut with ID 6390ab6964b52f2ee9bf36826393abaa from training. Duration: 4.017052154195011, lenth: 2209
628
+ 2025-04-24 09:09:53,086 WARNING [train.py:847] Exclude cut with ID f55c19435d6643ebf7356a1072a014fa from training. Duration: 5.259319727891157, lenth: 2310
629
+ 2025-04-24 09:09:53,765 WARNING [train.py:847] Exclude cut with ID 546f5fb6d11db41d104ee7b0667f4e1d from training. Duration: 5.712108843537415, lenth: 2770
630
+ 2025-04-24 09:09:54,071 WARNING [train.py:847] Exclude cut with ID b2069846ab4df18f5caa676751aa0b40 from training. Duration: 6.6525170068027215, lenth: 2588
631
+ 2025-04-24 09:09:54,445 WARNING [train.py:847] Exclude cut with ID 5717a65b9855425c0330e6022a71cc2f from training. Duration: 5.073560090702948, lenth: 2269
632
+ 2025-04-24 09:09:54,746 WARNING [train.py:847] Exclude cut with ID d6e37a975b3c69a86a808347b53e42f2 from training. Duration: 5.259319727891157, lenth: 2885
633
+ 2025-04-24 09:09:54,894 WARNING [train.py:847] Exclude cut with ID d15e75257f8dac616c078113f8f96c63 from training. Duration: 7.6741950113378685, lenth: 2276
634
+ 2025-04-24 09:09:54,958 WARNING [train.py:847] Exclude cut with ID 7ea49b35e0dc536537ae3ddcd07032af from training. Duration: 4.829750566893424, lenth: 2206
635
+ 2025-04-24 09:09:55,412 WARNING [train.py:847] Exclude cut with ID 0a1a80c13fd0c38d97c9491dff84c7da from training. Duration: 2.914104308390023, lenth: 2220
636
+ 2025-04-24 09:09:55,461 WARNING [train.py:847] Exclude cut with ID 359ff41c694bb848c3e37b948b1434f7 from training. Duration: 6.478367346938776, lenth: 2881
637
+ 2025-04-24 09:09:55,469 WARNING [train.py:847] Exclude cut with ID 6989881203588034ddf38ae5a7cfe238 from training. Duration: 5.491519274376417, lenth: 2243
638
+ 2025-04-24 09:09:55,741 WARNING [train.py:847] Exclude cut with ID fd7b5cb9a6bce2c02bf8a7c3dd2b35d6 from training. Duration: 5.793378684807256, lenth: 2305
639
+ 2025-04-24 09:09:56,713 WARNING [train.py:847] Exclude cut with ID e1c5efd6d4cd7c6dc0f3ac7720161c49 from training. Duration: 9.438911564625851, lenth: 2201
640
+ 2025-04-24 09:09:57,105 WARNING [train.py:847] Exclude cut with ID c13ffcc7b60ea460731e417d2d9575cf from training. Duration: 3.250793650793651, lenth: 2205
641
+ 2025-04-24 09:09:57,196 WARNING [train.py:847] Exclude cut with ID ba1a37e64efe00d8b8cbfb0069e65f8d from training. Duration: 3.274013605442177, lenth: 2290
642
+ 2025-04-24 09:09:57,273 WARNING [train.py:847] Exclude cut with ID de9f5fe02034bc4b0a66fbe378fffb1a from training. Duration: 6.455147392290249, lenth: 2228
643
+ 2025-04-24 09:09:57,753 WARNING [train.py:847] Exclude cut with ID 8ca7a35870994edd410e1ea8e951faa3 from training. Duration: 7.012426303854875, lenth: 2233
644
+ 2025-04-24 09:09:57,897 WARNING [train.py:847] Exclude cut with ID 3260368505eddf46fb1c9b562124ff18 from training. Duration: 12.097596371882085, lenth: 2261
645
+ 2025-04-24 09:09:58,088 WARNING [train.py:847] Exclude cut with ID 36878ea551121147626747e9fc39ae17 from training. Duration: 5.537959183673469, lenth: 2224
646
+ 2025-04-24 09:09:59,042 WARNING [train.py:847] Exclude cut with ID d4c7270e6c9b3dffed8643040b5f922b from training. Duration: 4.133151927437642, lenth: 2329
647
+ 2025-04-24 09:09:59,502 WARNING [train.py:847] Exclude cut with ID 179517464c97fbbcb8909a56b849bf73 from training. Duration: 6.617687074829932, lenth: 2257
648
+ 2025-04-24 09:09:59,704 WARNING [train.py:847] Exclude cut with ID 17480e9524e4c0f8e405487e50c8e98e from training. Duration: 4.551111111111111, lenth: 2263
649
+ 2025-04-24 09:09:59,792 WARNING [train.py:847] Exclude cut with ID 2c10af552b911b8ac45c1fc7bb9fcb5d from training. Duration: 6.2693877551020405, lenth: 2481
650
+ 2025-04-24 09:09:59,902 WARNING [train.py:847] Exclude cut with ID aa2539588175d8c96975a06a8993e5b6 from training. Duration: 5.456689342403628, lenth: 2573
651
+ 2025-04-24 09:09:59,933 WARNING [train.py:847] Exclude cut with ID 32945d8e21a8628bb3c93fd499330639 from training. Duration: 13.455963718820861, lenth: 2205
652
+ 2025-04-24 09:10:00,098 WARNING [train.py:847] Exclude cut with ID 64db5d6166c3edc340d8e1b02c059f19 from training. Duration: 4.818140589569161, lenth: 2259
653
+ 2025-04-24 09:10:00,235 WARNING [train.py:847] Exclude cut with ID 39483151e08291efe16217ab41622d59 from training. Duration: 7.952834467120182, lenth: 2439
654
+ 2025-04-24 09:10:00,924 WARNING [train.py:847] Exclude cut with ID c4a74dd23aa9a3fb13330113787139d6 from training. Duration: 3.239183673469388, lenth: 2276
655
+ 2025-04-24 09:10:00,966 WARNING [train.py:847] Exclude cut with ID b12a8836f1aa525a763ac657a44229a3 from training. Duration: 5.0967800453514736, lenth: 2427
656
+ 2025-04-24 09:10:04,105 WARNING [train.py:847] Exclude cut with ID 185cfa3ae9c12babe1e217de612f0c7b from training. Duration: 6.257777777777778, lenth: 2587
657
+ 2025-04-24 09:10:04,192 WARNING [train.py:847] Exclude cut with ID 08eff5b64d191a0d50634630437a3107 from training. Duration: 10.135510204081633, lenth: 2273
658
+ 2025-04-24 09:10:04,279 WARNING [train.py:847] Exclude cut with ID bcd8207538deeccfcf8d1de872b8f7e9 from training. Duration: 4.179591836734694, lenth: 2242
659
+ 2025-04-24 09:10:04,474 WARNING [train.py:847] Exclude cut with ID f909651c923fdd6cc0eaa81de64f4a5b from training. Duration: 8.068934240362811, lenth: 2421
660
+ 2025-04-24 09:10:04,502 WARNING [train.py:847] Exclude cut with ID f5d34097b89bd201018189ff44b10b33 from training. Duration: 3.7964625850340137, lenth: 2250
661
+ 2025-04-24 09:10:04,592 WARNING [train.py:847] Exclude cut with ID 245530abe5a62b390611b7a3b91ddfef from training. Duration: 9.079002267573696, lenth: 2299
662
+ 2025-04-24 09:10:05,468 WARNING [train.py:847] Exclude cut with ID 79d0d3129c764fd4d9c5391b6353e349 from training. Duration: 5.0967800453514736, lenth: 2309
663
+ 2025-04-24 09:10:05,505 WARNING [train.py:847] Exclude cut with ID 1969177f73511ef204642f43ce029d05 from training. Duration: 3.657142857142857, lenth: 2339
664
+ 2025-04-24 09:10:06,014 WARNING [train.py:847] Exclude cut with ID f601e756ecd6074cd3037c61e413cb6e from training. Duration: 5.712108843537415, lenth: 2287
665
+ 2025-04-24 09:10:06,271 WARNING [train.py:847] Exclude cut with ID 9f7758e035b3eebcb9267aa49ad65d4c from training. Duration: 3.9938321995464854, lenth: 2303
666
+ 2025-04-24 09:10:07,035 WARNING [train.py:847] Exclude cut with ID b29a42e8c0af813bf605d7f0a974620c from training. Duration: 5.050340136054421, lenth: 2219
667
+ 2025-04-24 09:10:07,208 WARNING [train.py:847] Exclude cut with ID 192e5517b050a4659892a2832d986764 from training. Duration: 5.12, lenth: 2420
668
+ 2025-04-24 09:10:07,376 WARNING [train.py:847] Exclude cut with ID bc262e923d16a434a3c5701bea841e4e from training. Duration: 7.418775510204082, lenth: 2205
669
+ 2025-04-24 09:10:07,535 WARNING [train.py:847] Exclude cut with ID 5141e01019dec47ed309fbf5881ab675 from training. Duration: 8.034104308390022, lenth: 2284
670
+ 2025-04-24 09:10:07,564 WARNING [train.py:847] Exclude cut with ID 3555e5c7123d744b39ba7ecce44ce1e2 from training. Duration: 3.7151927437641725, lenth: 2226
671
+ 2025-04-24 09:10:08,012 WARNING [train.py:847] Exclude cut with ID 973a1dc43ca452580d6999027a82fbec from training. Duration: 5.270929705215419, lenth: 2250
672
+ 2025-04-24 09:10:08,113 WARNING [train.py:847] Exclude cut with ID e2791accb62bb5e4bde9e0e2b04945c5 from training. Duration: 3.517823129251701, lenth: 2286
673
+ 2025-04-24 09:10:08,285 WARNING [train.py:847] Exclude cut with ID e91152d010c4479932b7ab64a998d5c9 from training. Duration: 4.771700680272109, lenth: 2468
674
+ 2025-04-24 09:10:08,418 WARNING [train.py:847] Exclude cut with ID cff6a8011a86580fcf47595d18819ae9 from training. Duration: 4.458231292517007, lenth: 2205
675
+ 2025-04-24 09:10:08,466 WARNING [train.py:847] Exclude cut with ID 1623cdac3d17f7a211e8a8f434f70657 from training. Duration: 4.191201814058957, lenth: 2308
676
+ 2025-04-24 09:10:09,098 WARNING [train.py:847] Exclude cut with ID 273cfc04aebaf8b0cad541d0f3d1cded from training. Duration: 6.408707482993197, lenth: 2609
677
+ 2025-04-24 09:10:09,182 WARNING [train.py:847] Exclude cut with ID 7d591b090a42935fd1edd8de97c690b6 from training. Duration: 5.038730158730159, lenth: 2229
678
+ 2025-04-24 09:10:09,281 WARNING [train.py:847] Exclude cut with ID 1cfa9ef743b568bb034b415000c8c479 from training. Duration: 7.174965986394557, lenth: 2440
679
+ 2025-04-24 09:10:10,073 WARNING [train.py:847] Exclude cut with ID 762676d460719a9203fa7e7537560530 from training. Duration: 3.1346938775510202, lenth: 2443
680
+ 2025-04-24 09:10:10,581 WARNING [train.py:847] Exclude cut with ID 43575814ad7bb6d631483cdc513b49ac from training. Duration: 10.95981859410431, lenth: 2380
681
+ 2025-04-24 09:10:10,698 WARNING [train.py:847] Exclude cut with ID ae208dee7119ca436f380b99077bcde9 from training. Duration: 5.2128798185941045, lenth: 2615
682
+ 2025-04-24 09:10:10,784 WARNING [train.py:847] Exclude cut with ID b56757708077ab00baa7857c1cef8491 from training. Duration: 5.456689342403628, lenth: 2206
683
+ 2025-04-24 09:10:11,014 WARNING [train.py:847] Exclude cut with ID f3f6d5671f3d11bc83f8fcad598007ff from training. Duration: 4.179591836734694, lenth: 2262
684
+ 2025-04-24 09:10:11,431 WARNING [train.py:847] Exclude cut with ID 6af07f098cd794b9dd5a6bd660f53505 from training. Duration: 4.435011337868481, lenth: 2251
685
+ 2025-04-24 09:10:11,568 WARNING [train.py:847] Exclude cut with ID 4ac1131393136a35ef54499bf3c728d4 from training. Duration: 7.732244897959184, lenth: 2211
686
+ 2025-04-24 09:10:11,570 WARNING [train.py:847] Exclude cut with ID 482877e78defa83a355d2cdeb5a629ed from training. Duration: 5.154829931972789, lenth: 2891
687
+ 2025-04-24 09:10:13,777 WARNING [train.py:847] Exclude cut with ID a93637e8f3e7bdf34bc8eb5d34d34197 from training. Duration: 3.215963718820862, lenth: 2235
688
+ 2025-04-24 09:10:15,429 WARNING [train.py:847] Exclude cut with ID 90c6fbf20eeb34c13ffb97a198d84b0b from training. Duration: 5.0967800453514736, lenth: 2236
689
+ 2025-04-24 09:10:15,536 INFO [train.py:539] Computing validation loss
log/log-train-2025-04-24-09-15-56 ADDED
@@ -0,0 +1,689 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-04-24 09:15:56,357 INFO [train.py:653] {
2
+ "allowed_excess_duration_ratio": 0.1,
3
+ "audio_key": "question_audio",
4
+ "batch_idx_train": 0,
5
+ "best_train_epoch": -1,
6
+ "best_train_loss": Infinity,
7
+ "best_valid_epoch": -1,
8
+ "best_valid_loss": Infinity,
9
+ "bucketing_sampler": true,
10
+ "deepscale": false,
11
+ "deepscale_config": null,
12
+ "deepspeed": true,
13
+ "deepspeed_config": "./slam_omni/ds_config_zero1.json",
14
+ "drop_last": true,
15
+ "enable_musan": false,
16
+ "enable_spec_aug": true,
17
+ "enable_speech_output": true,
18
+ "encoder_projector_ds_rate": 8,
19
+ "env_info": {
20
+ "IP address": "0.114.172.122",
21
+ "hostname": "7515258",
22
+ "icefall-git-branch": "master",
23
+ "icefall-git-date": "Thu Apr 24 08:24:11 2025",
24
+ "icefall-git-sha1": "2e9be467-dirty",
25
+ "icefall-path": "/workspace/slam/icefall_omni",
26
+ "k2-build-type": "Release",
27
+ "k2-git-date": "Tue Oct 29 09:02:19 2024",
28
+ "k2-git-sha1": "75e2ed6b2fd87c22b7f3f34bad48a69984bb8755",
29
+ "k2-path": "/opt/conda/lib/python3.11/site-packages/k2/__init__.py",
30
+ "k2-version": "1.24.4",
31
+ "k2-with-cuda": true,
32
+ "lhotse-path": "/workspace/slam/lhotse/lhotse/__init__.py",
33
+ "lhotse-version": "1.30.0.dev+git.13c7616f.dirty",
34
+ "python-version": "3.11",
35
+ "torch-cuda-available": true,
36
+ "torch-cuda-version": "12.4",
37
+ "torch-version": "2.4.0"
38
+ },
39
+ "exp_dir": "slam_omni/exp_speech2speech_rerun",
40
+ "frame_shift_ms": 10,
41
+ "huggingface_dataset_path_or_name": "/workspace/Belle_1.4M-SLAM-Omni",
42
+ "input_strategy": "PrecomputedFeatures",
43
+ "llm_path_or_name": "models/Qwen2.5-0.5B-Instruct",
44
+ "log_interval": 50,
45
+ "manifest_dir": "data/fbank",
46
+ "max_duration": 40,
47
+ "num_buckets": 30,
48
+ "num_epochs": 10,
49
+ "num_workers": 2,
50
+ "on_the_fly_feats": false,
51
+ "pretrained_model_path": null,
52
+ "resample_to_16kHz": true,
53
+ "reset_interval": 200,
54
+ "return_cuts": true,
55
+ "sampler_state_dict_path": null,
56
+ "seed": 42,
57
+ "shuffle": true,
58
+ "spec_aug_time_warp_factor": 80,
59
+ "speech_encoder_path_or_name": "models/whisper/v1.1/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt",
60
+ "start_epoch": 1,
61
+ "subsampling_factor": 2,
62
+ "tensorboard": true,
63
+ "text_key": "answer",
64
+ "unfreeze_llm": true,
65
+ "unfreeze_speech_projector": true,
66
+ "use_flash_attn": true,
67
+ "use_fp16": true,
68
+ "use_lora": true,
69
+ "valid_interval": 5000
70
+ }
71
+ 2025-04-24 09:15:56,357 INFO [train.py:655] About to create model
72
+ 2025-04-24 09:16:18,941 INFO [train.py:808] Number of model parameters: 1326848644
73
+ 2025-04-24 09:16:18,941 INFO [train.py:810] Trainable parameters (excluding model.eval modules):
74
+ 2025-04-24 09:16:18,942 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
75
+ 2025-04-24 09:16:18,942 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
76
+ 2025-04-24 09:16:18,942 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
77
+ 2025-04-24 09:16:18,942 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
78
+ 2025-04-24 09:16:18,942 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
79
+ 2025-04-24 09:16:18,942 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
80
+ 2025-04-24 09:16:18,942 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
81
+ 2025-04-24 09:16:18,942 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
82
+ 2025-04-24 09:16:18,943 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
83
+ 2025-04-24 09:16:18,943 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
84
+ 2025-04-24 09:16:18,943 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
85
+ 2025-04-24 09:16:18,943 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
86
+ 2025-04-24 09:16:18,943 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
87
+ 2025-04-24 09:16:18,943 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
88
+ 2025-04-24 09:16:18,943 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
89
+ 2025-04-24 09:16:18,943 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
90
+ 2025-04-24 09:16:18,943 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
91
+ 2025-04-24 09:16:18,943 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
92
+ 2025-04-24 09:16:18,943 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
93
+ 2025-04-24 09:16:18,943 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
94
+ 2025-04-24 09:16:18,943 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
95
+ 2025-04-24 09:16:18,943 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
96
+ 2025-04-24 09:16:18,943 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
97
+ 2025-04-24 09:16:18,943 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
98
+ 2025-04-24 09:16:18,943 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
99
+ 2025-04-24 09:16:18,943 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
100
+ 2025-04-24 09:16:18,944 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
101
+ 2025-04-24 09:16:18,944 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
102
+ 2025-04-24 09:16:18,944 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
103
+ 2025-04-24 09:16:18,944 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
104
+ 2025-04-24 09:16:18,944 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
105
+ 2025-04-24 09:16:18,944 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
106
+ 2025-04-24 09:16:18,944 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
107
+ 2025-04-24 09:16:18,944 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
108
+ 2025-04-24 09:16:18,944 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
109
+ 2025-04-24 09:16:18,944 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
110
+ 2025-04-24 09:16:18,944 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
111
+ 2025-04-24 09:16:18,944 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
112
+ 2025-04-24 09:16:18,944 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
113
+ 2025-04-24 09:16:18,944 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
114
+ 2025-04-24 09:16:18,944 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
115
+ 2025-04-24 09:16:18,944 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
116
+ 2025-04-24 09:16:18,944 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
117
+ 2025-04-24 09:16:18,944 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
118
+ 2025-04-24 09:16:18,944 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
119
+ 2025-04-24 09:16:18,945 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
120
+ 2025-04-24 09:16:18,945 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
121
+ 2025-04-24 09:16:18,945 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
122
+ 2025-04-24 09:16:18,945 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
123
+ 2025-04-24 09:16:18,945 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
124
+ 2025-04-24 09:16:18,945 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
125
+ 2025-04-24 09:16:18,945 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
126
+ 2025-04-24 09:16:18,945 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
127
+ 2025-04-24 09:16:18,945 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
128
+ 2025-04-24 09:16:18,945 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
129
+ 2025-04-24 09:16:18,945 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
130
+ 2025-04-24 09:16:18,945 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
131
+ 2025-04-24 09:16:18,945 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
132
+ 2025-04-24 09:16:18,945 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
133
+ 2025-04-24 09:16:18,945 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
134
+ 2025-04-24 09:16:18,945 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
135
+ 2025-04-24 09:16:18,945 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
136
+ 2025-04-24 09:16:18,945 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
137
+ 2025-04-24 09:16:18,945 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
138
+ 2025-04-24 09:16:18,945 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
139
+ 2025-04-24 09:16:18,946 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
140
+ 2025-04-24 09:16:18,946 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
141
+ 2025-04-24 09:16:18,946 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
142
+ 2025-04-24 09:16:18,946 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
143
+ 2025-04-24 09:16:18,946 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
144
+ 2025-04-24 09:16:18,946 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
145
+ 2025-04-24 09:16:18,946 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
146
+ 2025-04-24 09:16:18,946 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
147
+ 2025-04-24 09:16:18,946 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
148
+ 2025-04-24 09:16:18,946 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
149
+ 2025-04-24 09:16:18,946 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
150
+ 2025-04-24 09:16:18,946 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
151
+ 2025-04-24 09:16:18,946 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
152
+ 2025-04-24 09:16:18,946 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
153
+ 2025-04-24 09:16:18,946 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
154
+ 2025-04-24 09:16:18,946 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
155
+ 2025-04-24 09:16:18,946 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
156
+ 2025-04-24 09:16:18,946 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
157
+ 2025-04-24 09:16:18,946 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
158
+ 2025-04-24 09:16:18,947 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
159
+ 2025-04-24 09:16:18,947 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
160
+ 2025-04-24 09:16:18,947 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
161
+ 2025-04-24 09:16:18,947 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
162
+ 2025-04-24 09:16:18,947 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
163
+ 2025-04-24 09:16:18,947 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
164
+ 2025-04-24 09:16:18,947 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
165
+ 2025-04-24 09:16:18,947 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
166
+ 2025-04-24 09:16:18,947 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
167
+ 2025-04-24 09:16:18,947 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
168
+ 2025-04-24 09:16:18,947 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
169
+ 2025-04-24 09:16:18,947 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
170
+ 2025-04-24 09:16:18,947 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
171
+ 2025-04-24 09:16:18,947 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
172
+ 2025-04-24 09:16:18,947 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
173
+ 2025-04-24 09:16:18,947 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
174
+ 2025-04-24 09:16:18,947 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
175
+ 2025-04-24 09:16:18,947 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
176
+ 2025-04-24 09:16:18,947 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
177
+ 2025-04-24 09:16:18,948 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
178
+ 2025-04-24 09:16:18,948 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
179
+ 2025-04-24 09:16:18,948 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
180
+ 2025-04-24 09:16:18,948 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
181
+ 2025-04-24 09:16:18,948 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
182
+ 2025-04-24 09:16:18,948 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
183
+ 2025-04-24 09:16:18,948 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
184
+ 2025-04-24 09:16:18,948 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
185
+ 2025-04-24 09:16:18,948 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
186
+ 2025-04-24 09:16:18,948 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
187
+ 2025-04-24 09:16:18,948 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
188
+ 2025-04-24 09:16:18,948 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
189
+ 2025-04-24 09:16:18,948 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
190
+ 2025-04-24 09:16:18,948 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
191
+ 2025-04-24 09:16:18,948 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
192
+ 2025-04-24 09:16:18,948 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
193
+ 2025-04-24 09:16:18,948 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
194
+ 2025-04-24 09:16:18,948 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
195
+ 2025-04-24 09:16:18,948 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
196
+ 2025-04-24 09:16:18,948 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
197
+ 2025-04-24 09:16:18,949 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
198
+ 2025-04-24 09:16:18,949 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
199
+ 2025-04-24 09:16:18,949 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
200
+ 2025-04-24 09:16:18,949 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
201
+ 2025-04-24 09:16:18,949 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
202
+ 2025-04-24 09:16:18,949 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
203
+ 2025-04-24 09:16:18,949 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
204
+ 2025-04-24 09:16:18,949 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
205
+ 2025-04-24 09:16:18,949 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
206
+ 2025-04-24 09:16:18,949 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
207
+ 2025-04-24 09:16:18,949 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
208
+ 2025-04-24 09:16:18,949 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
209
+ 2025-04-24 09:16:18,949 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
210
+ 2025-04-24 09:16:18,949 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
211
+ 2025-04-24 09:16:18,949 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
212
+ 2025-04-24 09:16:18,949 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
213
+ 2025-04-24 09:16:18,949 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
214
+ 2025-04-24 09:16:18,949 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
215
+ 2025-04-24 09:16:18,950 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
216
+ 2025-04-24 09:16:18,950 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
217
+ 2025-04-24 09:16:18,950 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
218
+ 2025-04-24 09:16:18,950 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
219
+ 2025-04-24 09:16:18,950 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
220
+ 2025-04-24 09:16:18,950 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
221
+ 2025-04-24 09:16:18,950 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
222
+ 2025-04-24 09:16:18,950 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
223
+ 2025-04-24 09:16:18,950 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
224
+ 2025-04-24 09:16:18,950 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
225
+ 2025-04-24 09:16:18,950 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
226
+ 2025-04-24 09:16:18,950 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
227
+ 2025-04-24 09:16:18,950 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
228
+ 2025-04-24 09:16:18,950 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
229
+ 2025-04-24 09:16:18,950 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
230
+ 2025-04-24 09:16:18,950 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
231
+ 2025-04-24 09:16:18,950 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
232
+ 2025-04-24 09:16:18,950 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
233
+ 2025-04-24 09:16:18,950 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
234
+ 2025-04-24 09:16:18,951 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
235
+ 2025-04-24 09:16:18,951 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
236
+ 2025-04-24 09:16:18,951 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
237
+ 2025-04-24 09:16:18,951 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
238
+ 2025-04-24 09:16:18,951 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
239
+ 2025-04-24 09:16:18,951 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
240
+ 2025-04-24 09:16:18,951 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
241
+ 2025-04-24 09:16:18,951 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
242
+ 2025-04-24 09:16:18,951 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
243
+ 2025-04-24 09:16:18,951 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
244
+ 2025-04-24 09:16:18,951 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
245
+ 2025-04-24 09:16:18,951 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
246
+ 2025-04-24 09:16:18,951 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
247
+ 2025-04-24 09:16:18,951 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
248
+ 2025-04-24 09:16:18,951 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
249
+ 2025-04-24 09:16:18,951 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
250
+ 2025-04-24 09:16:18,951 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
251
+ 2025-04-24 09:16:18,951 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
252
+ 2025-04-24 09:16:18,951 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
253
+ 2025-04-24 09:16:18,952 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
254
+ 2025-04-24 09:16:18,952 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
255
+ 2025-04-24 09:16:18,952 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
256
+ 2025-04-24 09:16:18,952 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
257
+ 2025-04-24 09:16:18,952 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
258
+ 2025-04-24 09:16:18,952 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
259
+ 2025-04-24 09:16:18,952 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
260
+ 2025-04-24 09:16:18,952 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
261
+ 2025-04-24 09:16:18,952 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
262
+ 2025-04-24 09:16:18,952 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
263
+ 2025-04-24 09:16:18,952 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
264
+ 2025-04-24 09:16:18,952 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
265
+ 2025-04-24 09:16:18,952 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
266
+ 2025-04-24 09:16:18,952 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
267
+ 2025-04-24 09:16:18,952 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
268
+ 2025-04-24 09:16:18,952 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
269
+ 2025-04-24 09:16:18,952 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
270
+ 2025-04-24 09:16:18,952 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
271
+ 2025-04-24 09:16:18,952 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
272
+ 2025-04-24 09:16:18,953 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
273
+ 2025-04-24 09:16:18,953 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
274
+ 2025-04-24 09:16:18,953 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
275
+ 2025-04-24 09:16:18,953 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
276
+ 2025-04-24 09:16:18,953 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
277
+ 2025-04-24 09:16:18,953 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
278
+ 2025-04-24 09:16:18,953 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
279
+ 2025-04-24 09:16:18,953 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
280
+ 2025-04-24 09:16:18,953 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
281
+ 2025-04-24 09:16:18,953 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
282
+ 2025-04-24 09:16:18,953 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
283
+ 2025-04-24 09:16:18,953 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
284
+ 2025-04-24 09:16:18,953 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
285
+ 2025-04-24 09:16:18,953 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
286
+ 2025-04-24 09:16:18,953 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
287
+ 2025-04-24 09:16:18,953 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
288
+ 2025-04-24 09:16:18,953 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
289
+ 2025-04-24 09:16:18,953 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
290
+ 2025-04-24 09:16:18,953 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
291
+ 2025-04-24 09:16:18,954 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
292
+ 2025-04-24 09:16:18,954 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
293
+ 2025-04-24 09:16:18,954 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
294
+ 2025-04-24 09:16:18,954 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
295
+ 2025-04-24 09:16:18,954 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
296
+ 2025-04-24 09:16:18,954 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
297
+ 2025-04-24 09:16:18,954 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
298
+ 2025-04-24 09:16:18,954 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
299
+ 2025-04-24 09:16:18,954 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
300
+ 2025-04-24 09:16:18,954 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
301
+ 2025-04-24 09:16:18,954 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
302
+ 2025-04-24 09:16:18,954 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
303
+ 2025-04-24 09:16:18,954 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
304
+ 2025-04-24 09:16:18,954 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
305
+ 2025-04-24 09:16:18,954 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
306
+ 2025-04-24 09:16:18,954 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
307
+ 2025-04-24 09:16:18,954 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
308
+ 2025-04-24 09:16:18,954 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
309
+ 2025-04-24 09:16:18,954 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
310
+ 2025-04-24 09:16:18,955 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
311
+ 2025-04-24 09:16:18,955 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
312
+ 2025-04-24 09:16:18,955 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
313
+ 2025-04-24 09:16:18,955 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
314
+ 2025-04-24 09:16:18,955 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
315
+ 2025-04-24 09:16:18,955 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
316
+ 2025-04-24 09:16:18,955 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
317
+ 2025-04-24 09:16:18,955 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
318
+ 2025-04-24 09:16:18,955 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
319
+ 2025-04-24 09:16:18,955 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
320
+ 2025-04-24 09:16:18,955 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
321
+ 2025-04-24 09:16:18,955 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
322
+ 2025-04-24 09:16:18,955 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
323
+ 2025-04-24 09:16:18,955 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
324
+ 2025-04-24 09:16:18,955 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
325
+ 2025-04-24 09:16:18,955 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
326
+ 2025-04-24 09:16:18,955 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
327
+ 2025-04-24 09:16:18,955 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
328
+ 2025-04-24 09:16:18,956 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
329
+ 2025-04-24 09:16:18,956 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
330
+ 2025-04-24 09:16:18,956 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
331
+ 2025-04-24 09:16:18,956 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
332
+ 2025-04-24 09:16:18,956 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
333
+ 2025-04-24 09:16:18,956 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
334
+ 2025-04-24 09:16:18,956 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
335
+ 2025-04-24 09:16:18,956 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
336
+ 2025-04-24 09:16:18,956 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
337
+ 2025-04-24 09:16:18,956 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
338
+ 2025-04-24 09:16:18,956 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
339
+ 2025-04-24 09:16:18,956 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
340
+ 2025-04-24 09:16:18,956 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
341
+ 2025-04-24 09:16:18,956 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
342
+ 2025-04-24 09:16:18,956 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
343
+ 2025-04-24 09:16:18,956 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
344
+ 2025-04-24 09:16:18,956 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
345
+ 2025-04-24 09:16:18,956 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
346
+ 2025-04-24 09:16:18,956 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
347
+ 2025-04-24 09:16:18,957 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
348
+ 2025-04-24 09:16:18,957 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
349
+ 2025-04-24 09:16:18,957 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
350
+ 2025-04-24 09:16:18,957 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
351
+ 2025-04-24 09:16:18,957 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
352
+ 2025-04-24 09:16:18,957 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
353
+ 2025-04-24 09:16:18,957 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
354
+ 2025-04-24 09:16:18,957 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
355
+ 2025-04-24 09:16:18,957 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
356
+ 2025-04-24 09:16:18,957 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
357
+ 2025-04-24 09:16:18,957 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
358
+ 2025-04-24 09:16:18,957 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
359
+ 2025-04-24 09:16:18,957 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
360
+ 2025-04-24 09:16:18,957 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
361
+ 2025-04-24 09:16:18,957 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
362
+ 2025-04-24 09:16:18,957 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
363
+ 2025-04-24 09:16:18,957 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
364
+ 2025-04-24 09:16:18,957 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
365
+ 2025-04-24 09:16:18,957 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
366
+ 2025-04-24 09:16:18,957 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
367
+ 2025-04-24 09:16:18,958 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
368
+ 2025-04-24 09:16:18,958 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
369
+ 2025-04-24 09:16:18,958 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
370
+ 2025-04-24 09:16:18,958 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
371
+ 2025-04-24 09:16:18,958 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
372
+ 2025-04-24 09:16:18,958 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
373
+ 2025-04-24 09:16:18,958 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
374
+ 2025-04-24 09:16:18,958 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
375
+ 2025-04-24 09:16:18,958 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
376
+ 2025-04-24 09:16:18,958 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
377
+ 2025-04-24 09:16:18,958 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
378
+ 2025-04-24 09:16:18,958 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
379
+ 2025-04-24 09:16:18,958 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
380
+ 2025-04-24 09:16:18,958 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
381
+ 2025-04-24 09:16:18,958 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
382
+ 2025-04-24 09:16:18,958 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
383
+ 2025-04-24 09:16:18,958 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
384
+ 2025-04-24 09:16:18,958 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
385
+ 2025-04-24 09:16:18,958 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
386
+ 2025-04-24 09:16:18,959 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
387
+ 2025-04-24 09:16:18,959 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
388
+ 2025-04-24 09:16:18,959 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
389
+ 2025-04-24 09:16:18,959 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
390
+ 2025-04-24 09:16:18,959 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
391
+ 2025-04-24 09:16:18,959 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
392
+ 2025-04-24 09:16:18,959 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
393
+ 2025-04-24 09:16:18,959 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
394
+ 2025-04-24 09:16:18,959 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
395
+ 2025-04-24 09:16:18,959 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
396
+ 2025-04-24 09:16:18,959 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
397
+ 2025-04-24 09:16:18,959 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
398
+ 2025-04-24 09:16:18,959 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
399
+ 2025-04-24 09:16:18,959 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
400
+ 2025-04-24 09:16:18,959 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
401
+ 2025-04-24 09:16:18,959 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
402
+ 2025-04-24 09:16:18,959 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
403
+ 2025-04-24 09:16:18,959 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
404
+ 2025-04-24 09:16:18,959 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
405
+ 2025-04-24 09:16:18,960 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
406
+ 2025-04-24 09:16:18,960 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
407
+ 2025-04-24 09:16:18,960 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
408
+ 2025-04-24 09:16:18,960 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
409
+ 2025-04-24 09:16:18,960 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
410
+ 2025-04-24 09:16:18,960 INFO [train.py:813] encoder_projector.linear1.weight: torch.Size([896, 10240])
411
+ 2025-04-24 09:16:18,960 INFO [train.py:813] encoder_projector.linear1.bias: torch.Size([896])
412
+ 2025-04-24 09:16:18,960 INFO [train.py:813] encoder_projector.linear2.weight: torch.Size([896, 896])
413
+ 2025-04-24 09:16:18,960 INFO [train.py:813] encoder_projector.linear2.bias: torch.Size([896])
414
+ 2025-04-24 09:16:18,960 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
415
+ 2025-04-24 09:16:18,960 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
416
+ 2025-04-24 09:16:18,960 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
417
+ 2025-04-24 09:16:18,960 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
418
+ 2025-04-24 09:16:18,960 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
419
+ 2025-04-24 09:16:18,960 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
420
+ 2025-04-24 09:16:18,960 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
421
+ 2025-04-24 09:16:18,960 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
422
+ 2025-04-24 09:16:18,960 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
423
+ 2025-04-24 09:16:18,960 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
424
+ 2025-04-24 09:16:18,961 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
425
+ 2025-04-24 09:16:18,961 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
426
+ 2025-04-24 09:16:18,961 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
427
+ 2025-04-24 09:16:18,961 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
428
+ 2025-04-24 09:16:18,961 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
429
+ 2025-04-24 09:16:18,961 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
430
+ 2025-04-24 09:16:18,961 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
431
+ 2025-04-24 09:16:18,961 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
432
+ 2025-04-24 09:16:18,961 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
433
+ 2025-04-24 09:16:18,961 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
434
+ 2025-04-24 09:16:18,961 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
435
+ 2025-04-24 09:16:18,961 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
436
+ 2025-04-24 09:16:18,961 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
437
+ 2025-04-24 09:16:18,961 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
438
+ 2025-04-24 09:16:18,961 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
439
+ 2025-04-24 09:16:18,961 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
440
+ 2025-04-24 09:16:18,961 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
441
+ 2025-04-24 09:16:18,961 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
442
+ 2025-04-24 09:16:18,961 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
443
+ 2025-04-24 09:16:18,962 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
444
+ 2025-04-24 09:16:18,962 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
445
+ 2025-04-24 09:16:18,962 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
446
+ 2025-04-24 09:16:18,962 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
447
+ 2025-04-24 09:16:18,962 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
448
+ 2025-04-24 09:16:18,962 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
449
+ 2025-04-24 09:16:18,962 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
450
+ 2025-04-24 09:16:18,962 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
451
+ 2025-04-24 09:16:18,962 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
452
+ 2025-04-24 09:16:18,962 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
453
+ 2025-04-24 09:16:18,962 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
454
+ 2025-04-24 09:16:18,962 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
455
+ 2025-04-24 09:16:18,962 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
456
+ 2025-04-24 09:16:18,962 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
457
+ 2025-04-24 09:16:18,962 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
458
+ 2025-04-24 09:16:18,962 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
459
+ 2025-04-24 09:16:18,962 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
460
+ 2025-04-24 09:16:18,962 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
461
+ 2025-04-24 09:16:18,962 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
462
+ 2025-04-24 09:16:18,963 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
463
+ 2025-04-24 09:16:18,963 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
464
+ 2025-04-24 09:16:18,963 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
465
+ 2025-04-24 09:16:18,963 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
466
+ 2025-04-24 09:16:18,963 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
467
+ 2025-04-24 09:16:18,963 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
468
+ 2025-04-24 09:16:18,963 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
469
+ 2025-04-24 09:16:18,963 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
470
+ 2025-04-24 09:16:18,963 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
471
+ 2025-04-24 09:16:18,963 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
472
+ 2025-04-24 09:16:18,963 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
473
+ 2025-04-24 09:16:18,963 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
474
+ 2025-04-24 09:16:18,963 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
475
+ 2025-04-24 09:16:18,963 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
476
+ 2025-04-24 09:16:18,963 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
477
+ 2025-04-24 09:16:18,963 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
478
+ 2025-04-24 09:16:18,963 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
479
+ 2025-04-24 09:16:18,963 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
480
+ 2025-04-24 09:16:18,963 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
481
+ 2025-04-24 09:16:18,964 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
482
+ 2025-04-24 09:16:18,964 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
483
+ 2025-04-24 09:16:18,964 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
484
+ 2025-04-24 09:16:18,964 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
485
+ 2025-04-24 09:16:18,964 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
486
+ 2025-04-24 09:16:18,964 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
487
+ 2025-04-24 09:16:18,964 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
488
+ 2025-04-24 09:16:18,964 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
489
+ 2025-04-24 09:16:18,964 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
490
+ 2025-04-24 09:16:18,964 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
491
+ 2025-04-24 09:16:18,964 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
492
+ 2025-04-24 09:16:18,964 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
493
+ 2025-04-24 09:16:18,964 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
494
+ 2025-04-24 09:16:18,964 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
495
+ 2025-04-24 09:16:18,964 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
496
+ 2025-04-24 09:16:18,964 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
497
+ 2025-04-24 09:16:18,964 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
498
+ 2025-04-24 09:16:18,964 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
499
+ 2025-04-24 09:16:18,964 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
500
+ 2025-04-24 09:16:18,965 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
501
+ 2025-04-24 09:16:18,965 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
502
+ 2025-04-24 09:16:18,965 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
503
+ 2025-04-24 09:16:18,965 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
504
+ 2025-04-24 09:16:18,965 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
505
+ 2025-04-24 09:16:18,965 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
506
+ 2025-04-24 09:16:18,965 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
507
+ 2025-04-24 09:16:18,965 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
508
+ 2025-04-24 09:16:18,965 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
509
+ 2025-04-24 09:16:18,965 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
510
+ 2025-04-24 09:16:18,965 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
511
+ 2025-04-24 09:16:18,965 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
512
+ 2025-04-24 09:16:18,965 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
513
+ 2025-04-24 09:16:18,965 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
514
+ 2025-04-24 09:16:18,965 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
515
+ 2025-04-24 09:16:18,965 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
516
+ 2025-04-24 09:16:18,965 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
517
+ 2025-04-24 09:16:18,965 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
518
+ 2025-04-24 09:16:18,965 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
519
+ 2025-04-24 09:16:18,966 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
520
+ 2025-04-24 09:16:18,966 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
521
+ 2025-04-24 09:16:18,966 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
522
+ 2025-04-24 09:16:18,966 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
523
+ 2025-04-24 09:16:18,966 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
524
+ 2025-04-24 09:16:18,966 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
525
+ 2025-04-24 09:16:18,966 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
526
+ 2025-04-24 09:16:18,966 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
527
+ 2025-04-24 09:16:18,966 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
528
+ 2025-04-24 09:16:18,966 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
529
+ 2025-04-24 09:16:18,966 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
530
+ 2025-04-24 09:16:18,966 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
531
+ 2025-04-24 09:16:18,966 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
532
+ 2025-04-24 09:16:18,966 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
533
+ 2025-04-24 09:16:18,966 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
534
+ 2025-04-24 09:16:18,966 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
535
+ 2025-04-24 09:16:18,966 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
536
+ 2025-04-24 09:16:18,966 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
537
+ 2025-04-24 09:16:18,966 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
538
+ 2025-04-24 09:16:18,966 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
539
+ 2025-04-24 09:16:18,967 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
540
+ 2025-04-24 09:16:18,967 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
541
+ 2025-04-24 09:16:18,967 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
542
+ 2025-04-24 09:16:18,967 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
543
+ 2025-04-24 09:16:18,967 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
544
+ 2025-04-24 09:16:18,967 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
545
+ 2025-04-24 09:16:18,967 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
546
+ 2025-04-24 09:16:18,967 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
547
+ 2025-04-24 09:16:18,967 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
548
+ 2025-04-24 09:16:18,967 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
549
+ 2025-04-24 09:16:18,967 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
550
+ 2025-04-24 09:16:18,967 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
551
+ 2025-04-24 09:16:18,967 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
552
+ 2025-04-24 09:16:18,967 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
553
+ 2025-04-24 09:16:18,967 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
554
+ 2025-04-24 09:16:18,967 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
555
+ 2025-04-24 09:16:18,967 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
556
+ 2025-04-24 09:16:18,967 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
557
+ 2025-04-24 09:16:18,967 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
558
+ 2025-04-24 09:16:18,968 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
559
+ 2025-04-24 09:16:18,968 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
560
+ 2025-04-24 09:16:18,968 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
561
+ 2025-04-24 09:16:18,968 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
562
+ 2025-04-24 09:16:18,968 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
563
+ 2025-04-24 09:16:18,968 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
564
+ 2025-04-24 09:16:18,968 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
565
+ 2025-04-24 09:16:18,968 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
566
+ 2025-04-24 09:16:18,968 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
567
+ 2025-04-24 09:16:18,968 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
568
+ 2025-04-24 09:16:18,968 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
569
+ 2025-04-24 09:16:18,968 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
570
+ 2025-04-24 09:16:18,968 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
571
+ 2025-04-24 09:16:18,968 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
572
+ 2025-04-24 09:16:18,968 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
573
+ 2025-04-24 09:16:18,968 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
574
+ 2025-04-24 09:16:18,968 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
575
+ 2025-04-24 09:16:18,968 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
576
+ 2025-04-24 09:16:18,968 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
577
+ 2025-04-24 09:16:18,969 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
578
+ 2025-04-24 09:16:18,969 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
579
+ 2025-04-24 09:16:18,969 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
580
+ 2025-04-24 09:16:18,969 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
581
+ 2025-04-24 09:16:18,969 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
582
+ 2025-04-24 09:16:18,969 INFO [train.py:813] speech_token_projector.weight: torch.Size([1024, 896])
583
+ 2025-04-24 09:16:18,969 INFO [train.py:813] speech_token_projector.bias: torch.Size([1024])
584
+ 2025-04-24 09:16:18,969 INFO [train.py:813] codec_lm_head.weight: torch.Size([4100, 1024])
585
+ 2025-04-24 09:16:18,969 INFO [train.py:813] codec_lm_head.bias: torch.Size([4100])
586
+ 2025-04-24 09:16:18,969 INFO [train.py:819] Device: cuda:0
587
+ 2025-04-24 09:16:19,658 INFO [train.py:823] Using DeepSpeed
588
+ 2025-04-24 09:16:21,501 INFO [data_module.py:445] About to get train cuts
589
+ 2025-04-24 09:16:21,504 INFO [data_module.py:250] Disable MUSAN
590
+ 2025-04-24 09:16:21,504 INFO [data_module.py:268] Enable SpecAugment
591
+ 2025-04-24 09:16:21,504 INFO [data_module.py:269] Time warp factor: 80
592
+ 2025-04-24 09:16:21,505 INFO [data_module.py:279] Num frame mask: 10
593
+ 2025-04-24 09:16:21,505 INFO [data_module.py:292] About to create train dataset
594
+ 2025-04-24 09:16:21,505 INFO [data_module.py:319] Using DynamicBucketingSampler.
595
+ 2025-04-24 09:16:21,537 WARNING [train.py:847] Exclude cut with ID c45eb75253bb65901de61b81270c9974 from training. Duration: 6.199727891156463, lenth: 2321
596
+ 2025-04-24 09:16:21,720 WARNING [train.py:847] Exclude cut with ID 82678aa1e73accb2d11a87440a90fd55 from training. Duration: 5.0967800453514736, lenth: 2413
597
+ 2025-04-24 09:16:22,210 WARNING [train.py:847] Exclude cut with ID 345bcf1b57cbfcf140007812e992c07a from training. Duration: 5.712108843537415, lenth: 2520
598
+ 2025-04-24 09:16:22,364 WARNING [train.py:847] Exclude cut with ID 2bc428341a40605b4887a7e46e3ee4ac from training. Duration: 3.657142857142857, lenth: 2282
599
+ 2025-04-24 09:16:22,430 WARNING [train.py:847] Exclude cut with ID 24c14f6007ceb8d993f6496aeb663c24 from training. Duration: 2.391655328798186, lenth: 2218
600
+ 2025-04-24 09:16:22,494 INFO [data_module.py:336] About to create train dataloader
601
+ 2025-04-24 09:16:22,495 INFO [data_module.py:436] About to get test cuts
602
+ 2025-04-24 09:16:22,496 INFO [data_module.py:365] About to create dev dataset
603
+ 2025-04-24 09:16:22,586 INFO [data_module.py:379] About to create dev dataloader
604
+ 2025-04-24 09:16:22,594 INFO [train.py:875] start training from epoch 1
605
+ 2025-04-24 09:16:22,719 WARNING [train.py:847] Exclude cut with ID c45eb75253bb65901de61b81270c9974 from training. Duration: 6.199727891156463, lenth: 2321
606
+ 2025-04-24 09:16:22,947 WARNING [train.py:847] Exclude cut with ID 82678aa1e73accb2d11a87440a90fd55 from training. Duration: 5.0967800453514736, lenth: 2413
607
+ 2025-04-24 09:16:23,573 WARNING [train.py:847] Exclude cut with ID 345bcf1b57cbfcf140007812e992c07a from training. Duration: 5.712108843537415, lenth: 2520
608
+ 2025-04-24 09:16:23,764 WARNING [train.py:847] Exclude cut with ID 2bc428341a40605b4887a7e46e3ee4ac from training. Duration: 3.657142857142857, lenth: 2282
609
+ 2025-04-24 09:16:23,840 WARNING [train.py:847] Exclude cut with ID 24c14f6007ceb8d993f6496aeb663c24 from training. Duration: 2.391655328798186, lenth: 2218
610
+ 2025-04-24 09:16:24,386 WARNING [train.py:847] Exclude cut with ID de225db1da1393474e4ce9af8cbafb42 from training. Duration: 5.433469387755102, lenth: 2230
611
+ 2025-04-24 09:16:24,677 WARNING [train.py:847] Exclude cut with ID ef376c7d1fb4606edb0547b2a7fb95b3 from training. Duration: 2.7399546485260773, lenth: 2637
612
+ 2025-04-24 09:16:25,021 WARNING [train.py:847] Exclude cut with ID c7f3c7da44c2c6e27f86bda421b958d8 from training. Duration: 2.995374149659864, lenth: 2318
613
+ 2025-04-24 09:16:25,194 WARNING [train.py:847] Exclude cut with ID 8d724f452d1d99549eebf90ab658e7bd from training. Duration: 5.712108843537415, lenth: 2209
614
+ 2025-04-24 09:16:25,310 WARNING [train.py:847] Exclude cut with ID a585785e795cb35d02d8f7b599352e16 from training. Duration: 4.911020408163266, lenth: 2575
615
+ 2025-04-24 09:16:25,391 WARNING [train.py:847] Exclude cut with ID 31a00c5e63c88ef67402f578ebc6ebf7 from training. Duration: 3.575873015873016, lenth: 2307
616
+ 2025-04-24 09:16:25,508 WARNING [train.py:847] Exclude cut with ID 32ab4761aaf08d9473fc8388c3a6cffc from training. Duration: 7.9760544217687075, lenth: 2201
617
+ 2025-04-24 09:16:25,564 WARNING [train.py:847] Exclude cut with ID 5fddb6bf9d5265c0409b81c85fadbaec from training. Duration: 3.4597732426303853, lenth: 2210
618
+ 2025-04-24 09:16:26,408 WARNING [train.py:847] Exclude cut with ID c5d43a2443ce7820cf864f7c55eafb77 from training. Duration: 3.912562358276644, lenth: 2315
619
+ 2025-04-24 09:16:27,597 WARNING [train.py:847] Exclude cut with ID a951e36267c8901d4c03fb6b4dffe636 from training. Duration: 4.73687074829932, lenth: 2223
620
+ 2025-04-24 09:16:27,668 WARNING [train.py:847] Exclude cut with ID 595680451c1ee7e42dba0452e165dea3 from training. Duration: 3.959002267573696, lenth: 2251
621
+ 2025-04-24 09:16:28,218 WARNING [train.py:847] Exclude cut with ID 20fa9b11557159d78f72ccc6b464de18 from training. Duration: 3.9938321995464854, lenth: 2215
622
+ 2025-04-24 09:16:28,272 WARNING [train.py:847] Exclude cut with ID 8a01fc72b4108673fd8393e523450e23 from training. Duration: 8.719092970521542, lenth: 2517
623
+ 2025-04-24 09:16:28,798 WARNING [train.py:847] Exclude cut with ID 924d54288847d448a20809a26b320477 from training. Duration: 3.81968253968254, lenth: 2302
624
+ 2025-04-24 09:16:29,100 WARNING [train.py:847] Exclude cut with ID a7f91a17f6318d896d0bee7b7307d000 from training. Duration: 5.294149659863946, lenth: 2239
625
+ 2025-04-24 09:16:29,600 WARNING [train.py:847] Exclude cut with ID 8d7cff5d1d25a773f9d27590b5912b60 from training. Duration: 4.597551020408163, lenth: 2203
626
+ 2025-04-24 09:16:29,720 WARNING [train.py:847] Exclude cut with ID 13407ca1ec78def38346569b54d11a78 from training. Duration: 5.909478458049887, lenth: 2280
627
+ 2025-04-24 09:16:29,724 WARNING [train.py:847] Exclude cut with ID 6390ab6964b52f2ee9bf36826393abaa from training. Duration: 4.017052154195011, lenth: 2209
628
+ 2025-04-24 09:16:29,769 WARNING [train.py:847] Exclude cut with ID f55c19435d6643ebf7356a1072a014fa from training. Duration: 5.259319727891157, lenth: 2310
629
+ 2025-04-24 09:16:30,441 WARNING [train.py:847] Exclude cut with ID 546f5fb6d11db41d104ee7b0667f4e1d from training. Duration: 5.712108843537415, lenth: 2770
630
+ 2025-04-24 09:16:30,732 WARNING [train.py:847] Exclude cut with ID b2069846ab4df18f5caa676751aa0b40 from training. Duration: 6.6525170068027215, lenth: 2588
631
+ 2025-04-24 09:16:31,105 WARNING [train.py:847] Exclude cut with ID 5717a65b9855425c0330e6022a71cc2f from training. Duration: 5.073560090702948, lenth: 2269
632
+ 2025-04-24 09:16:31,416 WARNING [train.py:847] Exclude cut with ID d6e37a975b3c69a86a808347b53e42f2 from training. Duration: 5.259319727891157, lenth: 2885
633
+ 2025-04-24 09:16:31,564 WARNING [train.py:847] Exclude cut with ID d15e75257f8dac616c078113f8f96c63 from training. Duration: 7.6741950113378685, lenth: 2276
634
+ 2025-04-24 09:16:31,632 WARNING [train.py:847] Exclude cut with ID 7ea49b35e0dc536537ae3ddcd07032af from training. Duration: 4.829750566893424, lenth: 2206
635
+ 2025-04-24 09:16:32,095 WARNING [train.py:847] Exclude cut with ID 0a1a80c13fd0c38d97c9491dff84c7da from training. Duration: 2.914104308390023, lenth: 2220
636
+ 2025-04-24 09:16:32,143 WARNING [train.py:847] Exclude cut with ID 359ff41c694bb848c3e37b948b1434f7 from training. Duration: 6.478367346938776, lenth: 2881
637
+ 2025-04-24 09:16:32,151 WARNING [train.py:847] Exclude cut with ID 6989881203588034ddf38ae5a7cfe238 from training. Duration: 5.491519274376417, lenth: 2243
638
+ 2025-04-24 09:16:32,424 WARNING [train.py:847] Exclude cut with ID fd7b5cb9a6bce2c02bf8a7c3dd2b35d6 from training. Duration: 5.793378684807256, lenth: 2305
639
+ 2025-04-24 09:16:33,613 WARNING [train.py:847] Exclude cut with ID e1c5efd6d4cd7c6dc0f3ac7720161c49 from training. Duration: 9.438911564625851, lenth: 2201
640
+ 2025-04-24 09:16:34,013 WARNING [train.py:847] Exclude cut with ID c13ffcc7b60ea460731e417d2d9575cf from training. Duration: 3.250793650793651, lenth: 2205
641
+ 2025-04-24 09:16:34,104 WARNING [train.py:847] Exclude cut with ID ba1a37e64efe00d8b8cbfb0069e65f8d from training. Duration: 3.274013605442177, lenth: 2290
642
+ 2025-04-24 09:16:34,183 WARNING [train.py:847] Exclude cut with ID de9f5fe02034bc4b0a66fbe378fffb1a from training. Duration: 6.455147392290249, lenth: 2228
643
+ 2025-04-24 09:16:34,674 WARNING [train.py:847] Exclude cut with ID 8ca7a35870994edd410e1ea8e951faa3 from training. Duration: 7.012426303854875, lenth: 2233
644
+ 2025-04-24 09:16:34,821 WARNING [train.py:847] Exclude cut with ID 3260368505eddf46fb1c9b562124ff18 from training. Duration: 12.097596371882085, lenth: 2261
645
+ 2025-04-24 09:16:35,017 WARNING [train.py:847] Exclude cut with ID 36878ea551121147626747e9fc39ae17 from training. Duration: 5.537959183673469, lenth: 2224
646
+ 2025-04-24 09:16:35,938 WARNING [train.py:847] Exclude cut with ID d4c7270e6c9b3dffed8643040b5f922b from training. Duration: 4.133151927437642, lenth: 2329
647
+ 2025-04-24 09:16:36,406 WARNING [train.py:847] Exclude cut with ID 179517464c97fbbcb8909a56b849bf73 from training. Duration: 6.617687074829932, lenth: 2257
648
+ 2025-04-24 09:16:36,608 WARNING [train.py:847] Exclude cut with ID 17480e9524e4c0f8e405487e50c8e98e from training. Duration: 4.551111111111111, lenth: 2263
649
+ 2025-04-24 09:16:36,697 WARNING [train.py:847] Exclude cut with ID 2c10af552b911b8ac45c1fc7bb9fcb5d from training. Duration: 6.2693877551020405, lenth: 2481
650
+ 2025-04-24 09:16:36,805 WARNING [train.py:847] Exclude cut with ID aa2539588175d8c96975a06a8993e5b6 from training. Duration: 5.456689342403628, lenth: 2573
651
+ 2025-04-24 09:16:36,842 WARNING [train.py:847] Exclude cut with ID 32945d8e21a8628bb3c93fd499330639 from training. Duration: 13.455963718820861, lenth: 2205
652
+ 2025-04-24 09:16:37,010 WARNING [train.py:847] Exclude cut with ID 64db5d6166c3edc340d8e1b02c059f19 from training. Duration: 4.818140589569161, lenth: 2259
653
+ 2025-04-24 09:16:37,145 WARNING [train.py:847] Exclude cut with ID 39483151e08291efe16217ab41622d59 from training. Duration: 7.952834467120182, lenth: 2439
654
+ 2025-04-24 09:16:37,842 WARNING [train.py:847] Exclude cut with ID c4a74dd23aa9a3fb13330113787139d6 from training. Duration: 3.239183673469388, lenth: 2276
655
+ 2025-04-24 09:16:37,885 WARNING [train.py:847] Exclude cut with ID b12a8836f1aa525a763ac657a44229a3 from training. Duration: 5.0967800453514736, lenth: 2427
656
+ 2025-04-24 09:16:41,266 WARNING [train.py:847] Exclude cut with ID 185cfa3ae9c12babe1e217de612f0c7b from training. Duration: 6.257777777777778, lenth: 2587
657
+ 2025-04-24 09:16:41,353 WARNING [train.py:847] Exclude cut with ID 08eff5b64d191a0d50634630437a3107 from training. Duration: 10.135510204081633, lenth: 2273
658
+ 2025-04-24 09:16:41,441 WARNING [train.py:847] Exclude cut with ID bcd8207538deeccfcf8d1de872b8f7e9 from training. Duration: 4.179591836734694, lenth: 2242
659
+ 2025-04-24 09:16:41,639 WARNING [train.py:847] Exclude cut with ID f909651c923fdd6cc0eaa81de64f4a5b from training. Duration: 8.068934240362811, lenth: 2421
660
+ 2025-04-24 09:16:41,667 WARNING [train.py:847] Exclude cut with ID f5d34097b89bd201018189ff44b10b33 from training. Duration: 3.7964625850340137, lenth: 2250
661
+ 2025-04-24 09:16:41,758 WARNING [train.py:847] Exclude cut with ID 245530abe5a62b390611b7a3b91ddfef from training. Duration: 9.079002267573696, lenth: 2299
662
+ 2025-04-24 09:16:42,647 WARNING [train.py:847] Exclude cut with ID 79d0d3129c764fd4d9c5391b6353e349 from training. Duration: 5.0967800453514736, lenth: 2309
663
+ 2025-04-24 09:16:42,684 WARNING [train.py:847] Exclude cut with ID 1969177f73511ef204642f43ce029d05 from training. Duration: 3.657142857142857, lenth: 2339
664
+ 2025-04-24 09:16:43,198 WARNING [train.py:847] Exclude cut with ID f601e756ecd6074cd3037c61e413cb6e from training. Duration: 5.712108843537415, lenth: 2287
665
+ 2025-04-24 09:16:43,450 WARNING [train.py:847] Exclude cut with ID 9f7758e035b3eebcb9267aa49ad65d4c from training. Duration: 3.9938321995464854, lenth: 2303
666
+ 2025-04-24 09:16:43,934 WARNING [train.py:847] Exclude cut with ID b29a42e8c0af813bf605d7f0a974620c from training. Duration: 5.050340136054421, lenth: 2219
667
+ 2025-04-24 09:16:44,106 WARNING [train.py:847] Exclude cut with ID 192e5517b050a4659892a2832d986764 from training. Duration: 5.12, lenth: 2420
668
+ 2025-04-24 09:16:44,279 WARNING [train.py:847] Exclude cut with ID bc262e923d16a434a3c5701bea841e4e from training. Duration: 7.418775510204082, lenth: 2205
669
+ 2025-04-24 09:16:44,447 WARNING [train.py:847] Exclude cut with ID 5141e01019dec47ed309fbf5881ab675 from training. Duration: 8.034104308390022, lenth: 2284
670
+ 2025-04-24 09:16:44,478 WARNING [train.py:847] Exclude cut with ID 3555e5c7123d744b39ba7ecce44ce1e2 from training. Duration: 3.7151927437641725, lenth: 2226
671
+ 2025-04-24 09:16:44,934 WARNING [train.py:847] Exclude cut with ID 973a1dc43ca452580d6999027a82fbec from training. Duration: 5.270929705215419, lenth: 2250
672
+ 2025-04-24 09:16:45,047 WARNING [train.py:847] Exclude cut with ID e2791accb62bb5e4bde9e0e2b04945c5 from training. Duration: 3.517823129251701, lenth: 2286
673
+ 2025-04-24 09:16:45,223 WARNING [train.py:847] Exclude cut with ID e91152d010c4479932b7ab64a998d5c9 from training. Duration: 4.771700680272109, lenth: 2468
674
+ 2025-04-24 09:16:45,358 WARNING [train.py:847] Exclude cut with ID cff6a8011a86580fcf47595d18819ae9 from training. Duration: 4.458231292517007, lenth: 2205
675
+ 2025-04-24 09:16:45,406 WARNING [train.py:847] Exclude cut with ID 1623cdac3d17f7a211e8a8f434f70657 from training. Duration: 4.191201814058957, lenth: 2308
676
+ 2025-04-24 09:16:46,050 WARNING [train.py:847] Exclude cut with ID 273cfc04aebaf8b0cad541d0f3d1cded from training. Duration: 6.408707482993197, lenth: 2609
677
+ 2025-04-24 09:16:46,130 WARNING [train.py:847] Exclude cut with ID 7d591b090a42935fd1edd8de97c690b6 from training. Duration: 5.038730158730159, lenth: 2229
678
+ 2025-04-24 09:16:46,229 WARNING [train.py:847] Exclude cut with ID 1cfa9ef743b568bb034b415000c8c479 from training. Duration: 7.174965986394557, lenth: 2440
679
+ 2025-04-24 09:16:47,010 WARNING [train.py:847] Exclude cut with ID 762676d460719a9203fa7e7537560530 from training. Duration: 3.1346938775510202, lenth: 2443
680
+ 2025-04-24 09:16:47,513 WARNING [train.py:847] Exclude cut with ID 43575814ad7bb6d631483cdc513b49ac from training. Duration: 10.95981859410431, lenth: 2380
681
+ 2025-04-24 09:16:47,634 WARNING [train.py:847] Exclude cut with ID ae208dee7119ca436f380b99077bcde9 from training. Duration: 5.2128798185941045, lenth: 2615
682
+ 2025-04-24 09:16:47,726 WARNING [train.py:847] Exclude cut with ID b56757708077ab00baa7857c1cef8491 from training. Duration: 5.456689342403628, lenth: 2206
683
+ 2025-04-24 09:16:47,959 WARNING [train.py:847] Exclude cut with ID f3f6d5671f3d11bc83f8fcad598007ff from training. Duration: 4.179591836734694, lenth: 2262
684
+ 2025-04-24 09:16:48,380 WARNING [train.py:847] Exclude cut with ID 6af07f098cd794b9dd5a6bd660f53505 from training. Duration: 4.435011337868481, lenth: 2251
685
+ 2025-04-24 09:16:48,518 WARNING [train.py:847] Exclude cut with ID 4ac1131393136a35ef54499bf3c728d4 from training. Duration: 7.732244897959184, lenth: 2211
686
+ 2025-04-24 09:16:48,520 WARNING [train.py:847] Exclude cut with ID 482877e78defa83a355d2cdeb5a629ed from training. Duration: 5.154829931972789, lenth: 2891
687
+ 2025-04-24 09:16:51,265 WARNING [train.py:847] Exclude cut with ID a93637e8f3e7bdf34bc8eb5d34d34197 from training. Duration: 3.215963718820862, lenth: 2235
688
+ 2025-04-24 09:16:52,934 WARNING [train.py:847] Exclude cut with ID 90c6fbf20eeb34c13ffb97a198d84b0b from training. Duration: 5.0967800453514736, lenth: 2236
689
+ 2025-04-24 09:16:53,026 INFO [train.py:539] Computing validation loss
log/log-train-2025-04-24-10-12-23 ADDED
@@ -0,0 +1,600 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-04-24 10:12:23,878 INFO [train.py:653] {
2
+ "allowed_excess_duration_ratio": 0.1,
3
+ "audio_key": "question_audio",
4
+ "batch_idx_train": 0,
5
+ "best_train_epoch": -1,
6
+ "best_train_loss": Infinity,
7
+ "best_valid_epoch": -1,
8
+ "best_valid_loss": Infinity,
9
+ "bucketing_sampler": true,
10
+ "deepscale": false,
11
+ "deepscale_config": null,
12
+ "deepspeed": true,
13
+ "deepspeed_config": "./slam_omni/ds_config_zero1.json",
14
+ "drop_last": true,
15
+ "enable_musan": false,
16
+ "enable_spec_aug": true,
17
+ "enable_speech_output": true,
18
+ "encoder_projector_ds_rate": 8,
19
+ "env_info": {
20
+ "IP address": "0.114.172.122",
21
+ "hostname": "7515258",
22
+ "icefall-git-branch": "master",
23
+ "icefall-git-date": "Thu Apr 24 08:24:11 2025",
24
+ "icefall-git-sha1": "2e9be467-dirty",
25
+ "icefall-path": "/workspace/slam/icefall_omni",
26
+ "k2-build-type": "Release",
27
+ "k2-git-date": "Tue Oct 29 09:02:19 2024",
28
+ "k2-git-sha1": "75e2ed6b2fd87c22b7f3f34bad48a69984bb8755",
29
+ "k2-path": "/opt/conda/lib/python3.11/site-packages/k2/__init__.py",
30
+ "k2-version": "1.24.4",
31
+ "k2-with-cuda": true,
32
+ "lhotse-path": "/workspace/slam/lhotse/lhotse/__init__.py",
33
+ "lhotse-version": "1.30.0.dev+git.13c7616f.dirty",
34
+ "python-version": "3.11",
35
+ "torch-cuda-available": true,
36
+ "torch-cuda-version": "12.4",
37
+ "torch-version": "2.4.0"
38
+ },
39
+ "exp_dir": "slam_omni/exp_speech2speech_rerun",
40
+ "frame_shift_ms": 10,
41
+ "huggingface_dataset_path_or_name": "/workspace/Belle_1.4M-SLAM-Omni",
42
+ "input_strategy": "PrecomputedFeatures",
43
+ "llm_path_or_name": "models/Qwen2.5-0.5B-Instruct",
44
+ "log_interval": 50,
45
+ "manifest_dir": "data/fbank",
46
+ "max_duration": 40,
47
+ "num_buckets": 30,
48
+ "num_epochs": 10,
49
+ "num_workers": 2,
50
+ "on_the_fly_feats": false,
51
+ "pretrained_model_path": null,
52
+ "resample_to_16kHz": true,
53
+ "reset_interval": 200,
54
+ "return_cuts": true,
55
+ "sampler_state_dict_path": null,
56
+ "seed": 42,
57
+ "shuffle": true,
58
+ "spec_aug_time_warp_factor": 80,
59
+ "speech_encoder_path_or_name": "models/whisper/v1.1/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt",
60
+ "start_epoch": 1,
61
+ "subsampling_factor": 2,
62
+ "tensorboard": true,
63
+ "text_key": "answer",
64
+ "unfreeze_llm": true,
65
+ "unfreeze_speech_projector": true,
66
+ "use_flash_attn": true,
67
+ "use_fp16": true,
68
+ "use_lora": true,
69
+ "valid_interval": 5000
70
+ }
71
+ 2025-04-24 10:12:23,878 INFO [train.py:655] About to create model
72
+ 2025-04-24 10:12:44,166 INFO [train.py:808] Number of model parameters: 1326848644
73
+ 2025-04-24 10:12:44,166 INFO [train.py:810] Trainable parameters (excluding model.eval modules):
74
+ 2025-04-24 10:12:44,167 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
75
+ 2025-04-24 10:12:44,167 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
76
+ 2025-04-24 10:12:44,167 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
77
+ 2025-04-24 10:12:44,167 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
78
+ 2025-04-24 10:12:44,167 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
79
+ 2025-04-24 10:12:44,167 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
80
+ 2025-04-24 10:12:44,168 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
81
+ 2025-04-24 10:12:44,168 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
82
+ 2025-04-24 10:12:44,168 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
83
+ 2025-04-24 10:12:44,168 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
84
+ 2025-04-24 10:12:44,168 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
85
+ 2025-04-24 10:12:44,168 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
86
+ 2025-04-24 10:12:44,168 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
87
+ 2025-04-24 10:12:44,168 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
88
+ 2025-04-24 10:12:44,168 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
89
+ 2025-04-24 10:12:44,168 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
90
+ 2025-04-24 10:12:44,168 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
91
+ 2025-04-24 10:12:44,168 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
92
+ 2025-04-24 10:12:44,168 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
93
+ 2025-04-24 10:12:44,168 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
94
+ 2025-04-24 10:12:44,168 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
95
+ 2025-04-24 10:12:44,168 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
96
+ 2025-04-24 10:12:44,168 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
97
+ 2025-04-24 10:12:44,169 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
98
+ 2025-04-24 10:12:44,169 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
99
+ 2025-04-24 10:12:44,169 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
100
+ 2025-04-24 10:12:44,169 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
101
+ 2025-04-24 10:12:44,169 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
102
+ 2025-04-24 10:12:44,169 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
103
+ 2025-04-24 10:12:44,169 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
104
+ 2025-04-24 10:12:44,169 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
105
+ 2025-04-24 10:12:44,169 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
106
+ 2025-04-24 10:12:44,169 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
107
+ 2025-04-24 10:12:44,169 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
108
+ 2025-04-24 10:12:44,169 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
109
+ 2025-04-24 10:12:44,169 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
110
+ 2025-04-24 10:12:44,169 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
111
+ 2025-04-24 10:12:44,169 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
112
+ 2025-04-24 10:12:44,169 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
113
+ 2025-04-24 10:12:44,169 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
114
+ 2025-04-24 10:12:44,169 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
115
+ 2025-04-24 10:12:44,169 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
116
+ 2025-04-24 10:12:44,170 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
117
+ 2025-04-24 10:12:44,170 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
118
+ 2025-04-24 10:12:44,170 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
119
+ 2025-04-24 10:12:44,170 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
120
+ 2025-04-24 10:12:44,170 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
121
+ 2025-04-24 10:12:44,170 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
122
+ 2025-04-24 10:12:44,170 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
123
+ 2025-04-24 10:12:44,170 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
124
+ 2025-04-24 10:12:44,170 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
125
+ 2025-04-24 10:12:44,170 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
126
+ 2025-04-24 10:12:44,170 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
127
+ 2025-04-24 10:12:44,170 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
128
+ 2025-04-24 10:12:44,170 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
129
+ 2025-04-24 10:12:44,170 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
130
+ 2025-04-24 10:12:44,170 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
131
+ 2025-04-24 10:12:44,170 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
132
+ 2025-04-24 10:12:44,170 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
133
+ 2025-04-24 10:12:44,170 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
134
+ 2025-04-24 10:12:44,171 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
135
+ 2025-04-24 10:12:44,171 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
136
+ 2025-04-24 10:12:44,171 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
137
+ 2025-04-24 10:12:44,171 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
138
+ 2025-04-24 10:12:44,171 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
139
+ 2025-04-24 10:12:44,171 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
140
+ 2025-04-24 10:12:44,171 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
141
+ 2025-04-24 10:12:44,171 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
142
+ 2025-04-24 10:12:44,171 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
143
+ 2025-04-24 10:12:44,171 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
144
+ 2025-04-24 10:12:44,171 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
145
+ 2025-04-24 10:12:44,171 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
146
+ 2025-04-24 10:12:44,171 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
147
+ 2025-04-24 10:12:44,171 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
148
+ 2025-04-24 10:12:44,171 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
149
+ 2025-04-24 10:12:44,171 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
150
+ 2025-04-24 10:12:44,171 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
151
+ 2025-04-24 10:12:44,171 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
152
+ 2025-04-24 10:12:44,172 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
153
+ 2025-04-24 10:12:44,172 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
154
+ 2025-04-24 10:12:44,172 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
155
+ 2025-04-24 10:12:44,172 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
156
+ 2025-04-24 10:12:44,172 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
157
+ 2025-04-24 10:12:44,172 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
158
+ 2025-04-24 10:12:44,172 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
159
+ 2025-04-24 10:12:44,172 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
160
+ 2025-04-24 10:12:44,172 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
161
+ 2025-04-24 10:12:44,172 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
162
+ 2025-04-24 10:12:44,172 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
163
+ 2025-04-24 10:12:44,172 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
164
+ 2025-04-24 10:12:44,172 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
165
+ 2025-04-24 10:12:44,172 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
166
+ 2025-04-24 10:12:44,172 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
167
+ 2025-04-24 10:12:44,172 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
168
+ 2025-04-24 10:12:44,172 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
169
+ 2025-04-24 10:12:44,172 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
170
+ 2025-04-24 10:12:44,172 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
171
+ 2025-04-24 10:12:44,172 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
172
+ 2025-04-24 10:12:44,173 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
173
+ 2025-04-24 10:12:44,173 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
174
+ 2025-04-24 10:12:44,173 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
175
+ 2025-04-24 10:12:44,173 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
176
+ 2025-04-24 10:12:44,173 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
177
+ 2025-04-24 10:12:44,173 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
178
+ 2025-04-24 10:12:44,173 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
179
+ 2025-04-24 10:12:44,173 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
180
+ 2025-04-24 10:12:44,173 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
181
+ 2025-04-24 10:12:44,173 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
182
+ 2025-04-24 10:12:44,173 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
183
+ 2025-04-24 10:12:44,173 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
184
+ 2025-04-24 10:12:44,173 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
185
+ 2025-04-24 10:12:44,173 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
186
+ 2025-04-24 10:12:44,173 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
187
+ 2025-04-24 10:12:44,173 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
188
+ 2025-04-24 10:12:44,173 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
189
+ 2025-04-24 10:12:44,173 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
190
+ 2025-04-24 10:12:44,173 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
191
+ 2025-04-24 10:12:44,174 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
192
+ 2025-04-24 10:12:44,174 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
193
+ 2025-04-24 10:12:44,174 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
194
+ 2025-04-24 10:12:44,174 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
195
+ 2025-04-24 10:12:44,174 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
196
+ 2025-04-24 10:12:44,174 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
197
+ 2025-04-24 10:12:44,174 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
198
+ 2025-04-24 10:12:44,174 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
199
+ 2025-04-24 10:12:44,174 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
200
+ 2025-04-24 10:12:44,174 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
201
+ 2025-04-24 10:12:44,174 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
202
+ 2025-04-24 10:12:44,174 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
203
+ 2025-04-24 10:12:44,174 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
204
+ 2025-04-24 10:12:44,174 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
205
+ 2025-04-24 10:12:44,174 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
206
+ 2025-04-24 10:12:44,174 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
207
+ 2025-04-24 10:12:44,174 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
208
+ 2025-04-24 10:12:44,174 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
209
+ 2025-04-24 10:12:44,174 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
210
+ 2025-04-24 10:12:44,175 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
211
+ 2025-04-24 10:12:44,175 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
212
+ 2025-04-24 10:12:44,175 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
213
+ 2025-04-24 10:12:44,175 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
214
+ 2025-04-24 10:12:44,175 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
215
+ 2025-04-24 10:12:44,175 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
216
+ 2025-04-24 10:12:44,175 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
217
+ 2025-04-24 10:12:44,175 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
218
+ 2025-04-24 10:12:44,175 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
219
+ 2025-04-24 10:12:44,175 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
220
+ 2025-04-24 10:12:44,175 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
221
+ 2025-04-24 10:12:44,175 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
222
+ 2025-04-24 10:12:44,175 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
223
+ 2025-04-24 10:12:44,175 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
224
+ 2025-04-24 10:12:44,175 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
225
+ 2025-04-24 10:12:44,175 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
226
+ 2025-04-24 10:12:44,175 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
227
+ 2025-04-24 10:12:44,175 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
228
+ 2025-04-24 10:12:44,176 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
229
+ 2025-04-24 10:12:44,176 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
230
+ 2025-04-24 10:12:44,176 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
231
+ 2025-04-24 10:12:44,176 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
232
+ 2025-04-24 10:12:44,176 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
233
+ 2025-04-24 10:12:44,176 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
234
+ 2025-04-24 10:12:44,176 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
235
+ 2025-04-24 10:12:44,176 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
236
+ 2025-04-24 10:12:44,176 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
237
+ 2025-04-24 10:12:44,176 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
238
+ 2025-04-24 10:12:44,176 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
239
+ 2025-04-24 10:12:44,176 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
240
+ 2025-04-24 10:12:44,176 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
241
+ 2025-04-24 10:12:44,176 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
242
+ 2025-04-24 10:12:44,176 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
243
+ 2025-04-24 10:12:44,176 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
244
+ 2025-04-24 10:12:44,176 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
245
+ 2025-04-24 10:12:44,176 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
246
+ 2025-04-24 10:12:44,176 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
247
+ 2025-04-24 10:12:44,176 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
248
+ 2025-04-24 10:12:44,177 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
249
+ 2025-04-24 10:12:44,177 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
250
+ 2025-04-24 10:12:44,177 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
251
+ 2025-04-24 10:12:44,177 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
252
+ 2025-04-24 10:12:44,177 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
253
+ 2025-04-24 10:12:44,177 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
254
+ 2025-04-24 10:12:44,177 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
255
+ 2025-04-24 10:12:44,177 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
256
+ 2025-04-24 10:12:44,177 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
257
+ 2025-04-24 10:12:44,177 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
258
+ 2025-04-24 10:12:44,177 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
259
+ 2025-04-24 10:12:44,177 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
260
+ 2025-04-24 10:12:44,177 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
261
+ 2025-04-24 10:12:44,177 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
262
+ 2025-04-24 10:12:44,177 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
263
+ 2025-04-24 10:12:44,177 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
264
+ 2025-04-24 10:12:44,177 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
265
+ 2025-04-24 10:12:44,177 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
266
+ 2025-04-24 10:12:44,177 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
267
+ 2025-04-24 10:12:44,178 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
268
+ 2025-04-24 10:12:44,178 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
269
+ 2025-04-24 10:12:44,178 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
270
+ 2025-04-24 10:12:44,178 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
271
+ 2025-04-24 10:12:44,178 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
272
+ 2025-04-24 10:12:44,178 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
273
+ 2025-04-24 10:12:44,178 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
274
+ 2025-04-24 10:12:44,178 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
275
+ 2025-04-24 10:12:44,178 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
276
+ 2025-04-24 10:12:44,178 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
277
+ 2025-04-24 10:12:44,178 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
278
+ 2025-04-24 10:12:44,178 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
279
+ 2025-04-24 10:12:44,178 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
280
+ 2025-04-24 10:12:44,178 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
281
+ 2025-04-24 10:12:44,178 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
282
+ 2025-04-24 10:12:44,178 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
283
+ 2025-04-24 10:12:44,178 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
284
+ 2025-04-24 10:12:44,178 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
285
+ 2025-04-24 10:12:44,178 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
286
+ 2025-04-24 10:12:44,179 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
287
+ 2025-04-24 10:12:44,179 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
288
+ 2025-04-24 10:12:44,179 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
289
+ 2025-04-24 10:12:44,179 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
290
+ 2025-04-24 10:12:44,179 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
291
+ 2025-04-24 10:12:44,179 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
292
+ 2025-04-24 10:12:44,179 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
293
+ 2025-04-24 10:12:44,179 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
294
+ 2025-04-24 10:12:44,179 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
295
+ 2025-04-24 10:12:44,179 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
296
+ 2025-04-24 10:12:44,179 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
297
+ 2025-04-24 10:12:44,179 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
298
+ 2025-04-24 10:12:44,179 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
299
+ 2025-04-24 10:12:44,179 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
300
+ 2025-04-24 10:12:44,179 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
301
+ 2025-04-24 10:12:44,179 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
302
+ 2025-04-24 10:12:44,179 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
303
+ 2025-04-24 10:12:44,179 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
304
+ 2025-04-24 10:12:44,180 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
305
+ 2025-04-24 10:12:44,180 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
306
+ 2025-04-24 10:12:44,180 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
307
+ 2025-04-24 10:12:44,180 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
308
+ 2025-04-24 10:12:44,180 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
309
+ 2025-04-24 10:12:44,180 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
310
+ 2025-04-24 10:12:44,180 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
311
+ 2025-04-24 10:12:44,180 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
312
+ 2025-04-24 10:12:44,180 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
313
+ 2025-04-24 10:12:44,180 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
314
+ 2025-04-24 10:12:44,180 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
315
+ 2025-04-24 10:12:44,180 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
316
+ 2025-04-24 10:12:44,180 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
317
+ 2025-04-24 10:12:44,180 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
318
+ 2025-04-24 10:12:44,180 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
319
+ 2025-04-24 10:12:44,180 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
320
+ 2025-04-24 10:12:44,180 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
321
+ 2025-04-24 10:12:44,180 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
322
+ 2025-04-24 10:12:44,181 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
323
+ 2025-04-24 10:12:44,181 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
324
+ 2025-04-24 10:12:44,181 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
325
+ 2025-04-24 10:12:44,181 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
326
+ 2025-04-24 10:12:44,181 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
327
+ 2025-04-24 10:12:44,181 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
328
+ 2025-04-24 10:12:44,181 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
329
+ 2025-04-24 10:12:44,181 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
330
+ 2025-04-24 10:12:44,181 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
331
+ 2025-04-24 10:12:44,181 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
332
+ 2025-04-24 10:12:44,181 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
333
+ 2025-04-24 10:12:44,181 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
334
+ 2025-04-24 10:12:44,181 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
335
+ 2025-04-24 10:12:44,181 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
336
+ 2025-04-24 10:12:44,181 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
337
+ 2025-04-24 10:12:44,181 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
338
+ 2025-04-24 10:12:44,181 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
339
+ 2025-04-24 10:12:44,181 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
340
+ 2025-04-24 10:12:44,181 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
341
+ 2025-04-24 10:12:44,181 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
342
+ 2025-04-24 10:12:44,182 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
343
+ 2025-04-24 10:12:44,182 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
344
+ 2025-04-24 10:12:44,182 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
345
+ 2025-04-24 10:12:44,182 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
346
+ 2025-04-24 10:12:44,182 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
347
+ 2025-04-24 10:12:44,182 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
348
+ 2025-04-24 10:12:44,182 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
349
+ 2025-04-24 10:12:44,182 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
350
+ 2025-04-24 10:12:44,182 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
351
+ 2025-04-24 10:12:44,182 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
352
+ 2025-04-24 10:12:44,182 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
353
+ 2025-04-24 10:12:44,182 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
354
+ 2025-04-24 10:12:44,182 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
355
+ 2025-04-24 10:12:44,182 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
356
+ 2025-04-24 10:12:44,182 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
357
+ 2025-04-24 10:12:44,182 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
358
+ 2025-04-24 10:12:44,182 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
359
+ 2025-04-24 10:12:44,182 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
360
+ 2025-04-24 10:12:44,183 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
361
+ 2025-04-24 10:12:44,183 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
362
+ 2025-04-24 10:12:44,183 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
363
+ 2025-04-24 10:12:44,183 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
364
+ 2025-04-24 10:12:44,183 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
365
+ 2025-04-24 10:12:44,183 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
366
+ 2025-04-24 10:12:44,183 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
367
+ 2025-04-24 10:12:44,183 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
368
+ 2025-04-24 10:12:44,183 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
369
+ 2025-04-24 10:12:44,183 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
370
+ 2025-04-24 10:12:44,183 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
371
+ 2025-04-24 10:12:44,183 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
372
+ 2025-04-24 10:12:44,183 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
373
+ 2025-04-24 10:12:44,183 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
374
+ 2025-04-24 10:12:44,183 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
375
+ 2025-04-24 10:12:44,183 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
376
+ 2025-04-24 10:12:44,183 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
377
+ 2025-04-24 10:12:44,183 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
378
+ 2025-04-24 10:12:44,183 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
379
+ 2025-04-24 10:12:44,184 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
380
+ 2025-04-24 10:12:44,184 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
381
+ 2025-04-24 10:12:44,184 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
382
+ 2025-04-24 10:12:44,184 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
383
+ 2025-04-24 10:12:44,184 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
384
+ 2025-04-24 10:12:44,184 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
385
+ 2025-04-24 10:12:44,184 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
386
+ 2025-04-24 10:12:44,184 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
387
+ 2025-04-24 10:12:44,184 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
388
+ 2025-04-24 10:12:44,184 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
389
+ 2025-04-24 10:12:44,184 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
390
+ 2025-04-24 10:12:44,184 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
391
+ 2025-04-24 10:12:44,184 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
392
+ 2025-04-24 10:12:44,184 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
393
+ 2025-04-24 10:12:44,184 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
394
+ 2025-04-24 10:12:44,184 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
395
+ 2025-04-24 10:12:44,184 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
396
+ 2025-04-24 10:12:44,184 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
397
+ 2025-04-24 10:12:44,184 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
398
+ 2025-04-24 10:12:44,184 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
399
+ 2025-04-24 10:12:44,185 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
400
+ 2025-04-24 10:12:44,185 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
401
+ 2025-04-24 10:12:44,185 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
402
+ 2025-04-24 10:12:44,185 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
403
+ 2025-04-24 10:12:44,185 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
404
+ 2025-04-24 10:12:44,185 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
405
+ 2025-04-24 10:12:44,185 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
406
+ 2025-04-24 10:12:44,185 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
407
+ 2025-04-24 10:12:44,185 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
408
+ 2025-04-24 10:12:44,185 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
409
+ 2025-04-24 10:12:44,185 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
410
+ 2025-04-24 10:12:44,185 INFO [train.py:813] encoder_projector.linear1.weight: torch.Size([896, 10240])
411
+ 2025-04-24 10:12:44,185 INFO [train.py:813] encoder_projector.linear1.bias: torch.Size([896])
412
+ 2025-04-24 10:12:44,185 INFO [train.py:813] encoder_projector.linear2.weight: torch.Size([896, 896])
413
+ 2025-04-24 10:12:44,185 INFO [train.py:813] encoder_projector.linear2.bias: torch.Size([896])
414
+ 2025-04-24 10:12:44,185 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
415
+ 2025-04-24 10:12:44,185 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
416
+ 2025-04-24 10:12:44,185 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
417
+ 2025-04-24 10:12:44,185 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
418
+ 2025-04-24 10:12:44,186 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
419
+ 2025-04-24 10:12:44,186 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
420
+ 2025-04-24 10:12:44,186 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
421
+ 2025-04-24 10:12:44,186 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
422
+ 2025-04-24 10:12:44,186 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
423
+ 2025-04-24 10:12:44,186 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
424
+ 2025-04-24 10:12:44,186 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
425
+ 2025-04-24 10:12:44,186 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
426
+ 2025-04-24 10:12:44,186 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
427
+ 2025-04-24 10:12:44,186 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
428
+ 2025-04-24 10:12:44,186 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
429
+ 2025-04-24 10:12:44,186 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
430
+ 2025-04-24 10:12:44,186 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
431
+ 2025-04-24 10:12:44,186 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
432
+ 2025-04-24 10:12:44,186 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
433
+ 2025-04-24 10:12:44,186 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
434
+ 2025-04-24 10:12:44,186 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
435
+ 2025-04-24 10:12:44,186 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
436
+ 2025-04-24 10:12:44,186 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
437
+ 2025-04-24 10:12:44,186 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
438
+ 2025-04-24 10:12:44,187 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
439
+ 2025-04-24 10:12:44,187 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
440
+ 2025-04-24 10:12:44,187 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
441
+ 2025-04-24 10:12:44,187 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
442
+ 2025-04-24 10:12:44,187 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
443
+ 2025-04-24 10:12:44,187 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
444
+ 2025-04-24 10:12:44,187 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
445
+ 2025-04-24 10:12:44,187 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
446
+ 2025-04-24 10:12:44,187 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
447
+ 2025-04-24 10:12:44,187 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
448
+ 2025-04-24 10:12:44,187 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
449
+ 2025-04-24 10:12:44,187 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
450
+ 2025-04-24 10:12:44,187 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
451
+ 2025-04-24 10:12:44,187 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
452
+ 2025-04-24 10:12:44,187 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
453
+ 2025-04-24 10:12:44,187 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
454
+ 2025-04-24 10:12:44,187 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
455
+ 2025-04-24 10:12:44,187 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
456
+ 2025-04-24 10:12:44,187 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
457
+ 2025-04-24 10:12:44,188 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
458
+ 2025-04-24 10:12:44,188 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
459
+ 2025-04-24 10:12:44,188 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
460
+ 2025-04-24 10:12:44,188 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
461
+ 2025-04-24 10:12:44,188 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
462
+ 2025-04-24 10:12:44,188 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
463
+ 2025-04-24 10:12:44,188 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
464
+ 2025-04-24 10:12:44,188 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
465
+ 2025-04-24 10:12:44,188 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
466
+ 2025-04-24 10:12:44,188 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
467
+ 2025-04-24 10:12:44,188 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
468
+ 2025-04-24 10:12:44,188 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
469
+ 2025-04-24 10:12:44,188 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
470
+ 2025-04-24 10:12:44,188 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
471
+ 2025-04-24 10:12:44,188 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
472
+ 2025-04-24 10:12:44,188 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
473
+ 2025-04-24 10:12:44,188 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
474
+ 2025-04-24 10:12:44,188 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
475
+ 2025-04-24 10:12:44,188 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
476
+ 2025-04-24 10:12:44,189 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
477
+ 2025-04-24 10:12:44,189 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
478
+ 2025-04-24 10:12:44,189 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
479
+ 2025-04-24 10:12:44,189 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
480
+ 2025-04-24 10:12:44,189 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
481
+ 2025-04-24 10:12:44,189 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
482
+ 2025-04-24 10:12:44,189 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
483
+ 2025-04-24 10:12:44,189 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
484
+ 2025-04-24 10:12:44,189 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
485
+ 2025-04-24 10:12:44,189 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
486
+ 2025-04-24 10:12:44,189 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
487
+ 2025-04-24 10:12:44,189 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
488
+ 2025-04-24 10:12:44,189 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
489
+ 2025-04-24 10:12:44,189 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
490
+ 2025-04-24 10:12:44,189 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
491
+ 2025-04-24 10:12:44,189 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
492
+ 2025-04-24 10:12:44,189 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
493
+ 2025-04-24 10:12:44,189 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
494
+ 2025-04-24 10:12:44,189 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
495
+ 2025-04-24 10:12:44,190 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
496
+ 2025-04-24 10:12:44,190 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
497
+ 2025-04-24 10:12:44,190 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
498
+ 2025-04-24 10:12:44,190 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
499
+ 2025-04-24 10:12:44,190 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
500
+ 2025-04-24 10:12:44,190 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
501
+ 2025-04-24 10:12:44,190 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
502
+ 2025-04-24 10:12:44,190 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
503
+ 2025-04-24 10:12:44,190 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
504
+ 2025-04-24 10:12:44,190 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
505
+ 2025-04-24 10:12:44,190 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
506
+ 2025-04-24 10:12:44,190 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
507
+ 2025-04-24 10:12:44,190 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
508
+ 2025-04-24 10:12:44,190 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
509
+ 2025-04-24 10:12:44,190 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
510
+ 2025-04-24 10:12:44,190 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
511
+ 2025-04-24 10:12:44,190 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
512
+ 2025-04-24 10:12:44,190 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
513
+ 2025-04-24 10:12:44,190 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
514
+ 2025-04-24 10:12:44,190 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
515
+ 2025-04-24 10:12:44,191 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
516
+ 2025-04-24 10:12:44,191 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
517
+ 2025-04-24 10:12:44,191 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
518
+ 2025-04-24 10:12:44,191 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
519
+ 2025-04-24 10:12:44,191 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
520
+ 2025-04-24 10:12:44,191 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
521
+ 2025-04-24 10:12:44,191 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
522
+ 2025-04-24 10:12:44,191 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
523
+ 2025-04-24 10:12:44,191 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
524
+ 2025-04-24 10:12:44,191 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
525
+ 2025-04-24 10:12:44,191 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
526
+ 2025-04-24 10:12:44,191 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
527
+ 2025-04-24 10:12:44,191 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
528
+ 2025-04-24 10:12:44,191 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
529
+ 2025-04-24 10:12:44,191 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
530
+ 2025-04-24 10:12:44,191 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
531
+ 2025-04-24 10:12:44,191 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
532
+ 2025-04-24 10:12:44,191 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
533
+ 2025-04-24 10:12:44,191 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
534
+ 2025-04-24 10:12:44,192 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
535
+ 2025-04-24 10:12:44,192 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
536
+ 2025-04-24 10:12:44,192 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
537
+ 2025-04-24 10:12:44,192 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
538
+ 2025-04-24 10:12:44,192 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
539
+ 2025-04-24 10:12:44,192 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
540
+ 2025-04-24 10:12:44,192 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
541
+ 2025-04-24 10:12:44,192 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
542
+ 2025-04-24 10:12:44,192 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
543
+ 2025-04-24 10:12:44,192 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
544
+ 2025-04-24 10:12:44,192 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
545
+ 2025-04-24 10:12:44,192 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
546
+ 2025-04-24 10:12:44,192 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
547
+ 2025-04-24 10:12:44,192 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
548
+ 2025-04-24 10:12:44,192 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
549
+ 2025-04-24 10:12:44,192 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
550
+ 2025-04-24 10:12:44,192 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
551
+ 2025-04-24 10:12:44,192 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
552
+ 2025-04-24 10:12:44,192 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
553
+ 2025-04-24 10:12:44,192 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
554
+ 2025-04-24 10:12:44,193 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
555
+ 2025-04-24 10:12:44,193 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
556
+ 2025-04-24 10:12:44,193 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
557
+ 2025-04-24 10:12:44,193 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
558
+ 2025-04-24 10:12:44,193 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
559
+ 2025-04-24 10:12:44,193 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
560
+ 2025-04-24 10:12:44,193 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
561
+ 2025-04-24 10:12:44,193 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
562
+ 2025-04-24 10:12:44,193 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
563
+ 2025-04-24 10:12:44,193 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
564
+ 2025-04-24 10:12:44,193 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
565
+ 2025-04-24 10:12:44,193 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
566
+ 2025-04-24 10:12:44,193 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
567
+ 2025-04-24 10:12:44,193 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
568
+ 2025-04-24 10:12:44,193 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
569
+ 2025-04-24 10:12:44,193 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
570
+ 2025-04-24 10:12:44,193 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
571
+ 2025-04-24 10:12:44,193 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
572
+ 2025-04-24 10:12:44,193 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
573
+ 2025-04-24 10:12:44,194 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
574
+ 2025-04-24 10:12:44,194 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
575
+ 2025-04-24 10:12:44,194 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
576
+ 2025-04-24 10:12:44,194 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
577
+ 2025-04-24 10:12:44,194 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
578
+ 2025-04-24 10:12:44,194 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
579
+ 2025-04-24 10:12:44,194 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
580
+ 2025-04-24 10:12:44,194 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
581
+ 2025-04-24 10:12:44,194 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
582
+ 2025-04-24 10:12:44,194 INFO [train.py:813] speech_token_projector.weight: torch.Size([1024, 896])
583
+ 2025-04-24 10:12:44,194 INFO [train.py:813] speech_token_projector.bias: torch.Size([1024])
584
+ 2025-04-24 10:12:44,194 INFO [train.py:813] codec_lm_head.weight: torch.Size([4100, 1024])
585
+ 2025-04-24 10:12:44,194 INFO [train.py:813] codec_lm_head.bias: torch.Size([4100])
586
+ 2025-04-24 10:12:44,194 INFO [train.py:819] Device: cuda:0
587
+ 2025-04-24 10:12:44,922 INFO [train.py:823] Using DeepSpeed
588
+ 2025-04-24 10:12:46,690 INFO [data_module.py:445] About to get train cuts
589
+ 2025-04-24 10:12:46,693 INFO [data_module.py:250] Disable MUSAN
590
+ 2025-04-24 10:12:46,693 INFO [data_module.py:268] Enable SpecAugment
591
+ 2025-04-24 10:12:46,693 INFO [data_module.py:269] Time warp factor: 80
592
+ 2025-04-24 10:12:46,693 INFO [data_module.py:279] Num frame mask: 10
593
+ 2025-04-24 10:12:46,693 INFO [data_module.py:292] About to create train dataset
594
+ 2025-04-24 10:12:46,693 INFO [data_module.py:319] Using DynamicBucketingSampler.
595
+ 2025-04-24 10:12:47,691 INFO [data_module.py:336] About to create train dataloader
596
+ 2025-04-24 10:12:47,692 INFO [data_module.py:436] About to get test cuts
597
+ 2025-04-24 10:12:47,693 INFO [data_module.py:365] About to create dev dataset
598
+ 2025-04-24 10:12:47,781 INFO [data_module.py:379] About to create dev dataloader
599
+ 2025-04-24 10:12:47,788 INFO [train.py:875] start training from epoch 1
600
+ 2025-04-24 10:13:18,332 INFO [train.py:539] Computing validation loss
log/log-train-2025-04-24-10-14-50 ADDED
@@ -0,0 +1,600 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-04-24 10:14:50,019 INFO [train.py:653] {
2
+ "allowed_excess_duration_ratio": 0.1,
3
+ "audio_key": "question_audio",
4
+ "batch_idx_train": 0,
5
+ "best_train_epoch": -1,
6
+ "best_train_loss": Infinity,
7
+ "best_valid_epoch": -1,
8
+ "best_valid_loss": Infinity,
9
+ "bucketing_sampler": true,
10
+ "deepscale": false,
11
+ "deepscale_config": null,
12
+ "deepspeed": true,
13
+ "deepspeed_config": "./slam_omni/ds_config_zero1.json",
14
+ "drop_last": true,
15
+ "enable_musan": false,
16
+ "enable_spec_aug": true,
17
+ "enable_speech_output": true,
18
+ "encoder_projector_ds_rate": 8,
19
+ "env_info": {
20
+ "IP address": "0.114.172.122",
21
+ "hostname": "7515258",
22
+ "icefall-git-branch": "master",
23
+ "icefall-git-date": "Thu Apr 24 08:24:11 2025",
24
+ "icefall-git-sha1": "2e9be467-dirty",
25
+ "icefall-path": "/workspace/slam/icefall_omni",
26
+ "k2-build-type": "Release",
27
+ "k2-git-date": "Tue Oct 29 09:02:19 2024",
28
+ "k2-git-sha1": "75e2ed6b2fd87c22b7f3f34bad48a69984bb8755",
29
+ "k2-path": "/opt/conda/lib/python3.11/site-packages/k2/__init__.py",
30
+ "k2-version": "1.24.4",
31
+ "k2-with-cuda": true,
32
+ "lhotse-path": "/workspace/slam/lhotse/lhotse/__init__.py",
33
+ "lhotse-version": "1.30.0.dev+git.13c7616f.dirty",
34
+ "python-version": "3.11",
35
+ "torch-cuda-available": true,
36
+ "torch-cuda-version": "12.4",
37
+ "torch-version": "2.4.0"
38
+ },
39
+ "exp_dir": "slam_omni/exp_speech2speech_rerun",
40
+ "frame_shift_ms": 10,
41
+ "huggingface_dataset_path_or_name": "/workspace/Belle_1.4M-SLAM-Omni",
42
+ "input_strategy": "PrecomputedFeatures",
43
+ "llm_path_or_name": "models/Qwen2.5-0.5B-Instruct",
44
+ "log_interval": 50,
45
+ "manifest_dir": "data/fbank",
46
+ "max_duration": 40,
47
+ "num_buckets": 30,
48
+ "num_epochs": 10,
49
+ "num_workers": 2,
50
+ "on_the_fly_feats": false,
51
+ "pretrained_model_path": null,
52
+ "resample_to_16kHz": true,
53
+ "reset_interval": 200,
54
+ "return_cuts": true,
55
+ "sampler_state_dict_path": null,
56
+ "seed": 42,
57
+ "shuffle": true,
58
+ "spec_aug_time_warp_factor": 80,
59
+ "speech_encoder_path_or_name": "models/whisper/v1.1/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt",
60
+ "start_epoch": 1,
61
+ "subsampling_factor": 2,
62
+ "tensorboard": true,
63
+ "text_key": "answer",
64
+ "unfreeze_llm": true,
65
+ "unfreeze_speech_projector": true,
66
+ "use_flash_attn": true,
67
+ "use_fp16": true,
68
+ "use_lora": true,
69
+ "valid_interval": 5000
70
+ }
71
+ 2025-04-24 10:14:50,020 INFO [train.py:655] About to create model
72
+ 2025-04-24 10:15:09,864 INFO [train.py:808] Number of model parameters: 1327766148
73
+ 2025-04-24 10:15:09,865 INFO [train.py:810] Trainable parameters (excluding model.eval modules):
74
+ 2025-04-24 10:15:09,866 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
75
+ 2025-04-24 10:15:09,866 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
76
+ 2025-04-24 10:15:09,866 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
77
+ 2025-04-24 10:15:09,866 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
78
+ 2025-04-24 10:15:09,866 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
79
+ 2025-04-24 10:15:09,866 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
80
+ 2025-04-24 10:15:09,866 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
81
+ 2025-04-24 10:15:09,866 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
82
+ 2025-04-24 10:15:09,866 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
83
+ 2025-04-24 10:15:09,866 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
84
+ 2025-04-24 10:15:09,866 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
85
+ 2025-04-24 10:15:09,866 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
86
+ 2025-04-24 10:15:09,866 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
87
+ 2025-04-24 10:15:09,866 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
88
+ 2025-04-24 10:15:09,866 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
89
+ 2025-04-24 10:15:09,866 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
90
+ 2025-04-24 10:15:09,867 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
91
+ 2025-04-24 10:15:09,867 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
92
+ 2025-04-24 10:15:09,867 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
93
+ 2025-04-24 10:15:09,867 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
94
+ 2025-04-24 10:15:09,867 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
95
+ 2025-04-24 10:15:09,867 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
96
+ 2025-04-24 10:15:09,867 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
97
+ 2025-04-24 10:15:09,867 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
98
+ 2025-04-24 10:15:09,867 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
99
+ 2025-04-24 10:15:09,867 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
100
+ 2025-04-24 10:15:09,867 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
101
+ 2025-04-24 10:15:09,867 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
102
+ 2025-04-24 10:15:09,867 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
103
+ 2025-04-24 10:15:09,867 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
104
+ 2025-04-24 10:15:09,867 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
105
+ 2025-04-24 10:15:09,867 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
106
+ 2025-04-24 10:15:09,867 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
107
+ 2025-04-24 10:15:09,867 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
108
+ 2025-04-24 10:15:09,868 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
109
+ 2025-04-24 10:15:09,868 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
110
+ 2025-04-24 10:15:09,868 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
111
+ 2025-04-24 10:15:09,868 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
112
+ 2025-04-24 10:15:09,868 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
113
+ 2025-04-24 10:15:09,868 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
114
+ 2025-04-24 10:15:09,868 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
115
+ 2025-04-24 10:15:09,868 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
116
+ 2025-04-24 10:15:09,868 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
117
+ 2025-04-24 10:15:09,868 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
118
+ 2025-04-24 10:15:09,868 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
119
+ 2025-04-24 10:15:09,868 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
120
+ 2025-04-24 10:15:09,868 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
121
+ 2025-04-24 10:15:09,868 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
122
+ 2025-04-24 10:15:09,868 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
123
+ 2025-04-24 10:15:09,868 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
124
+ 2025-04-24 10:15:09,868 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
125
+ 2025-04-24 10:15:09,868 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
126
+ 2025-04-24 10:15:09,869 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
127
+ 2025-04-24 10:15:09,869 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
128
+ 2025-04-24 10:15:09,869 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
129
+ 2025-04-24 10:15:09,869 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
130
+ 2025-04-24 10:15:09,869 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
131
+ 2025-04-24 10:15:09,869 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
132
+ 2025-04-24 10:15:09,869 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
133
+ 2025-04-24 10:15:09,869 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
134
+ 2025-04-24 10:15:09,869 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
135
+ 2025-04-24 10:15:09,869 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
136
+ 2025-04-24 10:15:09,869 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
137
+ 2025-04-24 10:15:09,869 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
138
+ 2025-04-24 10:15:09,869 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
139
+ 2025-04-24 10:15:09,869 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
140
+ 2025-04-24 10:15:09,869 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
141
+ 2025-04-24 10:15:09,869 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
142
+ 2025-04-24 10:15:09,869 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
143
+ 2025-04-24 10:15:09,869 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
144
+ 2025-04-24 10:15:09,869 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
145
+ 2025-04-24 10:15:09,870 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
146
+ 2025-04-24 10:15:09,870 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
147
+ 2025-04-24 10:15:09,870 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
148
+ 2025-04-24 10:15:09,870 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
149
+ 2025-04-24 10:15:09,870 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
150
+ 2025-04-24 10:15:09,870 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
151
+ 2025-04-24 10:15:09,870 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
152
+ 2025-04-24 10:15:09,870 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
153
+ 2025-04-24 10:15:09,870 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
154
+ 2025-04-24 10:15:09,870 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
155
+ 2025-04-24 10:15:09,870 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
156
+ 2025-04-24 10:15:09,870 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
157
+ 2025-04-24 10:15:09,870 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
158
+ 2025-04-24 10:15:09,870 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
159
+ 2025-04-24 10:15:09,870 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
160
+ 2025-04-24 10:15:09,870 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
161
+ 2025-04-24 10:15:09,870 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
162
+ 2025-04-24 10:15:09,870 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
163
+ 2025-04-24 10:15:09,870 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
164
+ 2025-04-24 10:15:09,871 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
165
+ 2025-04-24 10:15:09,871 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
166
+ 2025-04-24 10:15:09,871 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
167
+ 2025-04-24 10:15:09,871 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
168
+ 2025-04-24 10:15:09,871 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
169
+ 2025-04-24 10:15:09,871 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
170
+ 2025-04-24 10:15:09,871 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
171
+ 2025-04-24 10:15:09,871 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
172
+ 2025-04-24 10:15:09,871 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
173
+ 2025-04-24 10:15:09,871 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
174
+ 2025-04-24 10:15:09,871 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
175
+ 2025-04-24 10:15:09,871 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
176
+ 2025-04-24 10:15:09,871 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
177
+ 2025-04-24 10:15:09,871 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
178
+ 2025-04-24 10:15:09,871 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
179
+ 2025-04-24 10:15:09,871 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
180
+ 2025-04-24 10:15:09,871 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
181
+ 2025-04-24 10:15:09,871 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
182
+ 2025-04-24 10:15:09,872 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
183
+ 2025-04-24 10:15:09,872 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
184
+ 2025-04-24 10:15:09,872 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
185
+ 2025-04-24 10:15:09,872 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
186
+ 2025-04-24 10:15:09,872 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
187
+ 2025-04-24 10:15:09,872 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
188
+ 2025-04-24 10:15:09,872 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
189
+ 2025-04-24 10:15:09,872 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
190
+ 2025-04-24 10:15:09,872 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
191
+ 2025-04-24 10:15:09,872 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
192
+ 2025-04-24 10:15:09,872 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
193
+ 2025-04-24 10:15:09,872 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
194
+ 2025-04-24 10:15:09,872 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
195
+ 2025-04-24 10:15:09,872 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
196
+ 2025-04-24 10:15:09,872 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
197
+ 2025-04-24 10:15:09,872 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
198
+ 2025-04-24 10:15:09,872 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
199
+ 2025-04-24 10:15:09,872 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
200
+ 2025-04-24 10:15:09,872 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
201
+ 2025-04-24 10:15:09,873 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
202
+ 2025-04-24 10:15:09,873 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
203
+ 2025-04-24 10:15:09,873 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
204
+ 2025-04-24 10:15:09,873 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
205
+ 2025-04-24 10:15:09,873 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
206
+ 2025-04-24 10:15:09,873 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
207
+ 2025-04-24 10:15:09,873 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
208
+ 2025-04-24 10:15:09,873 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
209
+ 2025-04-24 10:15:09,873 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
210
+ 2025-04-24 10:15:09,873 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
211
+ 2025-04-24 10:15:09,873 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
212
+ 2025-04-24 10:15:09,873 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
213
+ 2025-04-24 10:15:09,873 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
214
+ 2025-04-24 10:15:09,873 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
215
+ 2025-04-24 10:15:09,873 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
216
+ 2025-04-24 10:15:09,873 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
217
+ 2025-04-24 10:15:09,873 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
218
+ 2025-04-24 10:15:09,873 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
219
+ 2025-04-24 10:15:09,874 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
220
+ 2025-04-24 10:15:09,874 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
221
+ 2025-04-24 10:15:09,874 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
222
+ 2025-04-24 10:15:09,874 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
223
+ 2025-04-24 10:15:09,874 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
224
+ 2025-04-24 10:15:09,874 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
225
+ 2025-04-24 10:15:09,874 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
226
+ 2025-04-24 10:15:09,874 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
227
+ 2025-04-24 10:15:09,874 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
228
+ 2025-04-24 10:15:09,874 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
229
+ 2025-04-24 10:15:09,874 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
230
+ 2025-04-24 10:15:09,874 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
231
+ 2025-04-24 10:15:09,874 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
232
+ 2025-04-24 10:15:09,874 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
233
+ 2025-04-24 10:15:09,874 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
234
+ 2025-04-24 10:15:09,874 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
235
+ 2025-04-24 10:15:09,874 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
236
+ 2025-04-24 10:15:09,874 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
237
+ 2025-04-24 10:15:09,874 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
238
+ 2025-04-24 10:15:09,875 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
239
+ 2025-04-24 10:15:09,875 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
240
+ 2025-04-24 10:15:09,875 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
241
+ 2025-04-24 10:15:09,875 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
242
+ 2025-04-24 10:15:09,875 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
243
+ 2025-04-24 10:15:09,875 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
244
+ 2025-04-24 10:15:09,875 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
245
+ 2025-04-24 10:15:09,875 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
246
+ 2025-04-24 10:15:09,875 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
247
+ 2025-04-24 10:15:09,875 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
248
+ 2025-04-24 10:15:09,875 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
249
+ 2025-04-24 10:15:09,875 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
250
+ 2025-04-24 10:15:09,875 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
251
+ 2025-04-24 10:15:09,875 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
252
+ 2025-04-24 10:15:09,875 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
253
+ 2025-04-24 10:15:09,875 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
254
+ 2025-04-24 10:15:09,875 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
255
+ 2025-04-24 10:15:09,875 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
256
+ 2025-04-24 10:15:09,875 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
257
+ 2025-04-24 10:15:09,876 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
258
+ 2025-04-24 10:15:09,876 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
259
+ 2025-04-24 10:15:09,876 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
260
+ 2025-04-24 10:15:09,876 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
261
+ 2025-04-24 10:15:09,876 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
262
+ 2025-04-24 10:15:09,876 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
263
+ 2025-04-24 10:15:09,876 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
264
+ 2025-04-24 10:15:09,876 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
265
+ 2025-04-24 10:15:09,876 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
266
+ 2025-04-24 10:15:09,876 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
267
+ 2025-04-24 10:15:09,876 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
268
+ 2025-04-24 10:15:09,876 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
269
+ 2025-04-24 10:15:09,876 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
270
+ 2025-04-24 10:15:09,876 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
271
+ 2025-04-24 10:15:09,876 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
272
+ 2025-04-24 10:15:09,876 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
273
+ 2025-04-24 10:15:09,876 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
274
+ 2025-04-24 10:15:09,876 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
275
+ 2025-04-24 10:15:09,876 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
276
+ 2025-04-24 10:15:09,877 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
277
+ 2025-04-24 10:15:09,877 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
278
+ 2025-04-24 10:15:09,877 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
279
+ 2025-04-24 10:15:09,877 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
280
+ 2025-04-24 10:15:09,877 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
281
+ 2025-04-24 10:15:09,877 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
282
+ 2025-04-24 10:15:09,877 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
283
+ 2025-04-24 10:15:09,877 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
284
+ 2025-04-24 10:15:09,877 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
285
+ 2025-04-24 10:15:09,877 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
286
+ 2025-04-24 10:15:09,877 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
287
+ 2025-04-24 10:15:09,877 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
288
+ 2025-04-24 10:15:09,877 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
289
+ 2025-04-24 10:15:09,877 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
290
+ 2025-04-24 10:15:09,877 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
291
+ 2025-04-24 10:15:09,877 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
292
+ 2025-04-24 10:15:09,877 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
293
+ 2025-04-24 10:15:09,877 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
294
+ 2025-04-24 10:15:09,877 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
295
+ 2025-04-24 10:15:09,878 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
296
+ 2025-04-24 10:15:09,878 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
297
+ 2025-04-24 10:15:09,878 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
298
+ 2025-04-24 10:15:09,878 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
299
+ 2025-04-24 10:15:09,878 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
300
+ 2025-04-24 10:15:09,878 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
301
+ 2025-04-24 10:15:09,878 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
302
+ 2025-04-24 10:15:09,878 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
303
+ 2025-04-24 10:15:09,878 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
304
+ 2025-04-24 10:15:09,878 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
305
+ 2025-04-24 10:15:09,878 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
306
+ 2025-04-24 10:15:09,878 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
307
+ 2025-04-24 10:15:09,878 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
308
+ 2025-04-24 10:15:09,878 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
309
+ 2025-04-24 10:15:09,878 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
310
+ 2025-04-24 10:15:09,878 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
311
+ 2025-04-24 10:15:09,878 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
312
+ 2025-04-24 10:15:09,878 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
313
+ 2025-04-24 10:15:09,878 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
314
+ 2025-04-24 10:15:09,879 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
315
+ 2025-04-24 10:15:09,879 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
316
+ 2025-04-24 10:15:09,879 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
317
+ 2025-04-24 10:15:09,879 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
318
+ 2025-04-24 10:15:09,879 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
319
+ 2025-04-24 10:15:09,879 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
320
+ 2025-04-24 10:15:09,879 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
321
+ 2025-04-24 10:15:09,879 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
322
+ 2025-04-24 10:15:09,879 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
323
+ 2025-04-24 10:15:09,879 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
324
+ 2025-04-24 10:15:09,879 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
325
+ 2025-04-24 10:15:09,879 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
326
+ 2025-04-24 10:15:09,879 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
327
+ 2025-04-24 10:15:09,879 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
328
+ 2025-04-24 10:15:09,879 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
329
+ 2025-04-24 10:15:09,879 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
330
+ 2025-04-24 10:15:09,879 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
331
+ 2025-04-24 10:15:09,879 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
332
+ 2025-04-24 10:15:09,880 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
333
+ 2025-04-24 10:15:09,880 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
334
+ 2025-04-24 10:15:09,880 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
335
+ 2025-04-24 10:15:09,880 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
336
+ 2025-04-24 10:15:09,880 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
337
+ 2025-04-24 10:15:09,880 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
338
+ 2025-04-24 10:15:09,880 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
339
+ 2025-04-24 10:15:09,880 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
340
+ 2025-04-24 10:15:09,880 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
341
+ 2025-04-24 10:15:09,880 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
342
+ 2025-04-24 10:15:09,880 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
343
+ 2025-04-24 10:15:09,880 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
344
+ 2025-04-24 10:15:09,880 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
345
+ 2025-04-24 10:15:09,880 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
346
+ 2025-04-24 10:15:09,880 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
347
+ 2025-04-24 10:15:09,880 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
348
+ 2025-04-24 10:15:09,880 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
349
+ 2025-04-24 10:15:09,880 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
350
+ 2025-04-24 10:15:09,880 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
351
+ 2025-04-24 10:15:09,881 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
352
+ 2025-04-24 10:15:09,881 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
353
+ 2025-04-24 10:15:09,881 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
354
+ 2025-04-24 10:15:09,881 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
355
+ 2025-04-24 10:15:09,881 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
356
+ 2025-04-24 10:15:09,881 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
357
+ 2025-04-24 10:15:09,881 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
358
+ 2025-04-24 10:15:09,881 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
359
+ 2025-04-24 10:15:09,881 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
360
+ 2025-04-24 10:15:09,881 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
361
+ 2025-04-24 10:15:09,881 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
362
+ 2025-04-24 10:15:09,881 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
363
+ 2025-04-24 10:15:09,881 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
364
+ 2025-04-24 10:15:09,881 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
365
+ 2025-04-24 10:15:09,881 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
366
+ 2025-04-24 10:15:09,881 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
367
+ 2025-04-24 10:15:09,881 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
368
+ 2025-04-24 10:15:09,881 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
369
+ 2025-04-24 10:15:09,881 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
370
+ 2025-04-24 10:15:09,882 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
371
+ 2025-04-24 10:15:09,882 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
372
+ 2025-04-24 10:15:09,882 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
373
+ 2025-04-24 10:15:09,882 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
374
+ 2025-04-24 10:15:09,882 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
375
+ 2025-04-24 10:15:09,882 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
376
+ 2025-04-24 10:15:09,882 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
377
+ 2025-04-24 10:15:09,882 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
378
+ 2025-04-24 10:15:09,882 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
379
+ 2025-04-24 10:15:09,882 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
380
+ 2025-04-24 10:15:09,882 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
381
+ 2025-04-24 10:15:09,882 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
382
+ 2025-04-24 10:15:09,882 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
383
+ 2025-04-24 10:15:09,882 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
384
+ 2025-04-24 10:15:09,882 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
385
+ 2025-04-24 10:15:09,882 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
386
+ 2025-04-24 10:15:09,882 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
387
+ 2025-04-24 10:15:09,882 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
388
+ 2025-04-24 10:15:09,883 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
389
+ 2025-04-24 10:15:09,883 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
390
+ 2025-04-24 10:15:09,883 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
391
+ 2025-04-24 10:15:09,883 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
392
+ 2025-04-24 10:15:09,883 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
393
+ 2025-04-24 10:15:09,883 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
394
+ 2025-04-24 10:15:09,883 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
395
+ 2025-04-24 10:15:09,883 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
396
+ 2025-04-24 10:15:09,883 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
397
+ 2025-04-24 10:15:09,883 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
398
+ 2025-04-24 10:15:09,883 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
399
+ 2025-04-24 10:15:09,883 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
400
+ 2025-04-24 10:15:09,883 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
401
+ 2025-04-24 10:15:09,883 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
402
+ 2025-04-24 10:15:09,883 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
403
+ 2025-04-24 10:15:09,883 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
404
+ 2025-04-24 10:15:09,883 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
405
+ 2025-04-24 10:15:09,883 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
406
+ 2025-04-24 10:15:09,883 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
407
+ 2025-04-24 10:15:09,884 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
408
+ 2025-04-24 10:15:09,884 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
409
+ 2025-04-24 10:15:09,884 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
410
+ 2025-04-24 10:15:09,884 INFO [train.py:813] encoder_projector.linear1.weight: torch.Size([896, 10240])
411
+ 2025-04-24 10:15:09,884 INFO [train.py:813] encoder_projector.linear1.bias: torch.Size([896])
412
+ 2025-04-24 10:15:09,884 INFO [train.py:813] encoder_projector.linear2.weight: torch.Size([896, 896])
413
+ 2025-04-24 10:15:09,884 INFO [train.py:813] encoder_projector.linear2.bias: torch.Size([896])
414
+ 2025-04-24 10:15:09,884 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
415
+ 2025-04-24 10:15:09,884 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
416
+ 2025-04-24 10:15:09,884 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
417
+ 2025-04-24 10:15:09,884 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
418
+ 2025-04-24 10:15:09,884 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
419
+ 2025-04-24 10:15:09,884 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
420
+ 2025-04-24 10:15:09,884 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
421
+ 2025-04-24 10:15:09,884 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
422
+ 2025-04-24 10:15:09,884 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
423
+ 2025-04-24 10:15:09,884 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
424
+ 2025-04-24 10:15:09,884 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
425
+ 2025-04-24 10:15:09,884 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
426
+ 2025-04-24 10:15:09,885 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
427
+ 2025-04-24 10:15:09,885 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
428
+ 2025-04-24 10:15:09,885 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
429
+ 2025-04-24 10:15:09,885 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
430
+ 2025-04-24 10:15:09,885 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
431
+ 2025-04-24 10:15:09,885 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
432
+ 2025-04-24 10:15:09,885 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
433
+ 2025-04-24 10:15:09,885 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
434
+ 2025-04-24 10:15:09,885 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
435
+ 2025-04-24 10:15:09,885 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
436
+ 2025-04-24 10:15:09,885 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
437
+ 2025-04-24 10:15:09,885 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
438
+ 2025-04-24 10:15:09,885 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
439
+ 2025-04-24 10:15:09,885 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
440
+ 2025-04-24 10:15:09,885 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
441
+ 2025-04-24 10:15:09,885 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
442
+ 2025-04-24 10:15:09,885 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
443
+ 2025-04-24 10:15:09,885 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
444
+ 2025-04-24 10:15:09,885 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
445
+ 2025-04-24 10:15:09,886 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
446
+ 2025-04-24 10:15:09,886 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
447
+ 2025-04-24 10:15:09,886 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
448
+ 2025-04-24 10:15:09,886 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
449
+ 2025-04-24 10:15:09,886 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
450
+ 2025-04-24 10:15:09,886 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
451
+ 2025-04-24 10:15:09,886 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
452
+ 2025-04-24 10:15:09,886 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
453
+ 2025-04-24 10:15:09,886 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
454
+ 2025-04-24 10:15:09,886 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
455
+ 2025-04-24 10:15:09,886 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
456
+ 2025-04-24 10:15:09,886 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
457
+ 2025-04-24 10:15:09,886 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
458
+ 2025-04-24 10:15:09,886 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
459
+ 2025-04-24 10:15:09,886 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
460
+ 2025-04-24 10:15:09,886 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
461
+ 2025-04-24 10:15:09,886 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
462
+ 2025-04-24 10:15:09,886 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
463
+ 2025-04-24 10:15:09,886 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
464
+ 2025-04-24 10:15:09,887 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
465
+ 2025-04-24 10:15:09,887 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
466
+ 2025-04-24 10:15:09,887 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
467
+ 2025-04-24 10:15:09,887 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
468
+ 2025-04-24 10:15:09,887 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
469
+ 2025-04-24 10:15:09,887 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
470
+ 2025-04-24 10:15:09,887 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
471
+ 2025-04-24 10:15:09,887 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
472
+ 2025-04-24 10:15:09,887 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
473
+ 2025-04-24 10:15:09,887 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
474
+ 2025-04-24 10:15:09,887 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
475
+ 2025-04-24 10:15:09,887 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
476
+ 2025-04-24 10:15:09,887 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
477
+ 2025-04-24 10:15:09,887 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
478
+ 2025-04-24 10:15:09,887 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
479
+ 2025-04-24 10:15:09,887 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
480
+ 2025-04-24 10:15:09,887 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
481
+ 2025-04-24 10:15:09,887 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
482
+ 2025-04-24 10:15:09,887 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
483
+ 2025-04-24 10:15:09,888 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
484
+ 2025-04-24 10:15:09,888 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
485
+ 2025-04-24 10:15:09,888 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
486
+ 2025-04-24 10:15:09,888 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
487
+ 2025-04-24 10:15:09,888 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
488
+ 2025-04-24 10:15:09,888 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
489
+ 2025-04-24 10:15:09,888 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
490
+ 2025-04-24 10:15:09,888 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
491
+ 2025-04-24 10:15:09,888 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
492
+ 2025-04-24 10:15:09,888 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
493
+ 2025-04-24 10:15:09,888 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
494
+ 2025-04-24 10:15:09,888 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
495
+ 2025-04-24 10:15:09,888 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
496
+ 2025-04-24 10:15:09,888 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
497
+ 2025-04-24 10:15:09,888 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
498
+ 2025-04-24 10:15:09,888 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
499
+ 2025-04-24 10:15:09,888 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
500
+ 2025-04-24 10:15:09,888 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
501
+ 2025-04-24 10:15:09,888 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
502
+ 2025-04-24 10:15:09,888 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
503
+ 2025-04-24 10:15:09,889 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
504
+ 2025-04-24 10:15:09,889 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
505
+ 2025-04-24 10:15:09,889 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
506
+ 2025-04-24 10:15:09,889 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
507
+ 2025-04-24 10:15:09,889 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
508
+ 2025-04-24 10:15:09,889 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
509
+ 2025-04-24 10:15:09,889 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
510
+ 2025-04-24 10:15:09,889 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
511
+ 2025-04-24 10:15:09,889 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
512
+ 2025-04-24 10:15:09,889 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
513
+ 2025-04-24 10:15:09,889 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
514
+ 2025-04-24 10:15:09,889 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
515
+ 2025-04-24 10:15:09,889 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
516
+ 2025-04-24 10:15:09,889 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
517
+ 2025-04-24 10:15:09,889 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
518
+ 2025-04-24 10:15:09,889 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
519
+ 2025-04-24 10:15:09,889 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
520
+ 2025-04-24 10:15:09,889 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
521
+ 2025-04-24 10:15:09,889 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
522
+ 2025-04-24 10:15:09,889 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
523
+ 2025-04-24 10:15:09,890 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
524
+ 2025-04-24 10:15:09,890 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
525
+ 2025-04-24 10:15:09,890 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
526
+ 2025-04-24 10:15:09,890 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
527
+ 2025-04-24 10:15:09,890 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
528
+ 2025-04-24 10:15:09,890 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
529
+ 2025-04-24 10:15:09,890 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
530
+ 2025-04-24 10:15:09,890 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
531
+ 2025-04-24 10:15:09,890 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
532
+ 2025-04-24 10:15:09,890 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
533
+ 2025-04-24 10:15:09,890 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
534
+ 2025-04-24 10:15:09,890 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
535
+ 2025-04-24 10:15:09,890 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
536
+ 2025-04-24 10:15:09,890 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
537
+ 2025-04-24 10:15:09,890 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
538
+ 2025-04-24 10:15:09,890 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
539
+ 2025-04-24 10:15:09,890 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
540
+ 2025-04-24 10:15:09,890 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
541
+ 2025-04-24 10:15:09,890 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
542
+ 2025-04-24 10:15:09,890 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
543
+ 2025-04-24 10:15:09,891 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
544
+ 2025-04-24 10:15:09,891 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
545
+ 2025-04-24 10:15:09,891 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
546
+ 2025-04-24 10:15:09,891 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
547
+ 2025-04-24 10:15:09,891 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
548
+ 2025-04-24 10:15:09,891 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
549
+ 2025-04-24 10:15:09,891 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
550
+ 2025-04-24 10:15:09,891 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
551
+ 2025-04-24 10:15:09,891 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
552
+ 2025-04-24 10:15:09,891 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
553
+ 2025-04-24 10:15:09,891 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
554
+ 2025-04-24 10:15:09,891 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
555
+ 2025-04-24 10:15:09,891 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
556
+ 2025-04-24 10:15:09,891 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
557
+ 2025-04-24 10:15:09,891 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
558
+ 2025-04-24 10:15:09,891 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
559
+ 2025-04-24 10:15:09,891 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
560
+ 2025-04-24 10:15:09,891 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
561
+ 2025-04-24 10:15:09,891 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
562
+ 2025-04-24 10:15:09,891 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
563
+ 2025-04-24 10:15:09,892 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
564
+ 2025-04-24 10:15:09,892 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
565
+ 2025-04-24 10:15:09,892 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
566
+ 2025-04-24 10:15:09,892 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
567
+ 2025-04-24 10:15:09,892 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
568
+ 2025-04-24 10:15:09,892 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
569
+ 2025-04-24 10:15:09,892 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
570
+ 2025-04-24 10:15:09,892 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
571
+ 2025-04-24 10:15:09,892 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
572
+ 2025-04-24 10:15:09,892 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
573
+ 2025-04-24 10:15:09,892 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
574
+ 2025-04-24 10:15:09,892 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
575
+ 2025-04-24 10:15:09,892 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
576
+ 2025-04-24 10:15:09,892 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
577
+ 2025-04-24 10:15:09,892 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
578
+ 2025-04-24 10:15:09,892 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
579
+ 2025-04-24 10:15:09,892 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
580
+ 2025-04-24 10:15:09,892 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
581
+ 2025-04-24 10:15:09,892 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
582
+ 2025-04-24 10:15:09,892 INFO [train.py:813] speech_token_projector.weight: torch.Size([1024, 1792])
583
+ 2025-04-24 10:15:09,893 INFO [train.py:813] speech_token_projector.bias: torch.Size([1024])
584
+ 2025-04-24 10:15:09,893 INFO [train.py:813] codec_lm_head.weight: torch.Size([4100, 1024])
585
+ 2025-04-24 10:15:09,893 INFO [train.py:813] codec_lm_head.bias: torch.Size([4100])
586
+ 2025-04-24 10:15:09,893 INFO [train.py:819] Device: cuda:1
587
+ 2025-04-24 10:15:10,679 INFO [train.py:823] Using DeepSpeed
588
+ 2025-04-24 10:15:12,380 INFO [data_module.py:445] About to get train cuts
589
+ 2025-04-24 10:15:12,382 INFO [data_module.py:250] Disable MUSAN
590
+ 2025-04-24 10:15:12,382 INFO [data_module.py:268] Enable SpecAugment
591
+ 2025-04-24 10:15:12,382 INFO [data_module.py:269] Time warp factor: 80
592
+ 2025-04-24 10:15:12,382 INFO [data_module.py:279] Num frame mask: 10
593
+ 2025-04-24 10:15:12,382 INFO [data_module.py:292] About to create train dataset
594
+ 2025-04-24 10:15:12,382 INFO [data_module.py:319] Using DynamicBucketingSampler.
595
+ 2025-04-24 10:15:13,374 INFO [data_module.py:336] About to create train dataloader
596
+ 2025-04-24 10:15:13,374 INFO [data_module.py:436] About to get test cuts
597
+ 2025-04-24 10:15:13,375 INFO [data_module.py:365] About to create dev dataset
598
+ 2025-04-24 10:15:13,462 INFO [data_module.py:379] About to create dev dataloader
599
+ 2025-04-24 10:15:13,468 INFO [train.py:875] start training from epoch 1
600
+ 2025-04-24 10:15:42,675 INFO [train.py:539] Computing validation loss
log/log-train-2025-04-24-10-17-26 ADDED
@@ -0,0 +1,607 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-04-24 10:17:26,814 INFO [train.py:653] {
2
+ "allowed_excess_duration_ratio": 0.1,
3
+ "audio_key": "question_audio",
4
+ "batch_idx_train": 0,
5
+ "best_train_epoch": -1,
6
+ "best_train_loss": Infinity,
7
+ "best_valid_epoch": -1,
8
+ "best_valid_loss": Infinity,
9
+ "bucketing_sampler": true,
10
+ "deepscale": false,
11
+ "deepscale_config": null,
12
+ "deepspeed": true,
13
+ "deepspeed_config": "./slam_omni/ds_config_zero1.json",
14
+ "drop_last": true,
15
+ "enable_musan": false,
16
+ "enable_spec_aug": true,
17
+ "enable_speech_output": true,
18
+ "encoder_projector_ds_rate": 8,
19
+ "env_info": {
20
+ "IP address": "0.114.172.122",
21
+ "hostname": "7515258",
22
+ "icefall-git-branch": "master",
23
+ "icefall-git-date": "Thu Apr 24 08:24:11 2025",
24
+ "icefall-git-sha1": "2e9be467-dirty",
25
+ "icefall-path": "/workspace/slam/icefall_omni",
26
+ "k2-build-type": "Release",
27
+ "k2-git-date": "Tue Oct 29 09:02:19 2024",
28
+ "k2-git-sha1": "75e2ed6b2fd87c22b7f3f34bad48a69984bb8755",
29
+ "k2-path": "/opt/conda/lib/python3.11/site-packages/k2/__init__.py",
30
+ "k2-version": "1.24.4",
31
+ "k2-with-cuda": true,
32
+ "lhotse-path": "/workspace/slam/lhotse/lhotse/__init__.py",
33
+ "lhotse-version": "1.30.0.dev+git.13c7616f.dirty",
34
+ "python-version": "3.11",
35
+ "torch-cuda-available": true,
36
+ "torch-cuda-version": "12.4",
37
+ "torch-version": "2.4.0"
38
+ },
39
+ "exp_dir": "slam_omni/exp_speech2speech_rerun",
40
+ "frame_shift_ms": 10,
41
+ "huggingface_dataset_path_or_name": "/workspace/Belle_1.4M-SLAM-Omni",
42
+ "input_strategy": "PrecomputedFeatures",
43
+ "llm_path_or_name": "models/Qwen2.5-0.5B-Instruct",
44
+ "log_interval": 50,
45
+ "manifest_dir": "data/fbank",
46
+ "max_duration": 40,
47
+ "num_buckets": 30,
48
+ "num_epochs": 10,
49
+ "num_workers": 2,
50
+ "on_the_fly_feats": false,
51
+ "pretrained_model_path": null,
52
+ "resample_to_16kHz": true,
53
+ "reset_interval": 200,
54
+ "return_cuts": true,
55
+ "sampler_state_dict_path": null,
56
+ "seed": 42,
57
+ "shuffle": true,
58
+ "spec_aug_time_warp_factor": 80,
59
+ "speech_encoder_path_or_name": "models/whisper/v1.1/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt",
60
+ "start_epoch": 1,
61
+ "subsampling_factor": 2,
62
+ "tensorboard": true,
63
+ "text_key": "answer",
64
+ "unfreeze_llm": true,
65
+ "unfreeze_speech_projector": true,
66
+ "use_flash_attn": true,
67
+ "use_fp16": true,
68
+ "use_lora": true,
69
+ "valid_interval": 5000
70
+ }
71
+ 2025-04-24 10:17:26,814 INFO [train.py:655] About to create model
72
+ 2025-04-24 10:17:46,427 INFO [train.py:808] Number of model parameters: 1327766148
73
+ 2025-04-24 10:17:46,427 INFO [train.py:810] Trainable parameters (excluding model.eval modules):
74
+ 2025-04-24 10:17:46,428 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
75
+ 2025-04-24 10:17:46,428 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
76
+ 2025-04-24 10:17:46,428 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
77
+ 2025-04-24 10:17:46,428 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
78
+ 2025-04-24 10:17:46,428 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
79
+ 2025-04-24 10:17:46,428 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
80
+ 2025-04-24 10:17:46,428 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
81
+ 2025-04-24 10:17:46,428 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
82
+ 2025-04-24 10:17:46,428 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
83
+ 2025-04-24 10:17:46,428 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
84
+ 2025-04-24 10:17:46,428 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
85
+ 2025-04-24 10:17:46,428 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
86
+ 2025-04-24 10:17:46,428 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
87
+ 2025-04-24 10:17:46,429 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
88
+ 2025-04-24 10:17:46,429 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
89
+ 2025-04-24 10:17:46,429 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
90
+ 2025-04-24 10:17:46,429 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
91
+ 2025-04-24 10:17:46,429 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
92
+ 2025-04-24 10:17:46,429 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
93
+ 2025-04-24 10:17:46,429 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
94
+ 2025-04-24 10:17:46,429 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
95
+ 2025-04-24 10:17:46,429 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
96
+ 2025-04-24 10:17:46,429 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
97
+ 2025-04-24 10:17:46,429 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
98
+ 2025-04-24 10:17:46,429 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
99
+ 2025-04-24 10:17:46,429 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
100
+ 2025-04-24 10:17:46,429 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
101
+ 2025-04-24 10:17:46,429 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
102
+ 2025-04-24 10:17:46,429 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
103
+ 2025-04-24 10:17:46,429 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
104
+ 2025-04-24 10:17:46,429 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
105
+ 2025-04-24 10:17:46,429 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
106
+ 2025-04-24 10:17:46,430 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
107
+ 2025-04-24 10:17:46,430 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
108
+ 2025-04-24 10:17:46,430 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
109
+ 2025-04-24 10:17:46,430 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
110
+ 2025-04-24 10:17:46,430 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
111
+ 2025-04-24 10:17:46,430 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
112
+ 2025-04-24 10:17:46,430 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
113
+ 2025-04-24 10:17:46,430 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
114
+ 2025-04-24 10:17:46,430 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
115
+ 2025-04-24 10:17:46,430 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
116
+ 2025-04-24 10:17:46,430 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
117
+ 2025-04-24 10:17:46,430 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
118
+ 2025-04-24 10:17:46,430 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
119
+ 2025-04-24 10:17:46,430 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
120
+ 2025-04-24 10:17:46,430 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
121
+ 2025-04-24 10:17:46,430 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
122
+ 2025-04-24 10:17:46,430 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
123
+ 2025-04-24 10:17:46,430 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
124
+ 2025-04-24 10:17:46,430 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
125
+ 2025-04-24 10:17:46,430 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
126
+ 2025-04-24 10:17:46,431 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
127
+ 2025-04-24 10:17:46,431 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
128
+ 2025-04-24 10:17:46,431 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
129
+ 2025-04-24 10:17:46,431 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
130
+ 2025-04-24 10:17:46,431 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
131
+ 2025-04-24 10:17:46,431 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
132
+ 2025-04-24 10:17:46,431 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
133
+ 2025-04-24 10:17:46,431 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
134
+ 2025-04-24 10:17:46,431 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
135
+ 2025-04-24 10:17:46,431 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
136
+ 2025-04-24 10:17:46,431 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
137
+ 2025-04-24 10:17:46,431 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
138
+ 2025-04-24 10:17:46,431 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
139
+ 2025-04-24 10:17:46,431 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
140
+ 2025-04-24 10:17:46,431 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
141
+ 2025-04-24 10:17:46,431 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
142
+ 2025-04-24 10:17:46,431 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
143
+ 2025-04-24 10:17:46,431 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
144
+ 2025-04-24 10:17:46,431 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
145
+ 2025-04-24 10:17:46,432 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
146
+ 2025-04-24 10:17:46,432 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
147
+ 2025-04-24 10:17:46,432 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
148
+ 2025-04-24 10:17:46,432 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
149
+ 2025-04-24 10:17:46,432 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
150
+ 2025-04-24 10:17:46,432 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
151
+ 2025-04-24 10:17:46,432 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
152
+ 2025-04-24 10:17:46,432 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
153
+ 2025-04-24 10:17:46,432 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
154
+ 2025-04-24 10:17:46,432 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
155
+ 2025-04-24 10:17:46,432 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
156
+ 2025-04-24 10:17:46,432 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
157
+ 2025-04-24 10:17:46,432 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
158
+ 2025-04-24 10:17:46,432 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
159
+ 2025-04-24 10:17:46,432 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
160
+ 2025-04-24 10:17:46,432 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
161
+ 2025-04-24 10:17:46,432 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
162
+ 2025-04-24 10:17:46,432 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
163
+ 2025-04-24 10:17:46,432 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
164
+ 2025-04-24 10:17:46,433 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
165
+ 2025-04-24 10:17:46,433 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
166
+ 2025-04-24 10:17:46,433 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
167
+ 2025-04-24 10:17:46,433 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
168
+ 2025-04-24 10:17:46,433 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
169
+ 2025-04-24 10:17:46,433 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
170
+ 2025-04-24 10:17:46,433 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
171
+ 2025-04-24 10:17:46,433 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
172
+ 2025-04-24 10:17:46,433 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
173
+ 2025-04-24 10:17:46,433 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
174
+ 2025-04-24 10:17:46,433 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
175
+ 2025-04-24 10:17:46,433 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
176
+ 2025-04-24 10:17:46,433 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
177
+ 2025-04-24 10:17:46,433 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
178
+ 2025-04-24 10:17:46,433 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
179
+ 2025-04-24 10:17:46,433 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
180
+ 2025-04-24 10:17:46,433 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
181
+ 2025-04-24 10:17:46,433 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
182
+ 2025-04-24 10:17:46,433 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
183
+ 2025-04-24 10:17:46,433 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
184
+ 2025-04-24 10:17:46,434 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
185
+ 2025-04-24 10:17:46,434 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
186
+ 2025-04-24 10:17:46,434 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
187
+ 2025-04-24 10:17:46,434 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
188
+ 2025-04-24 10:17:46,434 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
189
+ 2025-04-24 10:17:46,434 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
190
+ 2025-04-24 10:17:46,434 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
191
+ 2025-04-24 10:17:46,434 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
192
+ 2025-04-24 10:17:46,434 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
193
+ 2025-04-24 10:17:46,434 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
194
+ 2025-04-24 10:17:46,434 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
195
+ 2025-04-24 10:17:46,434 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
196
+ 2025-04-24 10:17:46,434 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
197
+ 2025-04-24 10:17:46,434 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
198
+ 2025-04-24 10:17:46,434 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
199
+ 2025-04-24 10:17:46,434 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
200
+ 2025-04-24 10:17:46,434 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
201
+ 2025-04-24 10:17:46,434 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
202
+ 2025-04-24 10:17:46,435 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
203
+ 2025-04-24 10:17:46,435 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
204
+ 2025-04-24 10:17:46,435 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
205
+ 2025-04-24 10:17:46,435 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
206
+ 2025-04-24 10:17:46,435 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
207
+ 2025-04-24 10:17:46,435 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
208
+ 2025-04-24 10:17:46,435 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
209
+ 2025-04-24 10:17:46,435 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
210
+ 2025-04-24 10:17:46,435 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
211
+ 2025-04-24 10:17:46,435 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
212
+ 2025-04-24 10:17:46,435 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
213
+ 2025-04-24 10:17:46,435 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
214
+ 2025-04-24 10:17:46,435 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
215
+ 2025-04-24 10:17:46,435 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
216
+ 2025-04-24 10:17:46,435 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
217
+ 2025-04-24 10:17:46,435 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
218
+ 2025-04-24 10:17:46,435 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
219
+ 2025-04-24 10:17:46,435 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
220
+ 2025-04-24 10:17:46,435 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
221
+ 2025-04-24 10:17:46,436 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
222
+ 2025-04-24 10:17:46,436 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
223
+ 2025-04-24 10:17:46,436 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
224
+ 2025-04-24 10:17:46,436 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
225
+ 2025-04-24 10:17:46,436 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
226
+ 2025-04-24 10:17:46,436 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
227
+ 2025-04-24 10:17:46,436 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
228
+ 2025-04-24 10:17:46,436 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
229
+ 2025-04-24 10:17:46,436 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
230
+ 2025-04-24 10:17:46,436 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
231
+ 2025-04-24 10:17:46,436 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
232
+ 2025-04-24 10:17:46,436 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
233
+ 2025-04-24 10:17:46,436 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
234
+ 2025-04-24 10:17:46,436 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
235
+ 2025-04-24 10:17:46,436 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
236
+ 2025-04-24 10:17:46,436 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
237
+ 2025-04-24 10:17:46,436 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
238
+ 2025-04-24 10:17:46,436 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
239
+ 2025-04-24 10:17:46,436 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
240
+ 2025-04-24 10:17:46,436 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
241
+ 2025-04-24 10:17:46,437 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
242
+ 2025-04-24 10:17:46,437 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
243
+ 2025-04-24 10:17:46,437 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
244
+ 2025-04-24 10:17:46,437 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
245
+ 2025-04-24 10:17:46,437 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
246
+ 2025-04-24 10:17:46,437 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
247
+ 2025-04-24 10:17:46,437 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
248
+ 2025-04-24 10:17:46,437 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
249
+ 2025-04-24 10:17:46,437 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
250
+ 2025-04-24 10:17:46,437 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
251
+ 2025-04-24 10:17:46,437 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
252
+ 2025-04-24 10:17:46,437 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
253
+ 2025-04-24 10:17:46,437 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
254
+ 2025-04-24 10:17:46,437 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
255
+ 2025-04-24 10:17:46,437 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
256
+ 2025-04-24 10:17:46,437 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
257
+ 2025-04-24 10:17:46,437 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
258
+ 2025-04-24 10:17:46,437 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
259
+ 2025-04-24 10:17:46,437 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
260
+ 2025-04-24 10:17:46,438 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
261
+ 2025-04-24 10:17:46,438 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
262
+ 2025-04-24 10:17:46,438 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
263
+ 2025-04-24 10:17:46,438 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
264
+ 2025-04-24 10:17:46,438 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
265
+ 2025-04-24 10:17:46,438 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
266
+ 2025-04-24 10:17:46,438 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
267
+ 2025-04-24 10:17:46,438 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
268
+ 2025-04-24 10:17:46,438 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
269
+ 2025-04-24 10:17:46,438 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
270
+ 2025-04-24 10:17:46,438 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
271
+ 2025-04-24 10:17:46,438 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
272
+ 2025-04-24 10:17:46,438 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
273
+ 2025-04-24 10:17:46,438 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
274
+ 2025-04-24 10:17:46,438 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
275
+ 2025-04-24 10:17:46,438 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
276
+ 2025-04-24 10:17:46,438 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
277
+ 2025-04-24 10:17:46,438 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
278
+ 2025-04-24 10:17:46,438 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
279
+ 2025-04-24 10:17:46,438 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
280
+ 2025-04-24 10:17:46,439 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
281
+ 2025-04-24 10:17:46,439 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
282
+ 2025-04-24 10:17:46,439 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
283
+ 2025-04-24 10:17:46,439 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
284
+ 2025-04-24 10:17:46,439 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
285
+ 2025-04-24 10:17:46,439 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
286
+ 2025-04-24 10:17:46,439 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
287
+ 2025-04-24 10:17:46,439 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
288
+ 2025-04-24 10:17:46,439 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
289
+ 2025-04-24 10:17:46,439 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
290
+ 2025-04-24 10:17:46,439 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
291
+ 2025-04-24 10:17:46,439 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
292
+ 2025-04-24 10:17:46,439 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
293
+ 2025-04-24 10:17:46,439 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
294
+ 2025-04-24 10:17:46,439 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
295
+ 2025-04-24 10:17:46,439 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
296
+ 2025-04-24 10:17:46,439 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
297
+ 2025-04-24 10:17:46,439 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
298
+ 2025-04-24 10:17:46,440 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
299
+ 2025-04-24 10:17:46,440 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
300
+ 2025-04-24 10:17:46,440 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
301
+ 2025-04-24 10:17:46,440 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
302
+ 2025-04-24 10:17:46,440 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
303
+ 2025-04-24 10:17:46,440 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
304
+ 2025-04-24 10:17:46,440 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
305
+ 2025-04-24 10:17:46,440 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
306
+ 2025-04-24 10:17:46,440 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
307
+ 2025-04-24 10:17:46,440 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
308
+ 2025-04-24 10:17:46,440 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
309
+ 2025-04-24 10:17:46,440 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
310
+ 2025-04-24 10:17:46,440 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
311
+ 2025-04-24 10:17:46,440 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
312
+ 2025-04-24 10:17:46,440 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
313
+ 2025-04-24 10:17:46,440 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
314
+ 2025-04-24 10:17:46,440 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
315
+ 2025-04-24 10:17:46,440 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
316
+ 2025-04-24 10:17:46,440 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
317
+ 2025-04-24 10:17:46,440 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
318
+ 2025-04-24 10:17:46,441 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
319
+ 2025-04-24 10:17:46,441 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
320
+ 2025-04-24 10:17:46,441 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
321
+ 2025-04-24 10:17:46,441 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
322
+ 2025-04-24 10:17:46,441 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
323
+ 2025-04-24 10:17:46,441 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
324
+ 2025-04-24 10:17:46,441 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
325
+ 2025-04-24 10:17:46,441 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
326
+ 2025-04-24 10:17:46,441 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
327
+ 2025-04-24 10:17:46,441 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
328
+ 2025-04-24 10:17:46,441 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
329
+ 2025-04-24 10:17:46,441 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
330
+ 2025-04-24 10:17:46,441 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
331
+ 2025-04-24 10:17:46,441 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
332
+ 2025-04-24 10:17:46,441 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
333
+ 2025-04-24 10:17:46,441 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
334
+ 2025-04-24 10:17:46,441 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
335
+ 2025-04-24 10:17:46,441 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
336
+ 2025-04-24 10:17:46,441 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
337
+ 2025-04-24 10:17:46,441 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
338
+ 2025-04-24 10:17:46,442 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
339
+ 2025-04-24 10:17:46,442 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
340
+ 2025-04-24 10:17:46,442 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
341
+ 2025-04-24 10:17:46,442 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
342
+ 2025-04-24 10:17:46,442 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
343
+ 2025-04-24 10:17:46,442 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
344
+ 2025-04-24 10:17:46,442 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
345
+ 2025-04-24 10:17:46,442 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
346
+ 2025-04-24 10:17:46,442 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
347
+ 2025-04-24 10:17:46,442 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
348
+ 2025-04-24 10:17:46,442 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
349
+ 2025-04-24 10:17:46,442 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
350
+ 2025-04-24 10:17:46,442 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
351
+ 2025-04-24 10:17:46,442 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
352
+ 2025-04-24 10:17:46,442 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
353
+ 2025-04-24 10:17:46,442 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
354
+ 2025-04-24 10:17:46,442 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
355
+ 2025-04-24 10:17:46,442 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
356
+ 2025-04-24 10:17:46,442 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
357
+ 2025-04-24 10:17:46,443 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
358
+ 2025-04-24 10:17:46,443 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
359
+ 2025-04-24 10:17:46,443 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
360
+ 2025-04-24 10:17:46,443 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
361
+ 2025-04-24 10:17:46,443 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
362
+ 2025-04-24 10:17:46,443 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
363
+ 2025-04-24 10:17:46,443 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
364
+ 2025-04-24 10:17:46,443 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
365
+ 2025-04-24 10:17:46,443 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
366
+ 2025-04-24 10:17:46,443 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
367
+ 2025-04-24 10:17:46,443 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
368
+ 2025-04-24 10:17:46,443 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
369
+ 2025-04-24 10:17:46,443 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
370
+ 2025-04-24 10:17:46,443 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
371
+ 2025-04-24 10:17:46,443 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
372
+ 2025-04-24 10:17:46,443 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
373
+ 2025-04-24 10:17:46,443 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
374
+ 2025-04-24 10:17:46,443 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
375
+ 2025-04-24 10:17:46,443 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
376
+ 2025-04-24 10:17:46,443 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
377
+ 2025-04-24 10:17:46,444 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
378
+ 2025-04-24 10:17:46,444 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
379
+ 2025-04-24 10:17:46,444 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
380
+ 2025-04-24 10:17:46,444 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
381
+ 2025-04-24 10:17:46,444 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
382
+ 2025-04-24 10:17:46,444 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
383
+ 2025-04-24 10:17:46,444 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
384
+ 2025-04-24 10:17:46,444 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
385
+ 2025-04-24 10:17:46,444 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
386
+ 2025-04-24 10:17:46,444 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
387
+ 2025-04-24 10:17:46,444 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
388
+ 2025-04-24 10:17:46,444 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
389
+ 2025-04-24 10:17:46,444 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
390
+ 2025-04-24 10:17:46,444 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
391
+ 2025-04-24 10:17:46,444 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
392
+ 2025-04-24 10:17:46,444 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
393
+ 2025-04-24 10:17:46,444 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
394
+ 2025-04-24 10:17:46,444 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
395
+ 2025-04-24 10:17:46,444 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
396
+ 2025-04-24 10:17:46,444 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
397
+ 2025-04-24 10:17:46,445 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
398
+ 2025-04-24 10:17:46,445 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
399
+ 2025-04-24 10:17:46,445 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
400
+ 2025-04-24 10:17:46,445 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
401
+ 2025-04-24 10:17:46,445 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
402
+ 2025-04-24 10:17:46,445 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
403
+ 2025-04-24 10:17:46,445 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
404
+ 2025-04-24 10:17:46,445 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
405
+ 2025-04-24 10:17:46,445 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
406
+ 2025-04-24 10:17:46,445 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
407
+ 2025-04-24 10:17:46,445 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
408
+ 2025-04-24 10:17:46,445 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
409
+ 2025-04-24 10:17:46,445 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
410
+ 2025-04-24 10:17:46,445 INFO [train.py:813] encoder_projector.linear1.weight: torch.Size([896, 10240])
411
+ 2025-04-24 10:17:46,445 INFO [train.py:813] encoder_projector.linear1.bias: torch.Size([896])
412
+ 2025-04-24 10:17:46,445 INFO [train.py:813] encoder_projector.linear2.weight: torch.Size([896, 896])
413
+ 2025-04-24 10:17:46,445 INFO [train.py:813] encoder_projector.linear2.bias: torch.Size([896])
414
+ 2025-04-24 10:17:46,445 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
415
+ 2025-04-24 10:17:46,445 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
416
+ 2025-04-24 10:17:46,446 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
417
+ 2025-04-24 10:17:46,446 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
418
+ 2025-04-24 10:17:46,446 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
419
+ 2025-04-24 10:17:46,446 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
420
+ 2025-04-24 10:17:46,446 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
421
+ 2025-04-24 10:17:46,446 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
422
+ 2025-04-24 10:17:46,446 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
423
+ 2025-04-24 10:17:46,446 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
424
+ 2025-04-24 10:17:46,446 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
425
+ 2025-04-24 10:17:46,446 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
426
+ 2025-04-24 10:17:46,446 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
427
+ 2025-04-24 10:17:46,446 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
428
+ 2025-04-24 10:17:46,446 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
429
+ 2025-04-24 10:17:46,446 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
430
+ 2025-04-24 10:17:46,446 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
431
+ 2025-04-24 10:17:46,446 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
432
+ 2025-04-24 10:17:46,446 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
433
+ 2025-04-24 10:17:46,446 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
434
+ 2025-04-24 10:17:46,446 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
435
+ 2025-04-24 10:17:46,446 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
436
+ 2025-04-24 10:17:46,447 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
437
+ 2025-04-24 10:17:46,447 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
438
+ 2025-04-24 10:17:46,447 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
439
+ 2025-04-24 10:17:46,447 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
440
+ 2025-04-24 10:17:46,447 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
441
+ 2025-04-24 10:17:46,447 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
442
+ 2025-04-24 10:17:46,447 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
443
+ 2025-04-24 10:17:46,447 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
444
+ 2025-04-24 10:17:46,447 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
445
+ 2025-04-24 10:17:46,447 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
446
+ 2025-04-24 10:17:46,447 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
447
+ 2025-04-24 10:17:46,447 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
448
+ 2025-04-24 10:17:46,447 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
449
+ 2025-04-24 10:17:46,447 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
450
+ 2025-04-24 10:17:46,447 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
451
+ 2025-04-24 10:17:46,447 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
452
+ 2025-04-24 10:17:46,447 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
453
+ 2025-04-24 10:17:46,447 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
454
+ 2025-04-24 10:17:46,447 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
455
+ 2025-04-24 10:17:46,447 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
456
+ 2025-04-24 10:17:46,448 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
457
+ 2025-04-24 10:17:46,448 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
458
+ 2025-04-24 10:17:46,448 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
459
+ 2025-04-24 10:17:46,448 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
460
+ 2025-04-24 10:17:46,448 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
461
+ 2025-04-24 10:17:46,448 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
462
+ 2025-04-24 10:17:46,448 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
463
+ 2025-04-24 10:17:46,448 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
464
+ 2025-04-24 10:17:46,448 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
465
+ 2025-04-24 10:17:46,448 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
466
+ 2025-04-24 10:17:46,448 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
467
+ 2025-04-24 10:17:46,448 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
468
+ 2025-04-24 10:17:46,448 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
469
+ 2025-04-24 10:17:46,448 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
470
+ 2025-04-24 10:17:46,448 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
471
+ 2025-04-24 10:17:46,448 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
472
+ 2025-04-24 10:17:46,448 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
473
+ 2025-04-24 10:17:46,448 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
474
+ 2025-04-24 10:17:46,448 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
475
+ 2025-04-24 10:17:46,449 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
476
+ 2025-04-24 10:17:46,449 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
477
+ 2025-04-24 10:17:46,449 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
478
+ 2025-04-24 10:17:46,449 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
479
+ 2025-04-24 10:17:46,449 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
480
+ 2025-04-24 10:17:46,449 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
481
+ 2025-04-24 10:17:46,449 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
482
+ 2025-04-24 10:17:46,449 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
483
+ 2025-04-24 10:17:46,449 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
484
+ 2025-04-24 10:17:46,449 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
485
+ 2025-04-24 10:17:46,449 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
486
+ 2025-04-24 10:17:46,449 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
487
+ 2025-04-24 10:17:46,449 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
488
+ 2025-04-24 10:17:46,449 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
489
+ 2025-04-24 10:17:46,449 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
490
+ 2025-04-24 10:17:46,449 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
491
+ 2025-04-24 10:17:46,449 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
492
+ 2025-04-24 10:17:46,449 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
493
+ 2025-04-24 10:17:46,449 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
494
+ 2025-04-24 10:17:46,449 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
495
+ 2025-04-24 10:17:46,450 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
496
+ 2025-04-24 10:17:46,450 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
497
+ 2025-04-24 10:17:46,450 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
498
+ 2025-04-24 10:17:46,450 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
499
+ 2025-04-24 10:17:46,450 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
500
+ 2025-04-24 10:17:46,450 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
501
+ 2025-04-24 10:17:46,450 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
502
+ 2025-04-24 10:17:46,450 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
503
+ 2025-04-24 10:17:46,450 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
504
+ 2025-04-24 10:17:46,450 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
505
+ 2025-04-24 10:17:46,450 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
506
+ 2025-04-24 10:17:46,450 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
507
+ 2025-04-24 10:17:46,450 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
508
+ 2025-04-24 10:17:46,450 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
509
+ 2025-04-24 10:17:46,450 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
510
+ 2025-04-24 10:17:46,450 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
511
+ 2025-04-24 10:17:46,450 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
512
+ 2025-04-24 10:17:46,450 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
513
+ 2025-04-24 10:17:46,450 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
514
+ 2025-04-24 10:17:46,450 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
515
+ 2025-04-24 10:17:46,451 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
516
+ 2025-04-24 10:17:46,451 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
517
+ 2025-04-24 10:17:46,451 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
518
+ 2025-04-24 10:17:46,451 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
519
+ 2025-04-24 10:17:46,451 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
520
+ 2025-04-24 10:17:46,451 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
521
+ 2025-04-24 10:17:46,451 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
522
+ 2025-04-24 10:17:46,451 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
523
+ 2025-04-24 10:17:46,451 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
524
+ 2025-04-24 10:17:46,451 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
525
+ 2025-04-24 10:17:46,451 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
526
+ 2025-04-24 10:17:46,451 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
527
+ 2025-04-24 10:17:46,451 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
528
+ 2025-04-24 10:17:46,451 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
529
+ 2025-04-24 10:17:46,451 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
530
+ 2025-04-24 10:17:46,451 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
531
+ 2025-04-24 10:17:46,451 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
532
+ 2025-04-24 10:17:46,451 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
533
+ 2025-04-24 10:17:46,451 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
534
+ 2025-04-24 10:17:46,452 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
535
+ 2025-04-24 10:17:46,452 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
536
+ 2025-04-24 10:17:46,452 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
537
+ 2025-04-24 10:17:46,452 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
538
+ 2025-04-24 10:17:46,452 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
539
+ 2025-04-24 10:17:46,452 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
540
+ 2025-04-24 10:17:46,452 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
541
+ 2025-04-24 10:17:46,452 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
542
+ 2025-04-24 10:17:46,452 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
543
+ 2025-04-24 10:17:46,452 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
544
+ 2025-04-24 10:17:46,452 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
545
+ 2025-04-24 10:17:46,452 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
546
+ 2025-04-24 10:17:46,452 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
547
+ 2025-04-24 10:17:46,452 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
548
+ 2025-04-24 10:17:46,452 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
549
+ 2025-04-24 10:17:46,452 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
550
+ 2025-04-24 10:17:46,452 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
551
+ 2025-04-24 10:17:46,452 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
552
+ 2025-04-24 10:17:46,452 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
553
+ 2025-04-24 10:17:46,452 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
554
+ 2025-04-24 10:17:46,453 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
555
+ 2025-04-24 10:17:46,453 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
556
+ 2025-04-24 10:17:46,453 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
557
+ 2025-04-24 10:17:46,453 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
558
+ 2025-04-24 10:17:46,453 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
559
+ 2025-04-24 10:17:46,453 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
560
+ 2025-04-24 10:17:46,453 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
561
+ 2025-04-24 10:17:46,453 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
562
+ 2025-04-24 10:17:46,453 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
563
+ 2025-04-24 10:17:46,453 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
564
+ 2025-04-24 10:17:46,453 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
565
+ 2025-04-24 10:17:46,453 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
566
+ 2025-04-24 10:17:46,453 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
567
+ 2025-04-24 10:17:46,453 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
568
+ 2025-04-24 10:17:46,453 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
569
+ 2025-04-24 10:17:46,453 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
570
+ 2025-04-24 10:17:46,453 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
571
+ 2025-04-24 10:17:46,453 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
572
+ 2025-04-24 10:17:46,453 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
573
+ 2025-04-24 10:17:46,453 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
574
+ 2025-04-24 10:17:46,454 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
575
+ 2025-04-24 10:17:46,454 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
576
+ 2025-04-24 10:17:46,454 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
577
+ 2025-04-24 10:17:46,454 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
578
+ 2025-04-24 10:17:46,454 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
579
+ 2025-04-24 10:17:46,454 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
580
+ 2025-04-24 10:17:46,454 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
581
+ 2025-04-24 10:17:46,454 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
582
+ 2025-04-24 10:17:46,454 INFO [train.py:813] speech_token_projector.weight: torch.Size([1024, 1792])
583
+ 2025-04-24 10:17:46,454 INFO [train.py:813] speech_token_projector.bias: torch.Size([1024])
584
+ 2025-04-24 10:17:46,454 INFO [train.py:813] codec_lm_head.weight: torch.Size([4100, 1024])
585
+ 2025-04-24 10:17:46,454 INFO [train.py:813] codec_lm_head.bias: torch.Size([4100])
586
+ 2025-04-24 10:17:46,454 INFO [train.py:819] Device: cuda:0
587
+ 2025-04-24 10:17:47,363 INFO [train.py:823] Using DeepSpeed
588
+ 2025-04-24 10:17:49,160 INFO [data_module.py:445] About to get train cuts
589
+ 2025-04-24 10:17:49,163 INFO [data_module.py:250] Disable MUSAN
590
+ 2025-04-24 10:17:49,163 INFO [data_module.py:268] Enable SpecAugment
591
+ 2025-04-24 10:17:49,163 INFO [data_module.py:269] Time warp factor: 80
592
+ 2025-04-24 10:17:49,163 INFO [data_module.py:279] Num frame mask: 10
593
+ 2025-04-24 10:17:49,163 INFO [data_module.py:292] About to create train dataset
594
+ 2025-04-24 10:17:49,163 INFO [data_module.py:319] Using DynamicBucketingSampler.
595
+ 2025-04-24 10:17:50,163 INFO [data_module.py:336] About to create train dataloader
596
+ 2025-04-24 10:17:50,164 INFO [data_module.py:436] About to get test cuts
597
+ 2025-04-24 10:17:50,164 INFO [data_module.py:365] About to create dev dataset
598
+ 2025-04-24 10:17:50,251 INFO [data_module.py:379] About to create dev dataloader
599
+ 2025-04-24 10:17:50,260 INFO [train.py:875] start training from epoch 1
600
+ 2025-04-24 10:18:19,657 INFO [train.py:539] Computing validation loss
601
+ 2025-04-24 10:18:23,605 INFO [utils.py:1966] Filtering uneven-sized batch, original batch size is 2, retained batch size is 1.
602
+ 2025-04-24 10:18:36,169 INFO [train.py:548] Epoch 1, validation: loss=0.007026, acc=0.4635, codec_acc=0.0004482, codec_topk_acc=0.003435, codec_loss=0.0053, text_loss=0.001726, over 284780.00 frames.
603
+ 2025-04-24 10:18:36,170 INFO [train.py:549] Maximum memory allocated so far is 9104MB
604
+ 2025-04-24 10:18:36,617 INFO [train.py:611] Epoch 1, batch 0, loss[loss=0.006556, acc=0.4215, codec_acc=0, codec_topk_acc=0.0032025-04-24 10:18:36,617 INFO [train.py:611] Epoch 1, batch 0, loss[loss=0.006369, acc=0.48, codec_acc=0, codec_topk_acc=0.003988, codec_loss=0.004861, text_loss=0.001508, over 1743.00 frames. ], tot_loss[loss=0.006369, acc=0.48, codec_acc=2025-04-24 10:19:00,389 INFO [train.py:611] Epoch 1, batch 50, loss[loss=0.006524, acc=0.4478, codec_acc=0.01511, codec_top2025-04-24 10:19:00,389 INFO [train.py:611] Epoch 1, batch 50, loss[loss=0.006777, acc=0.3789, codec_acc=0.01844, codec_topk_acc=0.08811, codec_loss=0.004652, text_loss=0.002125, over 1371.00 frames. ], tot_loss[loss=0.005674, acc=0.4825, codec_acc=0.00862025-04-24 10:19:26,931 INFO [train.py:611] Epoch 1, batch 100, loss[loss=0.00485, acc=0.4072, codec_acc=0.02365, codec_topk2025-04-24 10:19:26,931 INFO [train.py:611] Epoch 1, batch 100, loss[loss=0.004453, acc=0.6573, codec_acc=0.02348, codec_topk_acc=0.08241, codec_loss=0.003445, text_loss=0.001008, over 1826.00 frames. ], tot_loss[loss=0.005231, acc=0.4903, codec_acc=0.02025-04-24 10:19:40,453 INFO [train.py:940] Saving batch to slam_omni/exp_speech2speech_rerun/batch-bdd640fb-0667-1ad1-1c80-317fa3b1799d.pt
605
+ 2025-04-24 10:19:40,475 INFO [train.py:945] features shape: torch.Size([8, 455, 80])
606
+ h-bdd640fb-0667-1ad1-1c80-317fa3b1799d.pt
607
+ 2025-04-24 10:19:40,473 INFO [train.py:945] features shape: torch.Size([8, 457, 80])
log/log-train-2025-04-24-10-21-52 ADDED
@@ -0,0 +1,604 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-04-24 10:21:52,552 INFO [train.py:653] {
2
+ "allowed_excess_duration_ratio": 0.1,
3
+ "audio_key": "question_audio",
4
+ "batch_idx_train": 0,
5
+ "best_train_epoch": -1,
6
+ "best_train_loss": Infinity,
7
+ "best_valid_epoch": -1,
8
+ "best_valid_loss": Infinity,
9
+ "bucketing_sampler": true,
10
+ "deepscale": false,
11
+ "deepscale_config": null,
12
+ "deepspeed": true,
13
+ "deepspeed_config": "./slam_omni/ds_config_zero1.json",
14
+ "drop_last": true,
15
+ "enable_musan": false,
16
+ "enable_spec_aug": true,
17
+ "enable_speech_output": true,
18
+ "encoder_projector_ds_rate": 8,
19
+ "env_info": {
20
+ "IP address": "0.114.183.253",
21
+ "hostname": "7518205",
22
+ "icefall-git-branch": null,
23
+ "icefall-git-date": null,
24
+ "icefall-git-sha1": null,
25
+ "icefall-path": "/workspace/slam/icefall_omni",
26
+ "k2-build-type": "Release",
27
+ "k2-git-date": "Tue Oct 29 09:02:19 2024",
28
+ "k2-git-sha1": "75e2ed6b2fd87c22b7f3f34bad48a69984bb8755",
29
+ "k2-path": "/opt/conda/lib/python3.11/site-packages/k2/__init__.py",
30
+ "k2-version": "1.24.4",
31
+ "k2-with-cuda": true,
32
+ "lhotse-path": "/workspace/slam/lhotse/lhotse/__init__.py",
33
+ "lhotse-version": "1.30.0.dev+git.13c7616f.dirty",
34
+ "python-version": "3.11",
35
+ "torch-cuda-available": true,
36
+ "torch-cuda-version": "12.4",
37
+ "torch-version": "2.4.0"
38
+ },
39
+ "exp_dir": "slam_omni/exp_speech2speech_rerun",
40
+ "frame_shift_ms": 10,
41
+ "huggingface_dataset_path_or_name": "/workspace/Belle_1.4M-SLAM-Omni",
42
+ "input_strategy": "PrecomputedFeatures",
43
+ "llm_path_or_name": "models/Qwen2.5-0.5B-Instruct",
44
+ "log_interval": 50,
45
+ "manifest_dir": "data/fbank",
46
+ "max_duration": 50,
47
+ "num_buckets": 30,
48
+ "num_epochs": 10,
49
+ "num_workers": 2,
50
+ "on_the_fly_feats": false,
51
+ "pretrained_model_path": null,
52
+ "resample_to_16kHz": true,
53
+ "reset_interval": 200,
54
+ "return_cuts": true,
55
+ "sampler_state_dict_path": null,
56
+ "seed": 42,
57
+ "shuffle": true,
58
+ "spec_aug_time_warp_factor": 80,
59
+ "speech_encoder_path_or_name": "models/whisper/v1.1/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt",
60
+ "start_epoch": 1,
61
+ "subsampling_factor": 2,
62
+ "tensorboard": true,
63
+ "text_key": "answer",
64
+ "unfreeze_llm": true,
65
+ "unfreeze_speech_projector": true,
66
+ "use_flash_attn": true,
67
+ "use_fp16": true,
68
+ "use_lora": true,
69
+ "valid_interval": 5000
70
+ }
71
+ 2025-04-24 10:21:52,552 INFO [train.py:655] About to create model
72
+ 2025-04-24 10:22:24,486 INFO [train.py:808] Number of model parameters: 1327766148
73
+ 2025-04-24 10:22:24,487 INFO [train.py:810] Trainable parameters (excluding model.eval modules):
74
+ 2025-04-24 10:22:24,487 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
75
+ 2025-04-24 10:22:24,488 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
76
+ 2025-04-24 10:22:24,488 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
77
+ 2025-04-24 10:22:24,488 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
78
+ 2025-04-24 10:22:24,488 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
79
+ 2025-04-24 10:22:24,488 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
80
+ 2025-04-24 10:22:24,488 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
81
+ 2025-04-24 10:22:24,488 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
82
+ 2025-04-24 10:22:24,488 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
83
+ 2025-04-24 10:22:24,488 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
84
+ 2025-04-24 10:22:24,488 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
85
+ 2025-04-24 10:22:24,488 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
86
+ 2025-04-24 10:22:24,488 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
87
+ 2025-04-24 10:22:24,488 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
88
+ 2025-04-24 10:22:24,488 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
89
+ 2025-04-24 10:22:24,488 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
90
+ 2025-04-24 10:22:24,488 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
91
+ 2025-04-24 10:22:24,488 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
92
+ 2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
93
+ 2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
94
+ 2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
95
+ 2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
96
+ 2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
97
+ 2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
98
+ 2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
99
+ 2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
100
+ 2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
101
+ 2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
102
+ 2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
103
+ 2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
104
+ 2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
105
+ 2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
106
+ 2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
107
+ 2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
108
+ 2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
109
+ 2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
110
+ 2025-04-24 10:22:24,489 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
111
+ 2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
112
+ 2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
113
+ 2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
114
+ 2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
115
+ 2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
116
+ 2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
117
+ 2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
118
+ 2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
119
+ 2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
120
+ 2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
121
+ 2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
122
+ 2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
123
+ 2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
124
+ 2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
125
+ 2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
126
+ 2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
127
+ 2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
128
+ 2025-04-24 10:22:24,490 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
129
+ 2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
130
+ 2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
131
+ 2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
132
+ 2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
133
+ 2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
134
+ 2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
135
+ 2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
136
+ 2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
137
+ 2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
138
+ 2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
139
+ 2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
140
+ 2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
141
+ 2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
142
+ 2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
143
+ 2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
144
+ 2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
145
+ 2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
146
+ 2025-04-24 10:22:24,491 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
147
+ 2025-04-24 10:22:24,492 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
148
+ 2025-04-24 10:22:24,492 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
149
+ 2025-04-24 10:22:24,492 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
150
+ 2025-04-24 10:22:24,492 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
151
+ 2025-04-24 10:22:24,492 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
152
+ 2025-04-24 10:22:24,492 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
153
+ 2025-04-24 10:22:24,492 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
154
+ 2025-04-24 10:22:24,492 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
155
+ 2025-04-24 10:22:24,492 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
156
+ 2025-04-24 10:22:24,492 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
157
+ 2025-04-24 10:22:24,492 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
158
+ 2025-04-24 10:22:24,492 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
159
+ 2025-04-24 10:22:24,492 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
160
+ 2025-04-24 10:22:24,492 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
161
+ 2025-04-24 10:22:24,492 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
162
+ 2025-04-24 10:22:24,492 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
163
+ 2025-04-24 10:22:24,492 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
164
+ 2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
165
+ 2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
166
+ 2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
167
+ 2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
168
+ 2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
169
+ 2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
170
+ 2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
171
+ 2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
172
+ 2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
173
+ 2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
174
+ 2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
175
+ 2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
176
+ 2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
177
+ 2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
178
+ 2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
179
+ 2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
180
+ 2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
181
+ 2025-04-24 10:22:24,493 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
182
+ 2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
183
+ 2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
184
+ 2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
185
+ 2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
186
+ 2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
187
+ 2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
188
+ 2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
189
+ 2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
190
+ 2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
191
+ 2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
192
+ 2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
193
+ 2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
194
+ 2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
195
+ 2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
196
+ 2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
197
+ 2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
198
+ 2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
199
+ 2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
200
+ 2025-04-24 10:22:24,494 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
201
+ 2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
202
+ 2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
203
+ 2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
204
+ 2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
205
+ 2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
206
+ 2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
207
+ 2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
208
+ 2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
209
+ 2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
210
+ 2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
211
+ 2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
212
+ 2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
213
+ 2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
214
+ 2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
215
+ 2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
216
+ 2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
217
+ 2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
218
+ 2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
219
+ 2025-04-24 10:22:24,495 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
220
+ 2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
221
+ 2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
222
+ 2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
223
+ 2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
224
+ 2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
225
+ 2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
226
+ 2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
227
+ 2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
228
+ 2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
229
+ 2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
230
+ 2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
231
+ 2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
232
+ 2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
233
+ 2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
234
+ 2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
235
+ 2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
236
+ 2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
237
+ 2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
238
+ 2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
239
+ 2025-04-24 10:22:24,496 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
240
+ 2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
241
+ 2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
242
+ 2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
243
+ 2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
244
+ 2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
245
+ 2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
246
+ 2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
247
+ 2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
248
+ 2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
249
+ 2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
250
+ 2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
251
+ 2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
252
+ 2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
253
+ 2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
254
+ 2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
255
+ 2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
256
+ 2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
257
+ 2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
258
+ 2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
259
+ 2025-04-24 10:22:24,497 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
260
+ 2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
261
+ 2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
262
+ 2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
263
+ 2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
264
+ 2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
265
+ 2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
266
+ 2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
267
+ 2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
268
+ 2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
269
+ 2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
270
+ 2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
271
+ 2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
272
+ 2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
273
+ 2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
274
+ 2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
275
+ 2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
276
+ 2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
277
+ 2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
278
+ 2025-04-24 10:22:24,498 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
279
+ 2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
280
+ 2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
281
+ 2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
282
+ 2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
283
+ 2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
284
+ 2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
285
+ 2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
286
+ 2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
287
+ 2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
288
+ 2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
289
+ 2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
290
+ 2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
291
+ 2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
292
+ 2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
293
+ 2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
294
+ 2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
295
+ 2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
296
+ 2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
297
+ 2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
298
+ 2025-04-24 10:22:24,499 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
299
+ 2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
300
+ 2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
301
+ 2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
302
+ 2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
303
+ 2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
304
+ 2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
305
+ 2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
306
+ 2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
307
+ 2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
308
+ 2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
309
+ 2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
310
+ 2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
311
+ 2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
312
+ 2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
313
+ 2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
314
+ 2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
315
+ 2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
316
+ 2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
317
+ 2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
318
+ 2025-04-24 10:22:24,500 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
319
+ 2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
320
+ 2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
321
+ 2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
322
+ 2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
323
+ 2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
324
+ 2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
325
+ 2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
326
+ 2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
327
+ 2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
328
+ 2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
329
+ 2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
330
+ 2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
331
+ 2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
332
+ 2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
333
+ 2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
334
+ 2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
335
+ 2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
336
+ 2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
337
+ 2025-04-24 10:22:24,501 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
338
+ 2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
339
+ 2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
340
+ 2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
341
+ 2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
342
+ 2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
343
+ 2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
344
+ 2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
345
+ 2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
346
+ 2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
347
+ 2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
348
+ 2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
349
+ 2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
350
+ 2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
351
+ 2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
352
+ 2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
353
+ 2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
354
+ 2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
355
+ 2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
356
+ 2025-04-24 10:22:24,502 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
357
+ 2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
358
+ 2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
359
+ 2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
360
+ 2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
361
+ 2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
362
+ 2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
363
+ 2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
364
+ 2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
365
+ 2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
366
+ 2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
367
+ 2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
368
+ 2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
369
+ 2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
370
+ 2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
371
+ 2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
372
+ 2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
373
+ 2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
374
+ 2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
375
+ 2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
376
+ 2025-04-24 10:22:24,503 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
377
+ 2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
378
+ 2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
379
+ 2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
380
+ 2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
381
+ 2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
382
+ 2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
383
+ 2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
384
+ 2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
385
+ 2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
386
+ 2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
387
+ 2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
388
+ 2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
389
+ 2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
390
+ 2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
391
+ 2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
392
+ 2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
393
+ 2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
394
+ 2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
395
+ 2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
396
+ 2025-04-24 10:22:24,504 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
397
+ 2025-04-24 10:22:24,505 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
398
+ 2025-04-24 10:22:24,505 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
399
+ 2025-04-24 10:22:24,505 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
400
+ 2025-04-24 10:22:24,505 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
401
+ 2025-04-24 10:22:24,505 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
402
+ 2025-04-24 10:22:24,505 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
403
+ 2025-04-24 10:22:24,505 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
404
+ 2025-04-24 10:22:24,505 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
405
+ 2025-04-24 10:22:24,505 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
406
+ 2025-04-24 10:22:24,505 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
407
+ 2025-04-24 10:22:24,505 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
408
+ 2025-04-24 10:22:24,505 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
409
+ 2025-04-24 10:22:24,505 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
410
+ 2025-04-24 10:22:24,505 INFO [train.py:813] encoder_projector.linear1.weight: torch.Size([896, 10240])
411
+ 2025-04-24 10:22:24,505 INFO [train.py:813] encoder_projector.linear1.bias: torch.Size([896])
412
+ 2025-04-24 10:22:24,505 INFO [train.py:813] encoder_projector.linear2.weight: torch.Size([896, 896])
413
+ 2025-04-24 10:22:24,505 INFO [train.py:813] encoder_projector.linear2.bias: torch.Size([896])
414
+ 2025-04-24 10:22:24,505 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
415
+ 2025-04-24 10:22:24,505 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
416
+ 2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
417
+ 2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
418
+ 2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
419
+ 2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
420
+ 2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
421
+ 2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
422
+ 2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
423
+ 2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
424
+ 2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
425
+ 2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
426
+ 2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
427
+ 2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.0.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
428
+ 2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
429
+ 2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
430
+ 2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
431
+ 2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
432
+ 2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
433
+ 2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
434
+ 2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
435
+ 2025-04-24 10:22:24,506 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
436
+ 2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
437
+ 2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
438
+ 2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
439
+ 2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
440
+ 2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
441
+ 2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.1.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
442
+ 2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
443
+ 2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
444
+ 2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
445
+ 2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
446
+ 2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
447
+ 2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
448
+ 2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
449
+ 2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
450
+ 2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
451
+ 2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
452
+ 2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
453
+ 2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
454
+ 2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
455
+ 2025-04-24 10:22:24,507 INFO [train.py:813] codec_lm.base_model.model.model.layers.2.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
456
+ 2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
457
+ 2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
458
+ 2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
459
+ 2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
460
+ 2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
461
+ 2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
462
+ 2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
463
+ 2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
464
+ 2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
465
+ 2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
466
+ 2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
467
+ 2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
468
+ 2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
469
+ 2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.3.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
470
+ 2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
471
+ 2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
472
+ 2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
473
+ 2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
474
+ 2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
475
+ 2025-04-24 10:22:24,508 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
476
+ 2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
477
+ 2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
478
+ 2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
479
+ 2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
480
+ 2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
481
+ 2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
482
+ 2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
483
+ 2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.4.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
484
+ 2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
485
+ 2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
486
+ 2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
487
+ 2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
488
+ 2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
489
+ 2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
490
+ 2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
491
+ 2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
492
+ 2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
493
+ 2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
494
+ 2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
495
+ 2025-04-24 10:22:24,509 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
496
+ 2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
497
+ 2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.5.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
498
+ 2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
499
+ 2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
500
+ 2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
501
+ 2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
502
+ 2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
503
+ 2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
504
+ 2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
505
+ 2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
506
+ 2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
507
+ 2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
508
+ 2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
509
+ 2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
510
+ 2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
511
+ 2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.6.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
512
+ 2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
513
+ 2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
514
+ 2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
515
+ 2025-04-24 10:22:24,510 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
516
+ 2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
517
+ 2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
518
+ 2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
519
+ 2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
520
+ 2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
521
+ 2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
522
+ 2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
523
+ 2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
524
+ 2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
525
+ 2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.7.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
526
+ 2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
527
+ 2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
528
+ 2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
529
+ 2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
530
+ 2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
531
+ 2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
532
+ 2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
533
+ 2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
534
+ 2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
535
+ 2025-04-24 10:22:24,511 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
536
+ 2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
537
+ 2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
538
+ 2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
539
+ 2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.8.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
540
+ 2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
541
+ 2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
542
+ 2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
543
+ 2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
544
+ 2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
545
+ 2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
546
+ 2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
547
+ 2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
548
+ 2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
549
+ 2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
550
+ 2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
551
+ 2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
552
+ 2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
553
+ 2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.9.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
554
+ 2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
555
+ 2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
556
+ 2025-04-24 10:22:24,512 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
557
+ 2025-04-24 10:22:24,513 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
558
+ 2025-04-24 10:22:24,513 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
559
+ 2025-04-24 10:22:24,513 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
560
+ 2025-04-24 10:22:24,513 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
561
+ 2025-04-24 10:22:24,513 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
562
+ 2025-04-24 10:22:24,513 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
563
+ 2025-04-24 10:22:24,513 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
564
+ 2025-04-24 10:22:24,513 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
565
+ 2025-04-24 10:22:24,513 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
566
+ 2025-04-24 10:22:24,513 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
567
+ 2025-04-24 10:22:24,513 INFO [train.py:813] codec_lm.base_model.model.model.layers.10.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
568
+ 2025-04-24 10:22:24,513 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 1024])
569
+ 2025-04-24 10:22:24,513 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.q_proj.lora_B.default.weight: torch.Size([1024, 64])
570
+ 2025-04-24 10:22:24,513 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 1024])
571
+ 2025-04-24 10:22:24,513 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.k_proj.lora_B.default.weight: torch.Size([1024, 64])
572
+ 2025-04-24 10:22:24,513 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 1024])
573
+ 2025-04-24 10:22:24,513 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 64])
574
+ 2025-04-24 10:22:24,514 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 1024])
575
+ 2025-04-24 10:22:24,514 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.self_attn.o_proj.lora_B.default.weight: torch.Size([1024, 64])
576
+ 2025-04-24 10:22:24,514 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 1024])
577
+ 2025-04-24 10:22:24,514 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.gate_proj.lora_B.default.weight: torch.Size([2048, 64])
578
+ 2025-04-24 10:22:24,514 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.up_proj.lora_A.default.weight: torch.Size([64, 1024])
579
+ 2025-04-24 10:22:24,514 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.up_proj.lora_B.default.weight: torch.Size([2048, 64])
580
+ 2025-04-24 10:22:24,514 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.down_proj.lora_A.default.weight: torch.Size([64, 2048])
581
+ 2025-04-24 10:22:24,514 INFO [train.py:813] codec_lm.base_model.model.model.layers.11.mlp.down_proj.lora_B.default.weight: torch.Size([1024, 64])
582
+ 2025-04-24 10:22:24,514 INFO [train.py:813] speech_token_projector.weight: torch.Size([1024, 1792])
583
+ 2025-04-24 10:22:24,514 INFO [train.py:813] speech_token_projector.bias: torch.Size([1024])
584
+ 2025-04-24 10:22:24,514 INFO [train.py:813] codec_lm_head.weight: torch.Size([4100, 1024])
585
+ 2025-04-24 10:22:24,514 INFO [train.py:813] codec_lm_head.bias: torch.Size([4100])
586
+ 2025-04-24 10:22:24,514 INFO [train.py:819] Device: cuda:3
587
+ 2025-04-24 10:22:25,742 INFO [train.py:823] Using DeepSpeed
588
+ 2025-04-24 10:22:52,214 INFO [data_module.py:445] About to get train cuts
589
+ 2025-04-24 10:22:52,216 INFO [data_module.py:250] Disable MUSAN
590
+ 2025-04-24 10:22:52,216 INFO [data_module.py:268] Enable SpecAugment
591
+ 2025-04-24 10:22:52,216 INFO [data_module.py:269] Time warp factor: 80
592
+ 2025-04-24 10:22:52,216 INFO [data_module.py:279] Num frame mask: 10
593
+ 2025-04-24 10:22:52,217 INFO [data_module.py:292] About to create train dataset
594
+ 2025-04-24 10:22:52,217 INFO [data_module.py:319] Using DynamicBucketingSampler.
595
+ 2025-04-24 10:22:53,219 INFO [data_module.py:336] About to create train dataloader
596
+ 2025-04-24 10:22:53,219 INFO [data_module.py:436] About to get test cuts
597
+ 2025-04-24 10:22:53,220 INFO [data_module.py:365] About to create dev dataset
598
+ 2025-04-24 10:22:53,308 INFO [data_module.py:379] About to create dev dataloader
599
+ 2025-04-24 10:22:53,315 INFO [train.py:875] start training from epoch 1
600
+ 2025-04-24 10:23:23,440 INFO [train.py:539] Computing validation loss
601
+ 2025-04-24 10:23:31,208 INFO [train.py:548] Epoch 1, validation: loss=0.005359, acc=0.4658, codec_acc=0.0004324, codec_topk_acc=0.003407, codec_loss=0.004039, text_loss=0.00132, over 285507.00 frames.
602
+ 2025-04-24 10:23:31,209 INFO [train.py:549] Maximum memory allocated so far is 12383MB
603
+ 2025-04-24 10:23:31,687 INFO [train.py:611] Epoch 1, batch 0, loss[loss=0.005086, acc=0.4173, codec_acc=0.001085, codec_topk_ac2025-04-24 10:23:31,687 INFO [train.py:611] Epoch 1, batch 0, loss[loss=0.004863, acc=0.4196, codec_acc=0.0002915, codec_topk_acc=0.002623, codec_loss=0.00371, text_loss=0.001153, over 2279.00 frames. ], tot_loss[loss=0.004863, acc=0.4196, codec_acc=0.0002025-04-24 10:23:39,106 INFO [train.py:940] Saving batch to slam_omni/exp_speech2speech_rerun/batch-bdd640fb-0667-1ad1-1c80-317fa3b1799d.pt
604
+ 2025-04-24 10:23:39,117 INFO [train.py:945] features shape: torch.Size([4, 1168, 80])
log/log-train-2025-04-24-10-24-20 ADDED
The diff for this file is too large to render. See raw diff
 
log/log-train-2025-04-24-14-23-07 ADDED
The diff for this file is too large to render. See raw diff
 
log/log-train-2025-04-25-00-57-07 ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-04-25 00:57:07,641 INFO [train.py:653] {
2
+ "allowed_excess_duration_ratio": 0.1,
3
+ "audio_key": "question_audio",
4
+ "batch_idx_train": 0,
5
+ "best_train_epoch": -1,
6
+ "best_train_loss": Infinity,
7
+ "best_valid_epoch": -1,
8
+ "best_valid_loss": Infinity,
9
+ "bucketing_sampler": true,
10
+ "deepscale": false,
11
+ "deepscale_config": null,
12
+ "deepspeed": true,
13
+ "deepspeed_config": "./slam_omni/ds_config_zero1.json",
14
+ "drop_last": true,
15
+ "enable_musan": false,
16
+ "enable_spec_aug": true,
17
+ "enable_speech_output": true,
18
+ "encoder_projector_ds_rate": 8,
19
+ "env_info": {
20
+ "IP address": "0.114.183.253",
21
+ "hostname": "7518205",
22
+ "icefall-git-branch": null,
23
+ "icefall-git-date": null,
24
+ "icefall-git-sha1": null,
25
+ "icefall-path": "/workspace/slam/icefall_omni",
26
+ "k2-build-type": "Release",
27
+ "k2-git-date": "Tue Oct 29 09:02:19 2024",
28
+ "k2-git-sha1": "75e2ed6b2fd87c22b7f3f34bad48a69984bb8755",
29
+ "k2-path": "/opt/conda/lib/python3.11/site-packages/k2/__init__.py",
30
+ "k2-version": "1.24.4",
31
+ "k2-with-cuda": true,
32
+ "lhotse-path": "/workspace/slam/lhotse/lhotse/__init__.py",
33
+ "lhotse-version": "1.30.0.dev+git.13c7616f.dirty",
34
+ "python-version": "3.11",
35
+ "torch-cuda-available": true,
36
+ "torch-cuda-version": "12.4",
37
+ "torch-version": "2.4.0"
38
+ },
39
+ "exp_dir": "slam_omni/exp_speech2speech_rerun",
40
+ "frame_shift_ms": 10,
41
+ "huggingface_dataset_path_or_name": "/workspace/Belle_1.4M-SLAM-Omni",
42
+ "input_strategy": "PrecomputedFeatures",
43
+ "llm_path_or_name": "models/Qwen2.5-0.5B-Instruct",
44
+ "log_interval": 50,
45
+ "manifest_dir": "data/fbank",
46
+ "max_duration": 50,
47
+ "num_buckets": 30,
48
+ "num_epochs": 10,
49
+ "num_workers": 2,
50
+ "on_the_fly_feats": false,
51
+ "pretrained_model_path": "./slam_omni/exp_speech2speech_rerun/epoch-1-checkpoint-15000.pt/pytorch_model.bin",
52
+ "resample_to_16kHz": true,
53
+ "reset_interval": 200,
54
+ "return_cuts": true,
55
+ "sampler_state_dict_path": "./slam_omni/exp_speech2speech_rerun/epoch-1-checkpoint-15000-sampler.pt",
56
+ "seed": 42,
57
+ "shuffle": true,
58
+ "spec_aug_time_warp_factor": 80,
59
+ "speech_encoder_path_or_name": "models/whisper/v1.1/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt",
60
+ "start_epoch": 1,
61
+ "subsampling_factor": 2,
62
+ "tensorboard": true,
63
+ "text_key": "answer",
64
+ "unfreeze_llm": true,
65
+ "unfreeze_speech_projector": true,
66
+ "use_flash_attn": true,
67
+ "use_fp16": true,
68
+ "use_lora": true,
69
+ "valid_interval": 5000
70
+ }
71
+ 2025-04-25 00:57:07,641 INFO [train.py:655] About to create model
log/log-train-2025-04-25-00-58-05 ADDED
@@ -0,0 +1,575 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-04-25 00:58:05,568 INFO [train.py:653] {
2
+ "allowed_excess_duration_ratio": 0.1,
3
+ "audio_key": "question_audio",
4
+ "batch_idx_train": 0,
5
+ "best_train_epoch": -1,
6
+ "best_train_loss": Infinity,
7
+ "best_valid_epoch": -1,
8
+ "best_valid_loss": Infinity,
9
+ "bucketing_sampler": true,
10
+ "deepscale": false,
11
+ "deepscale_config": null,
12
+ "deepspeed": true,
13
+ "deepspeed_config": "./slam_omni/ds_config_zero1.json",
14
+ "drop_last": true,
15
+ "enable_musan": false,
16
+ "enable_spec_aug": true,
17
+ "enable_speech_output": true,
18
+ "encoder_projector_ds_rate": 8,
19
+ "env_info": {
20
+ "IP address": "0.114.183.253",
21
+ "hostname": "7518205",
22
+ "icefall-git-branch": null,
23
+ "icefall-git-date": null,
24
+ "icefall-git-sha1": null,
25
+ "icefall-path": "/workspace/slam/icefall_omni",
26
+ "k2-build-type": "Release",
27
+ "k2-git-date": "Tue Oct 29 09:02:19 2024",
28
+ "k2-git-sha1": "75e2ed6b2fd87c22b7f3f34bad48a69984bb8755",
29
+ "k2-path": "/opt/conda/lib/python3.11/site-packages/k2/__init__.py",
30
+ "k2-version": "1.24.4",
31
+ "k2-with-cuda": true,
32
+ "lhotse-path": "/workspace/slam/lhotse/lhotse/__init__.py",
33
+ "lhotse-version": "1.30.0.dev+git.13c7616f.dirty",
34
+ "python-version": "3.11",
35
+ "torch-cuda-available": true,
36
+ "torch-cuda-version": "12.4",
37
+ "torch-version": "2.4.0"
38
+ },
39
+ "exp_dir": "slam_omni/exp_speech2speech_rerun",
40
+ "frame_shift_ms": 10,
41
+ "huggingface_dataset_path_or_name": "/workspace/Belle_1.4M-SLAM-Omni",
42
+ "input_strategy": "PrecomputedFeatures",
43
+ "llm_path_or_name": "models/Qwen2.5-0.5B-Instruct",
44
+ "log_interval": 50,
45
+ "manifest_dir": "data/fbank",
46
+ "max_duration": 50,
47
+ "num_buckets": 30,
48
+ "num_epochs": 10,
49
+ "num_workers": 2,
50
+ "on_the_fly_feats": false,
51
+ "pretrained_model_path": "./slam_omni/exp_speech2speech_rerun/epoch-1-checkpoint-15000.pt/pytorch_model.bin",
52
+ "resample_to_16kHz": true,
53
+ "reset_interval": 200,
54
+ "return_cuts": true,
55
+ "sampler_state_dict_path": "./slam_omni/exp_speech2speech_rerun/epoch-1-checkpoint-15000-sampler.pt",
56
+ "seed": 42,
57
+ "shuffle": true,
58
+ "spec_aug_time_warp_factor": 80,
59
+ "speech_encoder_path_or_name": "models/whisper/v1.1/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt",
60
+ "start_epoch": 1,
61
+ "subsampling_factor": 2,
62
+ "tensorboard": true,
63
+ "text_key": "answer",
64
+ "unfreeze_llm": true,
65
+ "unfreeze_speech_projector": true,
66
+ "use_flash_attn": true,
67
+ "use_fp16": true,
68
+ "use_lora": true,
69
+ "valid_interval": 5000
70
+ }
71
+ 2025-04-25 00:58:05,569 INFO [train.py:655] About to create model
72
+ 2025-04-25 00:58:25,413 INFO [train.py:808] Number of model parameters: 1314396804
73
+ 2025-04-25 00:58:25,413 INFO [train.py:810] Trainable parameters (excluding model.eval modules):
74
+ 2025-04-25 00:58:25,414 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
75
+ 2025-04-25 00:58:25,414 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
76
+ 2025-04-25 00:58:25,414 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
77
+ 2025-04-25 00:58:25,414 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
78
+ 2025-04-25 00:58:25,414 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
79
+ 2025-04-25 00:58:25,415 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
80
+ 2025-04-25 00:58:25,415 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
81
+ 2025-04-25 00:58:25,415 INFO [train.py:813] llm.base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
82
+ 2025-04-25 00:58:25,415 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
83
+ 2025-04-25 00:58:25,415 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
84
+ 2025-04-25 00:58:25,415 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
85
+ 2025-04-25 00:58:25,415 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
86
+ 2025-04-25 00:58:25,415 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
87
+ 2025-04-25 00:58:25,415 INFO [train.py:813] llm.base_model.model.model.layers.0.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
88
+ 2025-04-25 00:58:25,415 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
89
+ 2025-04-25 00:58:25,415 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
90
+ 2025-04-25 00:58:25,415 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
91
+ 2025-04-25 00:58:25,415 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
92
+ 2025-04-25 00:58:25,415 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
93
+ 2025-04-25 00:58:25,415 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
94
+ 2025-04-25 00:58:25,415 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
95
+ 2025-04-25 00:58:25,415 INFO [train.py:813] llm.base_model.model.model.layers.1.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
96
+ 2025-04-25 00:58:25,415 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
97
+ 2025-04-25 00:58:25,415 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
98
+ 2025-04-25 00:58:25,415 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
99
+ 2025-04-25 00:58:25,416 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
100
+ 2025-04-25 00:58:25,416 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
101
+ 2025-04-25 00:58:25,416 INFO [train.py:813] llm.base_model.model.model.layers.1.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
102
+ 2025-04-25 00:58:25,416 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
103
+ 2025-04-25 00:58:25,416 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
104
+ 2025-04-25 00:58:25,416 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
105
+ 2025-04-25 00:58:25,416 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
106
+ 2025-04-25 00:58:25,416 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
107
+ 2025-04-25 00:58:25,416 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
108
+ 2025-04-25 00:58:25,416 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
109
+ 2025-04-25 00:58:25,416 INFO [train.py:813] llm.base_model.model.model.layers.2.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
110
+ 2025-04-25 00:58:25,416 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
111
+ 2025-04-25 00:58:25,416 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
112
+ 2025-04-25 00:58:25,416 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
113
+ 2025-04-25 00:58:25,416 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
114
+ 2025-04-25 00:58:25,416 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
115
+ 2025-04-25 00:58:25,416 INFO [train.py:813] llm.base_model.model.model.layers.2.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
116
+ 2025-04-25 00:58:25,416 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
117
+ 2025-04-25 00:58:25,416 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
118
+ 2025-04-25 00:58:25,416 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
119
+ 2025-04-25 00:58:25,416 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
120
+ 2025-04-25 00:58:25,417 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
121
+ 2025-04-25 00:58:25,417 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
122
+ 2025-04-25 00:58:25,417 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
123
+ 2025-04-25 00:58:25,417 INFO [train.py:813] llm.base_model.model.model.layers.3.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
124
+ 2025-04-25 00:58:25,417 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
125
+ 2025-04-25 00:58:25,417 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
126
+ 2025-04-25 00:58:25,417 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
127
+ 2025-04-25 00:58:25,417 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
128
+ 2025-04-25 00:58:25,417 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
129
+ 2025-04-25 00:58:25,417 INFO [train.py:813] llm.base_model.model.model.layers.3.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
130
+ 2025-04-25 00:58:25,417 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
131
+ 2025-04-25 00:58:25,417 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
132
+ 2025-04-25 00:58:25,417 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
133
+ 2025-04-25 00:58:25,417 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
134
+ 2025-04-25 00:58:25,417 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
135
+ 2025-04-25 00:58:25,417 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
136
+ 2025-04-25 00:58:25,417 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
137
+ 2025-04-25 00:58:25,417 INFO [train.py:813] llm.base_model.model.model.layers.4.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
138
+ 2025-04-25 00:58:25,417 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
139
+ 2025-04-25 00:58:25,417 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
140
+ 2025-04-25 00:58:25,417 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
141
+ 2025-04-25 00:58:25,417 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
142
+ 2025-04-25 00:58:25,418 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
143
+ 2025-04-25 00:58:25,418 INFO [train.py:813] llm.base_model.model.model.layers.4.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
144
+ 2025-04-25 00:58:25,418 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
145
+ 2025-04-25 00:58:25,418 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
146
+ 2025-04-25 00:58:25,418 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
147
+ 2025-04-25 00:58:25,418 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
148
+ 2025-04-25 00:58:25,418 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
149
+ 2025-04-25 00:58:25,418 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
150
+ 2025-04-25 00:58:25,418 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
151
+ 2025-04-25 00:58:25,418 INFO [train.py:813] llm.base_model.model.model.layers.5.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
152
+ 2025-04-25 00:58:25,418 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
153
+ 2025-04-25 00:58:25,418 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
154
+ 2025-04-25 00:58:25,418 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
155
+ 2025-04-25 00:58:25,418 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
156
+ 2025-04-25 00:58:25,418 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
157
+ 2025-04-25 00:58:25,418 INFO [train.py:813] llm.base_model.model.model.layers.5.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
158
+ 2025-04-25 00:58:25,418 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
159
+ 2025-04-25 00:58:25,418 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
160
+ 2025-04-25 00:58:25,418 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
161
+ 2025-04-25 00:58:25,418 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
162
+ 2025-04-25 00:58:25,418 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
163
+ 2025-04-25 00:58:25,419 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
164
+ 2025-04-25 00:58:25,419 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
165
+ 2025-04-25 00:58:25,419 INFO [train.py:813] llm.base_model.model.model.layers.6.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
166
+ 2025-04-25 00:58:25,419 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
167
+ 2025-04-25 00:58:25,419 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
168
+ 2025-04-25 00:58:25,419 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
169
+ 2025-04-25 00:58:25,419 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
170
+ 2025-04-25 00:58:25,419 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
171
+ 2025-04-25 00:58:25,419 INFO [train.py:813] llm.base_model.model.model.layers.6.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
172
+ 2025-04-25 00:58:25,419 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
173
+ 2025-04-25 00:58:25,419 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
174
+ 2025-04-25 00:58:25,419 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
175
+ 2025-04-25 00:58:25,419 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
176
+ 2025-04-25 00:58:25,419 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
177
+ 2025-04-25 00:58:25,419 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
178
+ 2025-04-25 00:58:25,419 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
179
+ 2025-04-25 00:58:25,419 INFO [train.py:813] llm.base_model.model.model.layers.7.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
180
+ 2025-04-25 00:58:25,419 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
181
+ 2025-04-25 00:58:25,419 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
182
+ 2025-04-25 00:58:25,419 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
183
+ 2025-04-25 00:58:25,420 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
184
+ 2025-04-25 00:58:25,420 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
185
+ 2025-04-25 00:58:25,420 INFO [train.py:813] llm.base_model.model.model.layers.7.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
186
+ 2025-04-25 00:58:25,420 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
187
+ 2025-04-25 00:58:25,420 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
188
+ 2025-04-25 00:58:25,420 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
189
+ 2025-04-25 00:58:25,420 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
190
+ 2025-04-25 00:58:25,420 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
191
+ 2025-04-25 00:58:25,420 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
192
+ 2025-04-25 00:58:25,420 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
193
+ 2025-04-25 00:58:25,420 INFO [train.py:813] llm.base_model.model.model.layers.8.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
194
+ 2025-04-25 00:58:25,420 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
195
+ 2025-04-25 00:58:25,420 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
196
+ 2025-04-25 00:58:25,420 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
197
+ 2025-04-25 00:58:25,420 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
198
+ 2025-04-25 00:58:25,420 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
199
+ 2025-04-25 00:58:25,420 INFO [train.py:813] llm.base_model.model.model.layers.8.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
200
+ 2025-04-25 00:58:25,420 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
201
+ 2025-04-25 00:58:25,420 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
202
+ 2025-04-25 00:58:25,420 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
203
+ 2025-04-25 00:58:25,420 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
204
+ 2025-04-25 00:58:25,420 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
205
+ 2025-04-25 00:58:25,421 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
206
+ 2025-04-25 00:58:25,421 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
207
+ 2025-04-25 00:58:25,421 INFO [train.py:813] llm.base_model.model.model.layers.9.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
208
+ 2025-04-25 00:58:25,421 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
209
+ 2025-04-25 00:58:25,421 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
210
+ 2025-04-25 00:58:25,421 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
211
+ 2025-04-25 00:58:25,421 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
212
+ 2025-04-25 00:58:25,421 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
213
+ 2025-04-25 00:58:25,421 INFO [train.py:813] llm.base_model.model.model.layers.9.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
214
+ 2025-04-25 00:58:25,421 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
215
+ 2025-04-25 00:58:25,421 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
216
+ 2025-04-25 00:58:25,421 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
217
+ 2025-04-25 00:58:25,421 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
218
+ 2025-04-25 00:58:25,421 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
219
+ 2025-04-25 00:58:25,421 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
220
+ 2025-04-25 00:58:25,421 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
221
+ 2025-04-25 00:58:25,421 INFO [train.py:813] llm.base_model.model.model.layers.10.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
222
+ 2025-04-25 00:58:25,421 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
223
+ 2025-04-25 00:58:25,421 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
224
+ 2025-04-25 00:58:25,421 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
225
+ 2025-04-25 00:58:25,421 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
226
+ 2025-04-25 00:58:25,422 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
227
+ 2025-04-25 00:58:25,422 INFO [train.py:813] llm.base_model.model.model.layers.10.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
228
+ 2025-04-25 00:58:25,422 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
229
+ 2025-04-25 00:58:25,422 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
230
+ 2025-04-25 00:58:25,422 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
231
+ 2025-04-25 00:58:25,422 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
232
+ 2025-04-25 00:58:25,422 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
233
+ 2025-04-25 00:58:25,422 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
234
+ 2025-04-25 00:58:25,422 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
235
+ 2025-04-25 00:58:25,422 INFO [train.py:813] llm.base_model.model.model.layers.11.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
236
+ 2025-04-25 00:58:25,422 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
237
+ 2025-04-25 00:58:25,422 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
238
+ 2025-04-25 00:58:25,422 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
239
+ 2025-04-25 00:58:25,422 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
240
+ 2025-04-25 00:58:25,422 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
241
+ 2025-04-25 00:58:25,422 INFO [train.py:813] llm.base_model.model.model.layers.11.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
242
+ 2025-04-25 00:58:25,422 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
243
+ 2025-04-25 00:58:25,422 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
244
+ 2025-04-25 00:58:25,422 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
245
+ 2025-04-25 00:58:25,423 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
246
+ 2025-04-25 00:58:25,423 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
247
+ 2025-04-25 00:58:25,423 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
248
+ 2025-04-25 00:58:25,423 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
249
+ 2025-04-25 00:58:25,423 INFO [train.py:813] llm.base_model.model.model.layers.12.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
250
+ 2025-04-25 00:58:25,423 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
251
+ 2025-04-25 00:58:25,423 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
252
+ 2025-04-25 00:58:25,423 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
253
+ 2025-04-25 00:58:25,423 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
254
+ 2025-04-25 00:58:25,423 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
255
+ 2025-04-25 00:58:25,423 INFO [train.py:813] llm.base_model.model.model.layers.12.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
256
+ 2025-04-25 00:58:25,423 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
257
+ 2025-04-25 00:58:25,423 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
258
+ 2025-04-25 00:58:25,423 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
259
+ 2025-04-25 00:58:25,423 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
260
+ 2025-04-25 00:58:25,423 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
261
+ 2025-04-25 00:58:25,423 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
262
+ 2025-04-25 00:58:25,423 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
263
+ 2025-04-25 00:58:25,423 INFO [train.py:813] llm.base_model.model.model.layers.13.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
264
+ 2025-04-25 00:58:25,424 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
265
+ 2025-04-25 00:58:25,424 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
266
+ 2025-04-25 00:58:25,424 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
267
+ 2025-04-25 00:58:25,424 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
268
+ 2025-04-25 00:58:25,424 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
269
+ 2025-04-25 00:58:25,424 INFO [train.py:813] llm.base_model.model.model.layers.13.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
270
+ 2025-04-25 00:58:25,424 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
271
+ 2025-04-25 00:58:25,424 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
272
+ 2025-04-25 00:58:25,424 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
273
+ 2025-04-25 00:58:25,424 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
274
+ 2025-04-25 00:58:25,424 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
275
+ 2025-04-25 00:58:25,424 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
276
+ 2025-04-25 00:58:25,424 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
277
+ 2025-04-25 00:58:25,424 INFO [train.py:813] llm.base_model.model.model.layers.14.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
278
+ 2025-04-25 00:58:25,424 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
279
+ 2025-04-25 00:58:25,424 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
280
+ 2025-04-25 00:58:25,424 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
281
+ 2025-04-25 00:58:25,424 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
282
+ 2025-04-25 00:58:25,424 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
283
+ 2025-04-25 00:58:25,424 INFO [train.py:813] llm.base_model.model.model.layers.14.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
284
+ 2025-04-25 00:58:25,424 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
285
+ 2025-04-25 00:58:25,424 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
286
+ 2025-04-25 00:58:25,425 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
287
+ 2025-04-25 00:58:25,425 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
288
+ 2025-04-25 00:58:25,425 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
289
+ 2025-04-25 00:58:25,425 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
290
+ 2025-04-25 00:58:25,425 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
291
+ 2025-04-25 00:58:25,425 INFO [train.py:813] llm.base_model.model.model.layers.15.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
292
+ 2025-04-25 00:58:25,425 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
293
+ 2025-04-25 00:58:25,425 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
294
+ 2025-04-25 00:58:25,425 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
295
+ 2025-04-25 00:58:25,425 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
296
+ 2025-04-25 00:58:25,425 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
297
+ 2025-04-25 00:58:25,425 INFO [train.py:813] llm.base_model.model.model.layers.15.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
298
+ 2025-04-25 00:58:25,425 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
299
+ 2025-04-25 00:58:25,425 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
300
+ 2025-04-25 00:58:25,425 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
301
+ 2025-04-25 00:58:25,425 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
302
+ 2025-04-25 00:58:25,425 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
303
+ 2025-04-25 00:58:25,425 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
304
+ 2025-04-25 00:58:25,425 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
305
+ 2025-04-25 00:58:25,425 INFO [train.py:813] llm.base_model.model.model.layers.16.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
306
+ 2025-04-25 00:58:25,425 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
307
+ 2025-04-25 00:58:25,425 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
308
+ 2025-04-25 00:58:25,425 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
309
+ 2025-04-25 00:58:25,426 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
310
+ 2025-04-25 00:58:25,426 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
311
+ 2025-04-25 00:58:25,426 INFO [train.py:813] llm.base_model.model.model.layers.16.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
312
+ 2025-04-25 00:58:25,426 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
313
+ 2025-04-25 00:58:25,426 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
314
+ 2025-04-25 00:58:25,426 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
315
+ 2025-04-25 00:58:25,426 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
316
+ 2025-04-25 00:58:25,426 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
317
+ 2025-04-25 00:58:25,426 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
318
+ 2025-04-25 00:58:25,426 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
319
+ 2025-04-25 00:58:25,426 INFO [train.py:813] llm.base_model.model.model.layers.17.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
320
+ 2025-04-25 00:58:25,426 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
321
+ 2025-04-25 00:58:25,426 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
322
+ 2025-04-25 00:58:25,426 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
323
+ 2025-04-25 00:58:25,426 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
324
+ 2025-04-25 00:58:25,426 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
325
+ 2025-04-25 00:58:25,426 INFO [train.py:813] llm.base_model.model.model.layers.17.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
326
+ 2025-04-25 00:58:25,426 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
327
+ 2025-04-25 00:58:25,426 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
328
+ 2025-04-25 00:58:25,426 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
329
+ 2025-04-25 00:58:25,426 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
330
+ 2025-04-25 00:58:25,427 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
331
+ 2025-04-25 00:58:25,427 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
332
+ 2025-04-25 00:58:25,427 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
333
+ 2025-04-25 00:58:25,427 INFO [train.py:813] llm.base_model.model.model.layers.18.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
334
+ 2025-04-25 00:58:25,427 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
335
+ 2025-04-25 00:58:25,427 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
336
+ 2025-04-25 00:58:25,427 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
337
+ 2025-04-25 00:58:25,427 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
338
+ 2025-04-25 00:58:25,427 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
339
+ 2025-04-25 00:58:25,427 INFO [train.py:813] llm.base_model.model.model.layers.18.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
340
+ 2025-04-25 00:58:25,427 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
341
+ 2025-04-25 00:58:25,427 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
342
+ 2025-04-25 00:58:25,427 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
343
+ 2025-04-25 00:58:25,427 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
344
+ 2025-04-25 00:58:25,427 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
345
+ 2025-04-25 00:58:25,427 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
346
+ 2025-04-25 00:58:25,427 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
347
+ 2025-04-25 00:58:25,427 INFO [train.py:813] llm.base_model.model.model.layers.19.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
348
+ 2025-04-25 00:58:25,427 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
349
+ 2025-04-25 00:58:25,427 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
350
+ 2025-04-25 00:58:25,427 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
351
+ 2025-04-25 00:58:25,427 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
352
+ 2025-04-25 00:58:25,428 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
353
+ 2025-04-25 00:58:25,428 INFO [train.py:813] llm.base_model.model.model.layers.19.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
354
+ 2025-04-25 00:58:25,428 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
355
+ 2025-04-25 00:58:25,428 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
356
+ 2025-04-25 00:58:25,428 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
357
+ 2025-04-25 00:58:25,428 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
358
+ 2025-04-25 00:58:25,428 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
359
+ 2025-04-25 00:58:25,428 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
360
+ 2025-04-25 00:58:25,428 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
361
+ 2025-04-25 00:58:25,428 INFO [train.py:813] llm.base_model.model.model.layers.20.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
362
+ 2025-04-25 00:58:25,428 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
363
+ 2025-04-25 00:58:25,428 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
364
+ 2025-04-25 00:58:25,428 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
365
+ 2025-04-25 00:58:25,428 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
366
+ 2025-04-25 00:58:25,428 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
367
+ 2025-04-25 00:58:25,428 INFO [train.py:813] llm.base_model.model.model.layers.20.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
368
+ 2025-04-25 00:58:25,428 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
369
+ 2025-04-25 00:58:25,428 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
370
+ 2025-04-25 00:58:25,428 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
371
+ 2025-04-25 00:58:25,428 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
372
+ 2025-04-25 00:58:25,428 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
373
+ 2025-04-25 00:58:25,428 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
374
+ 2025-04-25 00:58:25,428 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
375
+ 2025-04-25 00:58:25,429 INFO [train.py:813] llm.base_model.model.model.layers.21.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
376
+ 2025-04-25 00:58:25,429 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
377
+ 2025-04-25 00:58:25,429 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
378
+ 2025-04-25 00:58:25,429 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
379
+ 2025-04-25 00:58:25,429 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
380
+ 2025-04-25 00:58:25,429 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
381
+ 2025-04-25 00:58:25,429 INFO [train.py:813] llm.base_model.model.model.layers.21.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
382
+ 2025-04-25 00:58:25,429 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
383
+ 2025-04-25 00:58:25,429 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
384
+ 2025-04-25 00:58:25,429 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
385
+ 2025-04-25 00:58:25,429 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
386
+ 2025-04-25 00:58:25,429 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
387
+ 2025-04-25 00:58:25,429 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
388
+ 2025-04-25 00:58:25,429 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
389
+ 2025-04-25 00:58:25,429 INFO [train.py:813] llm.base_model.model.model.layers.22.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
390
+ 2025-04-25 00:58:25,429 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
391
+ 2025-04-25 00:58:25,429 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
392
+ 2025-04-25 00:58:25,429 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
393
+ 2025-04-25 00:58:25,429 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
394
+ 2025-04-25 00:58:25,429 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
395
+ 2025-04-25 00:58:25,429 INFO [train.py:813] llm.base_model.model.model.layers.22.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
396
+ 2025-04-25 00:58:25,429 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.q_proj.lora_A.default.weight: torch.Size([64, 896])
397
+ 2025-04-25 00:58:25,429 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.q_proj.lora_B.default.weight: torch.Size([896, 64])
398
+ 2025-04-25 00:58:25,430 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.k_proj.lora_A.default.weight: torch.Size([64, 896])
399
+ 2025-04-25 00:58:25,430 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.k_proj.lora_B.default.weight: torch.Size([128, 64])
400
+ 2025-04-25 00:58:25,430 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.v_proj.lora_A.default.weight: torch.Size([64, 896])
401
+ 2025-04-25 00:58:25,430 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.v_proj.lora_B.default.weight: torch.Size([128, 64])
402
+ 2025-04-25 00:58:25,430 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.o_proj.lora_A.default.weight: torch.Size([64, 896])
403
+ 2025-04-25 00:58:25,430 INFO [train.py:813] llm.base_model.model.model.layers.23.self_attn.o_proj.lora_B.default.weight: torch.Size([896, 64])
404
+ 2025-04-25 00:58:25,430 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.gate_proj.lora_A.default.weight: torch.Size([64, 896])
405
+ 2025-04-25 00:58:25,430 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.gate_proj.lora_B.default.weight: torch.Size([4864, 64])
406
+ 2025-04-25 00:58:25,430 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.up_proj.lora_A.default.weight: torch.Size([64, 896])
407
+ 2025-04-25 00:58:25,430 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.up_proj.lora_B.default.weight: torch.Size([4864, 64])
408
+ 2025-04-25 00:58:25,430 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.down_proj.lora_A.default.weight: torch.Size([64, 4864])
409
+ 2025-04-25 00:58:25,430 INFO [train.py:813] llm.base_model.model.model.layers.23.mlp.down_proj.lora_B.default.weight: torch.Size([896, 64])
410
+ 2025-04-25 00:58:25,430 INFO [train.py:813] encoder_projector.linear1.weight: torch.Size([896, 10240])
411
+ 2025-04-25 00:58:25,430 INFO [train.py:813] encoder_projector.linear1.bias: torch.Size([896])
412
+ 2025-04-25 00:58:25,430 INFO [train.py:813] encoder_projector.linear2.weight: torch.Size([896, 896])
413
+ 2025-04-25 00:58:25,430 INFO [train.py:813] encoder_projector.linear2.bias: torch.Size([896])
414
+ 2025-04-25 00:58:25,430 INFO [train.py:813] codec_lm.model.embed_tokens.weight: torch.Size([4100, 1024])
415
+ 2025-04-25 00:58:25,430 INFO [train.py:813] codec_lm.model.layers.0.self_attn.q_proj.weight: torch.Size([1024, 1024])
416
+ 2025-04-25 00:58:25,430 INFO [train.py:813] codec_lm.model.layers.0.self_attn.q_proj.bias: torch.Size([1024])
417
+ 2025-04-25 00:58:25,430 INFO [train.py:813] codec_lm.model.layers.0.self_attn.k_proj.weight: torch.Size([1024, 1024])
418
+ 2025-04-25 00:58:25,430 INFO [train.py:813] codec_lm.model.layers.0.self_attn.k_proj.bias: torch.Size([1024])
419
+ 2025-04-25 00:58:25,430 INFO [train.py:813] codec_lm.model.layers.0.self_attn.v_proj.weight: torch.Size([1024, 1024])
420
+ 2025-04-25 00:58:25,430 INFO [train.py:813] codec_lm.model.layers.0.self_attn.v_proj.bias: torch.Size([1024])
421
+ 2025-04-25 00:58:25,430 INFO [train.py:813] codec_lm.model.layers.0.self_attn.o_proj.weight: torch.Size([1024, 1024])
422
+ 2025-04-25 00:58:25,430 INFO [train.py:813] codec_lm.model.layers.0.mlp.gate_proj.weight: torch.Size([2048, 1024])
423
+ 2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.0.mlp.up_proj.weight: torch.Size([2048, 1024])
424
+ 2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.0.mlp.down_proj.weight: torch.Size([1024, 2048])
425
+ 2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.0.input_layernorm.weight: torch.Size([1024])
426
+ 2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.0.post_attention_layernorm.weight: torch.Size([1024])
427
+ 2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.1.self_attn.q_proj.weight: torch.Size([1024, 1024])
428
+ 2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.1.self_attn.q_proj.bias: torch.Size([1024])
429
+ 2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.1.self_attn.k_proj.weight: torch.Size([1024, 1024])
430
+ 2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.1.self_attn.k_proj.bias: torch.Size([1024])
431
+ 2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.1.self_attn.v_proj.weight: torch.Size([1024, 1024])
432
+ 2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.1.self_attn.v_proj.bias: torch.Size([1024])
433
+ 2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.1.self_attn.o_proj.weight: torch.Size([1024, 1024])
434
+ 2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.1.mlp.gate_proj.weight: torch.Size([2048, 1024])
435
+ 2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.1.mlp.up_proj.weight: torch.Size([2048, 1024])
436
+ 2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.1.mlp.down_proj.weight: torch.Size([1024, 2048])
437
+ 2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.1.input_layernorm.weight: torch.Size([1024])
438
+ 2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.1.post_attention_layernorm.weight: torch.Size([1024])
439
+ 2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.2.self_attn.q_proj.weight: torch.Size([1024, 1024])
440
+ 2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.2.self_attn.q_proj.bias: torch.Size([1024])
441
+ 2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.2.self_attn.k_proj.weight: torch.Size([1024, 1024])
442
+ 2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.2.self_attn.k_proj.bias: torch.Size([1024])
443
+ 2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.2.self_attn.v_proj.weight: torch.Size([1024, 1024])
444
+ 2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.2.self_attn.v_proj.bias: torch.Size([1024])
445
+ 2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.2.self_attn.o_proj.weight: torch.Size([1024, 1024])
446
+ 2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.2.mlp.gate_proj.weight: torch.Size([2048, 1024])
447
+ 2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.2.mlp.up_proj.weight: torch.Size([2048, 1024])
448
+ 2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.2.mlp.down_proj.weight: torch.Size([1024, 2048])
449
+ 2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.2.input_layernorm.weight: torch.Size([1024])
450
+ 2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.2.post_attention_layernorm.weight: torch.Size([1024])
451
+ 2025-04-25 00:58:25,431 INFO [train.py:813] codec_lm.model.layers.3.self_attn.q_proj.weight: torch.Size([1024, 1024])
452
+ 2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.3.self_attn.q_proj.bias: torch.Size([1024])
453
+ 2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.3.self_attn.k_proj.weight: torch.Size([1024, 1024])
454
+ 2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.3.self_attn.k_proj.bias: torch.Size([1024])
455
+ 2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.3.self_attn.v_proj.weight: torch.Size([1024, 1024])
456
+ 2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.3.self_attn.v_proj.bias: torch.Size([1024])
457
+ 2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.3.self_attn.o_proj.weight: torch.Size([1024, 1024])
458
+ 2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.3.mlp.gate_proj.weight: torch.Size([2048, 1024])
459
+ 2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.3.mlp.up_proj.weight: torch.Size([2048, 1024])
460
+ 2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.3.mlp.down_proj.weight: torch.Size([1024, 2048])
461
+ 2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.3.input_layernorm.weight: torch.Size([1024])
462
+ 2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.3.post_attention_layernorm.weight: torch.Size([1024])
463
+ 2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.4.self_attn.q_proj.weight: torch.Size([1024, 1024])
464
+ 2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.4.self_attn.q_proj.bias: torch.Size([1024])
465
+ 2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.4.self_attn.k_proj.weight: torch.Size([1024, 1024])
466
+ 2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.4.self_attn.k_proj.bias: torch.Size([1024])
467
+ 2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.4.self_attn.v_proj.weight: torch.Size([1024, 1024])
468
+ 2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.4.self_attn.v_proj.bias: torch.Size([1024])
469
+ 2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.4.self_attn.o_proj.weight: torch.Size([1024, 1024])
470
+ 2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.4.mlp.gate_proj.weight: torch.Size([2048, 1024])
471
+ 2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.4.mlp.up_proj.weight: torch.Size([2048, 1024])
472
+ 2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.4.mlp.down_proj.weight: torch.Size([1024, 2048])
473
+ 2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.4.input_layernorm.weight: torch.Size([1024])
474
+ 2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.4.post_attention_layernorm.weight: torch.Size([1024])
475
+ 2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.5.self_attn.q_proj.weight: torch.Size([1024, 1024])
476
+ 2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.5.self_attn.q_proj.bias: torch.Size([1024])
477
+ 2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.5.self_attn.k_proj.weight: torch.Size([1024, 1024])
478
+ 2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.5.self_attn.k_proj.bias: torch.Size([1024])
479
+ 2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.5.self_attn.v_proj.weight: torch.Size([1024, 1024])
480
+ 2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.5.self_attn.v_proj.bias: torch.Size([1024])
481
+ 2025-04-25 00:58:25,432 INFO [train.py:813] codec_lm.model.layers.5.self_attn.o_proj.weight: torch.Size([1024, 1024])
482
+ 2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.5.mlp.gate_proj.weight: torch.Size([2048, 1024])
483
+ 2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.5.mlp.up_proj.weight: torch.Size([2048, 1024])
484
+ 2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.5.mlp.down_proj.weight: torch.Size([1024, 2048])
485
+ 2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.5.input_layernorm.weight: torch.Size([1024])
486
+ 2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.5.post_attention_layernorm.weight: torch.Size([1024])
487
+ 2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.6.self_attn.q_proj.weight: torch.Size([1024, 1024])
488
+ 2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.6.self_attn.q_proj.bias: torch.Size([1024])
489
+ 2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.6.self_attn.k_proj.weight: torch.Size([1024, 1024])
490
+ 2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.6.self_attn.k_proj.bias: torch.Size([1024])
491
+ 2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.6.self_attn.v_proj.weight: torch.Size([1024, 1024])
492
+ 2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.6.self_attn.v_proj.bias: torch.Size([1024])
493
+ 2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.6.self_attn.o_proj.weight: torch.Size([1024, 1024])
494
+ 2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.6.mlp.gate_proj.weight: torch.Size([2048, 1024])
495
+ 2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.6.mlp.up_proj.weight: torch.Size([2048, 1024])
496
+ 2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.6.mlp.down_proj.weight: torch.Size([1024, 2048])
497
+ 2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.6.input_layernorm.weight: torch.Size([1024])
498
+ 2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.6.post_attention_layernorm.weight: torch.Size([1024])
499
+ 2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.7.self_attn.q_proj.weight: torch.Size([1024, 1024])
500
+ 2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.7.self_attn.q_proj.bias: torch.Size([1024])
501
+ 2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.7.self_attn.k_proj.weight: torch.Size([1024, 1024])
502
+ 2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.7.self_attn.k_proj.bias: torch.Size([1024])
503
+ 2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.7.self_attn.v_proj.weight: torch.Size([1024, 1024])
504
+ 2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.7.self_attn.v_proj.bias: torch.Size([1024])
505
+ 2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.7.self_attn.o_proj.weight: torch.Size([1024, 1024])
506
+ 2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.7.mlp.gate_proj.weight: torch.Size([2048, 1024])
507
+ 2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.7.mlp.up_proj.weight: torch.Size([2048, 1024])
508
+ 2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.7.mlp.down_proj.weight: torch.Size([1024, 2048])
509
+ 2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.7.input_layernorm.weight: torch.Size([1024])
510
+ 2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.7.post_attention_layernorm.weight: torch.Size([1024])
511
+ 2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.8.self_attn.q_proj.weight: torch.Size([1024, 1024])
512
+ 2025-04-25 00:58:25,433 INFO [train.py:813] codec_lm.model.layers.8.self_attn.q_proj.bias: torch.Size([1024])
513
+ 2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.8.self_attn.k_proj.weight: torch.Size([1024, 1024])
514
+ 2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.8.self_attn.k_proj.bias: torch.Size([1024])
515
+ 2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.8.self_attn.v_proj.weight: torch.Size([1024, 1024])
516
+ 2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.8.self_attn.v_proj.bias: torch.Size([1024])
517
+ 2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.8.self_attn.o_proj.weight: torch.Size([1024, 1024])
518
+ 2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.8.mlp.gate_proj.weight: torch.Size([2048, 1024])
519
+ 2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.8.mlp.up_proj.weight: torch.Size([2048, 1024])
520
+ 2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.8.mlp.down_proj.weight: torch.Size([1024, 2048])
521
+ 2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.8.input_layernorm.weight: torch.Size([1024])
522
+ 2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.8.post_attention_layernorm.weight: torch.Size([1024])
523
+ 2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.9.self_attn.q_proj.weight: torch.Size([1024, 1024])
524
+ 2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.9.self_attn.q_proj.bias: torch.Size([1024])
525
+ 2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.9.self_attn.k_proj.weight: torch.Size([1024, 1024])
526
+ 2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.9.self_attn.k_proj.bias: torch.Size([1024])
527
+ 2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.9.self_attn.v_proj.weight: torch.Size([1024, 1024])
528
+ 2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.9.self_attn.v_proj.bias: torch.Size([1024])
529
+ 2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.9.self_attn.o_proj.weight: torch.Size([1024, 1024])
530
+ 2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.9.mlp.gate_proj.weight: torch.Size([2048, 1024])
531
+ 2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.9.mlp.up_proj.weight: torch.Size([2048, 1024])
532
+ 2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.9.mlp.down_proj.weight: torch.Size([1024, 2048])
533
+ 2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.9.input_layernorm.weight: torch.Size([1024])
534
+ 2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.9.post_attention_layernorm.weight: torch.Size([1024])
535
+ 2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.10.self_attn.q_proj.weight: torch.Size([1024, 1024])
536
+ 2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.10.self_attn.q_proj.bias: torch.Size([1024])
537
+ 2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.10.self_attn.k_proj.weight: torch.Size([1024, 1024])
538
+ 2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.10.self_attn.k_proj.bias: torch.Size([1024])
539
+ 2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.10.self_attn.v_proj.weight: torch.Size([1024, 1024])
540
+ 2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.10.self_attn.v_proj.bias: torch.Size([1024])
541
+ 2025-04-25 00:58:25,434 INFO [train.py:813] codec_lm.model.layers.10.self_attn.o_proj.weight: torch.Size([1024, 1024])
542
+ 2025-04-25 00:58:25,435 INFO [train.py:813] codec_lm.model.layers.10.mlp.gate_proj.weight: torch.Size([2048, 1024])
543
+ 2025-04-25 00:58:25,435 INFO [train.py:813] codec_lm.model.layers.10.mlp.up_proj.weight: torch.Size([2048, 1024])
544
+ 2025-04-25 00:58:25,435 INFO [train.py:813] codec_lm.model.layers.10.mlp.down_proj.weight: torch.Size([1024, 2048])
545
+ 2025-04-25 00:58:25,435 INFO [train.py:813] codec_lm.model.layers.10.input_layernorm.weight: torch.Size([1024])
546
+ 2025-04-25 00:58:25,435 INFO [train.py:813] codec_lm.model.layers.10.post_attention_layernorm.weight: torch.Size([1024])
547
+ 2025-04-25 00:58:25,435 INFO [train.py:813] codec_lm.model.layers.11.self_attn.q_proj.weight: torch.Size([1024, 1024])
548
+ 2025-04-25 00:58:25,435 INFO [train.py:813] codec_lm.model.layers.11.self_attn.q_proj.bias: torch.Size([1024])
549
+ 2025-04-25 00:58:25,435 INFO [train.py:813] codec_lm.model.layers.11.self_attn.k_proj.weight: torch.Size([1024, 1024])
550
+ 2025-04-25 00:58:25,435 INFO [train.py:813] codec_lm.model.layers.11.self_attn.k_proj.bias: torch.Size([1024])
551
+ 2025-04-25 00:58:25,435 INFO [train.py:813] codec_lm.model.layers.11.self_attn.v_proj.weight: torch.Size([1024, 1024])
552
+ 2025-04-25 00:58:25,435 INFO [train.py:813] codec_lm.model.layers.11.self_attn.v_proj.bias: torch.Size([1024])
553
+ 2025-04-25 00:58:25,435 INFO [train.py:813] codec_lm.model.layers.11.self_attn.o_proj.weight: torch.Size([1024, 1024])
554
+ 2025-04-25 00:58:25,435 INFO [train.py:813] codec_lm.model.layers.11.mlp.gate_proj.weight: torch.Size([2048, 1024])
555
+ 2025-04-25 00:58:25,435 INFO [train.py:813] codec_lm.model.layers.11.mlp.up_proj.weight: torch.Size([2048, 1024])
556
+ 2025-04-25 00:58:25,435 INFO [train.py:813] codec_lm.model.layers.11.mlp.down_proj.weight: torch.Size([1024, 2048])
557
+ 2025-04-25 00:58:25,435 INFO [train.py:813] codec_lm.model.layers.11.input_layernorm.weight: torch.Size([1024])
558
+ 2025-04-25 00:58:25,435 INFO [train.py:813] codec_lm.model.layers.11.post_attention_layernorm.weight: torch.Size([1024])
559
+ 2025-04-25 00:58:25,435 INFO [train.py:813] codec_lm.model.norm.weight: torch.Size([1024])
560
+ 2025-04-25 00:58:25,435 INFO [train.py:813] codec_lm.lm_head.weight: torch.Size([4100, 1024])
561
+ 2025-04-25 00:58:25,435 INFO [train.py:813] speech_token_projector.weight: torch.Size([1024, 1792])
562
+ 2025-04-25 00:58:25,435 INFO [train.py:813] speech_token_projector.bias: torch.Size([1024])
563
+ 2025-04-25 00:58:25,435 INFO [train.py:813] codec_lm_head.weight: torch.Size([4100, 1024])
564
+ 2025-04-25 00:58:25,435 INFO [train.py:813] codec_lm_head.bias: torch.Size([4100])
565
+ 2025-04-25 00:58:25,435 INFO [train.py:819] Device: cuda:1
566
+ 2025-04-25 00:58:26,665 INFO [train.py:823] Using DeepSpeed
567
+ 2025-04-25 00:58:30,432 INFO [data_module.py:445] About to get train cuts
568
+ 2025-04-25 00:58:30,434 INFO [data_module.py:250] Disable MUSAN
569
+ 2025-04-25 00:58:30,434 INFO [data_module.py:268] Enable SpecAugment
570
+ 2025-04-25 00:58:30,434 INFO [data_module.py:269] Time warp factor: 80
571
+ 2025-04-25 00:58:30,434 INFO [data_module.py:279] Num frame mask: 10
572
+ 2025-04-25 00:58:30,434 INFO [data_module.py:292] About to create train dataset
573
+ 2025-04-25 00:58:30,434 INFO [data_module.py:319] Using DynamicBucketingSampler.
574
+ 2025-04-25 00:58:31,421 INFO [data_module.py:336] About to create train dataloader
575
+ 2025-04-25 00:58:31,421 INFO [data_module.py:339] Loading sampler state dict
log/log-train-2025-04-25-01-06-08 ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-04-25 01:06:08,819 INFO [train.py:653] {
2
+ "allowed_excess_duration_ratio": 0.1,
3
+ "audio_key": "question_audio",
4
+ "batch_idx_train": 0,
5
+ "best_train_epoch": -1,
6
+ "best_train_loss": Infinity,
7
+ "best_valid_epoch": -1,
8
+ "best_valid_loss": Infinity,
9
+ "bucketing_sampler": true,
10
+ "deepscale": false,
11
+ "deepscale_config": null,
12
+ "deepspeed": true,
13
+ "deepspeed_config": "./slam_omni/ds_config_zero1.json",
14
+ "drop_last": true,
15
+ "enable_musan": false,
16
+ "enable_spec_aug": true,
17
+ "enable_speech_output": true,
18
+ "encoder_projector_ds_rate": 8,
19
+ "env_info": {
20
+ "IP address": "0.114.183.253",
21
+ "hostname": "7518205",
22
+ "icefall-git-branch": null,
23
+ "icefall-git-date": null,
24
+ "icefall-git-sha1": null,
25
+ "icefall-path": "/workspace/slam/icefall_omni",
26
+ "k2-build-type": "Release",
27
+ "k2-git-date": "Tue Oct 29 09:02:19 2024",
28
+ "k2-git-sha1": "75e2ed6b2fd87c22b7f3f34bad48a69984bb8755",
29
+ "k2-path": "/opt/conda/lib/python3.11/site-packages/k2/__init__.py",
30
+ "k2-version": "1.24.4",
31
+ "k2-with-cuda": true,
32
+ "lhotse-path": "/workspace/slam/lhotse/lhotse/__init__.py",
33
+ "lhotse-version": "1.30.0.dev+git.13c7616f.dirty",
34
+ "python-version": "3.11",
35
+ "torch-cuda-available": true,
36
+ "torch-cuda-version": "12.4",
37
+ "torch-version": "2.4.0"
38
+ },
39
+ "exp_dir": "slam_omni/exp_speech2speech_rerun",
40
+ "frame_shift_ms": 10,
41
+ "huggingface_dataset_path_or_name": "/workspace/Belle_1.4M-SLAM-Omni",
42
+ "input_strategy": "PrecomputedFeatures",
43
+ "llm_path_or_name": "models/Qwen2.5-0.5B-Instruct",
44
+ "log_interval": 50,
45
+ "manifest_dir": "data/fbank",
46
+ "max_duration": 50,
47
+ "num_buckets": 30,
48
+ "num_epochs": 10,
49
+ "num_workers": 2,
50
+ "on_the_fly_feats": false,
51
+ "pretrained_model_path": "./slam_omni/exp_speech2speech_rerun/epoch-1-checkpoint-15000.pt/pytorch_model.bin",
52
+ "resample_to_16kHz": true,
53
+ "reset_interval": 200,
54
+ "return_cuts": true,
55
+ "sampler_state_dict_path": "./slam_omni/exp_speech2speech_rerun/epoch-1-checkpoint-15000-sampler.pt",
56
+ "seed": 42,
57
+ "shuffle": true,
58
+ "spec_aug_time_warp_factor": 80,
59
+ "speech_encoder_path_or_name": "models/whisper/v1.1/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt",
60
+ "start_epoch": 1,
61
+ "subsampling_factor": 2,
62
+ "tensorboard": true,
63
+ "text_key": "answer",
64
+ "unfreeze_llm": true,
65
+ "unfreeze_speech_projector": true,
66
+ "use_flash_attn": true,
67
+ "use_fp16": true,
68
+ "use_lora": true,
69
+ "valid_interval": 5000
70
+ }
71
+ 2025-04-25 01:06:08,819 INFO [train.py:655] About to create model
log/log-train-2025-04-25-01-06-09 ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-04-25 01:06:09,289 INFO [train.py:653] {
2
+ "allowed_excess_duration_ratio": 0.1,
3
+ "audio_key": "question_audio",
4
+ "batch_idx_train": 0,
5
+ "best_train_epoch": -1,
6
+ "best_train_loss": Infinity,
7
+ "best_valid_epoch": -1,
8
+ "best_valid_loss": Infinity,
9
+ "bucketing_sampler": true,
10
+ "deepscale": false,
11
+ "deepscale_config": null,
12
+ "deepspeed": true,
13
+ "deepspeed_config": "./slam_omni/ds_config_zero1.json",
14
+ "drop_last": true,
15
+ "enable_musan": false,
16
+ "enable_spec_aug": true,
17
+ "enable_speech_output": true,
18
+ "encoder_projector_ds_rate": 8,
19
+ "env_info": {
20
+ "IP address": "0.114.183.253",
21
+ "hostname": "7518205",
22
+ "icefall-git-branch": null,
23
+ "icefall-git-date": null,
24
+ "icefall-git-sha1": null,
25
+ "icefall-path": "/workspace/slam/icefall_omni",
26
+ "k2-build-type": "Release",
27
+ "k2-git-date": "Tue Oct 29 09:02:19 2024",
28
+ "k2-git-sha1": "75e2ed6b2fd87c22b7f3f34bad48a69984bb8755",
29
+ "k2-path": "/opt/conda/lib/python3.11/site-packages/k2/__init__.py",
30
+ "k2-version": "1.24.4",
31
+ "k2-with-cuda": true,
32
+ "lhotse-path": "/workspace/slam/lhotse/lhotse/__init__.py",
33
+ "lhotse-version": "1.30.0.dev+git.13c7616f.dirty",
34
+ "python-version": "3.11",
35
+ "torch-cuda-available": true,
36
+ "torch-cuda-version": "12.4",
37
+ "torch-version": "2.4.0"
38
+ },
39
+ "exp_dir": "slam_omni/exp_speech2speech_rerun",
40
+ "frame_shift_ms": 10,
41
+ "huggingface_dataset_path_or_name": "/workspace/Belle_1.4M-SLAM-Omni",
42
+ "input_strategy": "PrecomputedFeatures",
43
+ "llm_path_or_name": "models/Qwen2.5-0.5B-Instruct",
44
+ "log_interval": 50,
45
+ "manifest_dir": "data/fbank",
46
+ "max_duration": 50,
47
+ "num_buckets": 30,
48
+ "num_epochs": 10,
49
+ "num_workers": 2,
50
+ "on_the_fly_feats": false,
51
+ "pretrained_model_path": "./slam_omni/exp_speech2speech_rerun/epoch-1-checkpoint-15000.pt/pytorch_model.bin",
52
+ "resample_to_16kHz": true,
53
+ "reset_interval": 200,
54
+ "return_cuts": true,
55
+ "sampler_state_dict_path": "./slam_omni/exp_speech2speech_rerun/epoch-1-checkpoint-15000-sampler.pt",
56
+ "seed": 42,
57
+ "shuffle": true,
58
+ "spec_aug_time_warp_factor": 80,
59
+ "speech_encoder_path_or_name": "models/whisper/v1.1/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt",
60
+ "start_epoch": 1,
61
+ "subsampling_factor": 2,
62
+ "tensorboard": true,
63
+ "text_key": "answer",
64
+ "unfreeze_llm": true,
65
+ "unfreeze_speech_projector": true,
66
+ "use_flash_attn": true,
67
+ "use_fp16": true,
68
+ "use_lora": true,
69
+ "valid_interval": 5000
70
+ }
71
+ 2025-04-25 01:06:09,289 INFO [train.py:655] About to create model
log/log-train-2025-04-25-01-07-00 ADDED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e518bb90d19eff29cab0e36e024ea6c8c51eb0ebf01bbb92026646d414da9c2
3
+ size 749750076
tensorboard/events.out.tfevents.1745485127.7515258.1454637.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3420d0018b82da7a2f43ca561c9fdfa179db66e06bb2d3ac614ce2e15aef91d3
3
+ size 135
tensorboard/events.out.tfevents.1745485786.7515258.1472312.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:574e628eb7cf4b844bcce9409abb79402c7e6572268922ca4e0be5303917364a
3
+ size 135
tensorboard/events.out.tfevents.1745486182.7515258.1482888.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4258223345f90a5e9b4adff784bafa145a6eae5082a911e0f78c0af8f1e8353
3
+ size 135
tensorboard/events.out.tfevents.1745489567.7515258.1571829.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82506830e9b646b9f24e159b391bf95b5d35b7bc747f0ce5a17e357f8e7ecb2a
3
+ size 135
tensorboard/events.out.tfevents.1745489713.7515258.1575936.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6e078e3c3f3220e183555839741d1b351c12f607c8f4faded3aa6ef3320f8a0
3
+ size 135
tensorboard/events.out.tfevents.1745489870.7515258.1580280.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b604e4364358ea7691f076b13759f31ef10f29a085d85bd1b0851b0eedf3bf9a
3
+ size 2749
tensorboard/events.out.tfevents.1745490173.7518205.24828.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65fdf02f6e5f61a8c53283b6bab3e98481ba09c0df80c0b426ebf5098c34a1a4
3
+ size 1239
tensorboard/events.out.tfevents.1745490285.7518205.28691.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38670ceb5697c143c012d8a151c2d7c2a3df929d4078f9d78dea7e75db336bf0
3
+ size 243430
tensorboard/events.out.tfevents.1745505052.7518205.257175.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61e88e23f8a8a50b7e58df2cabdfab400daedb6b180b2d3a6c4d3708234b9c96
3
+ size 68029
tensorboard/events.out.tfevents.1745543674.7518205.398874.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0293cafc5c6bfb857b9c784b441ede3d8066a65e7c2187c8776a7065d906538f
3
+ size 3480854