diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..3d611a9150262ba4b13e2f6662fdb50f1df1e1a1 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +data/matcha_tts-0.0.5.1-cp38-cp38-linux_x86_64.whl filter=lfs diff=lfs merge=lfs -text +data/wavs/BAC009S0915W0292.wav filter=lfs diff=lfs merge=lfs -text +out.wav filter=lfs diff=lfs merge=lfs -text +talker/tokenizer.json filter=lfs diff=lfs merge=lfs -text + + + diff --git a/__pycache__/bailingmm_utils.cpython-38.pyc b/__pycache__/bailingmm_utils.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8f8d658f2cd1afa2ad37840dcfde03c26c26987c Binary files /dev/null and b/__pycache__/bailingmm_utils.cpython-38.pyc differ diff --git a/__pycache__/chat_format.cpython-38.pyc b/__pycache__/chat_format.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dcd6d3d676eb99feabf9ca3e2cd7f35be001b7e8 Binary files /dev/null and b/__pycache__/chat_format.cpython-38.pyc differ diff --git a/__pycache__/configuration_audio.cpython-38.pyc b/__pycache__/configuration_audio.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..88afcb16f730ee640f4909ef5440d94a6652b45d Binary files /dev/null and b/__pycache__/configuration_audio.cpython-38.pyc differ diff --git a/__pycache__/configuration_bailing_moe.cpython-38.pyc b/__pycache__/configuration_bailing_moe.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..63899ca6560b4093848246ec57434924bb64ef6e Binary files /dev/null and b/__pycache__/configuration_bailing_moe.cpython-38.pyc differ diff --git a/__pycache__/configuration_bailing_talker.cpython-38.pyc b/__pycache__/configuration_bailing_talker.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9379a57892522e294093fa5a99e7189d572efd64 Binary files /dev/null and b/__pycache__/configuration_bailing_talker.cpython-38.pyc differ diff --git a/__pycache__/configuration_bailingmm.cpython-38.pyc b/__pycache__/configuration_bailingmm.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1bb53dbdbf575ce8a608be6f4edd90cdc2aa054c Binary files /dev/null and b/__pycache__/configuration_bailingmm.cpython-38.pyc differ diff --git a/__pycache__/image_processing_bailingmm.cpython-38.pyc b/__pycache__/image_processing_bailingmm.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dcca40143ee9a575e059293c3327075dbc75d90e Binary files /dev/null and b/__pycache__/image_processing_bailingmm.cpython-38.pyc differ diff --git a/__pycache__/modeling_bailing_moe.cpython-38.pyc b/__pycache__/modeling_bailing_moe.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..53579dee6c5956a63a9216dd4880e55197c8042d Binary files /dev/null and b/__pycache__/modeling_bailing_moe.cpython-38.pyc differ diff --git a/__pycache__/modeling_bailing_talker.cpython-38.pyc b/__pycache__/modeling_bailing_talker.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..95de3b28c0f003873a5c55f78091eadb02b51a07 Binary files /dev/null and b/__pycache__/modeling_bailing_talker.cpython-38.pyc differ diff --git a/__pycache__/modeling_bailingmm.cpython-38.pyc b/__pycache__/modeling_bailingmm.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4b9dc01e0785602b6073f7e5fdec3d4a5d54cb00 Binary files /dev/null and b/__pycache__/modeling_bailingmm.cpython-38.pyc differ diff --git a/__pycache__/modeling_utils.cpython-38.pyc b/__pycache__/modeling_utils.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cdd3e51825162d0328af58fa273fa6551084756f Binary files /dev/null and b/__pycache__/modeling_utils.cpython-38.pyc differ diff --git a/__pycache__/qwen2_5_vit.cpython-38.pyc b/__pycache__/qwen2_5_vit.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c600f4c32f08af52109bcc9bbe3175d10cef769d Binary files /dev/null and b/__pycache__/qwen2_5_vit.cpython-38.pyc differ diff --git a/__pycache__/s3bpe_tokenizer.cpython-38.pyc b/__pycache__/s3bpe_tokenizer.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..948bdb90ce3148d6f363a35c0fad3d8f14549628 Binary files /dev/null and b/__pycache__/s3bpe_tokenizer.cpython-38.pyc differ diff --git a/am.mvn b/am.mvn new file mode 100644 index 0000000000000000000000000000000000000000..681910cd1ab6458b61474cdbf0d1ac5e810f7b0d --- /dev/null +++ b/am.mvn @@ -0,0 +1,8 @@ + + 560 560 +[ 0 ] + 560 560 + 0 [ -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 ] + 560 560 + 0 [ 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 ] + diff --git a/audio_detokenizer/__init__.py b/audio_detokenizer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/audio_detokenizer/__pycache__/__init__.cpython-38.pyc b/audio_detokenizer/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d35e70f2ee21fc787ff17adf1191c6cf5b6c36dc Binary files /dev/null and b/audio_detokenizer/__pycache__/__init__.cpython-38.pyc differ diff --git a/audio_detokenizer/cli/__init__.py b/audio_detokenizer/cli/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/audio_detokenizer/cli/__pycache__/__init__.cpython-38.pyc b/audio_detokenizer/cli/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5ad26fb9eda807ac3f47efeb06b15b61ae180827 Binary files /dev/null and b/audio_detokenizer/cli/__pycache__/__init__.cpython-38.pyc differ diff --git a/audio_detokenizer/cli/__pycache__/model.cpython-38.pyc b/audio_detokenizer/cli/__pycache__/model.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8535825f7740986319edbf6b566e4d846d3e42e0 Binary files /dev/null and b/audio_detokenizer/cli/__pycache__/model.cpython-38.pyc differ diff --git a/audio_detokenizer/cli/model.py b/audio_detokenizer/cli/model.py new file mode 100644 index 0000000000000000000000000000000000000000..8ba51ed58258d45c455c210b803a6e949b8e333c --- /dev/null +++ b/audio_detokenizer/cli/model.py @@ -0,0 +1,62 @@ +# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import torch +import time + +class AudioDetokenizerModel: + + def __init__(self, + flow: torch.nn.Module, + hift: torch.nn.Module, + lora_config=None): + self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + self.flow = flow + self.hift = hift + self.dtype = torch.float16 + # self.dtype = torch.bfloat16 + self.max_seq_short = 384 + self.max_seq_long = 2048 + self.max_batch = 1 + + def load(self, flow_model, hift_model): + self.flow.load_state_dict(torch.load(flow_model, map_location=self.device)) + self.flow.to(self.device).eval().to(self.dtype) + self.hift.load_state_dict(torch.load(hift_model, map_location=self.device)) + self.hift.to(self.device).eval() + + def inference(self, flow_embedding, tts_speech_token, + flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), flow_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32), + prompt_speech_feat=torch.zeros(1, 0, 80), prompt_speech_feat_len=torch.zeros(1, dtype=torch.int32), is_en=False): + + torch.cuda.synchronize() + t0 = time.time() + + torch.cuda.synchronize() + t1 = time.time() + + tts_mel = self.flow.inference(token=tts_speech_token.to(self.device), + token_len=torch.tensor([tts_speech_token.size(1)], dtype=torch.int32).to(self.device), + prompt_token=flow_prompt_speech_token.to(self.device), + prompt_token_len=flow_prompt_speech_token_len.to(self.device), + prompt_feat=prompt_speech_feat.to(self.device), + prompt_feat_len=prompt_speech_feat_len.to(self.device), + embedding=flow_embedding.to(self.device).to(self.dtype)).float() + torch.cuda.synchronize() + + tts_speech = self.hift.inference(mel=tts_mel).cpu() + torch.cuda.synchronize() + dur = tts_speech.shape[-1]/22050 + torch.cuda.empty_cache() + return {'tts_speech': tts_speech} diff --git a/audio_detokenizer/flow/__init__.py b/audio_detokenizer/flow/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/audio_detokenizer/flow/__pycache__/__init__.cpython-38.pyc b/audio_detokenizer/flow/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..02e3a7b5e051db98ffebe4b4e4352de9172f906b Binary files /dev/null and b/audio_detokenizer/flow/__pycache__/__init__.cpython-38.pyc differ diff --git a/audio_detokenizer/flow/__pycache__/decoder.cpython-38.pyc b/audio_detokenizer/flow/__pycache__/decoder.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..091fe47b4b26221bd8f3c53460d3a8b5298524c9 Binary files /dev/null and b/audio_detokenizer/flow/__pycache__/decoder.cpython-38.pyc differ diff --git a/audio_detokenizer/flow/__pycache__/flow.cpython-38.pyc b/audio_detokenizer/flow/__pycache__/flow.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..560ee0ec2362932055547c95fa7baf4893a44bec Binary files /dev/null and b/audio_detokenizer/flow/__pycache__/flow.cpython-38.pyc differ diff --git a/audio_detokenizer/flow/__pycache__/flow_matching.cpython-38.pyc b/audio_detokenizer/flow/__pycache__/flow_matching.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1c0c23cd382f42d56cae3080ce9e32732ade3505 Binary files /dev/null and b/audio_detokenizer/flow/__pycache__/flow_matching.cpython-38.pyc differ diff --git a/audio_detokenizer/flow/__pycache__/length_regulator.cpython-38.pyc b/audio_detokenizer/flow/__pycache__/length_regulator.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..39beb1fc7734d9e6da36bc0de1611edb2bf4ad02 Binary files /dev/null and b/audio_detokenizer/flow/__pycache__/length_regulator.cpython-38.pyc differ diff --git a/audio_detokenizer/flow/decoder.py b/audio_detokenizer/flow/decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..105bc9068ef643ea66e7471028060231d6160016 --- /dev/null +++ b/audio_detokenizer/flow/decoder.py @@ -0,0 +1,224 @@ +# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# antflake8: noqa +import torch +import torch.nn as nn +from einops import pack, rearrange, repeat +from matcha.models.components.decoder import SinusoidalPosEmb, Block1D, ResnetBlock1D, Downsample1D, TimestepEmbedding, Upsample1D +from matcha.models.components.transformer import BasicTransformerBlock + + +class ConditionalDecoder(nn.Module): + def __init__( + self, + in_channels, + out_channels, + channels=(256, 256), + dropout=0.05, + attention_head_dim=64, + n_blocks=1, + num_mid_blocks=2, + num_heads=4, + act_fn="snake", + ): + """ + This decoder requires an input with the same shape of the target. So, if your text content + is shorter or longer than the outputs, please re-sampling it before feeding to the decoder. + """ + super().__init__() + channels = tuple(channels) + self.in_channels = in_channels + self.out_channels = out_channels + + self.time_embeddings = SinusoidalPosEmb(in_channels) + time_embed_dim = channels[0] * 4 + self.time_mlp = TimestepEmbedding( + in_channels=in_channels, + time_embed_dim=time_embed_dim, + act_fn="silu", + ) + self.down_blocks = nn.ModuleList([]) + self.mid_blocks = nn.ModuleList([]) + self.up_blocks = nn.ModuleList([]) + self.compiled_infer = None + + output_channel = in_channels + for i in range(len(channels)): # pylint: disable=consider-using-enumerate + input_channel = output_channel + output_channel = channels[i] + is_last = i == len(channels) - 1 + resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) + transformer_blocks = nn.ModuleList( + [ + BasicTransformerBlock( + dim=output_channel, + num_attention_heads=num_heads, + attention_head_dim=attention_head_dim, + dropout=dropout, + activation_fn=act_fn, + ) + for _ in range(n_blocks) + ] + ) + downsample = ( + Downsample1D(output_channel) if not is_last else nn.Conv1d(output_channel, output_channel, 3, padding=1) + ) + self.down_blocks.append(nn.ModuleList([resnet, transformer_blocks, downsample])) + + for i in range(num_mid_blocks): + input_channel = channels[-1] + out_channels = channels[-1] + resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) + + transformer_blocks = nn.ModuleList( + [ + BasicTransformerBlock( + dim=output_channel, + num_attention_heads=num_heads, + attention_head_dim=attention_head_dim, + dropout=dropout, + activation_fn=act_fn, + ) + for _ in range(n_blocks) + ] + ) + + self.mid_blocks.append(nn.ModuleList([resnet, transformer_blocks])) + + channels = channels[::-1] + (channels[0],) + for i in range(len(channels) - 1): + input_channel = channels[i] * 2 + output_channel = channels[i + 1] + is_last = i == len(channels) - 2 + resnet = ResnetBlock1D( + dim=input_channel, + dim_out=output_channel, + time_emb_dim=time_embed_dim, + ) + transformer_blocks = nn.ModuleList( + [ + BasicTransformerBlock( + dim=output_channel, + num_attention_heads=num_heads, + attention_head_dim=attention_head_dim, + dropout=dropout, + activation_fn=act_fn, + ) + for _ in range(n_blocks) + ] + ) + upsample = ( + Upsample1D(output_channel, use_conv_transpose=True) + if not is_last + else nn.Conv1d(output_channel, output_channel, 3, padding=1) + ) + self.up_blocks.append(nn.ModuleList([resnet, transformer_blocks, upsample])) + self.final_block = Block1D(channels[-1], channels[-1]) + self.final_proj = nn.Conv1d(channels[-1], self.out_channels, 1) + self.initialize_weights() + + + def initialize_weights(self): + for m in self.modules(): + if isinstance(m, nn.Conv1d): + nn.init.kaiming_normal_(m.weight, nonlinearity="relu") + if m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.GroupNorm): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.Linear): + nn.init.kaiming_normal_(m.weight, nonlinearity="relu") + if m.bias is not None: + nn.init.constant_(m.bias, 0) + + def forward(self, x, mask, mu, t, spks=None, cond=None): + """Forward pass of the UNet1DConditional model. + + Args: + x (torch.Tensor): shape (batch_size, in_channels, time) + mask (_type_): shape (batch_size, 1, time) + t (_type_): shape (batch_size) + spks (_type_, optional): shape: (batch_size, condition_channels). Defaults to None. + cond (_type_, optional): placeholder for future use. Defaults to None. + + Raises: + ValueError: _description_ + ValueError: _description_ + + Returns: + _type_: _description_ + """ + + t = self.time_embeddings(t).to(t.dtype) + t = self.time_mlp(t) + + x = pack([x, mu], "b * t")[0] + + if spks is not None: + spks = repeat(spks, "b c -> b c t", t=x.shape[-1]) + x = pack([x, spks], "b * t")[0] + if cond is not None: + x = pack([x, cond], "b * t")[0] + + hiddens = [] + masks = [mask] + for resnet, transformer_blocks, downsample in self.down_blocks: + mask_down = masks[-1] + x = resnet(x, mask_down, t) + x = rearrange(x, "b c t -> b t c").contiguous() + attn_mask = torch.matmul(mask_down.transpose(1, 2).contiguous(), mask_down) + for transformer_block in transformer_blocks: + x = transformer_block( + hidden_states=x, + attention_mask=attn_mask, + timestep=t, + ) + x = rearrange(x, "b t c -> b c t").contiguous() + hiddens.append(x) # Save hidden states for skip connections + x = downsample(x * mask_down) + masks.append(mask_down[:, :, ::2]) + masks = masks[:-1] + mask_mid = masks[-1] + + for resnet, transformer_blocks in self.mid_blocks: + x = resnet(x, mask_mid, t) + x = rearrange(x, "b c t -> b t c").contiguous() + attn_mask = torch.matmul(mask_mid.transpose(1, 2).contiguous(), mask_mid) + for transformer_block in transformer_blocks: + x = transformer_block( + hidden_states=x, + attention_mask=attn_mask, + timestep=t, + ) + x = rearrange(x, "b t c -> b c t").contiguous() + + for resnet, transformer_blocks, upsample in self.up_blocks: + mask_up = masks.pop() + skip = hiddens.pop() + x = pack([x[:, :, :skip.shape[-1]], skip], "b * t")[0] + x = resnet(x, mask_up, t) + x = rearrange(x, "b c t -> b t c").contiguous() + attn_mask = torch.matmul(mask_up.transpose(1, 2).contiguous(), mask_up) + for transformer_block in transformer_blocks: + x = transformer_block( + hidden_states=x, + attention_mask=attn_mask, + timestep=t, + ) + x = rearrange(x, "b t c -> b c t").contiguous() + x = upsample(x * mask_up) + x = self.final_block(x, mask_up) + output = self.final_proj(x * mask_up) + return output * mask diff --git a/audio_detokenizer/flow/flow.py b/audio_detokenizer/flow/flow.py new file mode 100644 index 0000000000000000000000000000000000000000..a40d8d6211a713eb7c5e868770707966f1c95454 --- /dev/null +++ b/audio_detokenizer/flow/flow.py @@ -0,0 +1,148 @@ +# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +import random +from typing import Dict, Optional +import torch +import torch.nn as nn +from torch.nn import functional as F +from omegaconf import DictConfig +from ..utils.mask import make_pad_mask + + +class MaskedDiffWithXvec(torch.nn.Module): + def __init__(self, + input_size: int = 512, + output_size: int = 80, + spk_embed_dim: int = 192, + output_type: str = "mel", + vocab_size: int = 4096, + input_frame_rate: int = 50, + only_mask_loss: bool = True, + encoder: torch.nn.Module = None, + length_regulator: torch.nn.Module = None, + decoder: torch.nn.Module = None, + decoder_conf: Dict = {'in_channels': 240, 'out_channel': 80, 'spk_emb_dim': 80, 'n_spks': 1, 'cfm_params': DictConfig({'sigma_min': 1e-06, 'solver': 'euler', 't_scheduler': 'cosine', 'training_cfg_rate': 0.2, 'inference_cfg_rate': 0.7, 'reg_loss_type': 'l1'}), 'decoder_params': {'channels': [256, 256], 'dropout': 0.0, 'attention_head_dim': 64, 'n_blocks': 4, 'num_mid_blocks': 12, 'num_heads': 8, 'act_fn': 'gelu'}}, + mel_feat_conf: Dict = {'n_fft': 1024, 'num_mels': 80, 'sampling_rate': 22050, 'hop_size': 256, 'win_size': 1024, 'fmin': 0, 'fmax': 8000}): + super().__init__() + self.input_size = input_size + self.output_size = output_size + self.decoder_conf = decoder_conf + self.mel_feat_conf = mel_feat_conf + self.vocab_size = vocab_size + self.output_type = output_type + self.input_frame_rate = input_frame_rate + logging.info(f"input frame rate={self.input_frame_rate}") + self.input_embedding = nn.Embedding(vocab_size, input_size) + self.spk_embed_affine_layer = torch.nn.Linear(spk_embed_dim, output_size) + self.encoder = encoder + self.encoder_proj = torch.nn.Linear(self.encoder.output_size(), output_size) + self.decoder = decoder + self.length_regulator = length_regulator + self.only_mask_loss = only_mask_loss + self.max_seq_long = 2048 * 2 + self.max_seq_short = 256 * 2 + + def forward( + self, + batch: dict, + device: torch.device, + ) -> Dict[str, Optional[torch.Tensor]]: + token = batch['speech_token'].to(device) + token_len = batch['speech_token_len'].to(device) + feat = batch['speech_feat'].to(device) + feat_len = batch['speech_feat_len'].to(device) + embedding = batch['embedding'].to(device) + + # xvec projection + embedding = F.normalize(embedding, dim=1) + embedding = self.spk_embed_affine_layer(embedding) + + # concat text and prompt_text + mask = (~make_pad_mask(token_len)).float().unsqueeze(-1).to(device) + token = self.input_embedding(torch.clamp(token, min=0)) * mask + + # text encode + h, h_lengths = self.encoder(token, token_len) + h = self.encoder_proj(h) + h, h_lengths = self.length_regulator(h, feat_len) + + # get conditions + conds = torch.zeros(feat.shape, device=token.device) + for i, j in enumerate(feat_len): + if random.random() < 0.5: + continue + index = random.randint(0, int(0.3 * j)) + conds[i, :index] = feat[i, :index] + conds = conds.transpose(1, 2) + + mask = (~make_pad_mask(feat_len)).to(h) + feat = F.interpolate(feat.unsqueeze(dim=1), size=h.shape[1:], mode="nearest").squeeze(dim=1) + loss, _ = self.decoder.compute_loss( + feat.transpose(1, 2).contiguous(), + mask.unsqueeze(1), + h.transpose(1, 2).contiguous(), + embedding, + cond=conds + ) + return {'loss': loss} + + @torch.inference_mode() + def inference(self, + token, + token_len, + prompt_token, + prompt_token_len, + prompt_feat, + prompt_feat_len, + embedding): + assert token.shape[0] == 1 + # xvec projection + embedding = F.normalize(embedding, dim=1) + embedding = self.spk_embed_affine_layer(embedding) + + # concat text and prompt_text + token, token_len = torch.concat([prompt_token, token], dim=1), prompt_token_len + token_len + mask = (~make_pad_mask(token_len)).to(embedding.dtype).unsqueeze(-1).to(embedding) + token = self.input_embedding(torch.clamp(token, min=0)) * mask + + # text encode + h, h_lengths = self.encoder(token, token_len) + h = self.encoder_proj(h) + feat_len = (token_len / self.input_frame_rate * 22050 / 256).int() + h, h_lengths = self.length_regulator(h, feat_len) + + fix_max_len = feat_len.max().item() + + # get conditions + conds = torch.zeros([1, fix_max_len, self.output_size], device=token.device, dtype=embedding.dtype) + # if prompt_feat.shape[1] != 0: + # for i, j in enumerate(prompt_feat_len): + # conds[i, :j] = prompt_feat[i] + conds = conds.transpose(1, 2) + + mask = (~make_pad_mask(feat_len, fix_max_len)).to(h) + + feat = self.decoder.forward( + mu=h.transpose(1, 2).contiguous(), + mask=mask.unsqueeze(1), + spks=embedding, + cond=conds, + n_timesteps=8, + # temperature=0.7, + ) + + if prompt_feat.shape[1] != 0: + feat = feat[:, :, prompt_feat.shape[1]:] + return feat diff --git a/audio_detokenizer/flow/flow_matching.py b/audio_detokenizer/flow/flow_matching.py new file mode 100644 index 0000000000000000000000000000000000000000..35eed592e65d908d27b47798c921349b9583d16d --- /dev/null +++ b/audio_detokenizer/flow/flow_matching.py @@ -0,0 +1,242 @@ +# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# antflake8: noqa +import os +import torch + +try: + import tensorrt as trt +except ImportError: + import warnings + warnings.warn("Failed to import TensorRT. Make sure TensorRT is installed and available in your environment.", ImportWarning) + +import torch.nn.functional as F +from matcha.models.components.flow_matching import BASECFM + +class ConditionalCFM(BASECFM): + def __init__(self, in_channels, cfm_params, n_spks=1, spk_emb_dim=64, tensorrt_model_path="estimator_fp16.plan", estimator: torch.nn.Module = None): + super().__init__( + n_feats=in_channels, + cfm_params=cfm_params, + n_spks=n_spks, + spk_emb_dim=spk_emb_dim, + ) + self.t_scheduler = cfm_params.t_scheduler + self.training_cfg_rate = cfm_params.training_cfg_rate + self.inference_cfg_rate = cfm_params.inference_cfg_rate + in_channels = in_channels + (spk_emb_dim if n_spks > 0 else 0) + # Just change the architecture of the estimator here + self.estimator = estimator + self.compiled_estimator = None + + self.export_onnx = False + self.use_tensorrt = False + + if os.path.isfile(tensorrt_model_path): + trt.init_libnvinfer_plugins(None, "") + logger = trt.Logger(trt.Logger.WARNING) + runtime = trt.Runtime(logger) + with open(tensorrt_model_path, 'rb') as f: + serialized_engine = f.read() + self.engine = runtime.deserialize_cuda_engine(serialized_engine) + self._context = self.engine.create_execution_context() + self.use_tensorrt = True + + @torch.inference_mode() + def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None): + """Forward diffusion + + Args: + mu (torch.Tensor): output of encoder + shape: (batch_size, n_feats, mel_timesteps) + mask (torch.Tensor): output_mask + shape: (batch_size, 1, mel_timesteps) + n_timesteps (int): number of diffusion steps + temperature (float, optional): temperature for scaling noise. Defaults to 1.0. + spks (torch.Tensor, optional): speaker ids. Defaults to None. + shape: (batch_size, spk_emb_dim) + cond: Not used but kept for future purposes + + Returns: + sample: generated mel-spectrogram + shape: (batch_size, n_feats, mel_timesteps) + """ + z = torch.randn_like(mu) * temperature + t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device, dtype=mu.dtype) + if self.t_scheduler == 'cosine': + t_span = 1 - torch.cos(t_span * 0.5 * torch.pi) + return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond) + + def estimator_infer(self, x, mask, mu, t, spks, cond): + if self.use_tensorrt: + # print("Using tensorrt now !!!!") + bs = x.shape[0] + hs = x.shape[1] + seq_len = x.shape[2] + + assert bs == 1 and hs == 80 + + ret = torch.empty_like(x) + self._context.set_input_shape("x", x.shape) + self._context.set_input_shape("mask", mask.shape) + self._context.set_input_shape("mu", mu.shape) + self._context.set_input_shape("t", t.shape) + self._context.set_input_shape("spks", spks.shape) + self._context.set_input_shape("cond", cond.shape) + + bindings = [x.data_ptr(), mask.data_ptr(), mu.data_ptr(), t.data_ptr(), spks.data_ptr(), cond.data_ptr(), ret.data_ptr()] + + for i in range(len(bindings)): + self._context.set_tensor_address(self.engine.get_tensor_name(i), bindings[i]) + + handle = torch.cuda.current_stream().cuda_stream + self._context.execute_async_v3(stream_handle=handle) + return ret + else: + return self.estimator.forward(x, mask, mu, t, spks, cond) + + def solve_euler(self, x, t_span, mu, mask, spks, cond): + """ + Fixed euler solver for ODEs. + Args: + x (torch.Tensor): random noise + t_span (torch.Tensor): n_timesteps interpolated + shape: (n_timesteps + 1,) + mu (torch.Tensor): output of encoder + shape: (batch_size, n_feats, mel_timesteps) + mask (torch.Tensor): output_mask + shape: (batch_size, 1, mel_timesteps) + spks (torch.Tensor, optional): speaker ids. Defaults to None. + shape: (batch_size, spk_emb_dim) + cond: Not used but kept for future purposes + """ + t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0] + t = t.unsqueeze(dim=0) + + # I am storing this because I can later plot it by putting a debugger here and saving it to a file + # Or in future might add like a return_all_steps flag + sol = [] + + # self.export_onnx= True + # if self.export_onnx == True: + # dummy_input = (x, mask, mu, t, spks, cond) + # torch.onnx.export( + # self.estimator, + # dummy_input, + # "estimator_bf16.onnx", + # export_params=True, + # opset_version=18, + # do_constant_folding=True, + # input_names=['x', 'mask', 'mu', 't', 'spks', 'cond'], + # output_names=['output'], + # dynamic_axes={ + # 'x': {2: 'seq_len'}, + # 'mask': {2: 'seq_len'}, + # 'mu': {2: 'seq_len'}, + # 'cond': {2: 'seq_len'}, + # 'output': {2: 'seq_len'}, + # } + # ) + # onnx_file_path = "estimator_bf16.onnx" + # tensorrt_path = "/root/TensorRT-10.2.0.19" + # if not tensorrt_path: + # raise EnvironmentError("Please set the 'tensorrt_root_dir' environment variable.") + + # if not os.path.isdir(tensorrt_path): + # raise FileNotFoundError(f"The directory {tensorrt_path} does not exist.") + + # trt_lib_path = os.path.join(tensorrt_path, "lib") + # if trt_lib_path not in os.environ.get('LD_LIBRARY_PATH', ''): + # print(f"Adding TensorRT lib path {trt_lib_path} to LD_LIBRARY_PATH.") + # os.environ['LD_LIBRARY_PATH'] = f"{os.environ.get('LD_LIBRARY_PATH', '')}:{trt_lib_path}" + + # trt_file_name = 'estimator_bf16.plan' + # flow_model_dir ='.' + # # trt_file_path = os.path.join(flow_model_dir, trt_file_name) + + # trtexec_bin = os.path.join(tensorrt_path, 'bin/trtexec') + # trtexec_cmd = f"{trtexec_bin} --onnx={onnx_file_path} --saveEngine={trt_file_name} " \ + # "--minShapes=x:1x80x1,mask:1x1x1,mu:1x80x1,t:1,spks:1x80,cond:1x80x1 " \ + # "--maxShapes=x:1x80x4096,mask:1x1x4096,mu:1x80x4096,t:1,spks:1x80,cond:1x80x4096 " + \ + # "--fp16" + + # print("execute tensorrt", trtexec_cmd) + # os.system(trtexec_cmd) + # # """ + # # ${TensorRT-10.2.0.19}/bin/trtexec --onnx=estimator_fp16.onnx --saveEngine=estimator_fp16.plan \ + # # --minShapes=x:1x80x1,mask:1x1x1,mu:1x80x1,t:1,spks:1x80,cond:1x80x1 \ + # # --maxShapes=x:1x80x4096,mask:1x1x4096,mu:1x80x4096,t:1,spks:1x80,cond:1x80x4096 \ + # # --fp16 --verbose + # # """ + + + for step in range(1, len(t_span)): + dphi_dt = self.estimator_infer(x, mask, mu, t, spks, cond).clone() + # Classifier-Free Guidance inference introduced in VoiceBox + if self.inference_cfg_rate > 0: + cfg_dphi_dt = self.estimator_infer( + x, mask, + torch.zeros_like(mu), t, + torch.zeros_like(spks) if spks is not None else None, + torch.zeros_like(cond) + ).clone() + dphi_dt = ((1.0 + self.inference_cfg_rate) * dphi_dt - self.inference_cfg_rate * cfg_dphi_dt) + x = x + dt * dphi_dt + t = t + dt + sol.append(x) + if step < len(t_span) - 1: + dt = t_span[step + 1] - t + + return sol[-1] + + def compute_loss(self, x1, mask, mu, spks=None, cond=None): + """Computes diffusion loss + + Args: + x1 (torch.Tensor): Target + shape: (batch_size, n_feats, mel_timesteps) + mask (torch.Tensor): target mask + shape: (batch_size, 1, mel_timesteps) + mu (torch.Tensor): output of encoder + shape: (batch_size, n_feats, mel_timesteps) + spks (torch.Tensor, optional): speaker embedding. Defaults to None. + shape: (batch_size, spk_emb_dim) + + Returns: + loss: conditional flow matching loss + y: conditional flow + shape: (batch_size, n_feats, mel_timesteps) + """ + b, _, t = mu.shape + + # random timestep + t = torch.rand([b, 1, 1], device=mu.device, dtype=mu.dtype) + if self.t_scheduler == 'cosine': + t = 1 - torch.cos(t * 0.5 * torch.pi) + # sample noise p(x_0) + z = torch.randn_like(x1) + + y = (1 - (1 - self.sigma_min) * t) * z + t * x1 + u = x1 - (1 - self.sigma_min) * z + + # during training, we randomly drop condition to trade off mode coverage and sample fidelity + if self.training_cfg_rate > 0: + cfg_mask = torch.rand(b, device=x1.device) > self.training_cfg_rate + mu = mu * cfg_mask.view(-1, 1, 1) + spks = spks * cfg_mask.view(-1, 1) + cond = cond * cfg_mask.view(-1, 1, 1) + + pred = self.estimator(y, mask, mu, t.squeeze(), spks, cond) + loss = F.mse_loss(pred * mask, u * mask, reduction="sum") / (torch.sum(mask) * u.shape[1]) + return loss, y diff --git a/audio_detokenizer/flow/length_regulator.py b/audio_detokenizer/flow/length_regulator.py new file mode 100644 index 0000000000000000000000000000000000000000..f64f8c6bf628240bb710b3a8f96db22ba220a02b --- /dev/null +++ b/audio_detokenizer/flow/length_regulator.py @@ -0,0 +1,49 @@ +# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Tuple +import torch.nn as nn +from torch.nn import functional as F +from ..utils.mask import make_pad_mask + + +class InterpolateRegulator(nn.Module): + def __init__( + self, + channels: int, + sampling_ratios: Tuple, + out_channels: int = None, + groups: int = 1, + ): + super().__init__() + self.sampling_ratios = sampling_ratios + out_channels = out_channels or channels + model = nn.ModuleList([]) + if len(sampling_ratios) > 0: + for _ in sampling_ratios: + module = nn.Conv1d(channels, channels, 3, 1, 1) + norm = nn.GroupNorm(groups, channels) + act = nn.Mish() + model.extend([module, norm, act]) + model.append( + nn.Conv1d(channels, out_channels, 1, 1) + ) + self.model = nn.Sequential(*model) + + def forward(self, x, ylens=None): + # x in (B, T, D) + mask = (~make_pad_mask(ylens)).to(x).unsqueeze(-1) + x = F.interpolate(x.transpose(1, 2).contiguous(), size=ylens.max(), mode='nearest') + out = self.model(x).transpose(1, 2).contiguous() + olens = ylens + return out * mask, olens diff --git a/audio_detokenizer/hifigan/__init__.py b/audio_detokenizer/hifigan/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/audio_detokenizer/hifigan/__pycache__/__init__.cpython-38.pyc b/audio_detokenizer/hifigan/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..98969bee81f02b258fcb751374f804eb4b2bcb96 Binary files /dev/null and b/audio_detokenizer/hifigan/__pycache__/__init__.cpython-38.pyc differ diff --git a/audio_detokenizer/hifigan/__pycache__/f0_predictor.cpython-38.pyc b/audio_detokenizer/hifigan/__pycache__/f0_predictor.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..83a6cd150d3d9b6ed7c57f83b67e3f9d4293e1b0 Binary files /dev/null and b/audio_detokenizer/hifigan/__pycache__/f0_predictor.cpython-38.pyc differ diff --git a/audio_detokenizer/hifigan/__pycache__/generator.cpython-38.pyc b/audio_detokenizer/hifigan/__pycache__/generator.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..97f5fa92709ccf747dad64ec23cc5aee182e344e Binary files /dev/null and b/audio_detokenizer/hifigan/__pycache__/generator.cpython-38.pyc differ diff --git a/audio_detokenizer/hifigan/f0_predictor.py b/audio_detokenizer/hifigan/f0_predictor.py new file mode 100644 index 0000000000000000000000000000000000000000..36b85f4ed90c3a412cb179f49ccb471132a86550 --- /dev/null +++ b/audio_detokenizer/hifigan/f0_predictor.py @@ -0,0 +1,55 @@ +# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Kai Hu) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch +import torch.nn as nn +from torch.nn.utils import weight_norm + + +class ConvRNNF0Predictor(nn.Module): + def __init__(self, + num_class: int = 1, + in_channels: int = 80, + cond_channels: int = 512 + ): + super().__init__() + + self.num_class = num_class + self.condnet = nn.Sequential( + weight_norm( + nn.Conv1d(in_channels, cond_channels, kernel_size=3, padding=1) + ), + nn.ELU(), + weight_norm( + nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1) + ), + nn.ELU(), + weight_norm( + nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1) + ), + nn.ELU(), + weight_norm( + nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1) + ), + nn.ELU(), + weight_norm( + nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1) + ), + nn.ELU(), + ) + self.classifier = nn.Linear(in_features=cond_channels, out_features=self.num_class) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.condnet(x) + x = x.transpose(1, 2) + return torch.abs(self.classifier(x).squeeze(-1)) diff --git a/audio_detokenizer/hifigan/generator.py b/audio_detokenizer/hifigan/generator.py new file mode 100644 index 0000000000000000000000000000000000000000..efc4be1868ef2a0219c836ac40cab20c40547b1e --- /dev/null +++ b/audio_detokenizer/hifigan/generator.py @@ -0,0 +1,392 @@ +# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Kai Hu) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""HIFI-GAN""" +# antflake8: noqa + +import typing as tp +import numpy as np +from scipy.signal import get_window +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn import Conv1d +from torch.nn import ConvTranspose1d +from torch.nn.utils import remove_weight_norm +from torch.nn.utils import weight_norm +from torch.distributions.uniform import Uniform + +from ..transformer.activation import Snake +from ..utils.common import get_padding +from ..utils.common import init_weights + + +"""hifigan based generator implementation. + +This code is modified from https://github.com/jik876/hifi-gan + ,https://github.com/kan-bayashi/ParallelWaveGAN and + https://github.com/NVIDIA/BigVGAN + +""" +class ResBlock(torch.nn.Module): + """Residual block module in HiFiGAN/BigVGAN.""" + def __init__( + self, + channels: int = 512, + kernel_size: int = 3, + dilations: tp.List[int] = [1, 3, 5], + ): + super(ResBlock, self).__init__() + self.convs1 = nn.ModuleList() + self.convs2 = nn.ModuleList() + + for dilation in dilations: + self.convs1.append( + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation, + padding=get_padding(kernel_size, dilation) + ) + ) + ) + self.convs2.append( + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1) + ) + ) + ) + self.convs1.apply(init_weights) + self.convs2.apply(init_weights) + self.activations1 = nn.ModuleList([ + Snake(channels, alpha_logscale=False) + for _ in range(len(self.convs1)) + ]) + self.activations2 = nn.ModuleList([ + Snake(channels, alpha_logscale=False) + for _ in range(len(self.convs2)) + ]) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + for idx in range(len(self.convs1)): + xt = self.activations1[idx](x) + xt = self.convs1[idx](xt) + xt = self.activations2[idx](xt) + xt = self.convs2[idx](xt) + x = xt + x + return x + + def remove_weight_norm(self): + for idx in range(len(self.convs1)): + remove_weight_norm(self.convs1[idx]) + remove_weight_norm(self.convs2[idx]) + +class SineGen(torch.nn.Module): + """ Definition of sine generator + SineGen(samp_rate, harmonic_num = 0, + sine_amp = 0.1, noise_std = 0.003, + voiced_threshold = 0, + flag_for_pulse=False) + samp_rate: sampling rate in Hz + harmonic_num: number of harmonic overtones (default 0) + sine_amp: amplitude of sine-wavefrom (default 0.1) + noise_std: std of Gaussian noise (default 0.003) + voiced_thoreshold: F0 threshold for U/V classification (default 0) + flag_for_pulse: this SinGen is used inside PulseGen (default False) + Note: when flag_for_pulse is True, the first time step of a voiced + segment is always sin(np.pi) or cos(0) + """ + + def __init__(self, samp_rate, harmonic_num=0, + sine_amp=0.1, noise_std=0.003, + voiced_threshold=0): + super(SineGen, self).__init__() + self.sine_amp = sine_amp + self.noise_std = noise_std + self.harmonic_num = harmonic_num + self.sampling_rate = samp_rate + self.voiced_threshold = voiced_threshold + + def _f02uv(self, f0): + # generate uv signal + uv = (f0 > self.voiced_threshold).type(torch.float32) + return uv + + @torch.no_grad() + def forward(self, f0): + """ + :param f0: [B, 1, sample_len], Hz + :return: [B, 1, sample_len] + """ + + F_mat = torch.zeros((f0.size(0), self.harmonic_num + 1, f0.size(-1))).to(f0.device) + for i in range(self.harmonic_num + 1): + F_mat[:, i: i + 1, :] = f0 * (i + 1) / self.sampling_rate + + theta_mat = 2 * np.pi * (torch.cumsum(F_mat, dim=-1) % 1) + u_dist = Uniform(low=-np.pi, high=np.pi) + phase_vec = u_dist.sample(sample_shape=(f0.size(0), self.harmonic_num + 1, 1)).to(F_mat.device) + phase_vec[:, 0, :] = 0 + + # generate sine waveforms + sine_waves = self.sine_amp * torch.sin(theta_mat + phase_vec) + + # generate uv signal + uv = self._f02uv(f0) + + # noise: for unvoiced should be similar to sine_amp + # std = self.sine_amp/3 -> max value ~ self.sine_amp + # . for voiced regions is self.noise_std + noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 + noise = noise_amp * torch.randn_like(sine_waves) + + # first: set the unvoiced part to 0 by uv + # then: additive noise + sine_waves = sine_waves * uv + noise + return sine_waves, uv, noise + + +class SourceModuleHnNSF(torch.nn.Module): + """ SourceModule for hn-nsf + SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1, + add_noise_std=0.003, voiced_threshod=0) + sampling_rate: sampling_rate in Hz + harmonic_num: number of harmonic above F0 (default: 0) + sine_amp: amplitude of sine source signal (default: 0.1) + add_noise_std: std of additive Gaussian noise (default: 0.003) + note that amplitude of noise in unvoiced is decided + by sine_amp + voiced_threshold: threhold to set U/V given F0 (default: 0) + Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) + F0_sampled (batchsize, length, 1) + Sine_source (batchsize, length, 1) + noise_source (batchsize, length 1) + uv (batchsize, length, 1) + """ + + def __init__(self, sampling_rate, upsample_scale, harmonic_num=0, sine_amp=0.1, + add_noise_std=0.003, voiced_threshod=0): + super(SourceModuleHnNSF, self).__init__() + + self.sine_amp = sine_amp + self.noise_std = add_noise_std + + # to produce sine waveforms + self.l_sin_gen = SineGen(sampling_rate, harmonic_num, + sine_amp, add_noise_std, voiced_threshod) + + # to merge source harmonics into a single excitation + self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) + self.l_tanh = torch.nn.Tanh() + + def forward(self, x): + """ + Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) + F0_sampled (batchsize, length, 1) + Sine_source (batchsize, length, 1) + noise_source (batchsize, length 1) + """ + # source for harmonic branch + with torch.no_grad(): + sine_wavs, uv, _ = self.l_sin_gen(x.transpose(1, 2)) + sine_wavs = sine_wavs.transpose(1, 2) + uv = uv.transpose(1, 2) + sine_merge = self.l_tanh(self.l_linear(sine_wavs)) + + # source for noise branch, in the same shape as uv + noise = torch.randn_like(uv) * self.sine_amp / 3 + return sine_merge, noise, uv + + +class HiFTGenerator(nn.Module): + """ + HiFTNet Generator: Neural Source Filter + ISTFTNet + https://arxiv.org/abs/2309.09493 + """ + def __init__( + self, + in_channels: int = 80, + base_channels: int = 512, + nb_harmonics: int = 8, + sampling_rate: int = 22050, + nsf_alpha: float = 0.1, + nsf_sigma: float = 0.003, + nsf_voiced_threshold: float = 10, + upsample_rates: tp.List[int] = [8, 8], + upsample_kernel_sizes: tp.List[int] = [16, 16], + istft_params: tp.Dict[str, int] = {"n_fft": 16, "hop_len": 4}, + resblock_kernel_sizes: tp.List[int] = [3, 7, 11], + resblock_dilation_sizes: tp.List[tp.List[int]] = [[1, 3, 5], [1, 3, 5], [1, 3, 5]], + source_resblock_kernel_sizes: tp.List[int] = [7, 11], + source_resblock_dilation_sizes: tp.List[tp.List[int]] = [[1, 3, 5], [1, 3, 5]], + lrelu_slope: float = 0.1, + audio_limit: float = 0.99, + f0_predictor: torch.nn.Module = None, + ): + super(HiFTGenerator, self).__init__() + + self.out_channels = 1 + self.nb_harmonics = nb_harmonics + self.sampling_rate = sampling_rate + self.istft_params = istft_params + self.lrelu_slope = lrelu_slope + self.audio_limit = audio_limit + + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + self.m_source = SourceModuleHnNSF( + sampling_rate=sampling_rate, + upsample_scale=np.prod(upsample_rates) * istft_params["hop_len"], + harmonic_num=nb_harmonics, + sine_amp=nsf_alpha, + add_noise_std=nsf_sigma, + voiced_threshod=nsf_voiced_threshold) + self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates) * istft_params["hop_len"]) + + self.conv_pre = weight_norm( + Conv1d(in_channels, base_channels, 7, 1, padding=3) + ) + + # Up + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + self.ups.append( + weight_norm( + ConvTranspose1d( + base_channels // (2**i), + base_channels // (2**(i + 1)), + k, + u, + padding=(k - u) // 2, + ) + ) + ) + + # Down + self.source_downs = nn.ModuleList() + self.source_resblocks = nn.ModuleList() + downsample_rates = [1] + upsample_rates[::-1][:-1] + downsample_cum_rates = np.cumprod(downsample_rates) + for i, (u, k, d) in enumerate(zip(downsample_cum_rates[::-1], source_resblock_kernel_sizes, + source_resblock_dilation_sizes)): + if u == 1: + self.source_downs.append( + Conv1d(istft_params["n_fft"] + 2, base_channels // (2 ** (i + 1)), 1, 1) + ) + else: + self.source_downs.append( + Conv1d(istft_params["n_fft"] + 2, base_channels // (2 ** (i + 1)), u * 2, u, padding=(u // 2)) + ) + + self.source_resblocks.append( + ResBlock(base_channels // (2 ** (i + 1)), k, d) + ) + + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = base_channels // (2**(i + 1)) + for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)): + self.resblocks.append(ResBlock(ch, k, d)) + + self.conv_post = weight_norm(Conv1d(ch, istft_params["n_fft"] + 2, 7, 1, padding=3)) + self.ups.apply(init_weights) + self.conv_post.apply(init_weights) + self.reflection_pad = nn.ReflectionPad1d((1, 0)) + self.stft_window = torch.from_numpy(get_window("hann", istft_params["n_fft"], fftbins=True).astype(np.float32)) + self.f0_predictor = f0_predictor + + def _f02source(self, f0: torch.Tensor) -> torch.Tensor: + f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2) # bs,n,t + + har_source, _, _ = self.m_source(f0) + return har_source.transpose(1, 2) + + def _stft(self, x): + spec = torch.stft( + x, + self.istft_params["n_fft"], self.istft_params["hop_len"], self.istft_params["n_fft"], window=self.stft_window.to(x.device), + return_complex=True) + spec = torch.view_as_real(spec) # [B, F, TT, 2] + return spec[..., 0], spec[..., 1] + + def _istft(self, magnitude, phase): + magnitude = torch.clip(magnitude, max=1e2) + real = magnitude * torch.cos(phase) + img = magnitude * torch.sin(phase) + inverse_transform = torch.istft(torch.complex(real, img), self.istft_params["n_fft"], self.istft_params["hop_len"], self.istft_params["n_fft"], window=self.stft_window.to(magnitude.device)) + return inverse_transform + + def forward(self, x: torch.Tensor) -> torch.Tensor: + f0 = self.f0_predictor(x) + s = self._f02source(f0) + + s_stft_real, s_stft_imag = self._stft(s.squeeze(1)) + s_stft = torch.cat([s_stft_real, s_stft_imag], dim=1) + + x = self.conv_pre(x) + for i in range(self.num_upsamples): + x = F.leaky_relu(x, self.lrelu_slope) + x = self.ups[i](x) + + if i == self.num_upsamples - 1: + x = self.reflection_pad(x) + + # fusion + si = self.source_downs[i](s_stft) + si = self.source_resblocks[i](si) + x = x + si + + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + + x = F.leaky_relu(x) + x = self.conv_post(x) + magnitude = torch.exp(x[:, :self.istft_params["n_fft"] // 2 + 1, :]) + phase = torch.sin(x[:, self.istft_params["n_fft"] // 2 + 1:, :]) # actually, sin is redundancy + + x = self._istft(magnitude, phase) + x = torch.clamp(x, -self.audio_limit, self.audio_limit) + return x + + def remove_weight_norm(self): + print('Removing weight norm...') + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + remove_weight_norm(self.conv_pre) + remove_weight_norm(self.conv_post) + self.source_module.remove_weight_norm() + for l in self.source_downs: + remove_weight_norm(l) + for l in self.source_resblocks: + l.remove_weight_norm() + + @torch.inference_mode() + def inference(self, mel: torch.Tensor) -> torch.Tensor: + return self.forward(x=mel) diff --git a/audio_detokenizer/transformer/__init__.py b/audio_detokenizer/transformer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/audio_detokenizer/transformer/__pycache__/__init__.cpython-38.pyc b/audio_detokenizer/transformer/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e5192de037894de4a2d8480629d3bbee52501cef Binary files /dev/null and b/audio_detokenizer/transformer/__pycache__/__init__.cpython-38.pyc differ diff --git a/audio_detokenizer/transformer/__pycache__/activation.cpython-38.pyc b/audio_detokenizer/transformer/__pycache__/activation.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4f56d5ae0817b7f7c580bb9595d01654992836e2 Binary files /dev/null and b/audio_detokenizer/transformer/__pycache__/activation.cpython-38.pyc differ diff --git a/audio_detokenizer/transformer/__pycache__/attention.cpython-38.pyc b/audio_detokenizer/transformer/__pycache__/attention.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..237bddc6539eddca2980372a429aae3ac096a623 Binary files /dev/null and b/audio_detokenizer/transformer/__pycache__/attention.cpython-38.pyc differ diff --git a/audio_detokenizer/transformer/__pycache__/convolution.cpython-38.pyc b/audio_detokenizer/transformer/__pycache__/convolution.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ed55fd8e4d4278c363be1ee4d069a470dd59e926 Binary files /dev/null and b/audio_detokenizer/transformer/__pycache__/convolution.cpython-38.pyc differ diff --git a/audio_detokenizer/transformer/__pycache__/embedding.cpython-38.pyc b/audio_detokenizer/transformer/__pycache__/embedding.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..372c0c3857f7294cb1041c4398b7f70005a9d56b Binary files /dev/null and b/audio_detokenizer/transformer/__pycache__/embedding.cpython-38.pyc differ diff --git a/audio_detokenizer/transformer/__pycache__/encoder.cpython-38.pyc b/audio_detokenizer/transformer/__pycache__/encoder.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..360857607f003b2f78d83b65a3cf244e6778ee55 Binary files /dev/null and b/audio_detokenizer/transformer/__pycache__/encoder.cpython-38.pyc differ diff --git a/audio_detokenizer/transformer/__pycache__/encoder_layer.cpython-38.pyc b/audio_detokenizer/transformer/__pycache__/encoder_layer.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6b01b625ed4df672fbcaf1cfa7ecf10ce9db5a6b Binary files /dev/null and b/audio_detokenizer/transformer/__pycache__/encoder_layer.cpython-38.pyc differ diff --git a/audio_detokenizer/transformer/__pycache__/positionwise_feed_forward.cpython-38.pyc b/audio_detokenizer/transformer/__pycache__/positionwise_feed_forward.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ff9fb3a39547d112053fe810b2add337fb6407f9 Binary files /dev/null and b/audio_detokenizer/transformer/__pycache__/positionwise_feed_forward.cpython-38.pyc differ diff --git a/audio_detokenizer/transformer/__pycache__/subsampling.cpython-38.pyc b/audio_detokenizer/transformer/__pycache__/subsampling.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5afc5e4db8265c30d84b932eafb877516c2cae8d Binary files /dev/null and b/audio_detokenizer/transformer/__pycache__/subsampling.cpython-38.pyc differ diff --git a/audio_detokenizer/transformer/activation.py b/audio_detokenizer/transformer/activation.py new file mode 100644 index 0000000000000000000000000000000000000000..8cea54816385d3b6585ccc2417bc71630d578177 --- /dev/null +++ b/audio_detokenizer/transformer/activation.py @@ -0,0 +1,84 @@ +# Copyright (c) 2020 Johns Hopkins University (Shinji Watanabe) +# 2020 Northwestern Polytechnical University (Pengcheng Guo) +# 2020 Mobvoi Inc (Binbin Zhang) +# 2024 Alibaba Inc (Xiang Lyu) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Swish() activation function for Conformer.""" + +import torch +from torch import nn, sin, pow +from torch.nn import Parameter + + +class Swish(torch.nn.Module): + """Construct an Swish object.""" + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Return Swish activation function.""" + return x * torch.sigmoid(x) + + +# Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license. +# LICENSE is in incl_licenses directory. +class Snake(nn.Module): + ''' + Implementation of a sine-based periodic activation function + Shape: + - Input: (B, C, T) + - Output: (B, C, T), same shape as the input + Parameters: + - alpha - trainable parameter + References: + - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda: + https://arxiv.org/abs/2006.08195 + Examples: + >>> a1 = snake(256) + >>> x = torch.randn(256) + >>> x = a1(x) + ''' + def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False): + ''' + Initialization. + INPUT: + - in_features: shape of the input + - alpha: trainable parameter + alpha is initialized to 1 by default, higher values = higher-frequency. + alpha will be trained along with the rest of your model. + ''' + super(Snake, self).__init__() + self.in_features = in_features + + # initialize alpha + self.alpha_logscale = alpha_logscale + if self.alpha_logscale: # log scale alphas initialized to zeros + self.alpha = Parameter(torch.zeros(in_features) * alpha) + else: # linear scale alphas initialized to ones + self.alpha = Parameter(torch.ones(in_features) * alpha) + + self.alpha.requires_grad = alpha_trainable + + self.no_div_by_zero = 0.000000001 + + def forward(self, x): + ''' + Forward pass of the function. + Applies the function to the input elementwise. + Snake ∶= x + 1/a * sin^2 (xa) + ''' + alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T] + if self.alpha_logscale: + alpha = torch.exp(alpha) + x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2) + + return x diff --git a/audio_detokenizer/transformer/attention.py b/audio_detokenizer/transformer/attention.py new file mode 100644 index 0000000000000000000000000000000000000000..0bbcea52740bf556c39f45d574ea9b691f630b7b --- /dev/null +++ b/audio_detokenizer/transformer/attention.py @@ -0,0 +1,463 @@ +# Copyright (c) 2019 Shigeki Karita +# 2020 Mobvoi Inc (Binbin Zhang) +# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) +# 2024 Alibaba Inc (Xiang Lyu) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Multi-Head Attention layer definition.""" + +import math +from typing import Tuple + +import torch +from torch import nn + + +class MultiHeadedAttention(nn.Module): + """Multi-Head Attention layer. + + Args: + n_head (int): The number of heads. + n_feat (int): The number of features. + dropout_rate (float): Dropout rate. + + """ + + def __init__(self, + n_head: int, + n_feat: int, + dropout_rate: float, + key_bias: bool = True): + """Construct an MultiHeadedAttention object.""" + super().__init__() + assert n_feat % n_head == 0 + # We assume d_v always equals d_k + self.d_k = n_feat // n_head + self.h = n_head + self.linear_q = nn.Linear(n_feat, n_feat) + self.linear_k = nn.Linear(n_feat, n_feat, bias=key_bias) + self.linear_v = nn.Linear(n_feat, n_feat) + self.linear_out = nn.Linear(n_feat, n_feat) + self.dropout = nn.Dropout(p=dropout_rate) + self.dropout_rate = dropout_rate + self.kv_cache = None + + def forward_qkv( + self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Transform query, key and value. + + Args: + query (torch.Tensor): Query tensor (#batch, time1, size). + key (torch.Tensor): Key tensor (#batch, time2, size). + value (torch.Tensor): Value tensor (#batch, time2, size). + + Returns: + torch.Tensor: Transformed query tensor, size + (#batch, n_head, time1, d_k). + torch.Tensor: Transformed key tensor, size + (#batch, n_head, time2, d_k). + torch.Tensor: Transformed value tensor, size + (#batch, n_head, time2, d_k). + + """ + n_batch = query.size(0) + q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k) + k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k) + v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k) + + return q, k, v + + def forward_attention( + self, + value: torch.Tensor, + scores: torch.Tensor, + mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) + ) -> torch.Tensor: + """Compute attention context vector. + + Args: + value (torch.Tensor): Transformed value, size + (#batch, n_head, time2, d_k). + scores (torch.Tensor): Attention score, size + (#batch, n_head, time1, time2). + mask (torch.Tensor): Mask, size (#batch, 1, time2) or + (#batch, time1, time2), (0, 0, 0) means fake mask. + + Returns: + torch.Tensor: Transformed value (#batch, time1, d_model) + weighted by the attention score (#batch, time1, time2). + + """ + n_batch = value.size(0) + # NOTE(xcsong): When will `if mask.size(2) > 0` be True? + # 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the + # 1st chunk to ease the onnx export.] + # 2. pytorch training + if mask.size(2) > 0: # time2 > 0 + mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) + # For last chunk, time2 might be larger than scores.size(-1) + mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) + scores = scores.masked_fill(mask, -float('inf')) + attn = torch.softmax(scores, dim=-1).masked_fill( + mask, 0.0) # (batch, head, time1, time2) + # NOTE(xcsong): When will `if mask.size(2) > 0` be False? + # 1. onnx(16/-1, -1/-1, 16/0) + # 2. jit (16/-1, -1/-1, 16/0, 16/4) + else: + attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) + + p_attn = self.dropout(attn) + x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) + x = (x.transpose(1, 2).contiguous().view(n_batch, -1, + self.h * self.d_k) + ) # (batch, time1, d_model) + + return self.linear_out(x) # (batch, time1, d_model) + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), + pos_emb: torch.Tensor = torch.empty(0), + cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Compute scaled dot product attention. + + Args: + query (torch.Tensor): Query tensor (#batch, time1, size). + key (torch.Tensor): Key tensor (#batch, time2, size). + value (torch.Tensor): Value tensor (#batch, time2, size). + mask (torch.Tensor): Mask tensor (#batch, 1, time2) or + (#batch, time1, time2). + 1.When applying cross attention between decoder and encoder, + the batch padding mask for input is in (#batch, 1, T) shape. + 2.When applying self attention of encoder, + the mask is in (#batch, T, T) shape. + 3.When applying self attention of decoder, + the mask is in (#batch, L, L) shape. + 4.If the different position in decoder see different block + of the encoder, such as Mocha, the passed in mask could be + in (#batch, L, T) shape. + cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), + where `cache_t == chunk_size * num_decoding_left_chunks` + and `head * d_k == size` + + + Returns: + torch.Tensor: Output tensor (#batch, time1, d_model). + torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) + where `cache_t == chunk_size * num_decoding_left_chunks` + and `head * d_k == size` + + """ + q, k, v = self.forward_qkv(query, key, value) + q = q.transpose(1, 2) # (batch, head, time1, d_k) + k = k.transpose(1, 2) # (batch, head, time2, d_k) + v = v.transpose(1, 2) # (batch, head, time2, d_k) + + # NOTE(xcsong): + # when export onnx model, for 1st chunk, we feed + # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) + # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). + # In all modes, `if cache.size(0) > 0` will alwayse be `True` + # and we will always do splitting and + # concatnation(this will simplify onnx export). Note that + # it's OK to concat & split zero-shaped tensors(see code below). + # when export jit model, for 1st chunk, we always feed + # cache(0, 0, 0, 0) since jit supports dynamic if-branch. + # >>> a = torch.ones((1, 2, 0, 4)) + # >>> b = torch.ones((1, 2, 3, 4)) + # >>> c = torch.cat((a, b), dim=2) + # >>> torch.equal(b, c) # True + # >>> d = torch.split(a, 2, dim=-1) + # >>> torch.equal(d[0], d[1]) # True + if cache.size(0) > 0: + key_cache, value_cache = torch.split(cache, + cache.size(-1) // 2, + dim=-1) + k = torch.cat([key_cache, k], dim=2) + v = torch.cat([value_cache, v], dim=2) + # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's + # non-trivial to calculate `next_cache_start` here. + new_cache = torch.cat((k, v), dim=-1) + + scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) + return self.forward_attention(v, scores, mask), new_cache + + def inference( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), + pos_emb: torch.Tensor = torch.empty(0), + cache_offset: torch.Tensor = None, + is_infer_short: bool = False, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Compute scaled dot product attention. + + Args: + query (torch.Tensor): Query tensor (#batch, time1, size). + key (torch.Tensor): Key tensor (#batch, time2, size). + value (torch.Tensor): Value tensor (#batch, time2, size). + mask (torch.Tensor): Mask tensor (#batch, 1, time2) or + (#batch, time1, time2). + 1.When applying cross attention between decoder and encoder, + the batch padding mask for input is in (#batch, 1, T) shape. + 2.When applying self attention of encoder, + the mask is in (#batch, T, T) shape. + 3.When applying self attention of decoder, + the mask is in (#batch, L, L) shape. + 4.If the different position in decoder see different block + of the encoder, such as Mocha, the passed in mask could be + in (#batch, L, T) shape. + cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), + where `cache_t == chunk_size * num_decoding_left_chunks` + and `head * d_k == size` + + + Returns: + torch.Tensor: Output tensor (#batch, time1, d_model). + torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) + where `cache_t == chunk_size * num_decoding_left_chunks` + and `head * d_k == size` + + """ + q, k, v = self.forward_qkv(query, key, value) + q = q.transpose(1, 2) # (batch, head, time1, d_k) + k = k.transpose(1, 2) # (batch, head, time2, d_k) + v = v.transpose(1, 2) # (batch, head, time2, d_k) + + if self.kv_cache is not None: + k, v = self.kv_cache.update(cache_offset, k, v, is_infer_short) + + assert mask.dtype == torch.bool + mask = mask.unsqueeze(1).eq(False) * torch.finfo(q.dtype).min + + output = torch.nn.functional.scaled_dot_product_attention( + q, + k, + v, + attn_mask=mask, + dropout_p=self.dropout_rate, + scale=1 / math.sqrt(self.d_k), + ) + output = (output.transpose(1, 2).contiguous().view( + query.size(0), -1, + self.h * self.d_k)) # (batch, time1, d_model) + return self.linear_out(output) + + +class RelPositionMultiHeadedAttention(MultiHeadedAttention): + """Multi-Head Attention layer with relative position encoding. + Paper: https://arxiv.org/abs/1901.02860 + Args: + n_head (int): The number of heads. + n_feat (int): The number of features. + dropout_rate (float): Dropout rate. + """ + + def __init__(self, + n_head: int, + n_feat: int, + dropout_rate: float, + key_bias: bool = True): + """Construct an RelPositionMultiHeadedAttention object.""" + super().__init__(n_head, n_feat, dropout_rate, key_bias) + # linear transformation for positional encoding + self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) + # these two learnable bias are used in matrix c and matrix d + # as described in https://arxiv.org/abs/1901.02860 Section 3.3 + self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) + self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) + torch.nn.init.xavier_uniform_(self.pos_bias_u) + torch.nn.init.xavier_uniform_(self.pos_bias_v) + + def rel_shift(self, x): + """Compute relative positional encoding. + + Args: + x (torch.Tensor): Input tensor (batch, head, time1, 2*time1-1). + time1 means the length of query vector. + + Returns: + torch.Tensor: Output tensor. + + """ + zero_pad = torch.zeros((*x.size()[:3], 1), device=x.device, dtype=x.dtype) + x_padded = torch.cat([zero_pad, x], dim=-1) + + x_padded = x_padded.view(*x.size()[:2], x.size(3) + 1, x.size(2)) + x = x_padded[:, :, 1:].view_as(x)[ + :, :, :, : x.size(-1) // 2 + 1 + ] # only keep the positions from 0 to time2 + return x + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), + pos_emb: torch.Tensor = torch.empty(0), + cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Compute 'Scaled Dot Product Attention' with rel. positional encoding. + Args: + query (torch.Tensor): Query tensor (#batch, time1, size). + key (torch.Tensor): Key tensor (#batch, time2, size). + value (torch.Tensor): Value tensor (#batch, time2, size). + mask (torch.Tensor): Mask tensor (#batch, 1, time2) or + (#batch, time1, time2), (0, 0, 0) means fake mask. + pos_emb (torch.Tensor): Positional embedding tensor + (#batch, time2, size). + cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), + where `cache_t == chunk_size * num_decoding_left_chunks` + and `head * d_k == size` + Returns: + torch.Tensor: Output tensor (#batch, time1, d_model). + torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) + where `cache_t == chunk_size * num_decoding_left_chunks` + and `head * d_k == size` + """ + q, k, v = self.forward_qkv(query, key, value) + k = k.transpose(1, 2) # (batch, head, time2, d_k) + v = v.transpose(1, 2) # (batch, head, time2, d_k) + + # NOTE(xcsong): + # when export onnx model, for 1st chunk, we feed + # cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode) + # or cache(1, head, real_cache_t, d_k * 2) (16/4 mode). + # In all modes, `if cache.size(0) > 0` will alwayse be `True` + # and we will always do splitting and + # concatnation(this will simplify onnx export). Note that + # it's OK to concat & split zero-shaped tensors(see code below). + # when export jit model, for 1st chunk, we always feed + # cache(0, 0, 0, 0) since jit supports dynamic if-branch. + # >>> a = torch.ones((1, 2, 0, 4)) + # >>> b = torch.ones((1, 2, 3, 4)) + # >>> c = torch.cat((a, b), dim=2) + # >>> torch.equal(b, c) # True + # >>> d = torch.split(a, 2, dim=-1) + # >>> torch.equal(d[0], d[1]) # True + if cache.size(0) > 0: + key_cache, value_cache = torch.split(cache, + cache.size(-1) // 2, + dim=-1) + k = torch.cat([key_cache, k], dim=2) + v = torch.cat([value_cache, v], dim=2) + # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's + # non-trivial to calculate `next_cache_start` here. + new_cache = torch.cat((k, v), dim=-1) + + n_batch_pos = pos_emb.size(0) + p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) + p = p.transpose(1, 2) # (batch, head, time1, d_k) + + # (batch, head, time1, d_k) + q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) + # (batch, head, time1, d_k) + q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) + + # compute attention score + # first compute matrix a and matrix c + # as described in https://arxiv.org/abs/1901.02860 Section 3.3 + # (batch, head, time1, time2) + matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) + + # compute matrix b and matrix d + # (batch, head, time1, time2) + matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) + # NOTE(Xiang Lyu): Keep rel_shift since espnet rel_pos_emb is used + if matrix_ac.shape != matrix_bd.shape: + matrix_bd = self.rel_shift(matrix_bd) + + scores = (matrix_ac + matrix_bd) / math.sqrt( + self.d_k) # (batch, head, time1, time2) + + return self.forward_attention(v, scores, mask), new_cache + + def inference( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), + pos_emb: torch.Tensor = torch.empty(0), + cache_offset: torch.Tensor = None, + is_infer_short: bool = False, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Compute 'Scaled Dot Product Attention' with rel. positional encoding. + Args: + query (torch.Tensor): Query tensor (#batch, time1, size). + key (torch.Tensor): Key tensor (#batch, time2, size). + value (torch.Tensor): Value tensor (#batch, time2, size). + mask (torch.Tensor): Mask tensor (#batch, 1, time2) or + (#batch, time1, time2), (0, 0, 0) means fake mask. + pos_emb (torch.Tensor): Positional embedding tensor + (#batch, time2, size). + cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), + where `cache_t == chunk_size * num_decoding_left_chunks` + and `head * d_k == size` + Returns: + torch.Tensor: Output tensor (#batch, time1, d_model). + torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) + where `cache_t == chunk_size * num_decoding_left_chunks` + and `head * d_k == size` + """ + q, k, v = self.forward_qkv(query, key, value) + k = k.transpose(1, 2) # (batch, head, time2, d_k) + v = v.transpose(1, 2) # (batch, head, time2, d_k) + + if self.kv_cache is not None: + k, v = self.kv_cache.update(cache_offset, k, v, is_infer_short) + + n_batch_pos = pos_emb.size(0) + p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) + p = p.transpose(1, 2) # (batch, head, time1, d_k) + + # (batch, head, time1, d_k) + q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) + # (batch, head, time1, d_k) + q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) + + # compute matrix b and matrix d + matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) + # NOTE(Xiang Lyu): Keep rel_shift since espnet rel_pos_emb is used + matrix_bd = self.rel_shift(matrix_bd) + + assert mask.dtype == torch.bool + # mask = (mask.unsqueeze(1).eq(False) * torch.finfo(k.dtype).min).to(matrix_bd.dtype) + mask = mask.unsqueeze(1).eq(False) + mask = (matrix_bd / math.sqrt(self.d_k)).masked_fill(mask, torch.tensor(-float('inf'), dtype=matrix_bd.dtype)) + # import pdb; pdb.set_trace() + # print("q_with_bias_u.shape", q_with_bias_u.shape) + # print("k.shape", k.shape) + # print("v.shape", v.shape) + # print("mask.shape", mask.shape) + # import pdb; pdb.set_trace() + output = torch.nn.functional.scaled_dot_product_attention( + q_with_bias_u, + k, + v, + attn_mask=mask, + dropout_p=self.dropout_rate, + scale=1 / math.sqrt(self.d_k), + ) + + output = (output.transpose(1, 2).contiguous().view( + query.size(0), -1, self.h * self.d_k)) # (batch, time1, d_model) + return self.linear_out(output) diff --git a/audio_detokenizer/transformer/convolution.py b/audio_detokenizer/transformer/convolution.py new file mode 100644 index 0000000000000000000000000000000000000000..4d5d96149154776000991a681a666fbe55e562fe --- /dev/null +++ b/audio_detokenizer/transformer/convolution.py @@ -0,0 +1,145 @@ +# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) +# 2024 Alibaba Inc (Xiang Lyu) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Modified from ESPnet(https://github.com/espnet/espnet) +"""ConvolutionModule definition.""" + +from typing import Tuple + +import torch +from torch import nn + + +class ConvolutionModule(nn.Module): + """ConvolutionModule in Conformer model.""" + + def __init__(self, + channels: int, + kernel_size: int = 15, + activation: nn.Module = nn.ReLU(), + norm: str = "batch_norm", + causal: bool = False, + bias: bool = True): + """Construct an ConvolutionModule object. + Args: + channels (int): The number of channels of conv layers. + kernel_size (int): Kernel size of conv layers. + causal (int): Whether use causal convolution or not + """ + super().__init__() + + self.pointwise_conv1 = nn.Conv1d( + channels, + 2 * channels, + kernel_size=1, + stride=1, + padding=0, + bias=bias, + ) + # self.lorder is used to distinguish if it's a causal convolution, + # if self.lorder > 0: it's a causal convolution, the input will be + # padded with self.lorder frames on the left in forward. + # else: it's a symmetrical convolution + if causal: + padding = 0 + self.lorder = kernel_size - 1 + else: + # kernel_size should be an odd number for none causal convolution + assert (kernel_size - 1) % 2 == 0 + padding = (kernel_size - 1) // 2 + self.lorder = 0 + self.depthwise_conv = nn.Conv1d( + channels, + channels, + kernel_size, + stride=1, + padding=padding, + groups=channels, + bias=bias, + ) + + assert norm in ['batch_norm', 'layer_norm'] + if norm == "batch_norm": + self.use_layer_norm = False + self.norm = nn.BatchNorm1d(channels) + else: + self.use_layer_norm = True + self.norm = nn.LayerNorm(channels) + + self.pointwise_conv2 = nn.Conv1d( + channels, + channels, + kernel_size=1, + stride=1, + padding=0, + bias=bias, + ) + self.activation = activation + + def forward( + self, + x: torch.Tensor, + mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), + cache: torch.Tensor = torch.zeros((0, 0, 0)), + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Compute convolution module. + Args: + x (torch.Tensor): Input tensor (#batch, time, channels). + mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), + (0, 0, 0) means fake mask. + cache (torch.Tensor): left context cache, it is only + used in causal convolution (#batch, channels, cache_t), + (0, 0, 0) meas fake cache. + Returns: + torch.Tensor: Output tensor (#batch, time, channels). + """ + # exchange the temporal dimension and the feature dimension + x = x.transpose(1, 2) # (#batch, channels, time) + + # mask batch padding + if mask_pad.size(2) > 0: # time > 0 + x.masked_fill_(~mask_pad, 0.0) + + if self.lorder > 0: + if cache.size(2) == 0: # cache_t == 0 + x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) + else: + assert cache.size(0) == x.size(0) # equal batch + assert cache.size(1) == x.size(1) # equal channel + x = torch.cat((cache, x), dim=2) + assert (x.size(2) > self.lorder) + new_cache = x[:, :, -self.lorder:] + else: + # It's better we just return None if no cache is required, + # However, for JIT export, here we just fake one tensor instead of + # None. + new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) + + # GLU mechanism + x = self.pointwise_conv1(x) # (batch, 2*channel, dim) + x = nn.functional.glu(x, dim=1) # (batch, channel, dim) + + # 1D Depthwise Conv + x = self.depthwise_conv(x) + if self.use_layer_norm: + x = x.transpose(1, 2) + x = self.activation(self.norm(x)) + if self.use_layer_norm: + x = x.transpose(1, 2) + x = self.pointwise_conv2(x) + # mask batch padding + if mask_pad.size(2) > 0: # time > 0 + x.masked_fill_(~mask_pad, 0.0) + + return x.transpose(1, 2), new_cache diff --git a/audio_detokenizer/transformer/decoder.py b/audio_detokenizer/transformer/decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..6a3acb1c5f2fb50ad336625b024ba77f3c851951 --- /dev/null +++ b/audio_detokenizer/transformer/decoder.py @@ -0,0 +1,396 @@ +# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) +# 2024 Alibaba Inc (Xiang Lyu) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Modified from ESPnet(https://github.com/espnet/espnet) +"""Decoder definition.""" +from typing import Tuple, List, Optional + +import torch +import torch.utils.checkpoint as ckpt +import logging + +from .decoder_layer import DecoderLayer +from .positionwise_feed_forward import PositionwiseFeedForward +from ..utils.class_utils import ( + BAILING_EMB_CLASSES, + BAILING_ATTENTION_CLASSES, + BAILING_ACTIVATION_CLASSES, +) +from ..utils.mask import (subsequent_mask, make_pad_mask) + + +class TransformerDecoder(torch.nn.Module): + """Base class of Transfomer decoder module. + Args: + vocab_size: output dim + encoder_output_size: dimension of attention + attention_heads: the number of heads of multi head attention + linear_units: the hidden units number of position-wise feedforward + num_blocks: the number of decoder blocks + dropout_rate: dropout rate + self_attention_dropout_rate: dropout rate for attention + input_layer: input layer type + use_output_layer: whether to use output layer + pos_enc_class: PositionalEncoding or ScaledPositionalEncoding + normalize_before: + True: use layer_norm before each sub-block of a layer. + False: use layer_norm after each sub-block of a layer. + src_attention: if false, encoder-decoder cross attention is not + applied, such as CIF model + key_bias: whether use bias in attention.linear_k, False for whisper models. + gradient_checkpointing: rerunning a forward-pass segment for each + checkpointed segment during backward. + tie_word_embedding: Tie or clone module weights depending of whether we are + using TorchScript or not + """ + + def __init__( + self, + vocab_size: int, + encoder_output_size: int, + attention_heads: int = 4, + linear_units: int = 2048, + num_blocks: int = 6, + dropout_rate: float = 0.1, + positional_dropout_rate: float = 0.1, + self_attention_dropout_rate: float = 0.0, + src_attention_dropout_rate: float = 0.0, + input_layer: str = "embed", + use_output_layer: bool = True, + normalize_before: bool = True, + src_attention: bool = True, + key_bias: bool = True, + activation_type: str = "relu", + gradient_checkpointing: bool = False, + tie_word_embedding: bool = False, + ): + super().__init__() + attention_dim = encoder_output_size + activation = BAILING_ACTIVATION_CLASSES[activation_type]() + + self.embed = torch.nn.Sequential( + torch.nn.Identity() if input_layer == "no_pos" else + torch.nn.Embedding(vocab_size, attention_dim), + BAILING_EMB_CLASSES[input_layer](attention_dim, + positional_dropout_rate), + ) + + self.normalize_before = normalize_before + self.after_norm = torch.nn.LayerNorm(attention_dim, eps=1e-5) + self.use_output_layer = use_output_layer + if use_output_layer: + self.output_layer = torch.nn.Linear(attention_dim, vocab_size) + else: + self.output_layer = torch.nn.Identity() + self.num_blocks = num_blocks + self.decoders = torch.nn.ModuleList([ + DecoderLayer( + attention_dim, + BAILING_ATTENTION_CLASSES["selfattn"]( + attention_heads, attention_dim, + self_attention_dropout_rate, key_bias), + BAILING_ATTENTION_CLASSES["selfattn"]( + attention_heads, attention_dim, src_attention_dropout_rate, + key_bias) if src_attention else None, + PositionwiseFeedForward(attention_dim, linear_units, + dropout_rate, activation), + dropout_rate, + normalize_before, + ) for _ in range(self.num_blocks) + ]) + + self.gradient_checkpointing = gradient_checkpointing + self.tie_word_embedding = tie_word_embedding + + def forward( + self, + memory: torch.Tensor, + memory_mask: torch.Tensor, + ys_in_pad: torch.Tensor, + ys_in_lens: torch.Tensor, + r_ys_in_pad: torch.Tensor = torch.empty(0), + reverse_weight: float = 0.0, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Forward decoder. + Args: + memory: encoded memory, float32 (batch, maxlen_in, feat) + memory_mask: encoder memory mask, (batch, 1, maxlen_in) + ys_in_pad: padded input token ids, int64 (batch, maxlen_out) + ys_in_lens: input lengths of this batch (batch) + r_ys_in_pad: not used in transformer decoder, in order to unify api + with bidirectional decoder + reverse_weight: not used in transformer decoder, in order to unify + api with bidirectional decode + Returns: + (tuple): tuple containing: + x: decoded token score before softmax (batch, maxlen_out, + vocab_size) if use_output_layer is True, + torch.tensor(0.0), in order to unify api with bidirectional decoder + olens: (batch, ) + NOTE(xcsong): + We pass the `__call__` method of the modules instead of `forward` to the + checkpointing API because `__call__` attaches all the hooks of the module. + https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2 + """ + tgt = ys_in_pad + maxlen = tgt.size(1) + # tgt_mask: (B, 1, L) + tgt_mask = ~make_pad_mask(ys_in_lens, maxlen).unsqueeze(1) + tgt_mask = tgt_mask.to(tgt.device) + # m: (1, L, L) + m = subsequent_mask(tgt_mask.size(-1), + device=tgt_mask.device).unsqueeze(0) + # tgt_mask: (B, L, L) + tgt_mask = tgt_mask & m + x, _ = self.embed(tgt) + if self.gradient_checkpointing and self.training: + x = self.forward_layers_checkpointed(x, tgt_mask, memory, + memory_mask) + else: + x = self.forward_layers(x, tgt_mask, memory, memory_mask) + if self.normalize_before: + x = self.after_norm(x) + if self.use_output_layer: + x = self.output_layer(x) + olens = tgt_mask.sum(1) + return x, torch.tensor(0.0), olens + + def forward_layers(self, x: torch.Tensor, tgt_mask: torch.Tensor, + memory: torch.Tensor, + memory_mask: torch.Tensor) -> torch.Tensor: + for layer in self.decoders: + x, tgt_mask, memory, memory_mask = layer(x, tgt_mask, memory, + memory_mask) + return x + + @torch.jit.ignore(drop=True) + def forward_layers_checkpointed(self, x: torch.Tensor, + tgt_mask: torch.Tensor, + memory: torch.Tensor, + memory_mask: torch.Tensor) -> torch.Tensor: + for layer in self.decoders: + x, tgt_mask, memory, memory_mask = ckpt.checkpoint( + layer.__call__, x, tgt_mask, memory, memory_mask) + return x + + def forward_one_step( + self, + memory: torch.Tensor, + memory_mask: torch.Tensor, + tgt: torch.Tensor, + tgt_mask: torch.Tensor, + cache: Optional[List[torch.Tensor]] = None, + ) -> Tuple[torch.Tensor, List[torch.Tensor]]: + """Forward one step. + This is only used for decoding. + Args: + memory: encoded memory, float32 (batch, maxlen_in, feat) + memory_mask: encoded memory mask, (batch, 1, maxlen_in) + tgt: input token ids, int64 (batch, maxlen_out) + tgt_mask: input token mask, (batch, maxlen_out) + dtype=torch.uint8 in PyTorch 1.2- + dtype=torch.bool in PyTorch 1.2+ (include 1.2) + cache: cached output list of (batch, max_time_out-1, size) + Returns: + y, cache: NN output value and cache per `self.decoders`. + y.shape` is (batch, maxlen_out, token) + """ + x, _ = self.embed(tgt) + new_cache = [] + for i, decoder in enumerate(self.decoders): + if cache is None: + c = None + else: + c = cache[i] + x, tgt_mask, memory, memory_mask = decoder(x, + tgt_mask, + memory, + memory_mask, + cache=c) + new_cache.append(x) + if self.normalize_before: + y = self.after_norm(x[:, -1]) + else: + y = x[:, -1] + if self.use_output_layer: + y = torch.log_softmax(self.output_layer(y), dim=-1) + return y, new_cache + + def tie_or_clone_weights(self, jit_mode: bool = True): + """Tie or clone module weights (between word_emb and output_layer) + depending of whether we are using TorchScript or not""" + if not self.use_output_layer: + return + if jit_mode: + logging.info("clone emb.weight to output.weight") + self.output_layer.weight = torch.nn.Parameter( + self.embed[0].weight.clone()) + else: + logging.info("tie emb.weight with output.weight") + self.output_layer.weight = self.embed[0].weight + + if getattr(self.output_layer, "bias", None) is not None: + self.output_layer.bias.data = torch.nn.functional.pad( + self.output_layer.bias.data, + ( + 0, + self.output_layer.weight.shape[0] - + self.output_layer.bias.shape[0], + ), + "constant", + 0, + ) + + +class BiTransformerDecoder(torch.nn.Module): + """Base class of Transfomer decoder module. + Args: + vocab_size: output dim + encoder_output_size: dimension of attention + attention_heads: the number of heads of multi head attention + linear_units: the hidden units number of position-wise feedforward + num_blocks: the number of decoder blocks + r_num_blocks: the number of right to left decoder blocks + dropout_rate: dropout rate + self_attention_dropout_rate: dropout rate for attention + input_layer: input layer type + use_output_layer: whether to use output layer + pos_enc_class: PositionalEncoding or ScaledPositionalEncoding + normalize_before: + True: use layer_norm before each sub-block of a layer. + False: use layer_norm after each sub-block of a layer. + key_bias: whether use bias in attention.linear_k, False for whisper models. + """ + + def __init__( + self, + vocab_size: int, + encoder_output_size: int, + attention_heads: int = 4, + linear_units: int = 2048, + num_blocks: int = 6, + r_num_blocks: int = 0, + dropout_rate: float = 0.1, + positional_dropout_rate: float = 0.1, + self_attention_dropout_rate: float = 0.0, + src_attention_dropout_rate: float = 0.0, + input_layer: str = "embed", + use_output_layer: bool = True, + normalize_before: bool = True, + key_bias: bool = True, + gradient_checkpointing: bool = False, + tie_word_embedding: bool = False, + ): + + super().__init__() + self.tie_word_embedding = tie_word_embedding + self.left_decoder = TransformerDecoder( + vocab_size, + encoder_output_size, + attention_heads, + linear_units, + num_blocks, + dropout_rate, + positional_dropout_rate, + self_attention_dropout_rate, + src_attention_dropout_rate, + input_layer, + use_output_layer, + normalize_before, + key_bias=key_bias, + gradient_checkpointing=gradient_checkpointing, + tie_word_embedding=tie_word_embedding) + + self.right_decoder = TransformerDecoder( + vocab_size, + encoder_output_size, + attention_heads, + linear_units, + r_num_blocks, + dropout_rate, + positional_dropout_rate, + self_attention_dropout_rate, + src_attention_dropout_rate, + input_layer, + use_output_layer, + normalize_before, + key_bias=key_bias, + gradient_checkpointing=gradient_checkpointing, + tie_word_embedding=tie_word_embedding) + + def forward( + self, + memory: torch.Tensor, + memory_mask: torch.Tensor, + ys_in_pad: torch.Tensor, + ys_in_lens: torch.Tensor, + r_ys_in_pad: torch.Tensor, + reverse_weight: float = 0.0, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Forward decoder. + Args: + memory: encoded memory, float32 (batch, maxlen_in, feat) + memory_mask: encoder memory mask, (batch, 1, maxlen_in) + ys_in_pad: padded input token ids, int64 (batch, maxlen_out) + ys_in_lens: input lengths of this batch (batch) + r_ys_in_pad: padded input token ids, int64 (batch, maxlen_out), + used for right to left decoder + reverse_weight: used for right to left decoder + Returns: + (tuple): tuple containing: + x: decoded token score before softmax (batch, maxlen_out, + vocab_size) if use_output_layer is True, + r_x: x: decoded token score (right to left decoder) + before softmax (batch, maxlen_out, vocab_size) + if use_output_layer is True, + olens: (batch, ) + """ + l_x, _, olens = self.left_decoder(memory, memory_mask, ys_in_pad, + ys_in_lens) + r_x = torch.tensor(0.0) + if reverse_weight > 0.0: + r_x, _, olens = self.right_decoder(memory, memory_mask, + r_ys_in_pad, ys_in_lens) + return l_x, r_x, olens + + def forward_one_step( + self, + memory: torch.Tensor, + memory_mask: torch.Tensor, + tgt: torch.Tensor, + tgt_mask: torch.Tensor, + cache: Optional[List[torch.Tensor]] = None, + ) -> Tuple[torch.Tensor, List[torch.Tensor]]: + """Forward one step. + This is only used for decoding. + Args: + memory: encoded memory, float32 (batch, maxlen_in, feat) + memory_mask: encoded memory mask, (batch, 1, maxlen_in) + tgt: input token ids, int64 (batch, maxlen_out) + tgt_mask: input token mask, (batch, maxlen_out) + dtype=torch.uint8 in PyTorch 1.2- + dtype=torch.bool in PyTorch 1.2+ (include 1.2) + cache: cached output list of (batch, max_time_out-1, size) + Returns: + y, cache: NN output value and cache per `self.decoders`. + y.shape` is (batch, maxlen_out, token) + """ + return self.left_decoder.forward_one_step(memory, memory_mask, tgt, + tgt_mask, cache) + + def tie_or_clone_weights(self, jit_mode: bool = True): + """Tie or clone module weights (between word_emb and output_layer) + depending of whether we are using TorchScript or not""" + self.left_decoder.tie_or_clone_weights(jit_mode) + self.right_decoder.tie_or_clone_weights(jit_mode) diff --git a/audio_detokenizer/transformer/decoder_layer.py b/audio_detokenizer/transformer/decoder_layer.py new file mode 100644 index 0000000000000000000000000000000000000000..91c7c5d7fb2a8e79cea7705646e5381016f73466 --- /dev/null +++ b/audio_detokenizer/transformer/decoder_layer.py @@ -0,0 +1,132 @@ +# Copyright (c) 2019 Shigeki Karita +# 2020 Mobvoi Inc (Binbin Zhang) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Decoder self-attention layer definition.""" +from typing import Optional, Tuple + +import torch +from torch import nn + + +class DecoderLayer(nn.Module): + """Single decoder layer module. + + Args: + size (int): Input dimension. + self_attn (torch.nn.Module): Self-attention module instance. + `MultiHeadedAttention` instance can be used as the argument. + src_attn (torch.nn.Module): Inter-attention module instance. + `MultiHeadedAttention` instance can be used as the argument. + If `None` is passed, Inter-attention is not used, such as + CIF, GPT, and other decoder only model. + feed_forward (torch.nn.Module): Feed-forward module instance. + `PositionwiseFeedForward` instance can be used as the argument. + dropout_rate (float): Dropout rate. + normalize_before (bool): + True: use layer_norm before each sub-block. + False: to use layer_norm after each sub-block. + """ + + def __init__( + self, + size: int, + self_attn: nn.Module, + src_attn: Optional[nn.Module], + feed_forward: nn.Module, + dropout_rate: float, + normalize_before: bool = True, + ): + """Construct an DecoderLayer object.""" + super().__init__() + self.size = size + self.self_attn = self_attn + self.src_attn = src_attn + self.feed_forward = feed_forward + self.norm1 = nn.LayerNorm(size, eps=1e-5) + self.norm2 = nn.LayerNorm(size, eps=1e-5) + self.norm3 = nn.LayerNorm(size, eps=1e-5) + self.dropout = nn.Dropout(dropout_rate) + self.normalize_before = normalize_before + + def forward( + self, + tgt: torch.Tensor, + tgt_mask: torch.Tensor, + memory: torch.Tensor, + memory_mask: torch.Tensor, + cache: Optional[torch.Tensor] = None + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """Compute decoded features. + + Args: + tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size). + tgt_mask (torch.Tensor): Mask for input tensor + (#batch, maxlen_out). + memory (torch.Tensor): Encoded memory + (#batch, maxlen_in, size). + memory_mask (torch.Tensor): Encoded memory mask + (#batch, maxlen_in). + cache (torch.Tensor): cached tensors. + (#batch, maxlen_out - 1, size). + + Returns: + torch.Tensor: Output tensor (#batch, maxlen_out, size). + torch.Tensor: Mask for output tensor (#batch, maxlen_out). + torch.Tensor: Encoded memory (#batch, maxlen_in, size). + torch.Tensor: Encoded memory mask (#batch, maxlen_in). + + """ + residual = tgt + if self.normalize_before: + tgt = self.norm1(tgt) + + if cache is None: + tgt_q = tgt + tgt_q_mask = tgt_mask + else: + # compute only the last frame query keeping dim: max_time_out -> 1 + assert cache.shape == ( + tgt.shape[0], + tgt.shape[1] - 1, + self.size, + ), "{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}" + tgt_q = tgt[:, -1:, :] + residual = residual[:, -1:, :] + tgt_q_mask = tgt_mask[:, -1:, :] + + x = residual + self.dropout( + self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]) + if not self.normalize_before: + x = self.norm1(x) + + if self.src_attn is not None: + residual = x + if self.normalize_before: + x = self.norm2(x) + x = residual + self.dropout( + self.src_attn(x, memory, memory, memory_mask)[0]) + if not self.normalize_before: + x = self.norm2(x) + + residual = x + if self.normalize_before: + x = self.norm3(x) + x = residual + self.dropout(self.feed_forward(x)) + if not self.normalize_before: + x = self.norm3(x) + + if cache is not None: + x = torch.cat([cache, x], dim=1) + + return x, tgt_mask, memory, memory_mask diff --git a/audio_detokenizer/transformer/embedding.py b/audio_detokenizer/transformer/embedding.py new file mode 100644 index 0000000000000000000000000000000000000000..cecd906060ae5091ca85ba20fda95dd80079bd5a --- /dev/null +++ b/audio_detokenizer/transformer/embedding.py @@ -0,0 +1,303 @@ +# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) +# 2024 Alibaba Inc (Xiang Lyu) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Modified from ESPnet(https://github.com/espnet/espnet) +"""Positonal Encoding Module.""" + +import math +from typing import Tuple, Union + +import torch +import torch.nn.functional as F +import numpy as np + + +class PositionalEncoding(torch.nn.Module): + """Positional encoding. + + :param int d_model: embedding dim + :param float dropout_rate: dropout rate + :param int max_len: maximum input length + + PE(pos, 2i) = sin(pos/(10000^(2i/dmodel))) + PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel))) + """ + + def __init__(self, + d_model: int, + dropout_rate: float, + max_len: int = 5000, + reverse: bool = False): + """Construct an PositionalEncoding object.""" + super().__init__() + self.d_model = d_model + self.xscale = math.sqrt(self.d_model) + self.dropout = torch.nn.Dropout(p=dropout_rate) + self.max_len = max_len + + self.pe = torch.zeros(self.max_len, self.d_model) + position = torch.arange(0, self.max_len, + dtype=torch.float32).unsqueeze(1) + div_term = torch.exp( + torch.arange(0, self.d_model, 2, dtype=torch.float32) * + -(math.log(10000.0) / self.d_model)) + self.pe[:, 0::2] = torch.sin(position * div_term) + self.pe[:, 1::2] = torch.cos(position * div_term) + self.pe = self.pe.unsqueeze(0) + + def forward(self, + x: torch.Tensor, + offset: Union[int, torch.Tensor] = 0) \ + -> Tuple[torch.Tensor, torch.Tensor]: + """Add positional encoding. + + Args: + x (torch.Tensor): Input. Its shape is (batch, time, ...) + offset (int, torch.tensor): position offset + + Returns: + torch.Tensor: Encoded tensor. Its shape is (batch, time, ...) + torch.Tensor: for compatibility to RelPositionalEncoding + """ + + self.pe = self.pe.to(x.device) + pos_emb = self.position_encoding(offset, x.size(1), False) + x = x * self.xscale + pos_emb + return self.dropout(x), self.dropout(pos_emb) + + def position_encoding(self, + offset: Union[int, torch.Tensor], + size: int, + apply_dropout: bool = True) -> torch.Tensor: + """ For getting encoding in a streaming fashion + + Attention!!!!! + we apply dropout only once at the whole utterance level in a none + streaming way, but will call this function several times with + increasing input size in a streaming scenario, so the dropout will + be applied several times. + + Args: + offset (int or torch.tensor): start offset + size (int): required size of position encoding + + Returns: + torch.Tensor: Corresponding encoding + """ + # How to subscript a Union type: + # https://github.com/pytorch/pytorch/issues/69434 + if isinstance(offset, int): + assert offset + size <= self.max_len + pos_emb = self.pe[:, offset:offset + size] + elif isinstance(offset, torch.Tensor) and offset.dim() == 0: # scalar + assert offset + size <= self.max_len + pos_emb = self.pe[:, offset:offset + size] + else: # for batched streaming decoding on GPU + assert torch.max(offset) + size <= self.max_len + index = offset.unsqueeze(1) + \ + torch.arange(0, size).to(offset.device) # B X T + flag = index > 0 + # remove negative offset + index = index * flag + pos_emb = F.embedding(index, self.pe[0]) # B X T X d_model + + if apply_dropout: + pos_emb = self.dropout(pos_emb) + return pos_emb + + +class RelPositionalEncoding(PositionalEncoding): + """Relative positional encoding module. + See : Appendix B in https://arxiv.org/abs/1901.02860 + Args: + d_model (int): Embedding dimension. + dropout_rate (float): Dropout rate. + max_len (int): Maximum input length. + """ + + def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000): + """Initialize class.""" + super().__init__(d_model, dropout_rate, max_len, reverse=True) + + def forward(self, + x: torch.Tensor, + offset: Union[int, torch.Tensor] = 0) \ + -> Tuple[torch.Tensor, torch.Tensor]: + """Compute positional encoding. + Args: + x (torch.Tensor): Input tensor (batch, time, `*`). + Returns: + torch.Tensor: Encoded tensor (batch, time, `*`). + torch.Tensor: Positional embedding tensor (1, time, `*`). + """ + self.pe = self.pe.to(x.device) + x = x * self.xscale + pos_emb = self.position_encoding(offset, x.size(1), False) + return self.dropout(x), self.dropout(pos_emb) + + +class WhisperPositionalEncoding(PositionalEncoding): + """ Sinusoids position encoding used in openai-whisper.encoder + """ + + def __init__(self, d_model: int, dropout_rate: float, max_len: int = 1500): + super().__init__(d_model, dropout_rate, max_len) + self.xscale = 1.0 + log_timescale_increment = np.log(10000) / (d_model // 2 - 1) + inv_timescales = torch.exp(-log_timescale_increment * + torch.arange(d_model // 2)) + scaled_time = torch.arange(max_len)[:, np.newaxis] * \ + inv_timescales[np.newaxis, :] + pe = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1) + delattr(self, "pe") + self.register_buffer("pe", pe.unsqueeze(0)) + + +class LearnablePositionalEncoding(PositionalEncoding): + """ Learnable position encoding used in openai-whisper.decoder + """ + + def __init__(self, d_model: int, dropout_rate: float, max_len: int = 448): + super().__init__(d_model, dropout_rate, max_len) + # NOTE(xcsong): overwrite self.pe & self.xscale + self.pe = torch.nn.Parameter(torch.empty(1, max_len, d_model)) + self.xscale = 1.0 + + +class NoPositionalEncoding(torch.nn.Module): + """ No position encoding + """ + + def __init__(self, d_model: int, dropout_rate: float): + super().__init__() + self.d_model = d_model + self.dropout = torch.nn.Dropout(p=dropout_rate) + + def forward(self, + x: torch.Tensor, + offset: Union[int, torch.Tensor] = 0) \ + -> Tuple[torch.Tensor, torch.Tensor]: + """ Just return zero vector for interface compatibility + """ + pos_emb = torch.zeros(1, x.size(1), self.d_model).to(x.device) + return self.dropout(x), pos_emb + + def position_encoding(self, offset: Union[int, torch.Tensor], + size: int) -> torch.Tensor: + return torch.zeros(1, size, self.d_model) + + +class EspnetRelPositionalEncoding(torch.nn.Module): + """Relative positional encoding module (new implementation). + + Details can be found in https://github.com/espnet/espnet/pull/2816. + + See : Appendix B in https://arxiv.org/abs/1901.02860 + + Args: + d_model (int): Embedding dimension. + dropout_rate (float): Dropout rate. + max_len (int): Maximum input length. + + """ + + def __init__(self, d_model, dropout_rate, max_len=5000): + """Construct an PositionalEncoding object.""" + super(EspnetRelPositionalEncoding, self).__init__() + self.d_model = d_model + self.xscale = math.sqrt(self.d_model) + self.dropout = torch.nn.Dropout(p=dropout_rate) + self.pe = None + self.extend_pe(torch.tensor(0.0).expand(1, 4096 * 2)) + + def extend_pe(self, x): + """Reset the positional encodings.""" + if self.pe is not None: + # self.pe contains both positive and negative parts + # the length of self.pe is 2 * input_len - 1 + if self.pe.size(1) >= x.size(1) * 2 - 1: + if self.pe.dtype != x.dtype or self.pe.device != x.device: + self.pe = self.pe.to(dtype=x.dtype, device=x.device) + return + # Suppose `i` means to the position of query vecotr and `j` means the + # position of key vector. We use position relative positions when keys + # are to the left (i>j) and negative relative positions otherwise (i torch.Tensor: + """ For getting encoding in a streaming fashion + + Attention!!!!! + we apply dropout only once at the whole utterance level in a none + streaming way, but will call this function several times with + increasing input size in a streaming scenario, so the dropout will + be applied several times. + + Args: + offset (int or torch.tensor): start offset + size (int): required size of position encoding + + Returns: + torch.Tensor: Corresponding encoding + """ + pos_emb = self.pe[ + :, + self.pe.size(1) // 2 - size + 1 : self.pe.size(1) // 2 + size, + ] + return pos_emb + + def fix_position_encoding(self, + offset: Union[int, torch.Tensor], + size: int, + max_len: int) -> torch.Tensor: + pos_emb = self.pe[ + :, + self.pe.size(1) // 2 - size + 1 : self.pe.size(1) // 2 - size + 2 * max_len, + ] + return pos_emb diff --git a/audio_detokenizer/transformer/encoder.py b/audio_detokenizer/transformer/encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..53f60e0003077134e1fda3753a61de5470b9bdf8 --- /dev/null +++ b/audio_detokenizer/transformer/encoder.py @@ -0,0 +1,676 @@ +# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) +# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) +# 2024 Alibaba Inc (Xiang Lyu) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Modified from ESPnet(https://github.com/espnet/espnet) +"""Encoder definition.""" +from typing import Tuple + +import torch +import torch.utils.checkpoint as ckpt + +from .convolution import ConvolutionModule +from .encoder_layer import TransformerEncoderLayer +from .encoder_layer import ConformerEncoderLayer +from .positionwise_feed_forward import PositionwiseFeedForward + +from ..utils.class_utils import ( + BAILING_EMB_CLASSES, + BAILING_SUBSAMPLE_CLASSES, + BAILING_ATTENTION_CLASSES, + BAILING_ACTIVATION_CLASSES, +) +from ..utils.mask import make_pad_mask +from ..utils.mask import add_optional_chunk_mask + +import torch.nn.functional as F +import logging + +class KVCache(torch.nn.Module): + def __init__(self, max_batch_size, n_heads, max_seq_short, max_seq_long, head_dim, dtype=torch.float32): + super().__init__() + cache_shape_short = (max_batch_size, n_heads, max_seq_short, head_dim) + self.register_buffer('k_cache_short', torch.zeros(cache_shape_short, dtype=dtype, device='cuda')) + self.register_buffer('v_cache_short', torch.zeros(cache_shape_short, dtype=dtype, device='cuda')) + + cache_shape_long = (max_batch_size, n_heads, max_seq_long, head_dim) + self.register_buffer('k_cache_long', torch.zeros(cache_shape_long, dtype=dtype, device='cuda')) + self.register_buffer('v_cache_long', torch.zeros(cache_shape_long, dtype=dtype, device='cuda')) + + self.max_seq_short = max_seq_short + self.max_seq_long = max_seq_long + + def update(self, input_pos, k_val, v_val, is_infer_short): + # input_pos: [S], k_val: [B, H, S, D] + assert input_pos.shape[0] == k_val.shape[2] + if is_infer_short: + k_out = self.k_cache_short + v_out = self.v_cache_short + else: + k_out = self.k_cache_long + v_out = self.v_cache_long + + k_out[:, :, input_pos] = k_val + v_out[:, :, input_pos] = v_val + return k_out, v_out + + +class BaseEncoder(torch.nn.Module): + + def __init__( + self, + input_size: int, + output_size: int = 256, + attention_heads: int = 4, + linear_units: int = 2048, + num_blocks: int = 6, + dropout_rate: float = 0.1, + positional_dropout_rate: float = 0.1, + attention_dropout_rate: float = 0.0, + input_layer: str = "conv2d", + pos_enc_layer_type: str = "abs_pos", + normalize_before: bool = True, + static_chunk_size: int = 0, + use_dynamic_chunk: bool = False, + global_cmvn: torch.nn.Module = None, + use_dynamic_left_chunk: bool = False, + gradient_checkpointing: bool = False, + max_seq_short: int = 384, + max_seq_long: int = 2048, + ): + """ + Args: + input_size (int): input dim + output_size (int): dimension of attention + attention_heads (int): the number of heads of multi head attention + linear_units (int): the hidden units number of position-wise feed + forward + num_blocks (int): the number of decoder blocks + dropout_rate (float): dropout rate + attention_dropout_rate (float): dropout rate in attention + positional_dropout_rate (float): dropout rate after adding + positional encoding + input_layer (str): input layer type. + optional [linear, conv2d, conv2d6, conv2d8] + pos_enc_layer_type (str): Encoder positional encoding layer type. + opitonal [abs_pos, scaled_abs_pos, rel_pos, no_pos] + normalize_before (bool): + True: use layer_norm before each sub-block of a layer. + False: use layer_norm after each sub-block of a layer. + static_chunk_size (int): chunk size for static chunk training and + decoding + use_dynamic_chunk (bool): whether use dynamic chunk size for + training or not, You can only use fixed chunk(chunk_size > 0) + or dyanmic chunk size(use_dynamic_chunk = True) + global_cmvn (Optional[torch.nn.Module]): Optional GlobalCMVN module + use_dynamic_left_chunk (bool): whether use dynamic left chunk in + dynamic chunk training + key_bias: whether use bias in attention.linear_k, False for whisper models. + gradient_checkpointing: rerunning a forward-pass segment for each + checkpointed segment during backward. + """ + super().__init__() + self._output_size = output_size + + self.global_cmvn = global_cmvn + self.embed = BAILING_SUBSAMPLE_CLASSES[input_layer]( + input_size, + output_size, + dropout_rate, + BAILING_EMB_CLASSES[pos_enc_layer_type](output_size, + positional_dropout_rate), + ) + + self.normalize_before = normalize_before + self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5) + self.static_chunk_size = static_chunk_size + self.use_dynamic_chunk = use_dynamic_chunk + self.use_dynamic_left_chunk = use_dynamic_left_chunk + self.gradient_checkpointing = gradient_checkpointing + self.attention_heads = attention_heads + self.head_dim = output_size // attention_heads + self.compiled_infer_short = None + self.compiled_infer_long = None + self.max_seq_short = max_seq_short + self.max_seq_long = max_seq_long + + def setup_caches(self, max_seq_short, max_seq_long, dtype=torch.float32): + # import pdb; pdb.set_trace() + assert max_seq_short == self.max_seq_short and max_seq_long == self.max_seq_long + for it in self.encoders: + it.self_attn.kv_cache = KVCache(1, self.attention_heads, self.max_seq_short, self.max_seq_long, + self.head_dim, dtype) + + def output_size(self) -> int: + return self._output_size + + def forward( + self, + xs: torch.Tensor, + xs_lens: torch.Tensor, + decoding_chunk_size: int = 0, + num_decoding_left_chunks: int = -1, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Embed positions in tensor. + + Args: + xs: padded input tensor (B, T, D) + xs_lens: input length (B) + decoding_chunk_size: decoding chunk size for dynamic chunk + 0: default for training, use random dynamic chunk. + <0: for decoding, use full chunk. + >0: for decoding, use fixed chunk size as set. + num_decoding_left_chunks: number of left chunks, this is for decoding, + the chunk size is decoding_chunk_size. + >=0: use num_decoding_left_chunks + <0: use all left chunks + Returns: + encoder output tensor xs, and subsampled masks + xs: padded output tensor (B, T' ~= T/subsample_rate, D) + masks: torch.Tensor batch padding mask after subsample + (B, 1, T' ~= T/subsample_rate) + NOTE(xcsong): + We pass the `__call__` method of the modules instead of `forward` to the + checkpointing API because `__call__` attaches all the hooks of the module. + https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2 + """ + T = xs.size(1) + masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) + if self.global_cmvn is not None: + xs = self.global_cmvn(xs) + xs, pos_emb, masks = self.embed(xs, masks) + mask_pad = masks # (B, 1, T/subsample_rate) + chunk_masks = add_optional_chunk_mask(xs, masks, + self.use_dynamic_chunk, + self.use_dynamic_left_chunk, + decoding_chunk_size, + self.static_chunk_size, + num_decoding_left_chunks) + if self.gradient_checkpointing and self.training: + xs = self.forward_layers_checkpointed(xs, chunk_masks, pos_emb, + mask_pad) + else: + xs = self.forward_layers(xs, chunk_masks, pos_emb, mask_pad) + if self.normalize_before: + xs = self.after_norm(xs) + # Here we assume the mask is not changed in encoder layers, so just + # return the masks before encoder layers, and the masks will be used + # for cross attention with decoder later + return xs, masks + + def forward_layers(self, xs: torch.Tensor, chunk_masks: torch.Tensor, + pos_emb: torch.Tensor, + mask_pad: torch.Tensor) -> torch.Tensor: + for layer in self.encoders: + xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) + return xs + + @torch.jit.ignore(drop=True) + def forward_layers_checkpointed(self, xs: torch.Tensor, + chunk_masks: torch.Tensor, + pos_emb: torch.Tensor, + mask_pad: torch.Tensor) -> torch.Tensor: + for layer in self.encoders: + xs, chunk_masks, _, _ = ckpt.checkpoint(layer.__call__, xs, + chunk_masks, pos_emb, + mask_pad) + return xs + + def forward_chunk( + self, + xs: torch.Tensor, + offset: int, + required_cache_size: int, + att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), + cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0), + att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ Forward just one chunk + + Args: + xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim), + where `time == (chunk_size - 1) * subsample_rate + \ + subsample.right_context + 1` + offset (int): current offset in encoder output time stamp + required_cache_size (int): cache size required for next chunk + compuation + >=0: actual cache size + <0: means all history cache is required + att_cache (torch.Tensor): cache tensor for KEY & VALUE in + transformer/conformer attention, with shape + (elayers, head, cache_t1, d_k * 2), where + `head * d_k == hidden-dim` and + `cache_t1 == chunk_size * num_decoding_left_chunks`. + cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer, + (elayers, b=1, hidden-dim, cache_t2), where + `cache_t2 == cnn.lorder - 1` + + Returns: + torch.Tensor: output of current input xs, + with shape (b=1, chunk_size, hidden-dim). + torch.Tensor: new attention cache required for next chunk, with + dynamic shape (elayers, head, ?, d_k * 2) + depending on required_cache_size. + torch.Tensor: new conformer cnn cache required for next chunk, with + same shape as the original cnn_cache. + + """ + assert xs.size(0) == 1 + # tmp_masks is just for interface compatibility + tmp_masks = torch.ones(1, + xs.size(1), + device=xs.device, + dtype=torch.bool) + tmp_masks = tmp_masks.unsqueeze(1) + if self.global_cmvn is not None: + xs = self.global_cmvn(xs) + # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) + xs, pos_emb, _ = self.embed(xs, tmp_masks, offset) + # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) + elayers, cache_t1 = att_cache.size(0), att_cache.size(2) + chunk_size = xs.size(1) + attention_key_size = cache_t1 + chunk_size + pos_emb = self.embed.position_encoding(offset=offset - cache_t1, + size=attention_key_size) + if required_cache_size < 0: + next_cache_start = 0 + elif required_cache_size == 0: + next_cache_start = attention_key_size + else: + next_cache_start = max(attention_key_size - required_cache_size, 0) + r_att_cache = [] + r_cnn_cache = [] + for i, layer in enumerate(self.encoders): + # NOTE(xcsong): Before layer.forward + # shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2), + # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) + xs, _, new_att_cache, new_cnn_cache = layer( + xs, + att_mask, + pos_emb, + att_cache=att_cache[i:i + 1] if elayers > 0 else att_cache, + cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache) + # NOTE(xcsong): After layer.forward + # shape(new_att_cache) is (1, head, attention_key_size, d_k * 2), + # shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2)setup_caches + r_att_cache.append(new_att_cache[:, :, next_cache_start:, :]) + r_cnn_cache.append(new_cnn_cache.unsqueeze(0)) + if self.normalize_before: + xs = self.after_norm(xs) + + # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2), + # ? may be larger than cache_t1, it depends on required_cache_size + r_att_cache = torch.cat(r_att_cache, dim=0) + # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2) + r_cnn_cache = torch.cat(r_cnn_cache, dim=0) + + return (xs, r_att_cache, r_cnn_cache) + + def inference_layers(self, xs: torch.Tensor, chunk_masks: torch.Tensor, + pos_emb: torch.Tensor, + mask_pad: torch.Tensor) -> torch.Tensor: + for layer in self.encoders: + xs, chunk_masks, _ = layer.inference(xs, chunk_masks, pos_emb, mask_pad) + return xs + + @torch.jit.ignore(drop=True) + def inference_layers_checkpointed(self, xs: torch.Tensor, + chunk_masks: torch.Tensor, + pos_emb: torch.Tensor, + mask_pad: torch.Tensor) -> torch.Tensor: + for layer in self.encoders: + xs, chunk_masks, _ = ckpt.checkpoint(layer.inference, xs, + chunk_masks, pos_emb, + mask_pad) + return xs + + def inference_prefill( + self, + xs: torch.Tensor, + offset: int, + cache_offset: int, + att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), + fix_shape=False, + is_infer_short=False, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + assert xs.size(0) == 1 + # tmp_masks is just for interface compatibility + tmp_masks = torch.ones(1, + xs.size(1), + device=xs.device, + dtype=torch.bool) + tmp_masks = tmp_masks.unsqueeze(1) + if self.global_cmvn is not None: + xs = self.global_cmvn(xs) + # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim) + xs, pos_emb, _ = self.embed(xs, tmp_masks, offset) + # NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim) + chunk_size = xs.size(1) + attention_key_size = cache_offset + chunk_size + max_seq = self.max_seq_short if is_infer_short else self.max_seq_long + if fix_shape: + pos_emb = self.embed.position_encoding(offset=offset - cache_offset, + size=attention_key_size) + target_seq_len = max_seq * 2 - 1 + current_seq_len = pos_emb.size(1) + padding_size = target_seq_len - current_seq_len + pos_emb = F.pad(pos_emb, (0, 0, 0, padding_size)) + else: + pos_emb = self.embed.position_encoding(offset=offset - cache_offset, + size=attention_key_size) + cache_offset = torch.arange(0, xs.shape[1], device=xs.device, dtype=torch.int) + + for i, layer in enumerate(self.encoders): + xs, _, _ = layer.inference( + xs, + att_mask, + pos_emb, + cache_offset=cache_offset, + is_infer_short=is_infer_short, + ) + if self.normalize_before: + xs = self.after_norm(xs) + + return xs + + def prepare_for_decode( + self, + xs: torch.Tensor, + offset: int, + cache_offset: int, + is_infer_short: bool, + ): + # assert xs.size(0) == 1 + chunk_size = xs.size(1) + attention_key_size = cache_offset + chunk_size + max_seq = self.max_seq_short if is_infer_short else self.max_seq_long + # tmp_masks = torch.ones(1, chunk_size, + # device=xs.device, dtype=torch.bool) + # tmp_masks = tmp_masks.unsqueeze(1) + # if self.global_cmvn is not None: + # xs = self.global_cmvn(xs) + # xs, _, _ = self.embed(xs, tmp_masks, offset) + xs = self.embed.out(xs) * self.embed.pos_enc.xscale + pos_emb = self.embed.fix_position_encoding(offset=offset - cache_offset, + size=attention_key_size, max_len=max_seq) + # print("xs, pos_emb", xs.device, pos_emb.device) + return xs, pos_emb + + def step_infer_short( + self, + xs: torch.Tensor, + pos_emb: torch.Tensor, + cache_offset: torch.Tensor, + att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), + ): + for i, layer in enumerate(self.encoders): + xs, _, _ = layer.inference( + xs, + att_mask, + pos_emb, + cache_offset=cache_offset, + is_infer_short = True, + ) + if self.normalize_before: + xs = self.after_norm(xs) + return xs + + def step_infer_long( + self, + xs: torch.Tensor, + pos_emb: torch.Tensor, + cache_offset: torch.Tensor, + att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), + ): + for i, layer in enumerate(self.encoders): + xs, _, _ = layer.inference( + xs, + att_mask, + pos_emb, + cache_offset=cache_offset, + is_infer_short=False, + ) + if self.normalize_before: + xs = self.after_norm(xs) + return xs + + def inference_decode_step( + self, + xs: torch.Tensor, + offset: int, + cache_offset: int, + att_mask, + is_infer_short, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + + xs, pos_emb = self.prepare_for_decode(xs, offset, cache_offset, is_infer_short) + cache_offset = torch.tensor([cache_offset], device=xs.device, dtype=torch.int32) + + # print("xs, att_mask: ", xs.shape, att_mask.shape) + + if is_infer_short: + if self.compiled_infer_short == None: + self.compiled_infer_short = torch.compile(self.step_infer_short, mode="reduce-overhead", fullgraph=True) + ret = self.compiled_infer_short(xs, pos_emb, cache_offset, att_mask) + elif not is_infer_short: + if self.compiled_infer_long == None: + self.compiled_infer_long = torch.compile(self.step_infer_long, mode="reduce-overhead", fullgraph=True) + ret = self.compiled_infer_long(xs, pos_emb, cache_offset, att_mask) + + return ret.clone() + + def forward_chunk_by_chunk( + self, + xs: torch.Tensor, + decoding_chunk_size: int, + num_decoding_left_chunks: int = -1, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ Forward input chunk by chunk with chunk_size like a streaming + fashion + + Here we should pay special attention to computation cache in the + streaming style forward chunk by chunk. Three things should be taken + into account for computation in the current network: + 1. transformer/conformer encoder layers output cache + 2. convolution in conformer + 3. convolution in subsampling + + However, we don't implement subsampling cache for: + 1. We can control subsampling module to output the right result by + overlapping input instead of cache left context, even though it + wastes some computation, but subsampling only takes a very + small fraction of computation in the whole model. + 2. Typically, there are several covolution layers with subsampling + in subsampling module, it is tricky and complicated to do cache + with different convolution layers with different subsampling + rate. + 3. Currently, nn.Sequential is used to stack all the convolution + layers in subsampling, we need to rewrite it to make it work + with cache, which is not prefered. + Args: + xs (torch.Tensor): (1, max_len, dim) + chunk_size (int): decoding chunk size + """ + assert decoding_chunk_size > 0 + # The model is trained by static or dynamic chunk + assert self.static_chunk_size > 0 or self.use_dynamic_chunk + subsampling = self.embed.subsampling_rate + context = self.embed.right_context + 1 # Add current frame + stride = subsampling * decoding_chunk_size + decoding_window = (decoding_chunk_size - 1) * subsampling + context + num_frames = xs.size(1) + att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) + cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device) + outputs = [] + offset = 0 + required_cache_size = decoding_chunk_size * num_decoding_left_chunks + + # Feed forward overlap input step by step + for cur in range(0, num_frames - context + 1, stride): + end = min(cur + decoding_window, num_frames) + chunk_xs = xs[:, cur:end, :] + (y, att_cache, + cnn_cache) = self.forward_chunk(chunk_xs, offset, + required_cache_size, att_cache, + cnn_cache) + outputs.append(y) + offset += y.size(1) + ys = torch.cat(outputs, 1) + masks = torch.ones((1, 1, ys.size(1)), + device=ys.device, + dtype=torch.bool) + return ys, masks + + +class TransformerEncoder(BaseEncoder): + """Transformer encoder module.""" + + def __init__( + self, + input_size: int, + output_size: int = 256, + attention_heads: int = 4, + linear_units: int = 2048, + num_blocks: int = 6, + dropout_rate: float = 0.1, + positional_dropout_rate: float = 0.1, + attention_dropout_rate: float = 0.0, + input_layer: str = "conv2d", + pos_enc_layer_type: str = "abs_pos", + normalize_before: bool = True, + static_chunk_size: int = 0, + use_dynamic_chunk: bool = False, + global_cmvn: torch.nn.Module = None, + use_dynamic_left_chunk: bool = False, + key_bias: bool = True, + selfattention_layer_type: str = "selfattn", + activation_type: str = "relu", + gradient_checkpointing: bool = False, + max_seq_short: int = 384, + max_seq_long: int = 2048, + ): + """ Construct TransformerEncoder + + See Encoder for the meaning of each parameter. + """ + super().__init__(input_size, output_size, attention_heads, + linear_units, num_blocks, dropout_rate, + positional_dropout_rate, attention_dropout_rate, + input_layer, pos_enc_layer_type, normalize_before, + static_chunk_size, use_dynamic_chunk, global_cmvn, + use_dynamic_left_chunk, gradient_checkpointing, max_seq_short, max_seq_long) + activation = BAILING_ACTIVATION_CLASSES[activation_type]() + self.encoders = torch.nn.ModuleList([ + TransformerEncoderLayer( + output_size, + BAILING_ATTENTION_CLASSES[selfattention_layer_type](attention_heads, + output_size, + attention_dropout_rate, + key_bias), + PositionwiseFeedForward(output_size, linear_units, + dropout_rate, activation), + dropout_rate, normalize_before) for _ in range(num_blocks) + ]) + + +class ConformerEncoder(BaseEncoder): + """Conformer encoder module.""" + + def __init__( + self, + input_size: int, + output_size: int = 256, + attention_heads: int = 4, + linear_units: int = 2048, + num_blocks: int = 6, + dropout_rate: float = 0.1, + positional_dropout_rate: float = 0.1, + attention_dropout_rate: float = 0.0, + input_layer: str = "conv2d", + pos_enc_layer_type: str = "rel_pos", + normalize_before: bool = True, + static_chunk_size: int = 0, + use_dynamic_chunk: bool = False, + global_cmvn: torch.nn.Module = None, + use_dynamic_left_chunk: bool = False, + positionwise_conv_kernel_size: int = 1, + macaron_style: bool = True, + selfattention_layer_type: str = "rel_selfattn", + activation_type: str = "swish", + use_cnn_module: bool = True, + cnn_module_kernel: int = 15, + causal: bool = False, + cnn_module_norm: str = "batch_norm", + key_bias: bool = True, + gradient_checkpointing: bool = False, + max_seq_short: int = 384, + max_seq_long: int = 2048, + ): + """Construct ConformerEncoder + + Args: + input_size to use_dynamic_chunk, see in BaseEncoder + positionwise_conv_kernel_size (int): Kernel size of positionwise + conv1d layer. + macaron_style (bool): Whether to use macaron style for + positionwise layer. + selfattention_layer_type (str): Encoder attention layer type, + the parameter has no effect now, it's just for configure + compatibility. + activation_type (str): Encoder activation function type. + use_cnn_module (bool): Whether to use convolution module. + cnn_module_kernel (int): Kernel size of convolution module. + causal (bool): whether to use causal convolution or not. + key_bias: whether use bias in attention.linear_k, False for whisper models. + """ + super().__init__(input_size, output_size, attention_heads, + linear_units, num_blocks, dropout_rate, + positional_dropout_rate, attention_dropout_rate, + input_layer, pos_enc_layer_type, normalize_before, + static_chunk_size, use_dynamic_chunk, global_cmvn, + use_dynamic_left_chunk, gradient_checkpointing, max_seq_short, max_seq_long) + activation = BAILING_ACTIVATION_CLASSES[activation_type]() + + # self-attention module definition + encoder_selfattn_layer_args = ( + attention_heads, + output_size, + attention_dropout_rate, + key_bias, + ) + # feed-forward module definition + positionwise_layer_args = ( + output_size, + linear_units, + dropout_rate, + activation, + ) + # convolution module definition + convolution_layer_args = (output_size, cnn_module_kernel, activation, + cnn_module_norm, causal) + + self.encoders = torch.nn.ModuleList([ + ConformerEncoderLayer( + output_size, + BAILING_ATTENTION_CLASSES[selfattention_layer_type]( + *encoder_selfattn_layer_args), + PositionwiseFeedForward(*positionwise_layer_args), + PositionwiseFeedForward( + *positionwise_layer_args) if macaron_style else None, + ConvolutionModule( + *convolution_layer_args) if use_cnn_module else None, + dropout_rate, + normalize_before, + ) for _ in range(num_blocks) + ]) diff --git a/audio_detokenizer/transformer/encoder_layer.py b/audio_detokenizer/transformer/encoder_layer.py new file mode 100644 index 0000000000000000000000000000000000000000..cb44ab206be9c7068557573c1e6f30c646ca7550 --- /dev/null +++ b/audio_detokenizer/transformer/encoder_layer.py @@ -0,0 +1,341 @@ +# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) +# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Modified from ESPnet(https://github.com/espnet/espnet) +"""Encoder self-attention layer definition.""" + +from typing import Optional, Tuple + +import torch +from torch import nn + + +class TransformerEncoderLayer(nn.Module): + """Encoder layer module. + + Args: + size (int): Input dimension. + self_attn (torch.nn.Module): Self-attention module instance. + `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` + instance can be used as the argument. + feed_forward (torch.nn.Module): Feed-forward module instance. + `PositionwiseFeedForward`, instance can be used as the argument. + dropout_rate (float): Dropout rate. + normalize_before (bool): + True: use layer_norm before each sub-block. + False: to use layer_norm after each sub-block. + """ + + def __init__( + self, + size: int, + self_attn: torch.nn.Module, + feed_forward: torch.nn.Module, + dropout_rate: float, + normalize_before: bool = True, + ): + """Construct an EncoderLayer object.""" + super().__init__() + self.self_attn = self_attn + self.feed_forward = feed_forward + self.norm1 = nn.LayerNorm(size, eps=1e-5) + self.norm2 = nn.LayerNorm(size, eps=1e-5) + self.dropout = nn.Dropout(dropout_rate) + self.size = size + self.normalize_before = normalize_before + + def forward( + self, + x: torch.Tensor, + mask: torch.Tensor, + pos_emb: torch.Tensor, + mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), + att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), + cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """Compute encoded features. + + Args: + x (torch.Tensor): (#batch, time, size) + mask (torch.Tensor): Mask tensor for the input (#batch, time,time), + (0, 0, 0) means fake mask. + pos_emb (torch.Tensor): just for interface compatibility + to ConformerEncoderLayer + mask_pad (torch.Tensor): does not used in transformer layer, + just for unified api with conformer. + att_cache (torch.Tensor): Cache tensor of the KEY & VALUE + (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. + cnn_cache (torch.Tensor): Convolution cache in conformer layer + (#batch=1, size, cache_t2), not used here, it's for interface + compatibility to ConformerEncoderLayer. + Returns: + torch.Tensor: Output tensor (#batch, time, size). + torch.Tensor: Mask tensor (#batch, time, time). + torch.Tensor: att_cache tensor, + (#batch=1, head, cache_t1 + time, d_k * 2). + torch.Tensor: cnn_cahce tensor (#batch=1, size, cache_t2). + + """ + residual = x + if self.normalize_before: + x = self.norm1(x) + x_att, new_att_cache = self.self_attn(x, x, x, mask, pos_emb=pos_emb, cache=att_cache) + x = residual + self.dropout(x_att) + if not self.normalize_before: + x = self.norm1(x) + + residual = x + if self.normalize_before: + x = self.norm2(x) + x = residual + self.dropout(self.feed_forward(x)) + if not self.normalize_before: + x = self.norm2(x) + + fake_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) + return x, mask, new_att_cache, fake_cnn_cache + + @torch.inference_mode() + def inference( + self, + x: torch.Tensor, + mask: torch.Tensor, + pos_emb: torch.Tensor, + mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), + cache_offset: torch.Tensor = None, + cnn_cache: torch.Tensor = None, + is_infer_short: bool = False, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """Compute encoded features. + + Args: + x (torch.Tensor): (#batch, time, size) + mask (torch.Tensor): Mask tensor for the input (#batch, time,time), + (0, 0, 0) means fake mask. + pos_emb (torch.Tensor): just for interface compatibility + to ConformerEncoderLayer + mask_pad (torch.Tensor): does not used in transformer layer, + just for unified api with conformer. + att_cache (torch.Tensor): Cache tensor of the KEY & VALUE + (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. + cnn_cache (torch.Tensor): Convolution cache in conformer layer + (#batch=1, size, cache_t2), not used here, it's for interface + compatibility to ConformerEncoderLayer. + Returns: + torch.Tensor: Output tensor (#batch, time, size). + torch.Tensor: Mask tensor (#batch, time, time). + torch.Tensor: att_cache tensor, + (#batch=1, head, cache_t1 + time, d_k * 2). + torch.Tensor: cnn_cahce tensor (#batch=1, size, cache_t2). + + """ + residual = x + if self.normalize_before: + x = self.norm1(x) + x_att = self.self_attn.inference(x, x, x, mask, pos_emb=pos_emb, cache_offset=cache_offset, is_infer_short=is_infer_short) + x = residual + self.dropout(x_att) + if not self.normalize_before: + x = self.norm1(x) + + residual = x + if self.normalize_before: + x = self.norm2(x) + x = residual + self.dropout(self.feed_forward(x)) + if not self.normalize_before: + x = self.norm2(x) + return x, None, None + +class ConformerEncoderLayer(nn.Module): + """Encoder layer module. + Args: + size (int): Input dimension. + self_attn (torch.nn.Module): Self-attention module instance. + `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` + instance can be used as the argument. + feed_forward (torch.nn.Module): Feed-forward module instance. + `PositionwiseFeedForward` instance can be used as the argument. + feed_forward_macaron (torch.nn.Module): Additional feed-forward module + instance. + `PositionwiseFeedForward` instance can be used as the argument. + conv_module (torch.nn.Module): Convolution module instance. + `ConvlutionModule` instance can be used as the argument. + dropout_rate (float): Dropout rate. + normalize_before (bool): + True: use layer_norm before each sub-block. + False: use layer_norm after each sub-block. + """ + + def __init__( + self, + size: int, + self_attn: torch.nn.Module, + feed_forward: Optional[nn.Module] = None, + feed_forward_macaron: Optional[nn.Module] = None, + conv_module: Optional[nn.Module] = None, + dropout_rate: float = 0.1, + normalize_before: bool = True, + ): + """Construct an EncoderLayer object.""" + super().__init__() + self.self_attn = self_attn + self.feed_forward = feed_forward + self.feed_forward_macaron = feed_forward_macaron + self.conv_module = conv_module + self.norm_ff = nn.LayerNorm(size, eps=1e-5) # for the FNN module + self.norm_mha = nn.LayerNorm(size, eps=1e-5) # for the MHA module + if feed_forward_macaron is not None: + self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-5) + self.ff_scale = 0.5 + else: + self.ff_scale = 1.0 + if self.conv_module is not None: + self.norm_conv = nn.LayerNorm(size, eps=1e-5) # for the CNN module + self.norm_final = nn.LayerNorm( + size, eps=1e-5) # for the final output of the block + self.dropout = nn.Dropout(dropout_rate) + self.size = size + self.normalize_before = normalize_before + + def forward( + self, + x: torch.Tensor, + mask: torch.Tensor, + pos_emb: torch.Tensor, + mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), + att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), + cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """Compute encoded features. + + Args: + x (torch.Tensor): (#batch, time, size) + mask (torch.Tensor): Mask tensor for the input (#batch, time,time), + (0, 0, 0) means fake mask. + pos_emb (torch.Tensor): positional encoding, must not be None + for ConformerEncoderLayer. + mask_pad (torch.Tensor): batch padding mask used for conv module. + (#batch, 1,time), (0, 0, 0) means fake mask. + att_cache (torch.Tensor): Cache tensor of the KEY & VALUE + (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. + cnn_cache (torch.Tensor): Convolution cache in conformer layer + (#batch=1, size, cache_t2) + Returns: + torch.Tensor: Output tensor (#batch, time, size). + torch.Tensor: Mask tensor (#batch, time, time). + torch.Tensor: att_cache tensor, + (#batch=1, head, cache_t1 + time, d_k * 2). + torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2). + """ + + # whether to use macaron style + if self.feed_forward_macaron is not None: + residual = x + if self.normalize_before: + x = self.norm_ff_macaron(x) + x = residual + self.ff_scale * self.dropout( + self.feed_forward_macaron(x)) + if not self.normalize_before: + x = self.norm_ff_macaron(x) + + # multi-headed self-attention module + residual = x + if self.normalize_before: + x = self.norm_mha(x) + x_att, new_att_cache = self.self_attn(x, x, x, mask, pos_emb, + att_cache) + x = residual + self.dropout(x_att) + if not self.normalize_before: + x = self.norm_mha(x) + + # convolution module + # Fake new cnn cache here, and then change it in conv_module + new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) + if self.conv_module is not None: + residual = x + if self.normalize_before: + x = self.norm_conv(x) + x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) + x = residual + self.dropout(x) + + if not self.normalize_before: + x = self.norm_conv(x) + + # feed forward module + residual = x + if self.normalize_before: + x = self.norm_ff(x) + + x = residual + self.ff_scale * self.dropout(self.feed_forward(x)) + if not self.normalize_before: + x = self.norm_ff(x) + + if self.conv_module is not None: + x = self.norm_final(x) + + return x, mask, new_att_cache, new_cnn_cache + + def inference( + self, + x: torch.Tensor, + mask: torch.Tensor, + pos_emb: torch.Tensor, + mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), + cache_offset: torch.Tensor = None, + cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + + # whether to use macaron style + if self.feed_forward_macaron is not None: + residual = x + if self.normalize_before: + x = self.norm_ff_macaron(x) + x = residual + self.ff_scale * self.dropout( + self.feed_forward_macaron(x)) + if not self.normalize_before: + x = self.norm_ff_macaron(x) + + # multi-headed self-attention module + residual = x + if self.normalize_before: + x = self.norm_mha(x) + x_att = self.self_attn.inference(x, x, x, mask, pos_emb, cache_offset) + x = residual + self.dropout(x_att) + if not self.normalize_before: + x = self.norm_mha(x) + + # convolution module + # Fake new cnn cache here, and then change it in conv_module + new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) + if self.conv_module is not None: + residual = x + if self.normalize_before: + x = self.norm_conv(x) + x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) + x = residual + self.dropout(x) + + if not self.normalize_before: + x = self.norm_conv(x) + + # feed forward module + residual = x + if self.normalize_before: + x = self.norm_ff(x) + + x = residual + self.ff_scale * self.dropout(self.feed_forward(x)) + if not self.normalize_before: + x = self.norm_ff(x) + + if self.conv_module is not None: + x = self.norm_final(x) + + return x, mask, new_cnn_cache diff --git a/audio_detokenizer/transformer/label_smoothing_loss.py b/audio_detokenizer/transformer/label_smoothing_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..feacabf09609ee6eb047c89ce18d372256c72c71 --- /dev/null +++ b/audio_detokenizer/transformer/label_smoothing_loss.py @@ -0,0 +1,96 @@ +# Copyright (c) 2019 Shigeki Karita +# 2020 Mobvoi Inc (Binbin Zhang) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Label smoothing module.""" + +import torch +from torch import nn + + +class LabelSmoothingLoss(nn.Module): + """Label-smoothing loss. + + In a standard CE loss, the label's data distribution is: + [0,1,2] -> + [ + [1.0, 0.0, 0.0], + [0.0, 1.0, 0.0], + [0.0, 0.0, 1.0], + ] + + In the smoothing version CE Loss,some probabilities + are taken from the true label prob (1.0) and are divided + among other labels. + + e.g. + smoothing=0.1 + [0,1,2] -> + [ + [0.9, 0.05, 0.05], + [0.05, 0.9, 0.05], + [0.05, 0.05, 0.9], + ] + + Args: + size (int): the number of class + padding_idx (int): padding class id which will be ignored for loss + smoothing (float): smoothing rate (0.0 means the conventional CE) + normalize_length (bool): + normalize loss by sequence length if True + normalize loss by batch size if False + """ + + def __init__(self, + size: int, + padding_idx: int, + smoothing: float, + normalize_length: bool = False): + """Construct an LabelSmoothingLoss object.""" + super(LabelSmoothingLoss, self).__init__() + self.criterion = nn.KLDivLoss(reduction="none") + self.padding_idx = padding_idx + self.confidence = 1.0 - smoothing + self.smoothing = smoothing + self.size = size + self.normalize_length = normalize_length + + def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor: + """Compute loss between x and target. + + The model outputs and data labels tensors are flatten to + (batch*seqlen, class) shape and a mask is applied to the + padding part which should not be calculated for loss. + + Args: + x (torch.Tensor): prediction (batch, seqlen, class) + target (torch.Tensor): + target signal masked with self.padding_id (batch, seqlen) + Returns: + loss (torch.Tensor) : The KL loss, scalar float value + """ + assert x.size(2) == self.size + batch_size = x.size(0) + x = x.view(-1, self.size) + target = target.view(-1) + # use zeros_like instead of torch.no_grad() for true_dist, + # since no_grad() can not be exported by JIT + true_dist = torch.zeros_like(x) + true_dist.fill_(self.smoothing / (self.size - 1)) + ignore = target == self.padding_idx # (B,) + total = len(target) - ignore.sum().item() + target = target.masked_fill(ignore, 0) # avoid -1 index + true_dist.scatter_(1, target.unsqueeze(1), self.confidence) + kl = self.criterion(torch.log_softmax(x, dim=1), true_dist) + denom = total if self.normalize_length else batch_size + return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom diff --git a/audio_detokenizer/transformer/positionwise_feed_forward.py b/audio_detokenizer/transformer/positionwise_feed_forward.py new file mode 100644 index 0000000000000000000000000000000000000000..b7a2cf6e7315e3a5ed2794423daff0a59cc5b208 --- /dev/null +++ b/audio_detokenizer/transformer/positionwise_feed_forward.py @@ -0,0 +1,115 @@ +# Copyright (c) 2019 Shigeki Karita +# 2020 Mobvoi Inc (Binbin Zhang) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Positionwise feed forward layer definition.""" + +import torch + + +class PositionwiseFeedForward(torch.nn.Module): + """Positionwise feed forward layer. + + FeedForward are appied on each position of the sequence. + The output dim is same with the input dim. + + Args: + idim (int): Input dimenstion. + hidden_units (int): The number of hidden units. + dropout_rate (float): Dropout rate. + activation (torch.nn.Module): Activation function + """ + + def __init__( + self, + idim: int, + hidden_units: int, + dropout_rate: float, + activation: torch.nn.Module = torch.nn.ReLU(), + ): + """Construct a PositionwiseFeedForward object.""" + super(PositionwiseFeedForward, self).__init__() + self.w_1 = torch.nn.Linear(idim, hidden_units) + self.activation = activation + self.dropout = torch.nn.Dropout(dropout_rate) + self.w_2 = torch.nn.Linear(hidden_units, idim) + + def forward(self, xs: torch.Tensor) -> torch.Tensor: + """Forward function. + + Args: + xs: input tensor (B, L, D) + Returns: + output tensor, (B, L, D) + """ + return self.w_2(self.dropout(self.activation(self.w_1(xs)))) + + +class MoEFFNLayer(torch.nn.Module): + """ + Mixture of expert with Positionwise feed forward layer + See also figure 1 in https://arxiv.org/pdf/2305.15663.pdf + The output dim is same with the input dim. + + Modified from https://github.com/Lightning-AI/lit-gpt/pull/823 + https://github.com/mistralai/mistral-src/blob/b46d6/moe_one_file_ref.py#L203-L219 + Args: + n_expert: number of expert. + n_expert_per_token: The actual number of experts used for each frame + idim (int): Input dimenstion. + hidden_units (int): The number of hidden units. + dropout_rate (float): Dropout rate. + activation (torch.nn.Module): Activation function + """ + + def __init__( + self, + n_expert: int, + n_expert_per_token: int, + idim: int, + hidden_units: int, + dropout_rate: float, + activation: torch.nn.Module = torch.nn.ReLU(), + ): + super(MoEFFNLayer, self).__init__() + self.gate = torch.nn.Linear(idim, n_expert, bias=False) + self.experts = torch.nn.ModuleList( + PositionwiseFeedForward(idim, hidden_units, dropout_rate, + activation) for _ in range(n_expert)) + self.n_expert_per_token = n_expert_per_token + + def forward(self, xs: torch.Tensor) -> torch.Tensor: + """Foward function. + Args: + xs: input tensor (B, L, D) + Returns: + output tensor, (B, L, D) + + """ + B, L, D = xs.size( + ) # batch size, sequence length, embedding dimension (idim) + xs = xs.view(-1, D) # (B*L, D) + router = self.gate(xs) # (B*L, n_expert) + logits, indices = torch.topk( + router, self.n_expert_per_token + ) # probs:(B*L, n_expert), indices: (B*L, n_expert) + weights = torch.nn.functional.softmax( + logits, dim=1, + dtype=torch.float).to(dtype=xs.dtype) # (B*L, n_expert_per_token) + output = torch.zeros_like(xs) # (B*L, D) + for i, expert in enumerate(self.experts): + mask = indices == i + batch_idx, ith_expert = torch.where(mask) + output[batch_idx] += weights[batch_idx, ith_expert, None] * expert( + xs[batch_idx]) + return output.view(B, L, D) diff --git a/audio_detokenizer/transformer/subsampling.py b/audio_detokenizer/transformer/subsampling.py new file mode 100644 index 0000000000000000000000000000000000000000..2dc1ba6ced6663335d6a3d43ee9663f4826ccd41 --- /dev/null +++ b/audio_detokenizer/transformer/subsampling.py @@ -0,0 +1,387 @@ +# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) +# 2024 Alibaba Inc (Xiang Lyu) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Modified from ESPnet(https://github.com/espnet/espnet) +"""Subsampling layer definition.""" + +from typing import Tuple, Union + +import torch + + +class BaseSubsampling(torch.nn.Module): + + def __init__(self): + super().__init__() + self.right_context = 0 + self.subsampling_rate = 1 + + def position_encoding(self, offset: Union[int, torch.Tensor], + size: int) -> torch.Tensor: + return self.pos_enc.position_encoding(offset, size) + + def fix_position_encoding(self, offset: Union[int, torch.Tensor], + size: int, max_len: int) -> torch.Tensor: + # import pdb;pdb.set_trace() + return self.pos_enc.fix_position_encoding(offset, size, max_len) + +class EmbedinigNoSubsampling(BaseSubsampling): + """Embedding input without subsampling + """ + + def __init__(self, idim: int, odim: int, dropout_rate: float, + pos_enc_class: torch.nn.Module): + super().__init__() + self.embed = torch.nn.Embedding(idim, odim) + self.pos_enc = pos_enc_class + + def forward( + self, + x: torch.Tensor, + x_mask: torch.Tensor, + offset: Union[int, torch.Tensor] = 0 + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Input x. + + Args: + x (torch.Tensor): Input tensor (#batch, time, idim). + x_mask (torch.Tensor): Input mask (#batch, 1, time). + + Returns: + torch.Tensor: linear input tensor (#batch, time', odim), + where time' = time . + torch.Tensor: linear input mask (#batch, 1, time'), + where time' = time . + + """ + x = self.embed(x) + x, pos_emb = self.pos_enc(x, offset) + return x, pos_emb, x_mask + + +class LinearNoSubsampling(BaseSubsampling): + """Linear transform the input without subsampling + + Args: + idim (int): Input dimension. + odim (int): Output dimension. + dropout_rate (float): Dropout rate. + + """ + + def __init__(self, idim: int, odim: int, dropout_rate: float, + pos_enc_class: torch.nn.Module): + """Construct an linear object.""" + super().__init__() + self.out = torch.nn.Sequential( + torch.nn.Linear(idim, odim), + torch.nn.LayerNorm(odim, eps=1e-5), + torch.nn.Dropout(dropout_rate), + ) + self.pos_enc = pos_enc_class + self.right_context = 0 + self.subsampling_rate = 1 + + def forward( + self, + x: torch.Tensor, + x_mask: torch.Tensor, + offset: Union[int, torch.Tensor] = 0 + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Input x. + + Args: + x (torch.Tensor): Input tensor (#batch, time, idim). + x_mask (torch.Tensor): Input mask (#batch, 1, time). + + Returns: + torch.Tensor: linear input tensor (#batch, time', odim), + where time' = time . + torch.Tensor: linear input mask (#batch, 1, time'), + where time' = time . + + """ + x = self.out(x) + x, pos_emb = self.pos_enc(x, offset) + return x, pos_emb, x_mask + + +class Conv1dSubsampling2(BaseSubsampling): + """Convolutional 1D subsampling (to 1/2 length). + It is designed for Whisper, ref: + https://github.com/openai/whisper/blob/main/whisper/model.py + + Args: + idim (int): Input dimension. + odim (int): Output dimension. + dropout_rate (float): Dropout rate. + + """ + + def __init__(self, idim: int, odim: int, dropout_rate: float, + pos_enc_class: torch.nn.Module): + """Construct an Conv1dSubsampling2 object.""" + super().__init__() + self.conv = torch.nn.Sequential( + torch.nn.Conv1d(idim, odim, kernel_size=3, padding=1), + torch.nn.GELU(), + torch.nn.Conv1d(odim, odim, kernel_size=3, stride=2, padding=1), + torch.nn.GELU(), + ) + self.pos_enc = pos_enc_class + # The right context for every conv layer is computed by: + # (kernel_size - 1) * frame_rate_of_this_layer + self.subsampling_rate = 2 + # 4 = (3 - 1) * 1 + (3 - 1) * 1 + self.right_context = 4 + + def forward( + self, + x: torch.Tensor, + x_mask: torch.Tensor, + offset: Union[int, torch.Tensor] = 0 + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Subsample x. + + Args: + x (torch.Tensor): Input tensor (#batch, time, idim). + x_mask (torch.Tensor): Input mask (#batch, 1, time). + + Returns: + torch.Tensor: Subsampled tensor (#batch, time', odim), + where time' = time // 2. + torch.Tensor: Subsampled mask (#batch, 1, time'), + where time' = time // 2. + torch.Tensor: positional encoding + + """ + time = x.size(1) + x = x.transpose(1, 2) # (b, f, t) + x = self.conv(x) + x = x.transpose(1, 2) # (b, t, f) + x, pos_emb = self.pos_enc(x, offset) + return x, pos_emb, x_mask[:, :, (time + 1) % 2::2] + + +class Conv2dSubsampling4(BaseSubsampling): + """Convolutional 2D subsampling (to 1/4 length). + + Args: + idim (int): Input dimension. + odim (int): Output dimension. + dropout_rate (float): Dropout rate. + + """ + + def __init__(self, idim: int, odim: int, dropout_rate: float, + pos_enc_class: torch.nn.Module): + """Construct an Conv2dSubsampling4 object.""" + super().__init__() + self.conv = torch.nn.Sequential( + torch.nn.Conv2d(1, odim, 3, 2), + torch.nn.ReLU(), + torch.nn.Conv2d(odim, odim, 3, 2), + torch.nn.ReLU(), + ) + self.out = torch.nn.Sequential( + torch.nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim)) + self.pos_enc = pos_enc_class + # The right context for every conv layer is computed by: + # (kernel_size - 1) * frame_rate_of_this_layer + self.subsampling_rate = 4 + # 6 = (3 - 1) * 1 + (3 - 1) * 2 + self.right_context = 6 + + def forward( + self, + x: torch.Tensor, + x_mask: torch.Tensor, + offset: Union[int, torch.Tensor] = 0 + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Subsample x. + + Args: + x (torch.Tensor): Input tensor (#batch, time, idim). + x_mask (torch.Tensor): Input mask (#batch, 1, time). + + Returns: + torch.Tensor: Subsampled tensor (#batch, time', odim), + where time' = time // 4. + torch.Tensor: Subsampled mask (#batch, 1, time'), + where time' = time // 4. + torch.Tensor: positional encoding + + """ + x = x.unsqueeze(1) # (b, c=1, t, f) + x = self.conv(x) + b, c, t, f = x.size() + x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) + x, pos_emb = self.pos_enc(x, offset) + return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2] + + +class Conv2dSubsampling6(BaseSubsampling): + """Convolutional 2D subsampling (to 1/6 length). + Args: + idim (int): Input dimension. + odim (int): Output dimension. + dropout_rate (float): Dropout rate. + pos_enc (torch.nn.Module): Custom position encoding layer. + """ + + def __init__(self, idim: int, odim: int, dropout_rate: float, + pos_enc_class: torch.nn.Module): + """Construct an Conv2dSubsampling6 object.""" + super().__init__() + self.conv = torch.nn.Sequential( + torch.nn.Conv2d(1, odim, 3, 2), + torch.nn.ReLU(), + torch.nn.Conv2d(odim, odim, 5, 3), + torch.nn.ReLU(), + ) + self.linear = torch.nn.Linear(odim * (((idim - 1) // 2 - 2) // 3), + odim) + self.pos_enc = pos_enc_class + # 10 = (3 - 1) * 1 + (5 - 1) * 2 + self.subsampling_rate = 6 + self.right_context = 10 + + def forward( + self, + x: torch.Tensor, + x_mask: torch.Tensor, + offset: Union[int, torch.Tensor] = 0 + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Subsample x. + Args: + x (torch.Tensor): Input tensor (#batch, time, idim). + x_mask (torch.Tensor): Input mask (#batch, 1, time). + + Returns: + torch.Tensor: Subsampled tensor (#batch, time', odim), + where time' = time // 6. + torch.Tensor: Subsampled mask (#batch, 1, time'), + where time' = time // 6. + torch.Tensor: positional encoding + """ + x = x.unsqueeze(1) # (b, c, t, f) + x = self.conv(x) + b, c, t, f = x.size() + x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f)) + x, pos_emb = self.pos_enc(x, offset) + return x, pos_emb, x_mask[:, :, 2::2][:, :, 4::3] + + +class Conv2dSubsampling8(BaseSubsampling): + """Convolutional 2D subsampling (to 1/8 length). + + Args: + idim (int): Input dimension. + odim (int): Output dimension. + dropout_rate (float): Dropout rate. + + """ + + def __init__(self, idim: int, odim: int, dropout_rate: float, + pos_enc_class: torch.nn.Module): + """Construct an Conv2dSubsampling8 object.""" + super().__init__() + self.conv = torch.nn.Sequential( + torch.nn.Conv2d(1, odim, 3, 2), + torch.nn.ReLU(), + torch.nn.Conv2d(odim, odim, 3, 2), + torch.nn.ReLU(), + torch.nn.Conv2d(odim, odim, 3, 2), + torch.nn.ReLU(), + ) + self.linear = torch.nn.Linear( + odim * ((((idim - 1) // 2 - 1) // 2 - 1) // 2), odim) + self.pos_enc = pos_enc_class + self.subsampling_rate = 8 + # 14 = (3 - 1) * 1 + (3 - 1) * 2 + (3 - 1) * 4 + self.right_context = 14 + + def forward( + self, + x: torch.Tensor, + x_mask: torch.Tensor, + offset: Union[int, torch.Tensor] = 0 + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Subsample x. + + Args: + x (torch.Tensor): Input tensor (#batch, time, idim). + x_mask (torch.Tensor): Input mask (#batch, 1, time). + + Returns: + torch.Tensor: Subsampled tensor (#batch, time', odim), + where time' = time // 8. + torch.Tensor: Subsampled mask (#batch, 1, time'), + where time' = time // 8. + torch.Tensor: positional encoding + """ + x = x.unsqueeze(1) # (b, c, t, f) + x = self.conv(x) + b, c, t, f = x.size() + x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f)) + x, pos_emb = self.pos_enc(x, offset) + return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2][:, :, 2::2] + + +class LegacyLinearNoSubsampling(BaseSubsampling): + """Linear transform the input without subsampling + + Args: + idim (int): Input dimension. + odim (int): Output dimension. + dropout_rate (float): Dropout rate. + + """ + + def __init__(self, idim: int, odim: int, dropout_rate: float, + pos_enc_class: torch.nn.Module): + """Construct an linear object.""" + super().__init__() + self.out = torch.nn.Sequential( + torch.nn.Linear(idim, odim), + torch.nn.LayerNorm(odim, eps=1e-5), + torch.nn.Dropout(dropout_rate), + torch.nn.ReLU(), + ) + self.pos_enc = pos_enc_class + self.right_context = 0 + self.subsampling_rate = 1 + + def forward( + self, + x: torch.Tensor, + x_mask: torch.Tensor, + offset: Union[int, torch.Tensor] = 0 + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Input x. + + Args: + x (torch.Tensor): Input tensor (#batch, time, idim). + x_mask (torch.Tensor): Input mask (#batch, 1, time). + + Returns: + torch.Tensor: linear input tensor (#batch, time', odim), + where time' = time . + torch.Tensor: linear input mask (#batch, 1, time'), + where time' = time . + + """ + x = self.out(x) + x, pos_emb = self.pos_enc(x, offset) + return x, pos_emb, x_mask diff --git a/audio_detokenizer/utils/__init__.py b/audio_detokenizer/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/audio_detokenizer/utils/__pycache__/__init__.cpython-38.pyc b/audio_detokenizer/utils/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a1a6bd279e28c6044415b1e4da2c73d5db9085bc Binary files /dev/null and b/audio_detokenizer/utils/__pycache__/__init__.cpython-38.pyc differ diff --git a/audio_detokenizer/utils/__pycache__/class_utils.cpython-38.pyc b/audio_detokenizer/utils/__pycache__/class_utils.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6c6d11c6c8fbe4588dd7616d1607e9fad030b9de Binary files /dev/null and b/audio_detokenizer/utils/__pycache__/class_utils.cpython-38.pyc differ diff --git a/audio_detokenizer/utils/__pycache__/common.cpython-38.pyc b/audio_detokenizer/utils/__pycache__/common.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bb10c16e357ed540737d862cf61cfb4c5142712e Binary files /dev/null and b/audio_detokenizer/utils/__pycache__/common.cpython-38.pyc differ diff --git a/audio_detokenizer/utils/__pycache__/mask.cpython-38.pyc b/audio_detokenizer/utils/__pycache__/mask.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..da5b89e3ff3d85e6c40c8bd948601b2ebd4828ba Binary files /dev/null and b/audio_detokenizer/utils/__pycache__/mask.cpython-38.pyc differ diff --git a/audio_detokenizer/utils/class_utils.py b/audio_detokenizer/utils/class_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..68bd45f74d6890849e84e75132692fb59be7bfbd --- /dev/null +++ b/audio_detokenizer/utils/class_utils.py @@ -0,0 +1,71 @@ +# Copyright [2023-11-28] +# 2024 Alibaba Inc (authors: Xiang Lyu) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# antflake8: noqa +import torch + +from ..transformer.activation import Swish +from ..transformer.subsampling import ( + LinearNoSubsampling, + EmbedinigNoSubsampling, + Conv1dSubsampling2, + Conv2dSubsampling4, + Conv2dSubsampling6, + Conv2dSubsampling8, +) +from ..transformer.embedding import (PositionalEncoding, + RelPositionalEncoding, + WhisperPositionalEncoding, + LearnablePositionalEncoding, + NoPositionalEncoding) +from ..transformer.attention import (MultiHeadedAttention, + RelPositionMultiHeadedAttention) +from ..transformer.embedding import EspnetRelPositionalEncoding +from ..transformer.subsampling import LegacyLinearNoSubsampling + + +BAILING_ACTIVATION_CLASSES = { + "hardtanh": torch.nn.Hardtanh, + "tanh": torch.nn.Tanh, + "relu": torch.nn.ReLU, + "selu": torch.nn.SELU, + "swish": getattr(torch.nn, "SiLU", Swish), + "gelu": torch.nn.GELU, +} + +BAILING_SUBSAMPLE_CLASSES = { + "linear": LinearNoSubsampling, + "linear_legacy": LegacyLinearNoSubsampling, + "embed": EmbedinigNoSubsampling, + "conv1d2": Conv1dSubsampling2, + "conv2d": Conv2dSubsampling4, + "conv2d6": Conv2dSubsampling6, + "conv2d8": Conv2dSubsampling8, + 'paraformer_dummy': torch.nn.Identity +} + +BAILING_EMB_CLASSES = { + "embed": PositionalEncoding, + "abs_pos": PositionalEncoding, + "rel_pos": RelPositionalEncoding, + "rel_pos_espnet": EspnetRelPositionalEncoding, + "no_pos": NoPositionalEncoding, + "abs_pos_whisper": WhisperPositionalEncoding, + "embed_learnable_pe": LearnablePositionalEncoding, +} + +BAILING_ATTENTION_CLASSES = { + "selfattn": MultiHeadedAttention, + "rel_selfattn": RelPositionMultiHeadedAttention, +} diff --git a/audio_detokenizer/utils/common.py b/audio_detokenizer/utils/common.py new file mode 100644 index 0000000000000000000000000000000000000000..a67291e6bc306dc376860f752335f4d36c232d52 --- /dev/null +++ b/audio_detokenizer/utils/common.py @@ -0,0 +1,140 @@ +# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) +# 2024 Alibaba Inc (authors: Xiang Lyu) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Modified from ESPnet(https://github.com/espnet/espnet) +"""Unility functions for Transformer.""" + +from typing import List + +import torch + +IGNORE_ID = -1 + + +def pad_list(xs: List[torch.Tensor], pad_value: int): + """Perform padding for the list of tensors. + + Args: + xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)]. + pad_value (float): Value for padding. + + Returns: + Tensor: Padded tensor (B, Tmax, `*`). + + Examples: + >>> x = [torch.ones(4), torch.ones(2), torch.ones(1)] + >>> x + [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])] + >>> pad_list(x, 0) + tensor([[1., 1., 1., 1.], + [1., 1., 0., 0.], + [1., 0., 0., 0.]]) + + """ + max_len = max([len(item) for item in xs]) + batchs = len(xs) + ndim = xs[0].ndim + if ndim == 1: + pad_res = torch.zeros(batchs, + max_len, + dtype=xs[0].dtype, + device=xs[0].device) + elif ndim == 2: + pad_res = torch.zeros(batchs, + max_len, + xs[0].shape[1], + dtype=xs[0].dtype, + device=xs[0].device) + elif ndim == 3: + pad_res = torch.zeros(batchs, + max_len, + xs[0].shape[1], + xs[0].shape[2], + dtype=xs[0].dtype, + device=xs[0].device) + else: + raise ValueError(f"Unsupported ndim: {ndim}") + pad_res.fill_(pad_value) + for i in range(batchs): + pad_res[i, :len(xs[i])] = xs[i] + return pad_res + + +def th_accuracy(pad_outputs: torch.Tensor, pad_targets: torch.Tensor, + ignore_label: int) -> torch.Tensor: + """Calculate accuracy. + + Args: + pad_outputs (Tensor): Prediction tensors (B * Lmax, D). + pad_targets (LongTensor): Target label tensors (B, Lmax). + ignore_label (int): Ignore label id. + + Returns: + torch.Tensor: Accuracy value (0.0 - 1.0). + + """ + pad_pred = pad_outputs.view(pad_targets.size(0), pad_targets.size(1), + pad_outputs.size(1)).argmax(2) + mask = pad_targets != ignore_label + numerator = torch.sum( + pad_pred.masked_select(mask) == pad_targets.masked_select(mask)) + denominator = torch.sum(mask) + return (numerator / denominator).detach() + + +def get_padding(kernel_size, dilation=1): + return int((kernel_size * dilation - dilation) / 2) + + +def init_weights(m, mean=0.0, std=0.01): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + m.weight.data.normal_(mean, std) + + +# Repetition Aware Sampling in VALL-E 2 +def ras_sampling(weighted_scores, decoded_tokens, sampling, top_p=0.8, top_k=25, win_size=10, tau_r=0.1): + top_ids = nucleus_sampling(weighted_scores, top_p=top_p, top_k=top_k) + rep_num = (torch.tensor(decoded_tokens[-win_size:]).to(weighted_scores.device) == top_ids).sum().item() + if rep_num >= win_size * tau_r: + top_ids = random_sampling(weighted_scores, decoded_tokens, sampling) + return top_ids + +def nucleus_sampling(weighted_scores, top_p=0.8, top_k=25): + prob, indices = [], [] + cum_prob = 0.0 + sorted_value, sorted_idx = weighted_scores.softmax(dim=0).sort(descending=True, stable=True) + for i in range(len(sorted_idx)): + # sampling both top-p and numbers. + if cum_prob < top_p and len(prob) < top_k: + cum_prob += sorted_value[i] + prob.append(sorted_value[i]) + indices.append(sorted_idx[i]) + else: + break + prob = torch.tensor(prob).to(weighted_scores) + indices = torch.tensor(indices, dtype=torch.long).to(weighted_scores.device) + top_ids = indices[prob.multinomial(1, replacement=True)] + return top_ids + +def random_sampling(weighted_scores, decoded_tokens, sampling): + top_ids = weighted_scores.softmax(dim=0).multinomial(1, replacement=True) + return top_ids + +def fade_in_out(fade_in_mel, fade_out_mel, window): + device = fade_in_mel.device + fade_in_mel, fade_out_mel = fade_in_mel.cpu(), fade_out_mel.cpu() + mel_overlap_len = int(window.shape[0] / 2) + fade_in_mel[:, :, :mel_overlap_len] = fade_in_mel[:, :, :mel_overlap_len] * window[:mel_overlap_len] + fade_out_mel[:, :, -mel_overlap_len:] * window[mel_overlap_len:] + return fade_in_mel.to(device) diff --git a/audio_detokenizer/utils/file_utils.py b/audio_detokenizer/utils/file_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..bc6f55fb0d3d3461ea2f0acc0a708b5b435aa619 --- /dev/null +++ b/audio_detokenizer/utils/file_utils.py @@ -0,0 +1,41 @@ +# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) +# 2024 Alibaba Inc (authors: Xiang Lyu) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# antflake8: noqa +import json +import torchaudio + + +def read_lists(list_file): + lists = [] + with open(list_file, 'r', encoding='utf8') as fin: + for line in fin: + lists.append(line.strip()) + return lists + +def read_json_lists(list_file): + lists = read_lists(list_file) + results = {} + for fn in lists: + with open(fn, 'r', encoding='utf8') as fin: + results.update(json.load(fin)) + return results + +def load_wav(wav, target_sr): + speech, sample_rate = torchaudio.load(wav) + speech = speech.mean(dim=0, keepdim=True) + if sample_rate != target_sr: + assert sample_rate > target_sr, 'wav sample rate {} must be greater than {}'.format(sample_rate, target_sr) + speech = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)(speech) + return speech diff --git a/audio_detokenizer/utils/frontend_utils.py b/audio_detokenizer/utils/frontend_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..d7702423ff42413fde84d560cde41febf179aa96 --- /dev/null +++ b/audio_detokenizer/utils/frontend_utils.py @@ -0,0 +1,126 @@ +# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# antflake8: noqa + +import re +chinese_char_pattern = re.compile(r'[\u4e00-\u9fff]+') + +# whether contain chinese character +def contains_chinese(text): + return bool(chinese_char_pattern.search(text)) + + +# replace special symbol +def replace_corner_mark(text): + text = text.replace('²', '平方') + text = text.replace('³', '立方') + return text + + +# remove meaningless symbol +def remove_bracket(text): + text = text.replace('(', '').replace(')', '') + text = text.replace('【', '').replace('】', '') + text = text.replace('`', '').replace('`', '') + text = text.replace("——", " ") + return text + + +# spell Arabic numerals +def spell_out_number(text: str, inflect_parser): + new_text = [] + st = None + for i, c in enumerate(text): + if not c.isdigit(): + if st is not None: + num_str = inflect_parser.number_to_words(text[st: i]) + new_text.append(num_str) + st = None + new_text.append(c) + else: + if st is None: + st = i + if st is not None and st < len(text): + num_str = inflect_parser.number_to_words(text[st:]) + new_text.append(num_str) + return ''.join(new_text) + + +# split paragrah logic: +# 1. per sentence max len token_max_n, min len token_min_n, merge if last sentence len less than merge_len +# 2. cal sentence len according to lang +# 3. split sentence according to puncatation +def split_paragraph(text: str, tokenize, lang="zh", token_max_n=80, token_min_n=60, merge_len=20, comma_split=False): + def calc_utt_length(_text: str): + if lang == "zh": + return len(_text) + else: + return len(tokenize(_text)) + + def should_merge(_text: str): + if lang == "zh": + return len(_text) < merge_len + else: + return len(tokenize(_text)) < merge_len + + if lang == "zh": + pounc = ['。', '?', '!', ';', ':', '、', '.', '?', '!', ';'] + else: + pounc = ['.', '?', '!', ';', ':'] + if comma_split: + pounc.extend([',', ',']) + st = 0 + utts = [] + for i, c in enumerate(text): + if c in pounc: + if len(text[st: i]) > 0: + utts.append(text[st: i] + c) + if i + 1 < len(text) and text[i + 1] in ['"', '”']: + tmp = utts.pop(-1) + utts.append(tmp + text[i + 1]) + st = i + 2 + else: + st = i + 1 + if len(utts) == 0: + if lang == "zh": + utts.append(text + '。') + else: + utts.append(text + '.') + final_utts = [] + cur_utt = "" + for utt in utts: + if calc_utt_length(cur_utt + utt) > token_max_n and calc_utt_length(cur_utt) > token_min_n: + final_utts.append(cur_utt) + cur_utt = "" + cur_utt = cur_utt + utt + if len(cur_utt) > 0: + if should_merge(cur_utt) and len(final_utts) != 0: + final_utts[-1] = final_utts[-1] + cur_utt + else: + final_utts.append(cur_utt) + + return final_utts + + +# remove blank between chinese character +def replace_blank(text: str): + out_str = [] + for i, c in enumerate(text): + if c == " ": + if ((text[i + 1].isascii() and text[i + 1] != " ") and + (text[i - 1].isascii() and text[i - 1] != " ")): + out_str.append(c) + else: + out_str.append(c) + return "".join(out_str) diff --git a/audio_detokenizer/utils/mask.py b/audio_detokenizer/utils/mask.py new file mode 100644 index 0000000000000000000000000000000000000000..2b460bbd5adb4bd61d643ace71400a14fe314236 --- /dev/null +++ b/audio_detokenizer/utils/mask.py @@ -0,0 +1,227 @@ +# Copyright (c) 2019 Shigeki Karita +# 2020 Mobvoi Inc (Binbin Zhang) +# 2024 Alibaba Inc (authors: Xiang Lyu) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +''' +def subsequent_mask( + size: int, + device: torch.device = torch.device("cpu"), +) -> torch.Tensor: + """Create mask for subsequent steps (size, size). + + This mask is used only in decoder which works in an auto-regressive mode. + This means the current step could only do attention with its left steps. + + In encoder, fully attention is used when streaming is not necessary and + the sequence is not long. In this case, no attention mask is needed. + + When streaming is need, chunk-based attention is used in encoder. See + subsequent_chunk_mask for the chunk-based attention mask. + + Args: + size (int): size of mask + str device (str): "cpu" or "cuda" or torch.Tensor.device + dtype (torch.device): result dtype + + Returns: + torch.Tensor: mask + + Examples: + >>> subsequent_mask(3) + [[1, 0, 0], + [1, 1, 0], + [1, 1, 1]] + """ + ret = torch.ones(size, size, device=device, dtype=torch.bool) + return torch.tril(ret) +''' + + +def subsequent_mask( + size: int, + device: torch.device = torch.device("cpu"), +) -> torch.Tensor: + """Create mask for subsequent steps (size, size). + + This mask is used only in decoder which works in an auto-regressive mode. + This means the current step could only do attention with its left steps. + + In encoder, fully attention is used when streaming is not necessary and + the sequence is not long. In this case, no attention mask is needed. + + When streaming is need, chunk-based attention is used in encoder. See + subsequent_chunk_mask for the chunk-based attention mask. + + Args: + size (int): size of mask + str device (str): "cpu" or "cuda" or torch.Tensor.device + dtype (torch.device): result dtype + + Returns: + torch.Tensor: mask + + Examples: + >>> subsequent_mask(3) + [[1, 0, 0], + [1, 1, 0], + [1, 1, 1]] + """ + arange = torch.arange(size, device=device) + mask = arange.expand(size, size) + arange = arange.unsqueeze(-1) + mask = mask <= arange + return mask + + +def subsequent_chunk_mask( + size: int, + chunk_size: int, + num_left_chunks: int = -1, + device: torch.device = torch.device("cpu"), +) -> torch.Tensor: + """Create mask for subsequent steps (size, size) with chunk size, + this is for streaming encoder + + Args: + size (int): size of mask + chunk_size (int): size of chunk + num_left_chunks (int): number of left chunks + <0: use full chunk + >=0: use num_left_chunks + device (torch.device): "cpu" or "cuda" or torch.Tensor.device + + Returns: + torch.Tensor: mask + + Examples: + >>> subsequent_chunk_mask(4, 2) + [[1, 1, 0, 0], + [1, 1, 0, 0], + [1, 1, 1, 1], + [1, 1, 1, 1]] + """ + ret = torch.zeros(size, size, device=device, dtype=torch.bool) + for i in range(size): + if num_left_chunks < 0: + start = 0 + else: + start = max((i // chunk_size - num_left_chunks) * chunk_size, 0) + ending = min((i // chunk_size + 1) * chunk_size, size) + ret[i, start:ending] = True + return ret + + +def add_optional_chunk_mask(xs: torch.Tensor, + masks: torch.Tensor, + use_dynamic_chunk: bool, + use_dynamic_left_chunk: bool, + decoding_chunk_size: int, + static_chunk_size: int, + num_decoding_left_chunks: int, + enable_full_context: bool = True): + """ Apply optional mask for encoder. + + Args: + xs (torch.Tensor): padded input, (B, L, D), L for max length + mask (torch.Tensor): mask for xs, (B, 1, L) + use_dynamic_chunk (bool): whether to use dynamic chunk or not + use_dynamic_left_chunk (bool): whether to use dynamic left chunk for + training. + decoding_chunk_size (int): decoding chunk size for dynamic chunk, it's + 0: default for training, use random dynamic chunk. + <0: for decoding, use full chunk. + >0: for decoding, use fixed chunk size as set. + static_chunk_size (int): chunk size for static chunk training/decoding + if it's greater than 0, if use_dynamic_chunk is true, + this parameter will be ignored + num_decoding_left_chunks: number of left chunks, this is for decoding, + the chunk size is decoding_chunk_size. + >=0: use num_decoding_left_chunks + <0: use all left chunks + enable_full_context (bool): + True: chunk size is either [1, 25] or full context(max_len) + False: chunk size ~ U[1, 25] + + Returns: + torch.Tensor: chunk mask of the input xs. + """ + # Whether to use chunk mask or not + if use_dynamic_chunk: + max_len = xs.size(1) + if decoding_chunk_size < 0: + chunk_size = max_len + num_left_chunks = -1 + elif decoding_chunk_size > 0: + chunk_size = decoding_chunk_size + num_left_chunks = num_decoding_left_chunks + else: + # chunk size is either [1, 25] or full context(max_len). + # Since we use 4 times subsampling and allow up to 1s(100 frames) + # delay, the maximum frame is 100 / 4 = 25. + chunk_size = torch.randint(1, max_len, (1, )).item() + num_left_chunks = -1 + if chunk_size > max_len // 2 and enable_full_context: + chunk_size = max_len + else: + chunk_size = chunk_size % 25 + 1 + if use_dynamic_left_chunk: + max_left_chunks = (max_len - 1) // chunk_size + num_left_chunks = torch.randint(0, max_left_chunks, + (1, )).item() + chunk_masks = subsequent_chunk_mask(xs.size(1), chunk_size, + num_left_chunks, + xs.device) # (L, L) + chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) + chunk_masks = masks & chunk_masks # (B, L, L) + elif static_chunk_size > 0: + num_left_chunks = num_decoding_left_chunks + chunk_masks = subsequent_chunk_mask(xs.size(1), static_chunk_size, + num_left_chunks, + xs.device) # (L, L) + chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) + chunk_masks = masks & chunk_masks # (B, L, L) + else: + chunk_masks = masks + return chunk_masks + + +def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor: + """Make mask tensor containing indices of padded part. + + See description of make_non_pad_mask. + + Args: + lengths (torch.Tensor): Batch of lengths (B,). + Returns: + torch.Tensor: Mask tensor containing indices of padded part. + + Examples: + >>> lengths = [5, 3, 2] + >>> make_pad_mask(lengths) + masks = [[0, 0, 0, 0 ,0], + [0, 0, 0, 1, 1], + [0, 0, 1, 1, 1]] + """ + batch_size = lengths.size(0) + max_len = max_len if max_len > 0 else lengths.max().item() + seq_range = torch.arange(0, + max_len, + dtype=torch.int64, + device=lengths.device) + seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len) + seq_length_expand = lengths.unsqueeze(-1) + mask = seq_range_expand >= seq_length_expand + return mask diff --git a/audio_detokenizer/utils/scheduler.py b/audio_detokenizer/utils/scheduler.py new file mode 100644 index 0000000000000000000000000000000000000000..fbf4803f81bd7a3cee4af7bd8b6af2d3b46304d7 --- /dev/null +++ b/audio_detokenizer/utils/scheduler.py @@ -0,0 +1,739 @@ +# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) +# 2022 Ximalaya Inc (Yuguang Yang) +# 2024 Alibaba Inc (authors: Xiang Lyu) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Modified from ESPnet(https://github.com/espnet/espnet) +# NeMo(https://github.com/NVIDIA/NeMo) + +from typing import Union + +import math +import warnings +import torch +from torch.optim.lr_scheduler import _LRScheduler + + +class WarmupLR(_LRScheduler): + """The WarmupLR scheduler + + This scheduler is almost same as NoamLR Scheduler except for following + difference: + + NoamLR: + lr = optimizer.lr * model_size ** -0.5 + * min(step ** -0.5, step * warmup_step ** -1.5) + WarmupLR: + lr = optimizer.lr * warmup_step ** 0.5 + * min(step ** -0.5, step * warmup_step ** -1.5) + + Note that the maximum lr equals to optimizer.lr in this scheduler. + + """ + + def __init__( + self, + optimizer: torch.optim.Optimizer, + warmup_steps: Union[int, float] = 25000, + last_epoch: int = -1, + ): + self.warmup_steps = warmup_steps + + # __init__() must be invoked before setting field + # because step() is also invoked in __init__() + super().__init__(optimizer, last_epoch) + + def __repr__(self): + return f"{self.__class__.__name__}(warmup_steps={self.warmup_steps})" + + def get_lr(self): + step_num = self.last_epoch + 1 + if self.warmup_steps == 0: + return [lr * step_num**-0.5 for lr in self.base_lrs] + else: + return [ + lr * self.warmup_steps**0.5 * + min(step_num**-0.5, step_num * self.warmup_steps**-1.5) + for lr in self.base_lrs + ] + + def set_step(self, step: int): + self.last_epoch = step + + +class WarmupPolicy(_LRScheduler): + """Adds warmup kwargs and warmup logic to lr policy. + All arguments should be passed as kwargs for clarity, + Args: + warmup_steps: Number of training steps in warmup stage + warmup_ratio: Ratio of warmup steps to total steps + max_steps: Total number of steps while training or `None` for + infinite training + """ + + def __init__(self, + optimizer, + *, + warmup_steps=None, + warmup_ratio=None, + max_steps=None, + min_lr=0.0, + last_epoch=-1): + assert not (warmup_steps is not None and warmup_ratio is not None),\ + "Either use particular number of step or ratio" + assert warmup_ratio is None or max_steps is not None, \ + "If there is a ratio, there should be a total steps" + + # It is necessary to assign all attributes *before* __init__, + # as class is wrapped by an inner class. + self.max_steps = max_steps + if warmup_steps is not None: + self.warmup_steps = warmup_steps + elif warmup_ratio is not None: + self.warmup_steps = int(warmup_ratio * max_steps) + else: + self.warmup_steps = 0 + + self.min_lr = min_lr + super().__init__(optimizer, last_epoch) + + def get_lr(self): + if not self._get_lr_called_within_step: + warnings.warn( + "To get the last learning rate computed " + "by the scheduler, please use `get_last_lr()`.", + UserWarning, + stacklevel=2) + + step = self.last_epoch + + if step <= self.warmup_steps and self.warmup_steps > 0: + return self._get_warmup_lr(step) + + if step > self.max_steps: + return [self.min_lr for _ in self.base_lrs] + + return self._get_lr(step) + + def _get_warmup_lr(self, step): + lr_val = (step + 1) / (self.warmup_steps + 1) + return [initial_lr * lr_val for initial_lr in self.base_lrs] + + def _get_lr(self, step): + """Simple const lr policy""" + return self.base_lrs + + +class SquareRootConstantPolicy(_LRScheduler): + """Adds warmup kwargs and warmup logic to lr policy. + All arguments should be passed as kwargs for clarity, + Args: + warmup_steps: Number of training steps in warmup stage + warmup_ratio: Ratio of warmup steps to total steps + max_steps: Total number of steps while training or `None` for + infinite training + """ + + def __init__(self, + optimizer, + *, + constant_steps=None, + constant_ratio=None, + max_steps=None, + min_lr=0.0, + last_epoch=-1): + assert not (constant_steps is not None + and constant_ratio is not None), \ + "Either use particular number of step or ratio" + assert constant_ratio is None or max_steps is not None, \ + "If there is a ratio, there should be a total steps" + + # It is necessary to assign all attributes *before* __init__, + # as class is wrapped by an inner class. + self.max_steps = max_steps + if constant_steps is not None: + self.constant_steps = constant_steps + elif constant_ratio is not None: + self.constant_steps = int(constant_ratio * max_steps) + else: + self.constant_steps = 0 + + self.constant_lr = 1 / (constant_steps**0.5) + self.min_lr = min_lr + super().__init__(optimizer, last_epoch) + + def get_lr(self): + if not self._get_lr_called_within_step: + warnings.warn( + "To get the last learning rate computed " + "by the scheduler, please use `get_last_lr()`.", + UserWarning, + stacklevel=2) + + step = self.last_epoch + + if step <= self.constant_steps: + return [self.constant_lr for _ in self.base_lrs] + + if step > self.max_steps: + return [self.min_lr for _ in self.base_lrs] + + return self._get_lr(step) + + def _get_lr(self, step): + """Simple const lr policy""" + return self.base_lrs + + +class WarmupHoldPolicy(WarmupPolicy): + """Variant of WarmupPolicy which maintains high + learning rate for a defined number of steps. + All arguments should be passed as kwargs for clarity, + Args: + warmup_steps: Number of training steps in warmup stage + warmup_ratio: Ratio of warmup steps to total steps + hold_steps: Number of training steps to + hold the learning rate after warm up + hold_ratio: Ratio of hold steps to total steps + max_steps: Total number of steps while training or `None` for + infinite training + """ + + def __init__( + self, + optimizer, + *, + warmup_steps=None, + warmup_ratio=None, + hold_steps=None, + hold_ratio=None, + max_steps=None, + min_lr=0.0, + last_epoch=-1, + ): + assert not (hold_steps is not None and hold_ratio is not None), \ + "Either use particular number of step or ratio" + assert hold_ratio is None or max_steps is not None, \ + "If there is a ratio, there should be a total steps" + + self.min_lr = min_lr + self._last_warmup_lr = 0.0 + + # Necessary to duplicate as class attributes are hidden in inner class + self.max_steps = max_steps + if warmup_steps is not None: + self.warmup_steps = warmup_steps + elif warmup_ratio is not None: + self.warmup_steps = int(warmup_ratio * max_steps) + else: + self.warmup_steps = 0 + + if hold_steps is not None: + self.hold_steps = hold_steps + self.warmup_steps + elif hold_ratio is not None: + self.hold_steps = int(hold_ratio * max_steps) + self.warmup_steps + else: + self.hold_steps = 0 + + super().__init__( + optimizer, + warmup_steps=warmup_steps, + warmup_ratio=warmup_ratio, + max_steps=max_steps, + last_epoch=last_epoch, + min_lr=min_lr, + ) + + def get_lr(self): + if not self._get_lr_called_within_step: + warnings.warn( + "To get the last learning rate computed by the scheduler," + " " + "please use `get_last_lr()`.", + UserWarning, + stacklevel=2) + + step = self.last_epoch + + # Warmup phase + if step <= self.warmup_steps and self.warmup_steps > 0: + return self._get_warmup_lr(step) + + # Hold phase + if (step >= self.warmup_steps) and (step < self.hold_steps): + return self.base_lrs + + if step > self.max_steps: + return [self.min_lr for _ in self.base_lrs] + + return self._get_lr(step) + + +class WarmupAnnealHoldPolicy(_LRScheduler): + """Adds warmup kwargs and warmup logic to lr policy. + All arguments should be passed as kwargs for clarity, + Args: + warmup_steps: Number of training steps in warmup stage + warmup_ratio: Ratio of warmup steps to total steps + max_steps: Total number of steps while training or `None` for + infinite training + min_lr: Minimum lr to hold the learning rate after decay at. + constant_steps: Number of steps to keep lr constant at. + constant_ratio: Ratio of steps to keep lr constant. + """ + + def __init__( + self, + optimizer, + *, + warmup_steps=None, + warmup_ratio=None, + constant_steps=None, + constant_ratio=None, + max_steps=None, + min_lr=0.0, + last_epoch=-1, + ): + assert not (warmup_steps is not None + and warmup_ratio is not None), \ + "Either use particular number of step or ratio" + assert not (constant_steps is not None + and constant_ratio is not None), \ + "Either use constant_steps or constant_ratio" + assert warmup_ratio is None or max_steps is not None, \ + "If there is a ratio, there should be a total steps" + + # It is necessary to assign all attributes *before* __init__, + # as class is wrapped by an inner class. + self.max_steps = max_steps + + if warmup_steps is not None: + self.warmup_steps = warmup_steps + elif warmup_ratio is not None: + self.warmup_steps = int(warmup_ratio * max_steps) + else: + self.warmup_steps = 0 + + if constant_steps is not None: + self.constant_steps = constant_steps + elif constant_ratio is not None: + self.constant_steps = int(constant_ratio * max_steps) + else: + self.constant_steps = 0 + + self.decay_steps = max_steps - (self.constant_steps + + self.warmup_steps) + + self.min_lr = min_lr + super().__init__(optimizer, last_epoch) + + def get_lr(self): + if not self._get_lr_called_within_step: + warnings.warn( + "To get the last learning rate computed " + "by the scheduler, please use `get_last_lr()`.", + UserWarning, + stacklevel=2) + + step = self.last_epoch + + # Warmup steps + if self.warmup_steps > 0 and step <= self.warmup_steps: + return self._get_warmup_lr(step) + + # Constant steps after warmup and decay + if self.constant_steps > 0 and ( + self.warmup_steps + self.decay_steps) < step <= self.max_steps: + return self._get_constant_lr(step) + + # Min lr after max steps of updates + if step > self.max_steps: + return [self.min_lr for _ in self.base_lrs] + + return self._get_lr(step) + + def _get_warmup_lr(self, step): + lr_val = (step + 1) / (self.warmup_steps + 1) + return [initial_lr * lr_val for initial_lr in self.base_lrs] + + def _get_constant_lr(self, step): + return [self.min_lr for _ in self.base_lrs] + + def _get_lr(self, step): + """Simple const lr policy""" + return self.base_lrs + + +def _squareroot_annealing(initial_lr, step, max_steps, min_lr): + mult = ((max_steps - step) / max_steps)**0.5 + out_lr = initial_lr * mult + out_lr = max(out_lr, min_lr) + return out_lr + + +def _square_annealing(initial_lr, step, max_steps, min_lr): + mult = ((max_steps - step) / max_steps)**2 + out_lr = initial_lr * mult + out_lr = max(out_lr, min_lr) + return out_lr + + +def _cosine_annealing(initial_lr, step, max_steps, min_lr): + mult = 0.5 * (1 + math.cos(math.pi * step / max_steps)) + out_lr = (initial_lr - min_lr) * mult + min_lr + return out_lr + + +def _linear_warmup_with_cosine_annealing(max_lr, warmup_steps, step, + decay_steps, min_lr): + assert max_lr > min_lr + # Use linear warmup for the initial part. + if warmup_steps > 0 and step <= warmup_steps: + return max_lr * float(step) / float(warmup_steps) + + # For any steps larger than `decay_steps`, use `min_lr`. + if step > warmup_steps + decay_steps: + return min_lr + + # If we are done with the warmup period, use the decay style. + num_steps_ = step - warmup_steps + decay_steps_ = decay_steps + decay_ratio = float(num_steps_) / float(decay_steps_) + assert decay_ratio >= 0.0 + assert decay_ratio <= 1.0 + delta_lr = max_lr - min_lr + + coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0) + + return min_lr + coeff * delta_lr + + +def _poly_decay(initial_lr, step, decay_steps, power, min_lr, cycle): + if cycle: + multiplier = 1.0 if step == 0 else math.ceil(step / decay_steps) + decay_steps *= multiplier + else: + step = min(step, decay_steps) + p = step / decay_steps + lr = (initial_lr - min_lr) * math.pow(1.0 - p, power) + lr += min_lr + return lr + + +def _noam_hold_annealing(initial_lr, step, warmup_steps, hold_steps, + decay_rate, min_lr): + # hold_steps = total number of steps + # to hold the LR, not the warmup + hold steps. + T_warmup_decay = max(1, warmup_steps**decay_rate) + T_hold_decay = max(1, (step - hold_steps)**decay_rate) + lr = (initial_lr * T_warmup_decay) / T_hold_decay + lr = max(lr, min_lr) + return lr + + +class SquareAnnealing(WarmupPolicy): + + def __init__(self, + optimizer, + *, + max_steps, + min_lr=1e-5, + last_epoch=-1, + **kwargs): + super().__init__(optimizer=optimizer, + max_steps=max_steps, + last_epoch=last_epoch, + min_lr=min_lr, + **kwargs) + + def _get_lr(self, step): + new_lrs = [ + _square_annealing( + initial_lr=initial_lr, + step=step - self.warmup_steps, + max_steps=self.max_steps - self.warmup_steps, + min_lr=self.min_lr, + ) for initial_lr in self.base_lrs + ] + return new_lrs + + +class SquareRootAnnealing(WarmupPolicy): + + def __init__(self, + optimizer, + *, + max_steps, + min_lr=0, + last_epoch=-1, + **kwargs): + super().__init__(optimizer=optimizer, + max_steps=max_steps, + last_epoch=last_epoch, + min_lr=min_lr, + **kwargs) + + def _get_lr(self, step): + new_lrs = [ + _squareroot_annealing(initial_lr=initial_lr, + step=step, + max_steps=self.max_steps, + min_lr=self.min_lr) + for initial_lr in self.base_lrs + ] + return new_lrs + + +class CosineAnnealing(WarmupAnnealHoldPolicy): + + def __init__(self, + optimizer, + *, + max_steps, + min_lr=0, + last_epoch=-1, + **kwargs): + super().__init__(optimizer=optimizer, + max_steps=max_steps, + last_epoch=last_epoch, + min_lr=min_lr, + **kwargs) + + def _get_lr(self, step): + for initial_lr in self.base_lrs: + if initial_lr < self.min_lr: + raise ValueError( + f"{self} received an initial learning rate " + f"that was lower than the minimum learning rate.") + + if self.constant_steps is None or self.constant_steps == 0: + new_lrs = [ + _cosine_annealing( + initial_lr=initial_lr, + step=step - self.warmup_steps, + max_steps=self.max_steps - self.warmup_steps, + min_lr=self.min_lr, + ) for initial_lr in self.base_lrs + ] + else: + new_lrs = self._get_linear_warmup_with_cosine_annealing_lr(step) + return new_lrs + + def _get_warmup_lr(self, step): + if self.constant_steps is None or self.constant_steps == 0: + return super()._get_warmup_lr(step) + else: + # Use linear warmup for the initial part. + return self._get_linear_warmup_with_cosine_annealing_lr(step) + + def _get_constant_lr(self, step): + # Only called when `constant_steps` > 0. + return self._get_linear_warmup_with_cosine_annealing_lr(step) + + def _get_linear_warmup_with_cosine_annealing_lr(self, step): + # Cosine Schedule for Megatron LM, + # slightly different warmup schedule + constant LR at the end. + new_lrs = [ + _linear_warmup_with_cosine_annealing( + max_lr=self.base_lrs[0], + warmup_steps=self.warmup_steps, + step=step, + decay_steps=self.decay_steps, + min_lr=self.min_lr, + ) for _ in self.base_lrs + ] + return new_lrs + + +class NoamAnnealing(_LRScheduler): + + def __init__(self, + optimizer, + *, + d_model, + warmup_steps=None, + warmup_ratio=None, + max_steps=None, + min_lr=0.0, + last_epoch=-1): + self._normalize = d_model**(-0.5) + assert not (warmup_steps is not None + and warmup_ratio is not None), \ + "Either use particular number of step or ratio" + assert warmup_ratio is None or max_steps is not None, \ + "If there is a ratio, there should be a total steps" + + # It is necessary to assign all attributes *before* __init__, + # as class is wrapped by an inner class. + self.max_steps = max_steps + if warmup_steps is not None: + self.warmup_steps = warmup_steps + elif warmup_ratio is not None: + self.warmup_steps = int(warmup_ratio * max_steps) + else: + self.warmup_steps = 0 + + self.min_lr = min_lr + super().__init__(optimizer, last_epoch) + + def get_lr(self): + if not self._get_lr_called_within_step: + warnings.warn( + "To get the last learning rate computed " + "by the scheduler, please use `get_last_lr()`.", + UserWarning, + stacklevel=2) + + step = max(1, self.last_epoch) + + for initial_lr in self.base_lrs: + if initial_lr < self.min_lr: + raise ValueError( + f"{self} received an initial learning rate " + f"that was lower than the minimum learning rate.") + + new_lrs = [ + self._noam_annealing(initial_lr=initial_lr, step=step) + for initial_lr in self.base_lrs + ] + return new_lrs + + def _noam_annealing(self, initial_lr, step): + if self.warmup_steps > 0: + mult = self._normalize * min(step**(-0.5), + step * (self.warmup_steps**(-1.5))) + else: + mult = self._normalize * step**(-0.5) + + out_lr = initial_lr * mult + if step > self.warmup_steps: + out_lr = max(out_lr, self.min_lr) + return out_lr + + +class NoamHoldAnnealing(WarmupHoldPolicy): + + def __init__(self, + optimizer, + *, + max_steps, + decay_rate=0.5, + min_lr=0.0, + last_epoch=-1, + **kwargs): + """ + From Nemo: + Implementation of the Noam Hold Annealing policy + from the SqueezeFormer paper. + + Unlike NoamAnnealing, the peak learning rate + can be explicitly set for this scheduler. + The schedule first performs linear warmup, + then holds the peak LR, then decays with some schedule for + the remainder of the steps. + Therefore the min-lr is still dependent + on the hyper parameters selected. + + It's schedule is determined by three factors- + + Warmup Steps: Initial stage, where linear warmup + occurs uptil the peak LR is reached. Unlike NoamAnnealing, + the peak LR is explicitly stated here instead of a scaling factor. + + Hold Steps: Intermediate stage, where the peak LR + is maintained for some number of steps. In this region, + the high peak LR allows the model to converge faster + if training is stable. However the high LR + may also cause instability during training. + Should usually be a significant fraction of training + steps (around 30-40% of the entire training steps). + + Decay Steps: Final stage, where the LR rapidly decays + with some scaling rate (set by decay rate). + To attain Noam decay, use 0.5, + for Squeezeformer recommended decay, use 1.0. + The fast decay after prolonged high LR during + hold phase allows for rapid convergence. + + References: + - [Squeezeformer: + An Efficient Transformer for Automatic Speech Recognition] + (https://arxiv.org/abs/2206.00888) + + Args: + optimizer: Pytorch compatible Optimizer object. + warmup_steps: Number of training steps in warmup stage + warmup_ratio: Ratio of warmup steps to total steps + hold_steps: Number of training steps to + hold the learning rate after warm up + hold_ratio: Ratio of hold steps to total steps + max_steps: Total number of steps while training or `None` for + infinite training + decay_rate: Float value describing the polynomial decay + after the hold period. Default value + of 0.5 corresponds to Noam decay. + min_lr: Minimum learning rate. + """ + self.decay_rate = decay_rate + super().__init__(optimizer=optimizer, + max_steps=max_steps, + last_epoch=last_epoch, + min_lr=min_lr, + **kwargs) + + def _get_lr(self, step): + if self.warmup_steps is None or self.warmup_steps == 0: + raise ValueError( + "Noam scheduler cannot be used without warmup steps") + + if self.hold_steps > 0: + hold_steps = self.hold_steps - self.warmup_steps + else: + hold_steps = 0 + + new_lrs = [ + _noam_hold_annealing( + initial_lr, + step=step, + warmup_steps=self.warmup_steps, + hold_steps=hold_steps, + decay_rate=self.decay_rate, + min_lr=self.min_lr, + ) for initial_lr in self.base_lrs + ] + return new_lrs + + def set_step(self, step: int): + self.last_epoch = step + + +class ConstantLR(_LRScheduler): + """The ConstantLR scheduler + + This scheduler keeps a constant lr + + """ + + def __init__( + self, + optimizer: torch.optim.Optimizer, + ): + # __init__() must be invoked before setting field + # because step() is also invoked in __init__() + super().__init__(optimizer) + + def get_lr(self): + return self.base_lrs + + def set_step(self, step: int): + self.last_epoch = step diff --git a/audio_processing_bailingmm.py b/audio_processing_bailingmm.py new file mode 100644 index 0000000000000000000000000000000000000000..73ba64920a6aa5fa7a66150de56d01cf8285476a --- /dev/null +++ b/audio_processing_bailingmm.py @@ -0,0 +1,345 @@ +from typing import List, Tuple, Dict, Optional, Any, Union +import os +import copy + +import numpy as np +import torch +import torch.utils.data +import torchaudio +import torchaudio.compliance.kaldi as kaldi +from torch.nn.utils.rnn import pad_sequence + +from transformers.utils import TensorType +from transformers.feature_extraction_utils import FeatureExtractionMixin +from image_processing_bailingmm import BatchFeature + +NORM_FACTOR_FOR_DTYPE = { + torch.int8: 2**7, + torch.int16: 2**15, + torch.int32: 2**31, + torch.int64: 2**63, + torch.float32: 1, + torch.float64: 1, +} + +# special tokens +DEFAULT_IMAGE_PATCH_TOKEN = "" +DEFAULT_IM_START_TOKEN = "" +DEFAULT_IM_END_TOKEN = "" +DEFAULT_VID_START_TOKEN = "" +DEFAULT_GEN_IMAGE_PATCH_TOKEN = "" +DEFAULT_GEN_IM_START_TOKEN = "" +DEFAULT_GEN_IM_END_TOKEN = "" +PLACEHOLDER_IMAGE_TOKEN_IN_TEXT = "" +DEFAULT_END_OF_CHUNK_TOKEN = "" + +DEFAULT_END_OF_AUDIO_TOKEN = "" +DEFAULT_AUDIO_PATCH_TOKEN = "" +DEFAULT_AU_START_TOKEN = "" +DEFAULT_GEN_AUDIO_PATCH_TOKEN = "" +DEFAULT_GEN_AU_START_TOKEN = "" +DEFAULT_GEN_AU_END_TOKEN = "" +PLACEHOLDER_AUDIO_TOKEN_IN_TEXT = "" +DEFAULT_FRAME_PATCH_TOKEN = "" +DEFAULT_TEXT_TOKEN = '' +DEFAULT_ASR_TOKEN = '' +DEFAULT_TTS_TOKEN = '' + + +class BailingMMAudioProcessor(FeatureExtractionMixin): + def __init__(self, wav_frontend_args: Dict[str, Any], **kwargs): + super().__init__(**kwargs) + self.sample_rate = 16000 + self.wav_frontend = WavFrontend(**wav_frontend_args) + + def to_dict(self) -> Dict[str, Any]: + output = copy.deepcopy(self.__dict__) + output["wav_frontend"] = output["wav_frontend"].__dict__ + output["wav_frontend"]["cmvn"] = output["wav_frontend"]["cmvn"].tolist() + output["wav_frontend"]["_non_persistent_buffers_set"] = list(output["wav_frontend"]["_non_persistent_buffers_set"]) + output["audio_processor_type"] = self.__class__.__name__ + return output + + @classmethod + def get_feature_extractor_dict( + cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs + ) -> Tuple[Dict[str, Any], Dict[str, Any]]: + """ + Auto-fill the cmvn file path. + """ + result, kwargs = super().get_feature_extractor_dict(pretrained_model_name_or_path, **kwargs) + if not result["wav_frontend_args"]["cmvn_file"].startswith("/"): + # Convert to an absolute path. + if os.path.isdir(pretrained_model_name_or_path): + pretrained_model_dir = pretrained_model_name_or_path + else: + pretrained_model_dir = os.path.dirname(pretrained_model_name_or_path) + result["wav_frontend_args"]["cmvn_file"] = os.path.join( + pretrained_model_dir, result["wav_frontend_args"]["cmvn_file"] + ) + return result, kwargs + + def __call__(self, audios, **kwargs) -> BatchFeature: + """Preprocess an audio or a batch of audios.""" + return self.preprocess(audios, **kwargs) + + def _preprocess_audio(self, waveform: torch.Tensor, sample_rate: int) -> torch.Tensor: + waveform = normalize_audio_tensor(waveform, sample_rate, target_sample_rate=self.sample_rate) + audio_feat = self.wav_frontend(waveform.unsqueeze(0), [len(waveform)])[0].squeeze(0) + return audio_feat + + def _make_batched_audios(self, audio_feat_list: List[torch.Tensor]) -> Dict[str, Any]: + audio_feats_lengths = torch.tensor([[audio_feat.shape[0]] for audio_feat in audio_feat_list], dtype=torch.long) + max_length = max(audio_feat.shape[0] for audio_feat in audio_feat_list) + audio_feats = torch.stack( + [ + torch.cat( + (audio_feat, torch.zeros((max_length - audio_feat.shape[0], *audio_feat.shape[1:]), dtype=audio_feat.dtype)), + dim=0, + ) for audio_feat in audio_feat_list + ], dim=0, + ) + return {"audio_feats": audio_feats.numpy(), "audio_feats_lengths": audio_feats_lengths.numpy()} + + def preprocess( + self, + audios: Union[Tuple[torch.Tensor, int], List[Tuple[torch.Tensor, int]]], + return_tensors: Optional[Union[str, TensorType]] = None, + **kwargs, + ) -> BatchFeature: + if isinstance(audios, List): + audio_inputs = self._make_batched_audios([self._preprocess_audio(waveform, sr) for waveform, sr in audios]) + else: + waveform, sr = audios + audio_inputs = self._make_batched_audios([self._preprocess_audio(waveform, sr)]) + return BatchFeature(data=audio_inputs, tensor_type=return_tensors) + + +class WavFrontend(torch.nn.Module): + """Conventional frontend structure for ASR. + """ + + def __init__( + self, + cmvn_file: Optional[str] = None, + fs: int = 16000, + window: str = 'hamming', + n_mels: int = 80, + frame_length: int = 25, + frame_shift: int = 10, + filter_length_min: int = -1, + filter_length_max: int = -1, + lfr_m: int = 1, + lfr_n: int = 1, + dither: float = 1.0, + snip_edges: bool = True, + upsacle_samples: bool = True, + ): + super().__init__() + self.fs = fs + self.window = window + self.n_mels = n_mels + self.frame_length = frame_length + self.frame_shift = frame_shift + self.filter_length_min = filter_length_min + self.filter_length_max = filter_length_max + self.lfr_m = lfr_m + self.lfr_n = lfr_n + self.cmvn_file = cmvn_file + self.dither = dither + self.snip_edges = snip_edges + self.upsacle_samples = upsacle_samples + self.cmvn = None if self.cmvn_file is None else load_cmvn(self.cmvn_file) + + def output_size(self) -> int: + return self.n_mels * self.lfr_m + + def forward( + self, + input: torch.Tensor, + input_lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + batch_size = input.size(0) + feats = [] + feats_lens = [] + for i in range(batch_size): + waveform_length = input_lengths[i] + waveform = input[i][:waveform_length] + if self.upsacle_samples: + waveform = waveform * (1 << 15) + waveform = waveform.unsqueeze(0) + mat = kaldi.fbank(waveform, + num_mel_bins=self.n_mels, + frame_length=self.frame_length, + frame_shift=self.frame_shift, + dither=0.0, #self.dither + energy_floor=0.0, + window_type=self.window, + sample_frequency=self.fs, + snip_edges=self.snip_edges) + + if self.lfr_m != 1 or self.lfr_n != 1: + mat = apply_lfr(mat, self.lfr_m, self.lfr_n) + if self.cmvn is not None: + mat = apply_cmvn(mat, self.cmvn) + feat_length = mat.size(0) + feats.append(mat) + feats_lens.append(feat_length) + + feats_lens = torch.as_tensor(feats_lens) + if batch_size == 1: + feats_pad = feats[0][None, :, :] + else: + feats_pad = pad_sequence(feats, + batch_first=True, + padding_value=0.0) + # import ipdb;ipdb.set_trace() + return feats_pad, feats_lens + + def forward_fbank( + self, + input: torch.Tensor, + input_lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + batch_size = input.size(0) + feats = [] + feats_lens = [] + for i in range(batch_size): + waveform_length = input_lengths[i] + waveform = input[i][:waveform_length] + waveform = waveform * (1 << 15) + waveform = waveform.unsqueeze(0) + mat = kaldi.fbank(waveform, + num_mel_bins=self.n_mels, + frame_length=self.frame_length, + frame_shift=self.frame_shift, + dither=self.dither, + energy_floor=0.0, + window_type=self.window, + sample_frequency=self.fs) + + feat_length = mat.size(0) + feats.append(mat) + feats_lens.append(feat_length) + + feats_lens = torch.as_tensor(feats_lens) + feats_pad = pad_sequence(feats, + batch_first=True, + padding_value=0.0) + return feats_pad, feats_lens + + def forward_lfr_cmvn( + self, + input: torch.Tensor, + input_lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + batch_size = input.size(0) + feats = [] + feats_lens = [] + for i in range(batch_size): + mat = input[i, :input_lengths[i], :] + if self.lfr_m != 1 or self.lfr_n != 1: + mat = apply_lfr(mat, self.lfr_m, self.lfr_n) + if self.cmvn is not None: + mat = apply_cmvn(mat, self.cmvn) + feat_length = mat.size(0) + feats.append(mat) + feats_lens.append(feat_length) + + feats_lens = torch.as_tensor(feats_lens) + feats_pad = pad_sequence(feats, + batch_first=True, + padding_value=0.0) + return feats_pad, feats_lens + + +def load_cmvn(cmvn_file): + with open(cmvn_file, 'r', encoding='utf-8') as f: + lines = f.readlines() + means_list = [] + vars_list = [] + for i in range(len(lines)): + line_item = lines[i].split() + if line_item[0] == '': + line_item = lines[i + 1].split() + if line_item[0] == '': + add_shift_line = line_item[3:(len(line_item) - 1)] + means_list = list(add_shift_line) + continue + elif line_item[0] == '': + line_item = lines[i + 1].split() + if line_item[0] == '': + rescale_line = line_item[3:(len(line_item) - 1)] + vars_list = list(rescale_line) + continue + means = np.array(means_list).astype(np.float32) + vars = np.array(vars_list).astype(np.float32) + cmvn = np.array([means, vars]) + cmvn = torch.as_tensor(cmvn, dtype=torch.float32) + return cmvn + + +def apply_cmvn(inputs, cmvn): # noqa + """ + Apply CMVN with mvn data + """ + + device = inputs.device + dtype = inputs.dtype + frame, dim = inputs.shape + + means = cmvn[0:1, :dim] + vars = cmvn[1:2, :dim] + inputs += means.to(device) + inputs *= vars.to(device) + + return inputs.type(torch.float32) + + +def apply_lfr(inputs, lfr_m, lfr_n): + LFR_inputs = [] + T = inputs.shape[0] + T_lfr = int(np.ceil(T / lfr_n)) + left_padding = inputs[0].repeat((lfr_m - 1) // 2, 1) + inputs = torch.vstack((left_padding, inputs)) + T = T + (lfr_m - 1) // 2 + for i in range(T_lfr): + if lfr_m <= T - i * lfr_n: + LFR_inputs.append((inputs[i * lfr_n:i * lfr_n + lfr_m]).view(1, -1)) + else: # process last LFR frame + num_padding = lfr_m - (T - i * lfr_n) + frame = (inputs[i * lfr_n:]).view(-1) + for _ in range(num_padding): + frame = torch.hstack((frame, inputs[-1])) + LFR_inputs.append(frame) + LFR_outputs = torch.vstack(LFR_inputs) + return LFR_outputs.type(torch.float32) + + +def normalize_audio_tensor( + waveform: torch.Tensor, + sample_rate: int, + device=None, + target_sample_rate: Optional[int] = None, +): + # Ensure dtype == float32. + assert waveform.dtype in NORM_FACTOR_FOR_DTYPE, f"Unsupported waveform dtype: {waveform.dtype}" + norm_factor = NORM_FACTOR_FOR_DTYPE[waveform.dtype] + waveform = waveform.to(torch.float32) / norm_factor + + # Remove the channel dimension. + while len(waveform.shape) > 1: + waveform = waveform[0] + + # Move to device. + if device is not None: + waveform = waveform.to(device) + + # Resample. + if target_sample_rate is not None and sample_rate != target_sample_rate: + resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sample_rate) + if device is not None: + resampler = resampler.to(device) + waveform = resampler(waveform.unsqueeze(0)).squeeze(0) + + return waveform + diff --git a/bailingmm_utils.py b/bailingmm_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..bb7eb36dda54609359b769d38cd3cf869a801054 --- /dev/null +++ b/bailingmm_utils.py @@ -0,0 +1,438 @@ +from __future__ import annotations + +import base64 +import logging +import math +import os +import sys +import time +import warnings +from functools import lru_cache +from io import BytesIO + +import random +import numpy as np + +import requests +import torch +import torchvision +from packaging import version + +from PIL import Image +import torchaudio +from torchvision import io, transforms +from torchvision.transforms import InterpolationMode +from typing import Union, Tuple, List + +logger = logging.getLogger(__name__) + +IMAGE_FACTOR = 28 +MIN_PIXELS = 4 * 28 * 28 +MAX_PIXELS = 1024 * 28 * 28 +MAX_RATIO = 200 + +VIDEO_MIN_PIXELS = 128 * 28 * 28 +VIDEO_MAX_PIXELS = 768 * 28 * 28 # 4: 3 => 32: 24 (768) | 16:9 => 32:18 (576) +VIDEO_TOTAL_PIXELS = 9216 * 28 * 28 # 9216: 24-72 frames | 7680: 10-60 frames | 6144: 8-48 frames + +FRAME_FACTOR = 2 +FPS = 2.0 +FPS_MIN_FRAMES = 4 +FPS_MAX_FRAMES = 128 + +def is_decord_available() -> bool: + import importlib.util + return importlib.util.find_spec("decord") is not None + +def round_by_factor(number: int, factor: int) -> int: + """Returns the closest integer to 'number' that is divisible by 'factor'.""" + return round(number / factor) * factor + +def ceil_by_factor(number: int, factor: int) -> int: + """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'.""" + return math.ceil(number / factor) * factor + +def floor_by_factor(number: int, factor: int) -> int: + """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'.""" + return math.floor(number / factor) * factor + +def is_image(image_file): + if isinstance(image_file, str) and (image_file.startswith("base64,") or image_file.lower().endswith( + ('.bmp', '.dib', '.png', '.jpg', '.jpeg', '.pbm', '.pgm', '.ppm', '.tif', '.tiff'))): + return True + elif isinstance(image_file, Image.Image): + return True + else: + return False + +def is_video(video_file): + if isinstance(video_file, str) and video_file.lower().endswith( + ('.mp4', '.mkv', '.avi', '.wmv', '.iso', ".webm")): + return True + else: + return False + +def is_audio(audio_file): + if isinstance(audio_file, str) and audio_file.lower().endswith( + (".wav", ".mp3", ".aac", ".flac", ".alac", ".m4a", ".ogg", ".wma", ".aiff", ".amr", ".au")): + return True + else: + return False + +def smart_resize( + height: int, width: int, factor: int = IMAGE_FACTOR, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS +) -> tuple[int, int]: + """ + Rescales the image so that the following conditions are met: + + 1. Both dimensions (height and width) are divisible by 'factor'. + + 2. The total number of pixels is within the range ['min_pixels', 'max_pixels']. + + 3. The aspect ratio of the image is maintained as closely as possible. + """ + if max(height, width) / min(height, width) > MAX_RATIO: + raise ValueError( + f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}" + ) + h_bar = max(factor, round_by_factor(height, factor)) + w_bar = max(factor, round_by_factor(width, factor)) + if h_bar * w_bar > max_pixels: + beta = math.sqrt((height * width) / max_pixels) + h_bar = floor_by_factor(height / beta, factor) + w_bar = floor_by_factor(width / beta, factor) + elif h_bar * w_bar < min_pixels: + beta = math.sqrt(min_pixels / (height * width)) + h_bar = ceil_by_factor(height * beta, factor) + w_bar = ceil_by_factor(width * beta, factor) + return h_bar, w_bar + +def fetch_image(ele: dict[str, str | Image.Image], size_factor: int = IMAGE_FACTOR) -> Image.Image: + if "image" in ele: + image = ele["image"] + else: + image = ele["image_url"] + image_obj = None + if isinstance(image, Image.Image): + image_obj = image + elif image.startswith("http://") or image.startswith("https://"): + image_obj = Image.open(requests.get(image, stream=True).raw) + elif image.startswith("file://"): + image_obj = Image.open(image[7:]) + elif image.startswith("data:image"): + if "base64," in image: + _, base64_data = image.split("base64,", 1) + data = base64.b64decode(base64_data) + image_obj = Image.open(BytesIO(data)) + else: + image_obj = Image.open(image) + if image_obj is None: + raise ValueError(f"Unrecognized image input, support local path, http url, base64 and PIL.Image, got {image}") + image = image_obj.convert("RGB") + ## resize + if "resized_height" in ele and "resized_width" in ele: + resized_height, resized_width = smart_resize( + ele["resized_height"], + ele["resized_width"], + factor=size_factor, + ) + else: + width, height = image.size + min_pixels = ele.get("min_pixels", MIN_PIXELS) + max_pixels = ele.get("max_pixels", MAX_PIXELS) + resized_height, resized_width = smart_resize( + height, + width, + factor=size_factor, + min_pixels=min_pixels, + max_pixels=max_pixels, + ) + image = image.resize((resized_width, resized_height)) + + return image + +def sample_frames(num_frames, total_frames, sample="random"): + if sample == "sequence": + frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int) + else: + intervals = np.linspace(start=0, stop=total_frames, num=num_frames + 1, dtype=int) + ranges = [] + for idx, interv in enumerate(intervals[:-1]): + ranges.append((interv, intervals[idx + 1] - 1)) + if sample == "random": + try: + frame_indices = [random.choice(range(x[0], x[1])) for x in ranges] + except: + frame_indices = np.random.permutation(total_frames)[:num_frames] + frame_indices.sort() + frame_indices = list(frame_indices) + if len(frame_indices) < num_frames: + padded_frame_indices = [frame_indices[-1]] * num_frames + padded_frame_indices[:len(frame_indices)] = frame_indices + frame_indices = padded_frame_indices + elif sample == "uniform": + frame_indices = [(x[0] + x[1]) // 2 for x in ranges] + if len(frame_indices) < num_frames: + frame_indices = [ + frame_indices[int((num_frames - 1) * i / (num_frames - 1) + 0.5)] for i in range(num_frames) + ] + else: + raise NotImplementedError + return frame_indices + +def get_frames( + ele: dict, + total_frames: int, +) -> int: + """calculate the number of frames for video used for model inputs. + Args: + ele (dict): a dict contains the configuration of video. + total_frames (int): the original total number of frames of the video. + Returns: + int: the number of frames for video used for model inputs. + """ + if "nframes" in ele: + num_frames = round_by_factor(ele["nframes"], FRAME_FACTOR) + else: + min_frames = ceil_by_factor(ele.get("min_frames", FPS_MIN_FRAMES), FRAME_FACTOR) + max_frames = floor_by_factor(ele.get("max_frames", min(FPS_MAX_FRAMES, total_frames)), FRAME_FACTOR) + num_frames = max(min(total_frames, max_frames), min_frames) + num_frames = floor_by_factor(num_frames, FRAME_FACTOR) + + if not (FRAME_FACTOR <= num_frames <= total_frames): + raise ValueError(f"nframes should in interval [{FRAME_FACTOR}, {total_frames}], but got {num_frames}.") + return num_frames + +def _read_video_torchvision( + ele: dict, +) -> (torch.Tensor, float): + """read video using torchvision.io.read_video + Args: + ele (dict): a dict contains the configuration of video. + support keys: + - video: the path of video. support "file://", "http://", "https://" and local path. + - video_start: the start time of video. + - video_end: the end time of video. + Returns: + torch.Tensor: the video tensor with shape (T, C, H, W). + """ + video_path = ele["video"] + if version.parse(torchvision.__version__) < version.parse("0.19.0"): + if "http://" in video_path or "https://" in video_path: + warnings.warn("torchvision < 0.19.0 does not support http/https video path, please upgrade to 0.19.0.") + if "file://" in video_path: + video_path = video_path[7:] + + sample_method = ele.get("sample", "sequence") + pts_unit = "sec" if sample_method == "sequence" else "pts" + st = time.time() + video, audio, info = io.read_video( + video_path, + start_pts=ele.get("video_start", 0.0), + end_pts=ele.get("video_end", None), + pts_unit=pts_unit, + output_format="TCHW", + ) + total_frames, video_fps = video.size(0), info["video_fps"] + logger.info(f"torchvision: {video_path=}, {total_frames=}, {video_fps=}, time={time.time() - st:.3f}s") + + num_frames = get_frames(ele, total_frames) + frame_indices = sample_frames( + num_frames=num_frames, total_frames=total_frames, sample=sample_method + ) + video = video[frame_indices] + sample_fps = num_frames / max(total_frames, 1e-6) * video_fps + return video, sample_fps + +def _read_video_decord( + ele: dict, +) -> (torch.Tensor, float): + """read video using decord.VideoReader + + Args: + ele (dict): a dict contains the configuration of video. + support keys: + - video: the path of video. support "file://", "http://", "https://" and local path. + - video_start: the start time of video. + - video_end: the end time of video. + Returns: + torch.Tensor: the video tensor with shape (T, C, H, W). + """ + import decord + video_path = ele["video"] + + st = time.time() + vr = decord.VideoReader(video_path) + if 'video_start' in ele or 'video_end' in ele: + raise NotImplementedError("not support start_pts and end_pts in decord for now.") + total_frames, video_fps = len(vr), vr.get_avg_fps() + logger.info(f"decord: {video_path=}, {total_frames=}, {video_fps=}, time={time.time() - st:.3f}s") + + sample_method = ele.get("sample", "uniform") + # if sample_method == "sequence": + # total_frames = int(total_frames / video_fps * 2) + num_frames = get_frames(ele, int(total_frames / video_fps * 2)) + frame_indices = sample_frames( + num_frames=num_frames, total_frames=total_frames, sample=sample_method + ) + + video = vr.get_batch(frame_indices).asnumpy() + video = torch.tensor(video).permute(0, 3, 1, 2) # Convert to TCHW format + sample_fps = num_frames / max(total_frames, 1e-6) * video_fps + return video, sample_fps + +VIDEO_READER_BACKENDS = { + "decord": _read_video_decord, + "torchvision": _read_video_torchvision, +} + +FORCE_BAILINGNATIVE_VIDEO_READER = os.getenv("FORCE_BAILINGNATIVE_VIDEO_READER", None) + +@lru_cache(maxsize=1) +def get_video_reader_backend() -> str: + if FORCE_BAILINGNATIVE_VIDEO_READER is not None: + video_reader_backend = FORCE_BAILINGNATIVE_VIDEO_READER + elif is_decord_available(): + video_reader_backend = "decord" + else: + video_reader_backend = "torchvision" + print(f"bailing-native-utils using {video_reader_backend} to read video.", file=sys.stderr) + return video_reader_backend + +def fetch_video(ele: dict, image_factor: int = IMAGE_FACTOR, return_video_sample_fps: bool = False) -> torch.Tensor | \ + list[ + Image.Image]: + if isinstance(ele["video"], str): + if ele["video"].startswith("file://"): + ele["video"] = ele["video"][7:] + video_reader_backend = get_video_reader_backend() + try: + video, sample_fps = VIDEO_READER_BACKENDS[video_reader_backend](ele) + except Exception as e: + logger.warning(f"video_reader_backend {video_reader_backend} error, use torchvision as default, msg: {e}") + video, sample_fps = VIDEO_READER_BACKENDS["torchvision"](ele) + + if "resized_height" in ele and "resized_width" in ele: + resized_height, resized_width = smart_resize( + ele["resized_height"], + ele["resized_width"], + factor=image_factor, + ) + else: + num_frames, _, height, width = video.shape + min_pixels = ele.get("min_pixels", VIDEO_MIN_PIXELS) + total_pixels = ele.get("total_pixels", VIDEO_TOTAL_PIXELS) + max_pixels = max(min(VIDEO_MAX_PIXELS, total_pixels / num_frames * FRAME_FACTOR), int(min_pixels * 1.05)) + max_pixels_supposed = ele.get("max_pixels", max_pixels) + if max_pixels_supposed > max_pixels: + logger.warning(f"The given max_pixels[{max_pixels_supposed}] exceeds limit[{max_pixels}].") + max_pixels = min(max_pixels_supposed, max_pixels) + + resized_height, resized_width = smart_resize( + height, + width, + factor=28, + min_pixels=min_pixels, + max_pixels=max_pixels, + ) + video = transforms.functional.resize( + video, + [resized_height, resized_width], + interpolation=InterpolationMode.BICUBIC, + antialias=True, + ).float() + if return_video_sample_fps: + return video, sample_fps + return video + else: + assert isinstance(ele["video"], (list, tuple)) + process_info = ele.copy() + process_info.pop("type", None) + process_info.pop("video", None) + images = [ + fetch_image({"image": video_element, **process_info}, size_factor=image_factor) + for video_element in ele["video"] + ] + if len(images) > ele["max_frames"]: + num_frames_target = ele["max_frames"] + print(ele["max_frames"]) + interval = len(images) // num_frames_target # 计算抽取间隔 + images = [images[i] for i in range(0, len(images), interval)][:num_frames_target] + num_frames = ceil_by_factor(len(images), FRAME_FACTOR) + if len(images) < num_frames: + images.extend([images[-1]] * (num_frames - len(images))) + if return_video_sample_fps: + return images, process_info.pop("sample_fps", 2.0) + return images + +def fetch_audio(ele: dict[str, str | torch.Tensor], return_tensor="pt") -> Tuple[Union[torch.Tensor, np.ndarray], int]: + if "audio" in ele: + audio = ele["audio"] + else: + audio = ele["audio_url"] + + if isinstance(audio, torch.Tensor): + waveform = audio + sample_rate: int = ele.get("sample_rate", 16000) + elif audio.startswith("http://") or audio.startswith("https://"): + audio_file = BytesIO(requests.get(audio, stream=True).content) + waveform, sample_rate = torchaudio.load(audio_file) + elif audio.startswith("file://"): + waveform, sample_rate = torchaudio.load(audio[7:]) + else: + waveform, sample_rate = torchaudio.load(audio) + if return_tensor == "pt": + return waveform, sample_rate + else: + return waveform.numpy(), sample_rate + +def extract_vision_info(conversations: list[dict] | list[list[dict]]) -> list[dict]: + vision_infos = [] + if isinstance(conversations[0], dict): + conversations = [conversations] + for conversation in conversations: + for message in conversation: + if isinstance(message["content"], list): + for ele in message["content"]: + if ( + "image" in ele + or "image_url" in ele + or "video" in ele + or "audio" in ele + or ele["type"] in ("image", "image_url", "video") + ): + vision_infos.append(ele) + return vision_infos + +def process_vision_info( + conversations: list[dict] | list[list[dict]], +) -> tuple[list[Image.Image] | None, list[torch.Tensor | list[Image.Image]] | None, list[ + torch.Tensor | list[np.ndarray]] | None]: + vision_infos = extract_vision_info(conversations) + ## Read images, videos or audios + image_inputs = [] + video_inputs = [] + audio_inputs = [] + for vision_info in vision_infos: + if "image" in vision_info or "image_url" in vision_info: + if isinstance(vision_info["image"], (tuple, list)): + for i in range(len(vision_info["image"])): + image_inputs.append(fetch_image({"type": "image", "image": vision_info["image"][i]})) + else: + image_inputs.append(fetch_image(vision_info)) + elif "video" in vision_info or "video_url" in vision_info: + video_inputs.append(fetch_video(vision_info)) + elif "audio" in vision_info or "audio_url" in vision_info: + if isinstance(vision_info["audio"], (tuple, list)): + audio_inputs.extend(fetch_audio(info) for info in vision_info["audio"]) + else: + audio_inputs.append(fetch_audio(vision_info)) + else: + raise ValueError("image, image_url, video, video_url, audio or audio_url should in content.") + if len(image_inputs) == 0: + image_inputs = None + if len(video_inputs) == 0: + video_inputs = None + if len(audio_inputs) == 0: + audio_inputs = None + return image_inputs, video_inputs, audio_inputs diff --git a/chat_format.py b/chat_format.py new file mode 100644 index 0000000000000000000000000000000000000000..5b6906cea73c1ad701bbee41a483c3251a3c4173 --- /dev/null +++ b/chat_format.py @@ -0,0 +1,875 @@ +'''AntGLM Chat-model data format. + +格式化 AntGLM 以及各种开源模型的符号系统: + - 确定 Chat 模型依赖的文件数据结构协议 + - 确定单轮/多轮的统一结构 + - 确定 Chat 符号系统的协议, 包括角色定义、分隔符等 + - 方便做开源模型依赖的 prompt 转换 + - 支持工具、代码、推理等支持 + +参考 FastChat Conversation 对象的设计思路. +Reference: https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py +''' + +import copy +import dataclasses +import logging +import re +import uuid +from copy import deepcopy +from enum import IntEnum, auto +from typing import Dict, List, Optional, Tuple + +logger = logging.getLogger(__name__) + + +class PromptStyle(IntEnum): + '''Prompt styles.''' + + # 原始 antglm format 格式, 单轮指令没有结构, 多轮 `第1轮\n用户: xx\n机器人: xx\n` + ANTGLM_RAW = auto() + # Chat format 格式, 单轮多轮统一为 chat format 格式 + ANTGLM_CHAT = auto() + # 单轮指令没有结构, 只有多轮为 chat format 格式 + ANTGLM_ONLY_MULTITURN_CHAT = auto() + # OpenAI ChatML 格式, 包括千问 + CHATML = auto() + # LLAMA2 格式 + LLAMA2 = auto() + # ChatGLM 1/2 格式 + CHATGLM = auto() + # ChatGLM3 格式 + CHATGLM3 = auto() + # 百川格式 + BAICHUAN2 = auto() + + +@dataclasses.dataclass +class Chat: + '''Chat 数据符号结构, 格式化 AntGLM 以及各种开源模型的符号系统. + + Examples: + + ```python + >>> from antllm.data.chat_format import Chat + + >>> ### 从 json 数据结构创建 chat 对象, 并且 format 结构使用 AntGLM 原始结构 + >>> input_json = { + ... "messages": [ + ... {"role": "HUMAN", "content": "讲一个笑话"}, + ... {"role": "ASSISTANT", "content": "为什么猪不能上网?因为它们会被网上的“猪”骗!哈哈哈!"}, + ... {"role": "HUMAN", "content": "不好笑,换个程序员的笑话"} + ... ], + ... } + >>> chat = Chat.from_json(input_json, name='antglm_raw') + + >>> ### 根据 chat 对象创建大模型训练所需 pack 数据 + >>> pack_data = chat.prompt_pack + >>> print(pack_data) + + >>> ### 根据 chat 对象创建大模型训练所需 input, output 数据 + >>> data = chat.prompt_inout + >>> print(data) + + >>> ### 根据 chat 对象创建大模型预测用的 prompt + >>> prompt = chat.prompt_str + >>> print(prompt) + + >>> ### 从大模型训练数据 {"input": "xx", "output": "xx"} 中创建 chat 对象 + >>> data = { + ... 'input': ( + ... '第1轮\n用户: 讲一个笑话\n机器人: 为什么猪不能上网?因为它们会被网上的“猪”骗!哈哈哈!\n' + ... '第2轮\n用户: 不好笑,换个程序员的笑话\n机器人:' + ... ), + ... 'output': '' + ... } + >>> chat = Chat.from_inout(data, name='antglm_raw') + + >>> ### 从大模型 pack 训练数据创建 chat 对象列表 + >>> pack_data = { + ... 'inputs': ['第1轮\n用户: 讲一个笑话\n机器人:', '第2轮\n用户: 不好笑,换个程序员的笑话\n机器人:', '第1轮\n用户: 写首诗\n机器人:'], + ... 'outputs': [ + ... '为什么猪不能上网?因为它们会被网上的“猪”骗!哈哈哈!\n', + ... '为什么程序员总是喜欢使用黑色主题?因为他们喜欢“黑暗模式”(Dark Mode),这样他们就可以在晚上加班时更好地隐藏自己的错误!', + ... ''] + ... } + >>> chats = Chat.from_pack(pack_data, name='antglm_raw') + >>> assert len(chats) == 2 + >>> print(chats[0]) + >>> print(chats[1]) + + >>> ### 显示总交互轮数 (以用户输出多少次为轮数个数) + >>> print(chat.turns_num) + + >>> ### 根据 chat 对象创建 json 格式化输出 + >>> data_json = chat.to_json() + >>> print(data_json) + + >>> ### 增加轮次信息 + >>> content = ( + ... '为什么程序员总是喜欢使用黑色主题?' + ... '因为他们喜欢“黑暗模式”(Dark Mode),这样他们就可以在晚上加班时更好地隐藏自己的错误!' + ... ) + >>> chat.append_message(chat.role_assistant, content) + + >>> ### 将 chat 对象转成 OpenAI ChatCompletion 接口的入参 + >>> openai_messages = chat.to_openai_api_messages() + >>> print(openai_messages) + + >>> ### 复制一个 chat 对象 + >>> chat_new = chat.copy() + ``` + ''' + + # 数据结构名称 + id: str = None + + # format 支持: antglm_raw, antglm_chat, chatglm1, chatglm2, llama2, qwen, baichuan2 + name: Optional[str] = None + + # Prompt 风格 + prompt_style: Optional[PromptStyle] = None + + # System Template 和 message + system_template: str = 'SYSTEM{}' + system_message: str = '' + + # 角色定义 + role_human: str = 'HUMAN' + role_assistant: str = 'ASSISTANT' + role_observation: str = 'OBSERVATION' + role_template: str = '{}' + + # 每轮符号定义 + turn_start: str = '' + human_end: str = '' + assistant_start: str = '' + assistant_end: str = '' + assistant_end_ids: Optional[List[int]] = None + general_role_end: str = '' + + # agent 符号定义 + tool_template = '{}' + code_template = '{}' + arithemetic_templte = '{}' + image_template = '{}' + + # All messages. Each item is (role, message). + messages: List[Tuple[str, str]] = () + + # messages 中用于 few-shot messages + offset: int = 0 + + # 其他 meta data + source: Optional[str] = None + lang: Optional[str] = None + topic: Optional[str] = None + + # 原始 json 数据 + origin_json: Optional[dict] = None + + @property + def support_names(self) -> Dict[str, str]: + '''支持的数据对象名称.''' + return { + 'antglm_raw': '原始 antglm format 格式, 单轮指令没有结构, 多轮 `第1轮\\n用户:xx\\n机器人xx\\n`', + 'antglm_chat': 'Chat format 格式, 单轮多轮统一为 chat format 格式', + 'chatglm1': 'chatglm1 format', + 'chatglm2': 'chatglm2 format', + 'llama2': 'llama2 format', + 'qwen': '千问 format', + 'baichuan2': '百川 2 format', + } + + @classmethod + def from_json( + cls, + input: dict, + name: Optional[str] = None, + prompt_style: Optional[PromptStyle] = None, + ): + '''从文件数据结构到数据对象的转换. + + Params: + name: `Optional[str]`, 符号系统名称 + - format 支持: antglm_raw, antglm_chat, chatglm1, chatglm2, llama2, qwen, baichuan2 + - 如果指定了 format name, 使用该 name 符号系统, 否则使用 input 中 `name` 字段 + + prompt_style: `Optional[PromptStyle]`, 指定 prompt 风格, 默认使用和 name 一致的风格 + + input: `dict`, 文件中的 json dict 对象, 协议为: + - 既支持 `messages` 字段, 也支持 `turns` 字段 + { + "id": "xxx", + "name": "antglm", + "source": "xxx", + "lang": "xx", + "topic": "xx", + "system_template": "", + "system_message": "xx", + "messages": [ + { + "role": "HUMAN", + "content": "Hi" + }, + { + "role": "ASSISTANT", + "content": "Hello" + }, + { + "role": "OBSERVATION", + "content": "xxx" + }, + { + "role": "ASSISTANT", + "content": "xxx" + } + ], + "turns": [ + {"HUMAN": "xxx", "OBSERVATION": "xx", "ASSISTANT": "xx"} + ] + } + + Returns: + `Chat` 对象 + ''' + _id = input.get('id') + if name: + _name = name + else: + _name = input.get('name') + source = input.get('source') + lang = input.get('lang') + topic = input.get('topic') + kwargs = {} + if 'system_template' in input: + kwargs['system_template'] = input['system_template'] + if 'system_message' in input: + kwargs['system_message'] = input['system_message'] + + # 转换成 Chat 对象 + chat = cls( + id=_id, + name=_name, + prompt_style=prompt_style, + source=source, + lang=lang, + topic=topic, + origin_json=deepcopy(input), + **kwargs, + ) + if 'messages' in input: + for msg in input['messages']: + if msg['role'] == 'HUMAN': + role = chat.role_human + elif msg['role'] == 'OBSERVATION': + role = chat.role_observation + elif msg['role'] == 'ASSISTANT': + role = chat.role_assistant + else: + raise ValueError(f'不支持数据集中的 role: {msg["role"]}') + + chat.append_message(role, msg['content']) + + elif 'turns' in input: + for turn in input['turns']: + if 'HUMAN' in turn: + content = turn['HUMAN'] + chat.append_message(chat.role_human, content) + if 'OBSERVATION' in turn: + content = turn['OBSERVATION'] + chat.append_message(chat.role_observation, content) + if 'ASSISTANT' in turn: + content = turn['ASSISTANT'] + chat.append_message(chat.role_assistant, content) + + return chat + + @classmethod + def from_pack( + cls, + packs: Dict[str, List[str]], + name: str, + prompt_style: Optional[PromptStyle] = None, + ) -> list: + '''根据 pack 数据创建 Chat 对象. + + Params: + packs: `dict`, pack 样本数据 + { + 'inputs': ['xx', 'xx'], + 'outputs': ['xx', 'xx'], + } + + name: `str`, 符号系统名称 + prompt_style: `Optional[PromptStyle]`, 指定 prompt 风格, 默认使用和 name 一致的风格 + ''' + chat = cls(name=name, prompt_style=prompt_style) + packs = cls._format_packs(packs) + + sys_pattern = re.compile(chat.system_template.format(r'(.*?)'), re.DOTALL) + turn_pattern = re.compile(chat.turn_start.format(r'(\d+)'), re.DOTALL) + human_pattern = re.compile(chat.role_template.format(chat.role_human).strip(), re.DOTALL) + observe_pattern = re.compile(chat.role_template.format(chat.role_observation).strip(), re.DOTALL) + assistant_pattern = re.compile(chat.role_template.format(chat.role_assistant).strip(), re.DOTALL) + + chats = [] + for input, output in zip(packs['input'], packs['output']): + # system message + sys_match = sys_pattern.search(input) + if sys_match and sys_match.group(0): + # system 指令只在首轮, 新增 chat 对象 + if len(chat.messages) > 0: + chats.append(chat) + chat = cls(name=name, prompt_style=prompt_style) + + input = input[sys_match.end() :] + chat.system_message = sys_match.group(1) + + # turn start + turn_match = turn_pattern.search(input) + if turn_match and turn_match.group(0): + # 当出现下一个轮次开始信息, 新增 chat 对象 + if name in ['antglm', 'antglm_raw', 'chatglm2']: + round_start = 1 + else: + round_start = 0 + + if all( + [ + len(turn_match.groups()) > 0, + int(turn_match.group(1)) == round_start, + len(chat.messages) > 0, + ] + ): + chats.append(chat) + chat = cls(name=name, prompt_style=prompt_style) + + input = input[turn_match.end() :] + + human_iter = human_pattern.finditer(input) + observe_iter = observe_pattern.finditer(input) + assistant_iter = assistant_pattern.finditer(input) + human_match = next(human_iter, None) + observe_match = next(observe_iter, None) + assistant_match = next(assistant_iter, None) + + if not human_match and not observe_match: + # 无 role format + chat.append_message(chat.role_human, input) + + while human_match or observe_match: + next_human_match = next(human_iter, None) + next_observe_match = next(observe_iter, None) + input = cls._append_human_observation( + chat, + input, + human_match=human_match, + next_human_match=next_human_match, + observe_match=observe_match, + next_observe_match=next_observe_match, + assistant_match=assistant_match, + ) + + human_match = next_human_match + observe_match = next_observe_match + next_human_match = next(human_iter, None) + next_observe_match = next(observe_iter, None) + + if output: + chat.append_message(chat.role_assistant, output) + + if chat.messages: + chats.append(chat) + + return chats + + @classmethod + def _append_human_observation( + cls, + chat, + input: str, + human_match: Optional[re.Match] = None, + next_human_match: Optional[re.Match] = None, + observe_match: Optional[re.Match] = None, + next_observe_match: Optional[re.Match] = None, + assistant_match: Optional[re.Match] = None, + ) -> str: + '''给 chat 对象增加 human/observation message.''' + if observe_match: + # observation 在 human 之后 + if observe_match.span()[0] > observe_match.span()[0]: + human_str = input[observe_match.span()[1] : observe_match.span()[0]] + observe_str = input[observe_match.span()[1] : assistant_match.span()[0]] + chat.append_message(chat.role_human, human_str.strip()) + input_end = observe_match.span()[1] + if observe_match.span()[0] < next_human_match.span()[0]: + chat.append_message(chat.role_observation, observe_str.strip()) + input_end = assistant_match.span()[1] + else: + # observation 在 human 之前 + human_str = input[observe_match.span()[1] : assistant_match.span()[0]] + observe_str = input[observe_match.span()[1] : observe_match.span()[0]] + chat.append_message(chat.role_observation, observe_str.strip()) + input_end = observe_match.span()[1] + if observe_match.span()[0] < next_observe_match.span()[0]: + chat.append_message(chat.role_human, human_str.strip()) + input_end = assistant_match.span()[1] + else: + if assistant_match: + human_str = input[human_match.span()[1] : assistant_match.span()[0]] + input_end = assistant_match.span()[1] + else: + human_str = input[human_match.span()[1] :] + input_end = len(input) + chat.append_message(chat.role_human, human_str.strip()) + + return input[input_end:] + + @classmethod + def from_inout( + cls, + sample: Dict[str, str], + name: str, + prompt_style: Optional[PromptStyle] = None, + ): + '''根据单样本创建一个 Chat 对象. + + Params: + sample: `Dict[str, str]`, input/output 数据样本 + { + "input": "xxx", + "output": "xxx", + } + + name: `str`, 符号系统名称 + prompt_style: `Optional[PromptStyle]`, 指定 prompt 风格, 默认使用和 name 一致的风格 + ''' + chat = cls(name=name, prompt_style=prompt_style) + input = sample['input'] + output = sample['output'] + + sys_pattern = re.compile(chat.system_template.format(r'(.*?)'), re.DOTALL) + turn_pattern = re.compile(chat.turn_start.format(r'(\d+)'), re.DOTALL) + human_pattern = re.compile(chat.role_template.format(chat.role_human).strip(), re.DOTALL) + observe_pattern = re.compile(chat.role_template.format(chat.role_observation).strip(), re.DOTALL) + assistant_pattern = re.compile(chat.role_template.format(chat.role_assistant).strip(), re.DOTALL) + + # 去除轮次信息 + input = turn_pattern.sub('', input) + + # system message search + sys_match = sys_pattern.search(input) + if sys_match and sys_match.group(0): + input = input[sys_match.end() :] + chat.system_message = sys_match.group(1) + + human_iter = human_pattern.finditer(input) + observe_iter = observe_pattern.finditer(input) + assistant_iter = assistant_pattern.finditer(input) + human_match = next(human_iter, None) + observe_match = next(observe_iter, None) + assistant_match = next(assistant_iter, None) + next_human_match = next(human_iter, None) + next_observe_match = next(observe_iter, None) + + while any( + [ + human_match, + observe_match, + assistant_match, + ] + ): + + # human/observation 先后顺序可能不一样, 并且有可能有多个 + # 判断 assitant 之前是否还有 human/observation + while any( + [ + human_match and human_match.span()[0] < assistant_match.span()[0], + observe_match and observe_match.span()[0] < assistant_match.span()[0], + next_human_match and next_human_match.span()[0] < assistant_match.span()[0], + next_observe_match and next_observe_match.span()[0] < assistant_match.span()[0], + ] + ): + if not input: + break + + cls._append_human_observation( + chat, + input, + human_match=human_match, + next_human_match=next_human_match, + observe_match=observe_match, + next_observe_match=next_observe_match, + assistant_match=assistant_match, + ) + + human_match = next_human_match + observe_match = next_observe_match + next_human_match = next(human_iter, None) + next_observe_match = next(observe_iter, None) + + # assistant message + if assistant_match and assistant_match.span(): + if observe_match: + if observe_match.span() and observe_match.span()[0] < human_match.span()[0]: + assistant_str = input[assistant_match.span()[1] : observe_match.span()[0]] + elif human_match: + if human_match.span(): + assistant_str = input[assistant_match.span()[1] : human_match.span()[0]] + else: + assistant_str = input[assistant_match.span()[1] :] + + if assistant_str: + chat.append_message(chat.role_assistant, assistant_str) + + assistant_match = next(assistant_iter, None) + + if output: + chat.append_message(chat.role_assistant, output) + + return chat + + def __hash__(self): + '''数据对象的 hash 函数.''' + return hash(self.id) + + def __post_init__(self): + '''对象初始化后的处理, 处理包括: + - 根据数据对象名称, 支持转成其他开源数据对象的基本信息 + ''' + self.id = str(uuid.uuid4()) + if not self.messages: + self.messages = [] + + if not self.name and not self.prompt_style: + logger.error('构造 Chat 对象至少包含以下一个入参: `name/prompt_style`.\n\n' '`name` 支持以下 format 名称:') + logger.error('\n'.join([f'{k}: {v}' for k, v in self.support_names.items()])) + logger.error('\n`prompt_style` 参考 antllm.data.chat_format.PromptStyle') + raise ValueError + + if self.name == 'antglm': + # 默认 antglm 使用原始 antglm_raw - 第1轮\n用户: xx\n机器人: xx\n + self.name = 'antglm_raw' + + if not self.name and self.prompt_style == PromptStyle.ANTGLM_CHAT: + logger.info( + 'Chat 对象入参没有 `name`, 默认使用 `ANTGLM_CHAT`, format:\n' + f'role_human: {self.role_human}\n' + f'role_assistant: {self.role_assistant}\n' + f'role_observation: {self.role_observation}\n' + f'role_template: {self.role_template}\n' + f'turn_start: {self.turn_start}\n' + f'human_end: {self.human_end}\n' + f'assistant_start: {self.assistant_start}\n' + f'assistant_end: {self.assistant_end}\n' + f'assistant_end_ids: {self.assistant_end_ids}\n' + f'general_role_end: {self.general_role_end}\n' + f'tool_template: {self.tool_template}\n' + f'code_template: {self.code_template}\n' + f'arithemetic_templte: {self.arithemetic_templte}\n' + f'image_template: {self.image_template}\n' + f'\n入参 `name` 支持: ``' + ) + return + + if self.name == 'antglm_raw' or self.prompt_style == PromptStyle.ANTGLM_RAW: + self.prompt_style = PromptStyle.ANTGLM_RAW + self.role_template = '{}' + self.role_human = '用户: ' + self.role_assistant = '机器人: ' + self.turn_start = '第{}轮\n' + self.general_role_end = '\n' + + if self.name in ['chatglm1', 'chatglm2'] or self.prompt_style == PromptStyle.CHATGLM: + self.prompt_style = PromptStyle.CHATGLM + self.role_template = '{}' + self.role_human = '问:' + self.role_assistant = '答:' + self.turn_start = '[Round {}]\n' + if self.name == 'chatglm1': + self.general_role_end = '\n' + else: + self.general_role_end = '\n\n' + + elif self.name == 'chatglm3' or self.prompt_style == PromptStyle.CHATGLM3: + self.prompt_style = PromptStyle.CHATGLM3 + self.system_template = '<|system|>\n {}' + self.role_human = '<|user|>\n ' + self.role_assistant = '<|assistant|>\n ' + self.role_template = '{}' + + elif self.name == 'llama2' or self.prompt_style == PromptStyle.LLAMA2: + self.prompt_style = PromptStyle.LLAMA2 + self.role_template = '{}' + self.system_template = '[INST] <>\n{}\n<>\n\n' + self.role_human = '[INST] ' + self.role_assistant = '[/INST] ' + self.human_end = ' ' + self.assistant_end = ' ' + + elif self.name == 'qwen': + self.prompt_style = PromptStyle.CHATML + self.role_template = '{}' + self.system_template = '<|im_start|>system\n{}' + if not self.system_message: + self.system_message = 'You are a helpful assistant.' + self.role_human = '<|im_start|>user\n' + self.role_assistant = '<|im_start|>assistant\n' + self.general_role_end = '<|im_end|>\n' + + elif self.name == 'baichuan': + self.prompt_style = PromptStyle.BAICHUAN2 + self.role_template = '{}' + self.system_template = '{}' + self.role_human = '' + self.role_assistant = '' + + if not self.system_template: + self.system_template = '{}' + + def readable_messages(self) -> str: + '''将 messages 输出为人类可读的字符串, 方便分析数据.''' + pass + + @property + def prompt_str(self) -> str: + '''将 Chat 对象转成 prompt str, 合并 human/assitant 输出为 format 字符串.''' + return f'{self.prompt_inout["input"]}{self.prompt_inout["output"]}' + + @classmethod + def _format_packs(cls, packs: Dict[str, List[str]]) -> Dict[str, List[str]]: + '''格式化 pack 样本, 输出相同 pack inputs, outputs 个数.''' + _packs = copy.deepcopy(packs) + if len(_packs['input']) - 1 == len(_packs['output']): + _packs['output'].append('') + + if len(_packs['input']) != len(_packs['output']): + print(packs) + raise ValueError( + '输入 input 和 output 数量不匹配, ' + f'input num: {len(packs["input"])}, ' + f'output num: {len(packs["output"])}' + ) + + return _packs + + @property + def prompt_inout(self) -> Dict[str, str]: + '''将 Chat 对象转成 input prompt, output prompt 字符串. + + Returns: + `Dict[str, str]`, 示例: + { + "input": "SYSTEMxxxxHUMAN你好ASSISTANT你好,有什么可以帮您?ASSISTANT", # noqa + "output": "你好,有什么可以帮您?" + } + ''' + packs = self._format_packs(self.prompt_pack) + + # 兼容逻辑 + if self.prompt_style == PromptStyle.ANTGLM_RAW: + packs['input'] = [f'{item} ' for item in packs['input']] + + prompt_input = ''.join([f'{x}{y}' for x, y in zip(packs['input'][:-1], packs['output'][:-1])]) + prompt_input += packs['input'][-1] + prompt_output = packs['output'][-1] + + # 兼容逻辑 + if self.prompt_style == PromptStyle.ANTGLM_RAW: + prompt_input = prompt_input.strip() + + return { + 'input': prompt_input, + 'output': prompt_output, + } + + @property + def prompt_pack(self) -> Dict[str, List[str]]: + '''将数据对象转成 pack input prompt, output prompt 字符串列表.: + + Returns: + `Dict[str, List[str]]`, 示例: + + { + "input": [ + "SYSTEMxxxxHUMAN你好ASSISTANT", + "HUMAN讲个笑话ASSISTANT", + "OBSERVATION{\"weather\": \"晴\"}ASSISTANT" + ], + "output": [ + "你好,有什么可以帮您?", + "笑话 1", + "今天天气 xxx" + ] + } + + ''' + inputs = [] + outputs = [] + + # 最开始 system 构造 + system_prompt = '' + if self.system_message: + system_prompt = self.system_template.format(self.system_message) + + if system_prompt: + ret = system_prompt + self.general_role_end + else: + ret = '' + + # 有些 prompt style 单轮指令没有 format + if self.prompt_style in [ + PromptStyle.ANTGLM_RAW, + PromptStyle.ANTGLM_ONLY_MULTITURN_CHAT, + ]: + if len(self.messages) <= 2: + output = '' + for role, message in self.messages: + if role == self.role_assistant: + output = message + else: + input = ret + message + return { + 'input': [input], + 'output': [output], + } + + # 多轮对话 + if self.name in ['antglm_raw', 'chatglm2']: + round_start = 1 + else: + round_start = 0 + + for i, (role, message) in enumerate(self.messages): + # 轮次信息 + if self.name in ['antglm_raw', 'chatglm1', 'chatglm2']: + if i % 2 == 0: + ret += self.turn_start.format(i // 2 + round_start) + + # 角色 + 内容 + role_end = self.general_role_end + if role == self.role_assistant and self.assistant_end: + role_end = self.assistant_end + elif self.human_end: + role_end = self.human_end + + ret += self.role_template.format(role) + message + role_end + + if role == self.role_assistant: + # output 只保留实际 assistant 内容 + if not message: + outputs.append('') + else: + outputs.append(message + role_end) + # input 需要连接 assistant role + inputs[-1] += ret[: -len(message + role_end)] + elif all( + [ + role == self.role_observation, + len(self.messages) > 1, + self.messages[i - 1][0] != self.role_assistant, + ] + ): + # observation 之前不是 assistant, 需要将 observation 和上一个 input 连接一起 + continue + else: + inputs.append(ret) + ret = '' + + # 最后一轮不是机器人回复, 需要拼接机器人 role, 用于模型生成 + if i == len(self.messages) - 1 and role != self.role_assistant: + inputs[-1] += self.role_template.format(self.role_assistant).strip() + + # 兼容逻辑, 去除 inputs 最后空格符号 + if self.prompt_style == PromptStyle.ANTGLM_RAW: + inputs = [item.strip() for item in inputs] + + return { + 'input': inputs, + 'output': outputs, + } + + @property + def turns_num(self) -> int: + '''和机器人的交互轮数, 以用户输出多少次为轮数个数.''' + return sum([1 if msg[0] == self.role_human else 0 for msg in self.messages]) + + def to_json(self) -> dict: + '''输出 chat json dict 格式, 包含不同角色和机器人交互的每轮信息. + + Returns + `List[dict]`, { + "id": "xx", + "messages": [ + {"role": "HUMAN", "content": "xxx"} + ] + "turns": [ + {"HUMAN": "xx", "OBSERVATION": "xx", "ASSISTANT": "xx"} + ] + } + ''' + turns = [] + messages = [] + turn = {} + for msg in self.messages: + if msg[0] == self.role_assistant: + messages.append({'role': 'ASSISTANT', 'content': msg[1]}) + turn['ASSISTANT'] = msg[1] + turns.append(turn) + turn = {} + + if msg[0] == self.role_human: + messages.append({'role': 'HUMAN', 'content': msg[1]}) + turn['HUMAN'] = msg[1] + + if msg[0] == self.role_observation: + messages.append({'role': 'OBSERVATION', 'content': msg[1]}) + turn['OBSERVATION'] = msg[1] + + if self.messages[-1][0] == self.role_human: + messages.append({'role': 'ASSISTANT', 'content': ''}) + turn['ASSISTANT'] = '' + turns.append(turn) + + result = self.origin_json or {} + result.update( + { + 'id': self.id, + 'name': self.name, + 'source': self.source, + 'lang': self.lang, + 'topic': self.topic, + 'system_template': self.system_template, + 'system_message': self.system_message, + 'turns': turns, + 'messages': messages, + } + ) + + return result + + def set_system_message(self, system_message: str): + '''Set the system message.''' + self.system_message = system_message + + def append_message(self, role: str, message: str): + '''Append a new message.''' + if not message: + message = '' + self.messages.append([role, message]) + + def to_openai_api_messages(self) -> List[dict]: + '''Convert the conversation to OpenAI chat completion format.''' + ret = [{'role': 'system', 'content': self.system_message}] + + for i, (_, msg) in enumerate(self.messages[self.offset :]): + if i % 2 == 0: + ret.append({'role': 'user', 'content': msg}) + else: + if msg is not None: + ret.append({'role': 'assistant', 'content': msg}) + return ret + + def copy(self): + return copy.deepcopy(self) diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..79ed46c84d7cf4ccb6a7742601b10deeeff18d9a --- /dev/null +++ b/config.json @@ -0,0 +1,311 @@ +{ + "_name_or_path": ".", + "architectures": [ + "BailingMMNativeForConditionalGeneration" + ], + "auto_map": { + "AutoConfig": "configuration_bailingmm.BailingMMConfig" + }, + "audio_config": { + "audio_decoder_type": null, + "audio_encoder_config_sanm": { + "attention_dropout_rate": 0.1, + "attention_heads": 4, + "dropout_rate": 0.1, + "input_layer": "pe", + "input_size": 560, + "kernel_size": 11, + "linear_units": 2048, + "normalize_before": true, + "num_blocks": 50, + "output_size": 512, + "pos_enc_class": "SinusoidalPositionEncoder", + "positional_dropout_rate": 0.1, + "sanm_shfit": 0, + "selfattention_layer_type": "sanm" + }, + "audio_encoder_output_size": 512, + "audio_id_shift": null, + "audio_wav_frontend_config_sanm": {}, + "ds_conv_type": "conv", + "ds_kernel_size": 1, + "ds_stride": 1, + "model_type": "bailingmm", + "norm_query_embeds": true, + "use_audio_bpe_token": false, + "vocab_size_audio": 0 + }, + "auto_map": { + "AutoConfig": "configuration_bailingmm.BailingMMConfig" + }, + "llm_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": [ + "BailingMoeForCausalLM" + ], + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bailing_moe.BailingMoeConfig", + "AutoModel": "modeling_bailing_moe.BailingMoeModel", + "AutoModelForCausalLM": "modeling_bailing_moe.BailingMoeForCausalLM", + "AutoModelForTokenClassification": "modeling_bailing_moe.BailingMoeForTokenClassification" + }, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "embedding_dropout": 0.0, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 126081, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "first_k_dense_replace": 0, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_patch_token": 126346, + "initializer_range": 0.006, + "intermediate_size": 5632, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 32768, + "max_window_layers": 28, + "min_length": 0, + "model_type": "bailing_moe", + "moe_intermediate_size": 1408, + "multi_gate": true, + "no_repeat_ngram_size": 0, + "norm_head": true, + "norm_softmax": false, + "norm_topk_prob": true, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_experts": 64, + "num_experts_per_tok": 6, + "num_hidden_layers": 28, + "num_key_value_heads": 4, + "num_return_sequences": 1, + "num_shared_experts": 2, + "output_attentions": false, + "output_dropout": 0.0, + "output_hidden_states": false, + "output_router_logits": false, + "output_scores": false, + "pad_token_id": 126081, + "prefix": null, + "pretraining_tp": 1, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 600000, + "sep_token_id": null, + "sliding_window": 4096, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": false, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "bfloat16", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_bias": false, + "use_cache": true, + "use_qkv_bias": false, + "use_sliding_window": false, + "vocab_size": 126464 + }, + "mlp_depth": 2, + "model_type": "bailingmm", + "talker_config": { + "_name_or_path": "./talker", + "add_cross_attention": false, + "architectures": null, + "audio_vocab_size": 32768, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "", + "no_repeat_ngram_size": 0, + "num_beam_groups": 1, + "num_beams": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "prefix": null, + "pretrained_model_path": null, + "problem_type": null, + "pruned_heads": {}, + "qa_model_hidden_size": 2048, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "s3bpe_tokenizer": null, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "text_vocab_size": 151677, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": false, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": null, + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "vocab_size": 184445, + "vp_feature_size": 192, + "vp_kernel_size": 1, + "vp_stride": 1 + }, + "torch_dtype": "float32", + "transformers_version": "4.45.0", + "vision_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": [ + "Qwen2_5_VisionTransformer" + ], + "auto_map": { + "AutoConfig": "configuration_qwen2_5_vit.Qwen2_5_VLVisionConfig", + "AutoModel": "qwen2_5_vit.Qwen2_5_VisionTransformer" + }, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "depth": 32, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "fullatt_block_indexes": [ + 7, + 15, + 23, + 31 + ], + "hidden_act": "silu", + "hidden_size": 1280, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "in_channels": 3, + "in_chans": 3, + "intermediate_size": 3456, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "qwen2_5_vit", + "no_repeat_ngram_size": 0, + "num_beam_groups": 1, + "num_beams": 1, + "num_heads": 16, + "num_return_sequences": 1, + "out_hidden_size": 8192, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "spatial_merge_size": 2, + "spatial_patch_size": 14, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "temporal_patch_size": 2, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "tokens_per_second": 2, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "bfloat16", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "window_size": 112 + } +} diff --git a/configuration_audio.py b/configuration_audio.py new file mode 100644 index 0000000000000000000000000000000000000000..8bf5ca8d192ffede59e4aa6987201576d9ad8edd --- /dev/null +++ b/configuration_audio.py @@ -0,0 +1,54 @@ +# coding=utf-8 +# Copyright 2022 shunxing1234 and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Audio configuration """ + +from transformers.configuration_utils import PretrainedConfig + + +class GLMAudioConfig(PretrainedConfig): + model_type = "bailingmm" + + def __init__( + self, + audio_encoder_output_size=512, + audio_decoder_type=None, # None, "glmaudio", "glmv4audio" + audio_id_shift=None, + vocab_size_audio=0, # audio vocab starts from audio_id_shift and ends at audio_id_shift + vocab_size_audio exclusively. + use_audio_bpe_token=False, + ds_conv_type="conv", # "conv" or "dsconv" + ds_kernel_size=1, + ds_stride=1, + norm_query_embeds=True, + audio_wav_frontend_config_sanm={}, # SANMEncoder's WavFrontend related configs. + audio_encoder_config_sanm={}, # SANMEncoder related configs. + **kwargs + ): + # Audio related. + self.audio_encoder_output_size = audio_encoder_output_size + self.audio_decoder_type = audio_decoder_type + self.audio_id_shift = audio_id_shift + self.vocab_size_audio = vocab_size_audio + self.use_audio_bpe_token = use_audio_bpe_token + + # Audio feature downsampler related. + self.ds_conv_type = ds_conv_type + self.ds_kernel_size = ds_kernel_size + self.ds_stride = ds_stride + self.norm_query_embeds = norm_query_embeds + + # Third-party module configs. + self.audio_wav_frontend_config_sanm = audio_wav_frontend_config_sanm + self.audio_encoder_config_sanm = audio_encoder_config_sanm + diff --git a/configuration_bailing_moe.py b/configuration_bailing_moe.py new file mode 100644 index 0000000000000000000000000000000000000000..dd72ba9cd514d14d0cf295c2fbf70b6833953a00 --- /dev/null +++ b/configuration_bailing_moe.py @@ -0,0 +1,81 @@ +""" Bailing MoE model configuration """ + +from transformers.configuration_utils import PretrainedConfig + + +class BailingMoeConfig(PretrainedConfig): + model_type = "bailing_moe" + + def __init__( + self, + vocab_size=30592, + hidden_size=1024, + intermediate_size=None, + num_hidden_layers=24, + num_attention_heads=16, + num_key_value_heads=0, + hidden_act="silu", + use_qkv_bias=False, # bailing only + use_bias=True, # bailing only + rms_norm_eps=1e-05, + norm_head=False, # bailing only + tie_word_embeddings=False, # PretrainedConfig key, here change default value. + embedding_dropout=0.1, + attention_dropout=0.1, + output_dropout=0.1, + initializer_range=0.02, + max_position_embeddings=16384, + rope_theta=10000.0, + use_cache=True, + use_sliding_window=False, + sliding_window=4096, + max_window_layers=28, + rope_scaling=None, + pad_token_id=126081, + num_experts=16, + num_shared_experts=0, + num_experts_per_tok=2, + norm_topk_prob=True, + moe_intermediate_size=None, + first_k_dense_replace=0, + head_dim=None, + output_router_logits=False, + multi_gate=False, + image_patch_token=126346, + **kwargs, + ): + self.num_hidden_layers = num_hidden_layers + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_attention_heads = num_attention_heads + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.use_qkv_bias = use_qkv_bias + self.use_bias = use_bias + self.norm_head = norm_head + self.rms_norm_eps = rms_norm_eps + self.embedding_dropout = embedding_dropout + self.attention_dropout = attention_dropout + self.output_dropout = output_dropout + self.initializer_range = initializer_range + self.max_position_embeddings = max_position_embeddings + self.rope_theta = rope_theta + self.use_cache = use_cache + self.use_sliding_window = use_sliding_window + self.sliding_window = sliding_window + self.max_window_layers = max_window_layers + self.head_dim = head_dim or self.hidden_size // self.num_attention_heads + self.rope_scaling = rope_scaling + + # MoE configs + self.num_experts = num_experts + self.num_shared_experts = num_shared_experts + self.num_experts_per_tok = num_experts_per_tok + self.norm_topk_prob = norm_topk_prob + self.moe_intermediate_size = moe_intermediate_size + self.first_k_dense_replace = first_k_dense_replace + self.output_router_logits = output_router_logits + self.multi_gate = multi_gate + self.image_patch_token = image_patch_token + super().__init__(pad_token_id=pad_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs) \ No newline at end of file diff --git a/configuration_bailing_talker.py b/configuration_bailing_talker.py new file mode 100644 index 0000000000000000000000000000000000000000..7cf24e9d45a4f9461e4130ba7da91d8b986b4a8a --- /dev/null +++ b/configuration_bailing_talker.py @@ -0,0 +1,55 @@ +# coding=utf-8 +# Copyright 2022 shunxing1234 and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" GLMAudio model configuration """ + +from typing import Dict + +from transformers.configuration_utils import PretrainedConfig +from transformers.utils import logging + +logger = logging.get_logger(__name__) + + +class BailingTalkerConfig(PretrainedConfig): + # model_type = "glmaudio" + # attribute_map = { + # "num_hidden_layers": "num_layers" + # } + + def __init__( + self, + pretrained_model_path=None, + qa_model_hidden_size=2048, + vocab_size=184445, + text_vocab_size=151677, + audio_vocab_size=32768, + vp_feature_size=192, + vp_kernel_size=1, + vp_stride=1, + s3bpe_tokenizer=None, + **kwargs + ): + self.pretrained_model_path = pretrained_model_path + self.qa_model_hidden_size = qa_model_hidden_size + self.vocab_size = vocab_size + self.text_vocab_size = text_vocab_size + self.audio_vocab_size = audio_vocab_size + self.vp_feature_size = vp_feature_size + self.vp_kernel_size = vp_kernel_size + self.vp_stride = vp_stride + self.s3bpe_tokenizer = s3bpe_tokenizer + super().__init__( + **kwargs + ) diff --git a/configuration_bailingmm.py b/configuration_bailingmm.py new file mode 100644 index 0000000000000000000000000000000000000000..539e3ebb64c4711264db84cf7e1d29c25ae4dc24 --- /dev/null +++ b/configuration_bailingmm.py @@ -0,0 +1,39 @@ +# coding=utf-8 +# Copyright 2024 ANT Group and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from transformers import PretrainedConfig +from qwen2_5_vit import Qwen2_5_VLVisionConfig +from configuration_audio import GLMAudioConfig +from configuration_bailing_moe import BailingMoeConfig +from configuration_bailing_talker import BailingTalkerConfig + +class BailingMMConfig(PretrainedConfig): + model_type = "bailingmm" + + def __init__( + self, + mlp_depth=1, + llm_config: BailingMoeConfig = None, + vision_config: Qwen2_5_VLVisionConfig = None, + audio_config: GLMAudioConfig = None, + talker_config: BailingTalkerConfig = None, + **kwargs + ): + self.audio_config = GLMAudioConfig(**audio_config) if isinstance(audio_config, dict) else audio_config + self.vision_config = Qwen2_5_VLVisionConfig(**vision_config) if isinstance(vision_config, dict) else vision_config + self.llm_config = BailingMoeConfig(**llm_config) if isinstance(llm_config, dict) else llm_config + self.mlp_depth = mlp_depth + self.talker_config = BailingTalkerConfig(**talker_config) if isinstance(talker_config, dict) else talker_config + super().__init__(**kwargs) diff --git a/configuration_glm.py b/configuration_glm.py new file mode 100644 index 0000000000000000000000000000000000000000..d05a96a197df8fa0d51c3432246cd5f83276e5da --- /dev/null +++ b/configuration_glm.py @@ -0,0 +1,200 @@ +# coding=utf-8 +# Copyright 2022 shunxing1234 and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" GLM model configuration """ + +from typing import Dict + +from transformers.configuration_utils import PretrainedConfig +from transformers.utils import logging + +logger = logging.get_logger(__name__) + +GLM_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "shunxing1234/GLM": "https://huggingface.co/shunxing1234/GLM/resolve/main/config.json", + # See all GLM models at https://huggingface.co/models?filter=glm +} + + +class GLMConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`~GLMModel`]. + It is used to instantiate an GLM model according to the specified arguments, defining the model + architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of + the GLM [shunxing1234/GLM-base-cased](https://huggingface.co/shunxing1234/GLM-base-cased) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used + to control the model outputs. Read the documentation from [`PretrainedConfig`] + for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 30522): + Vocabulary size of the GLM model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`~GLMModel`] or + [`~TFGLMModel`]. + hidden_size (`int`, *optional*, defaults to 768): + Dimension of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. + If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. + Typically set this to something large just in case (e.g., 512 or 1024 or 2048). + type_vocab_size (`int`, *optional*, defaults to 2): + The vocabulary size of the `token_type_ids` passed when calling [`~GLMModel`] or + [`~TFGLMModel`]. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + last_logits_l2_alpha ('float', *optional*, defaults to -1.0): + Whether use l2 norm for last output logits. + If < 0, will not compute last logits l2 norm, + elif == 0, will compute l2 norm but not plus in the loss, + while > 0, will plus this loss in the total loss. + rotary_type (`str` or `function`, *optional*, defaults to `"none"`): + The Rotary Embedding type to used in SelfAttention. + If string, `"none"`, `"1d"`, `"2d"` are supported. + unidirectional ('bool', *optional*, defaults to `False`): + Whether or not the model is train with prefix LM or causal LM. + Example: + + ```python + >>> from transformers import GLMModel, GLMConfig + + >>> # Initializing a GLM shunxing1234/GLM-base-cased style configuration + >>> configuration = GLMConfig() + + >>> # Initializing a model from the shunxing1234/GLM-base-cased style configuration + >>> model = GLMModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ``` + """ + + model_type = "glm" + attribute_map = {"num_hidden_layers": "num_layers"} + + def __init__( + self, + num_layers=24, + vocab_size=30592, + hidden_size=1024, + num_experts=1, + expert_capacity=None, + moe_config: Dict = {}, + num_attention_heads=16, + num_key_value_heads=0, + embedding_dropout_prob=0.1, + attention_dropout_prob=0.1, + output_dropout_prob=0.1, + max_sequence_length=512, + checkpoint_activations=False, + checkpoint_num_layers=1, + parallel_output=True, + relative_encoding=False, + block_position_encoding=True, + output_predict=False, + spell_length=None, + spell_func="lstm", + attention_scale=1.0, + initializer_range=0.02, + pool_token="cls", + max_memory_length=0, + bf16=True, + intermediate_size=None, + last_logits_l2_alpha=-1.0, + rotary_type='none', + use_rmsnorm=False, + use_atorch_rmsnorm=False, + use_swiglu=False, + rope_scaling=1.0, + use_cache=True, + focused_attention=False, + cache_in_memory=False, + attention_grouping=None, + output_hidden_states=False, + tie_word_embeddings=True, + unidirectional=False, + use_bias=True, + use_qkv_bias=False, + mlp_version='v1', + norm_softmax=False, + norm_head=False, + num_decoder_image_token=1024, + num_decoder_audio_token=512, + **kwargs, + ): + self.num_layers = num_layers + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_experts = num_experts + self.expert_capacity = expert_capacity + self.moe_config = moe_config + self.num_attention_heads = num_attention_heads + self.num_key_value_heads = num_key_value_heads + self.embedding_dropout_prob = embedding_dropout_prob + self.attention_dropout_prob = attention_dropout_prob + self.output_dropout_prob = output_dropout_prob + self.max_sequence_length = max_sequence_length + self.checkpoint_activations = checkpoint_activations + self.checkpoint_num_layers = checkpoint_num_layers + self.parallel_output = parallel_output + self.relative_encoding = relative_encoding + self.block_position_encoding = block_position_encoding + self.output_predict = output_predict + self.spell_length = spell_length + self.spell_func = spell_func + self.attention_scale = attention_scale + self.initializer_range = initializer_range + self.pool_token = pool_token + self.max_memory_length = max_memory_length + self.bf16 = bf16 + self.intermediate_size = intermediate_size + self.last_logits_l2_alpha = last_logits_l2_alpha + self.rotary_type = rotary_type + self.use_rmsnorm = use_rmsnorm + self.use_atorch_rmsnorm = use_atorch_rmsnorm + self.use_swiglu = use_swiglu + self.rope_scaling = rope_scaling + self.use_cache = use_cache + self.focused_attention = focused_attention + self.cache_in_memory = cache_in_memory + self.attention_grouping = attention_grouping + self.unidirectional = unidirectional + self.use_bias = use_bias + self.use_qkv_bias = use_qkv_bias + self.mlp_version = mlp_version + self.norm_softmax = norm_softmax + self.norm_head = norm_head + self.num_decoder_image_token = num_decoder_image_token + self.num_decoder_audio_token = num_decoder_audio_token + + super().__init__(output_hidden_states=output_hidden_states, tie_word_embeddings=tie_word_embeddings, **kwargs) + diff --git a/data/matcha_tts-0.0.5.1-cp38-cp38-linux_x86_64.whl b/data/matcha_tts-0.0.5.1-cp38-cp38-linux_x86_64.whl new file mode 100644 index 0000000000000000000000000000000000000000..2cc860799fb95ec614d84c66ab1726a9b9ed7f02 --- /dev/null +++ b/data/matcha_tts-0.0.5.1-cp38-cp38-linux_x86_64.whl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bef7da6f0b1ee1949a48665789a1da0cf41c2b32a0bf605cdd76983934530c5e +size 646305 diff --git a/data/spks/luna.pt b/data/spks/luna.pt new file mode 100644 index 0000000000000000000000000000000000000000..a09b1a626cf28cd47327fe62ef7b2e64ef6bab82 --- /dev/null +++ b/data/spks/luna.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f485f3180000a68aac70482d3e543429abe24866b79d8431630953c313f634c8 +size 1953 diff --git a/data/wavs/BAC009S0915W0292.wav b/data/wavs/BAC009S0915W0292.wav new file mode 100644 index 0000000000000000000000000000000000000000..52a19699830ff72144b35ab522968f66f97aa489 --- /dev/null +++ b/data/wavs/BAC009S0915W0292.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c83ec62c2cc1705fd8119cf408bdcb084df8ceffccbe128cc5b8d3cfde22f21 +size 109646 diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a7cf185daaeed2e9b6d5b2f89041794f1eb3c1a2 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,4 @@ +{ + "_from_model_config": true, + "transformers_version": "4.45.0" +} diff --git a/image_processing_bailingmm.py b/image_processing_bailingmm.py new file mode 100644 index 0000000000000000000000000000000000000000..8bbd8e49958f607366156fc2b49da426a6f96f8b --- /dev/null +++ b/image_processing_bailingmm.py @@ -0,0 +1,462 @@ +# coding=utf-8 +# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for BailingMM""" + +import math +from typing import Dict, List, Optional, Union + +import numpy as np + +from transformers.image_processing_utils import BaseImageProcessor, BatchFeature +from transformers.image_transforms import ( + convert_to_rgb, + resize, + to_channel_dimension_format, +) +from transformers.image_utils import ( + OPENAI_CLIP_MEAN, + OPENAI_CLIP_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + VideoInput, + get_image_size, + infer_channel_dimension_format, + is_scaled_image, + is_valid_image, + make_list_of_images, + to_numpy_array, + valid_images, + validate_preprocess_arguments, +) +from transformers.utils import TensorType, is_vision_available, logging + +logger = logging.get_logger(__name__) + +if is_vision_available(): + from PIL import Image + +def make_batched_images(images) -> List[List[ImageInput]]: + """ + Accepts images in list or nested list format, and makes a list of images for preprocessing. + + Args: + images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`): + The input image. + + Returns: + list: A list of images. + """ + if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]): + return [img for img_list in images for img in img_list] + + elif isinstance(images, (list, tuple)) and is_valid_image(images[0]): + return images + + elif is_valid_image(images): + return [images] + + raise ValueError(f"Could not make batched images from {images}") + +# Copied from transformers.models.llava_next_video.image_processing_llava_next_video.make_batched_videos +def make_batched_videos(videos) -> List[VideoInput]: + if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): + return videos + + elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]): + if isinstance(videos[0], Image.Image): + return [videos] + elif len(videos[0].shape) == 4: + return [list(video) for video in videos] + + elif is_valid_image(videos) and len(videos.shape) == 4: + return [list(videos)] + + raise ValueError(f"Could not make batched video from {videos}") + +def smart_resize( + height: int, width: int, factor: int = 28, min_pixels: int = 56 * 56, max_pixels: int = 14 * 14 * 4 * 1280 +): + """Rescales the image so that the following conditions are met: + + 1. Both dimensions (height and width) are divisible by 'factor'. + + 2. The total number of pixels is within the range ['min_pixels', 'max_pixels']. + + 3. The aspect ratio of the image is maintained as closely as possible. + + """ + if height < factor or width < factor: + raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}") + elif max(height, width) / min(height, width) > 200: + raise ValueError( + f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}" + ) + h_bar = round(height / factor) * factor + w_bar = round(width / factor) * factor + if h_bar * w_bar > max_pixels: + beta = math.sqrt((height * width) / max_pixels) + h_bar = math.floor(height / beta / factor) * factor + w_bar = math.floor(width / beta / factor) * factor + elif h_bar * w_bar < min_pixels: + beta = math.sqrt(min_pixels / (height * width)) + h_bar = math.ceil(height * beta / factor) * factor + w_bar = math.ceil(width * beta / factor) * factor + return h_bar, w_bar + +class BailingMMImageProcessor(BaseImageProcessor): + r""" + Constructs a BailingMM image processor that dynamically resizes images based on the original images. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions. + resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): + Resampling filter to use when resizing the image. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. + image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`): + Mean to use if normalizing the image. This is a float or list of floats for each channel in the image. + image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`): + Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image. + do_convert_rgb (`bool`, *optional*, defaults to `True`): + Whether to convert the image to RGB. + min_pixels (`int`, *optional*, defaults to `56 * 56`): + The min pixels of the image to resize the image. + max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`): + The max pixels of the image to resize the image. + patch_size (`int`, *optional*, defaults to 14): + The spacial patch size of the vision encoder. + temporal_patch_size (`int`, *optional*, defaults to 2): + The temporal patch size of the vision encoder. + merge_size (`int`, *optional*, defaults to 2): + The merge size of the vision encoder to llm encoder. + """ + + model_input_names = ["pixel_values", "image_grid_thw", "pixel_values_videos", "video_grid_thw"] + + def __init__( + self, + do_resize: bool = True, + resample: PILImageResampling = PILImageResampling.BICUBIC, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_convert_rgb: bool = True, + min_pixels: int = 56 * 56, + max_pixels: int = 28 * 28 * 1280, + min_pixels_video: int = 128 * 28 * 28, + max_pixels_video: int = 768 * 28 * 28, + patch_size: int = 14, + temporal_patch_size: int = 2, + merge_size: int = 2, + **kwargs, + ) -> None: + super().__init__(**kwargs) + self.do_resize = do_resize + self.resample = resample + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN + self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD + self.min_pixels = min_pixels + self.max_pixels = max_pixels + self.min_pixels_video = min_pixels_video + self.max_pixels_video = max_pixels_video + self.patch_size = patch_size + self.temporal_patch_size = temporal_patch_size + self.merge_size = merge_size + self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels} + self.do_convert_rgb = do_convert_rgb + + def _preprocess( + self, + images: Union[ImageInput, VideoInput], + do_resize: bool = None, + resample: PILImageResampling = None, + do_rescale: bool = None, + rescale_factor: float = None, + do_normalize: bool = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_convert_rgb: bool = None, + data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + min_pixels: int = None, + max_pixels: int = None, + ): + """ + Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`. + + Args: + images (`ImageInput`): + Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`. + vision_info (`List[Dict]`, *optional*): + Optional list of dictionaries containing additional information about vision inputs. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + resample (`PILImageResampling`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Scale factor to use if rescaling the image. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): + Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image. + image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): + Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image. + do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): + Whether to convert the image to RGB. + data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + images = make_list_of_images(images) + + if do_convert_rgb: + images = [convert_to_rgb(image) for image in images] + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if is_scaled_image(images[0]) and do_rescale: + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + height, width = get_image_size(images[0], channel_dim=input_data_format) + resized_height, resized_width = height, width + processed_images = [] + for image in images: + if do_resize: + resized_height, resized_width = smart_resize( + height, + width, + factor=self.patch_size * self.merge_size, + min_pixels=min_pixels, + max_pixels=max_pixels, + ) + image = resize( + image, size=(resized_height, resized_width), resample=resample, input_data_format=input_data_format + ) + + if do_rescale: + image = self.rescale(image, scale=rescale_factor, input_data_format=input_data_format) + + if do_normalize: + image = self.normalize( + image=image, mean=image_mean, std=image_std, input_data_format=input_data_format + ) + + image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) + processed_images.append(image) + + patches = np.array(processed_images) + if data_format == ChannelDimension.LAST: + patches = patches.transpose(0, 3, 1, 2) + if patches.shape[0] == 1: + patches = np.tile(patches, (self.temporal_patch_size, 1, 1, 1)) + channel = patches.shape[1] + grid_t = patches.shape[0] // self.temporal_patch_size + grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size + patches = patches.reshape( + grid_t, + self.temporal_patch_size, + channel, + grid_h // self.merge_size, + self.merge_size, + self.patch_size, + grid_w // self.merge_size, + self.merge_size, + self.patch_size, + ) + patches = patches.transpose(0, 3, 6, 4, 7, 2, 1, 5, 8) + flatten_patches = patches.reshape( + grid_t * grid_h * grid_w, channel * self.temporal_patch_size * self.patch_size * self.patch_size + ) + + return flatten_patches, (grid_t, grid_h, grid_w) + + def preprocess( + self, + images: ImageInput, + videos: VideoInput = None, + do_resize: bool = None, + size: Dict[str, int] = None, + resample: PILImageResampling = None, + do_rescale: bool = None, + rescale_factor: float = None, + do_normalize: bool = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_convert_rgb: bool = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ): + """ + Args: + images (`ImageInput`): + Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_rescale=False`. + videos (`VideoInput`): + Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If + passing in videos with pixel values between 0 and 1, set `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`Dict[str, int]`, *optional*, defaults to `self.size`): + Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with + the longest edge resized to keep the input aspect ratio. + resample (`int`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only + has an effect if `do_resize` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): + Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`. + image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to + `True`. + do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): + Whether to convert the image to RGB. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + + """ + do_resize = do_resize if do_resize is not None else self.do_resize + size = size if size is not None else self.size + resample = resample if resample is not None else self.resample + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb + + if images is not None: + images = make_batched_images(images) + if videos is not None: + videos = make_batched_videos(videos) + + if images is not None and not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + + validate_preprocess_arguments( + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_resize=do_resize, + size=size, + resample=resample, + ) + + if images is not None: + pixel_values, vision_grid_thws = [], [] + for image in images: + patches, image_grid_thw = self._preprocess( + image, + do_resize=do_resize, + resample=resample, + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + data_format=data_format, + do_convert_rgb=do_convert_rgb, + input_data_format=input_data_format, + min_pixels=self.min_pixels, + max_pixels=self.max_pixels + ) + pixel_values.extend(patches) + vision_grid_thws.append(image_grid_thw) + pixel_values = np.array(pixel_values) + vision_grid_thws = np.array(vision_grid_thws) + data = {"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws} + + if videos is not None: + pixel_values, vision_grid_thws = [], [] + for images in videos: + patches, video_grid_thw = self._preprocess( + images, + do_resize=do_resize, + resample=resample, + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + data_format=data_format, + do_convert_rgb=do_convert_rgb, + input_data_format=input_data_format, + min_pixels=self.min_pixels_video, + max_pixels=self.max_pixels_video + ) + pixel_values.extend(patches) + vision_grid_thws.append(video_grid_thw) + pixel_values = np.array(pixel_values) + vision_grid_thws = np.array(vision_grid_thws) + data = {"pixel_values_videos": pixel_values, "video_grid_thw": vision_grid_thws} + + return BatchFeature(data=data, tensor_type=return_tensors) diff --git a/model-00001-of-00015.safetensors b/model-00001-of-00015.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6db1473ae726691d3220f93a0ec9eb8b95a6b025 --- /dev/null +++ b/model-00001-of-00015.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf878ba731271e7ec124eb0c6b8f1f2567da16693d6575fcf4bb418f2b247c22 +size 4989626072 diff --git a/model-00002-of-00015.safetensors b/model-00002-of-00015.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fbfa16a80f7e88b97cab8b2e13ea90d49eaf7f66 --- /dev/null +++ b/model-00002-of-00015.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22fc8a02e35c1385869516ec5b20794e1c19c365f7375728f57988ac1d992df5 +size 4989213040 diff --git a/model-00003-of-00015.safetensors b/model-00003-of-00015.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b7ff09acb6746805798b6e9fb07899dc6397b63b --- /dev/null +++ b/model-00003-of-00015.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d428e8e4d80c20c4dd1ac42f77734159789cc2bd197c2bc747d67373702adf91 +size 4989213040 diff --git a/model-00004-of-00015.safetensors b/model-00004-of-00015.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2fde4013741cfc5a3bb830e6600429571c3ab6fd --- /dev/null +++ b/model-00004-of-00015.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76d02a9ceb2ea1d81221161ea192eb9725799c822dd7d506eab725c1e9d02599 +size 4989213040 diff --git a/model-00005-of-00015.safetensors b/model-00005-of-00015.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b1165c185108c0ced71c79d0dc04932b73e6a235 --- /dev/null +++ b/model-00005-of-00015.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19aa7738c51ec1e2bed0555d31ca2e068c6240cc7fe543d8828834549208542b +size 4989213040 diff --git a/model-00006-of-00015.safetensors b/model-00006-of-00015.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e5d6697971c7f697d71b972a2f033f17789eda00 --- /dev/null +++ b/model-00006-of-00015.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b50267c4665aa4bb7f7190baa7d1e1befff23d830a3abf5c7eed6607e0d524d +size 4989213248 diff --git a/model-00007-of-00015.safetensors b/model-00007-of-00015.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..994d98d6ceacb791d9544dae7024722cbaec764f --- /dev/null +++ b/model-00007-of-00015.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25b4430c71d44de683aaeec8c47e8ed376737b404dcd59a64a42878073664db3 +size 4998142816 diff --git a/model-00008-of-00015.safetensors b/model-00008-of-00015.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e0030c353eda31bda5cadae088cccdfc602154d5 --- /dev/null +++ b/model-00008-of-00015.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d06000b8e78a8dfa4fcf5d31f99a1a76fb832156fa7c4cbab462220e2d16f3de +size 4989213456 diff --git a/model-00009-of-00015.safetensors b/model-00009-of-00015.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7a898faf42e45ed88e805ba2cf63735358430f30 --- /dev/null +++ b/model-00009-of-00015.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7b4d6b484a67ec9329588ce997a2442e28710215334469cb22fdc95093516dd +size 4989213472 diff --git a/model-00010-of-00015.safetensors b/model-00010-of-00015.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..441f5f1c2e378ddc7be588d62a1ab928d2b12223 --- /dev/null +++ b/model-00010-of-00015.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92280d5371fa8f6c1123553e7abe0d2d359904ab4dc20fe9a82e150c19798b36 +size 4989213472 diff --git a/model-00011-of-00015.safetensors b/model-00011-of-00015.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..339429eabf431495655374c3d9a5a7bc81803274 --- /dev/null +++ b/model-00011-of-00015.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04b37b96b8358c8e712540fca47142a89e4cf4538c87b903522f3658e667afd3 +size 4989213472 diff --git a/model-00012-of-00015.safetensors b/model-00012-of-00015.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ca38e87f778282fe66263686a2abc9e317531221 --- /dev/null +++ b/model-00012-of-00015.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6b3d318ed0a3778aac5066f074101f54414cce516b8768a2557358c8a5d5697 +size 4989213472 diff --git a/model-00013-of-00015.safetensors b/model-00013-of-00015.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..be5377e8dd50037c0500bcdd9bc2f63ad521222c --- /dev/null +++ b/model-00013-of-00015.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34a9564626955946f0f732890143378fba1863ed06473bed49713a9ea1c967aa +size 4989213472 diff --git a/model-00014-of-00015.safetensors b/model-00014-of-00015.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3d69a6aba7c199a3d2a58b3aaf92038d6e559162 --- /dev/null +++ b/model-00014-of-00015.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:224aef187c4e6f61ba25f5ed9a0c8cdb0525ab0aa9d05f7814d42ca7f9d7f87f +size 4783190832 diff --git a/model-00015-of-00015.safetensors b/model-00015-of-00015.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c1a7ae01e0760073b90d16f3dfa50e6eca853c2d --- /dev/null +++ b/model-00015-of-00015.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69d73e53e63f6702e10a9aae081fbe58fe20ea7091021bdf5f915b42d61401d4 +size 3902650696 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..4d92cc9bb38a1f3c459f20dd3e381a8bee3810c7 --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,7011 @@ +{ + "metadata": { + "total_size": 73554074496 + }, + "weight_map": { + "audio.after_norm.bias": "model-00001-of-00015.safetensors", + "audio.after_norm.weight": "model-00001-of-00015.safetensors", + "audio.encoders.0.feed_forward.w_1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.0.feed_forward.w_1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.0.feed_forward.w_2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.0.feed_forward.w_2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.0.norm1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.0.norm1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.0.norm2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.0.norm2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.0.self_attn.fsmn_block.weight": "model-00001-of-00015.safetensors", + "audio.encoders.0.self_attn.linear_out.bias": "model-00001-of-00015.safetensors", + "audio.encoders.0.self_attn.linear_out.weight": "model-00001-of-00015.safetensors", + "audio.encoders.0.self_attn.linear_q_k_v.bias": "model-00001-of-00015.safetensors", + "audio.encoders.0.self_attn.linear_q_k_v.weight": "model-00001-of-00015.safetensors", + "audio.encoders.1.feed_forward.w_1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.1.feed_forward.w_1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.1.feed_forward.w_2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.1.feed_forward.w_2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.1.norm1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.1.norm1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.1.norm2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.1.norm2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.1.self_attn.fsmn_block.weight": "model-00001-of-00015.safetensors", + "audio.encoders.1.self_attn.linear_out.bias": "model-00001-of-00015.safetensors", + "audio.encoders.1.self_attn.linear_out.weight": "model-00001-of-00015.safetensors", + "audio.encoders.1.self_attn.linear_q_k_v.bias": "model-00001-of-00015.safetensors", + "audio.encoders.1.self_attn.linear_q_k_v.weight": "model-00001-of-00015.safetensors", + "audio.encoders.10.feed_forward.w_1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.10.feed_forward.w_1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.10.feed_forward.w_2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.10.feed_forward.w_2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.10.norm1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.10.norm1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.10.norm2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.10.norm2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.10.self_attn.fsmn_block.weight": "model-00001-of-00015.safetensors", + "audio.encoders.10.self_attn.linear_out.bias": "model-00001-of-00015.safetensors", + "audio.encoders.10.self_attn.linear_out.weight": "model-00001-of-00015.safetensors", + "audio.encoders.10.self_attn.linear_q_k_v.bias": "model-00001-of-00015.safetensors", + "audio.encoders.10.self_attn.linear_q_k_v.weight": "model-00001-of-00015.safetensors", + "audio.encoders.11.feed_forward.w_1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.11.feed_forward.w_1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.11.feed_forward.w_2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.11.feed_forward.w_2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.11.norm1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.11.norm1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.11.norm2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.11.norm2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.11.self_attn.fsmn_block.weight": "model-00001-of-00015.safetensors", + "audio.encoders.11.self_attn.linear_out.bias": "model-00001-of-00015.safetensors", + "audio.encoders.11.self_attn.linear_out.weight": "model-00001-of-00015.safetensors", + "audio.encoders.11.self_attn.linear_q_k_v.bias": "model-00001-of-00015.safetensors", + "audio.encoders.11.self_attn.linear_q_k_v.weight": "model-00001-of-00015.safetensors", + "audio.encoders.12.feed_forward.w_1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.12.feed_forward.w_1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.12.feed_forward.w_2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.12.feed_forward.w_2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.12.norm1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.12.norm1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.12.norm2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.12.norm2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.12.self_attn.fsmn_block.weight": "model-00001-of-00015.safetensors", + "audio.encoders.12.self_attn.linear_out.bias": "model-00001-of-00015.safetensors", + "audio.encoders.12.self_attn.linear_out.weight": "model-00001-of-00015.safetensors", + "audio.encoders.12.self_attn.linear_q_k_v.bias": "model-00001-of-00015.safetensors", + "audio.encoders.12.self_attn.linear_q_k_v.weight": "model-00001-of-00015.safetensors", + "audio.encoders.13.feed_forward.w_1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.13.feed_forward.w_1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.13.feed_forward.w_2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.13.feed_forward.w_2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.13.norm1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.13.norm1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.13.norm2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.13.norm2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.13.self_attn.fsmn_block.weight": "model-00001-of-00015.safetensors", + "audio.encoders.13.self_attn.linear_out.bias": "model-00001-of-00015.safetensors", + "audio.encoders.13.self_attn.linear_out.weight": "model-00001-of-00015.safetensors", + "audio.encoders.13.self_attn.linear_q_k_v.bias": "model-00001-of-00015.safetensors", + "audio.encoders.13.self_attn.linear_q_k_v.weight": "model-00001-of-00015.safetensors", + "audio.encoders.14.feed_forward.w_1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.14.feed_forward.w_1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.14.feed_forward.w_2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.14.feed_forward.w_2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.14.norm1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.14.norm1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.14.norm2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.14.norm2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.14.self_attn.fsmn_block.weight": "model-00001-of-00015.safetensors", + "audio.encoders.14.self_attn.linear_out.bias": "model-00001-of-00015.safetensors", + "audio.encoders.14.self_attn.linear_out.weight": "model-00001-of-00015.safetensors", + "audio.encoders.14.self_attn.linear_q_k_v.bias": "model-00001-of-00015.safetensors", + "audio.encoders.14.self_attn.linear_q_k_v.weight": "model-00001-of-00015.safetensors", + "audio.encoders.15.feed_forward.w_1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.15.feed_forward.w_1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.15.feed_forward.w_2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.15.feed_forward.w_2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.15.norm1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.15.norm1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.15.norm2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.15.norm2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.15.self_attn.fsmn_block.weight": "model-00001-of-00015.safetensors", + "audio.encoders.15.self_attn.linear_out.bias": "model-00001-of-00015.safetensors", + "audio.encoders.15.self_attn.linear_out.weight": "model-00001-of-00015.safetensors", + "audio.encoders.15.self_attn.linear_q_k_v.bias": "model-00001-of-00015.safetensors", + "audio.encoders.15.self_attn.linear_q_k_v.weight": "model-00001-of-00015.safetensors", + "audio.encoders.16.feed_forward.w_1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.16.feed_forward.w_1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.16.feed_forward.w_2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.16.feed_forward.w_2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.16.norm1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.16.norm1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.16.norm2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.16.norm2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.16.self_attn.fsmn_block.weight": "model-00001-of-00015.safetensors", + "audio.encoders.16.self_attn.linear_out.bias": "model-00001-of-00015.safetensors", + "audio.encoders.16.self_attn.linear_out.weight": "model-00001-of-00015.safetensors", + "audio.encoders.16.self_attn.linear_q_k_v.bias": "model-00001-of-00015.safetensors", + "audio.encoders.16.self_attn.linear_q_k_v.weight": "model-00001-of-00015.safetensors", + "audio.encoders.17.feed_forward.w_1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.17.feed_forward.w_1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.17.feed_forward.w_2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.17.feed_forward.w_2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.17.norm1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.17.norm1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.17.norm2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.17.norm2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.17.self_attn.fsmn_block.weight": "model-00001-of-00015.safetensors", + "audio.encoders.17.self_attn.linear_out.bias": "model-00001-of-00015.safetensors", + "audio.encoders.17.self_attn.linear_out.weight": "model-00001-of-00015.safetensors", + "audio.encoders.17.self_attn.linear_q_k_v.bias": "model-00001-of-00015.safetensors", + "audio.encoders.17.self_attn.linear_q_k_v.weight": "model-00001-of-00015.safetensors", + "audio.encoders.18.feed_forward.w_1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.18.feed_forward.w_1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.18.feed_forward.w_2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.18.feed_forward.w_2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.18.norm1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.18.norm1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.18.norm2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.18.norm2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.18.self_attn.fsmn_block.weight": "model-00001-of-00015.safetensors", + "audio.encoders.18.self_attn.linear_out.bias": "model-00001-of-00015.safetensors", + "audio.encoders.18.self_attn.linear_out.weight": "model-00001-of-00015.safetensors", + "audio.encoders.18.self_attn.linear_q_k_v.bias": "model-00001-of-00015.safetensors", + "audio.encoders.18.self_attn.linear_q_k_v.weight": "model-00001-of-00015.safetensors", + "audio.encoders.19.feed_forward.w_1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.19.feed_forward.w_1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.19.feed_forward.w_2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.19.feed_forward.w_2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.19.norm1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.19.norm1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.19.norm2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.19.norm2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.19.self_attn.fsmn_block.weight": "model-00001-of-00015.safetensors", + "audio.encoders.19.self_attn.linear_out.bias": "model-00001-of-00015.safetensors", + "audio.encoders.19.self_attn.linear_out.weight": "model-00001-of-00015.safetensors", + "audio.encoders.19.self_attn.linear_q_k_v.bias": "model-00001-of-00015.safetensors", + "audio.encoders.19.self_attn.linear_q_k_v.weight": "model-00001-of-00015.safetensors", + "audio.encoders.2.feed_forward.w_1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.2.feed_forward.w_1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.2.feed_forward.w_2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.2.feed_forward.w_2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.2.norm1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.2.norm1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.2.norm2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.2.norm2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.2.self_attn.fsmn_block.weight": "model-00001-of-00015.safetensors", + "audio.encoders.2.self_attn.linear_out.bias": "model-00001-of-00015.safetensors", + "audio.encoders.2.self_attn.linear_out.weight": "model-00001-of-00015.safetensors", + "audio.encoders.2.self_attn.linear_q_k_v.bias": "model-00001-of-00015.safetensors", + "audio.encoders.2.self_attn.linear_q_k_v.weight": "model-00001-of-00015.safetensors", + "audio.encoders.20.feed_forward.w_1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.20.feed_forward.w_1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.20.feed_forward.w_2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.20.feed_forward.w_2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.20.norm1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.20.norm1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.20.norm2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.20.norm2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.20.self_attn.fsmn_block.weight": "model-00001-of-00015.safetensors", + "audio.encoders.20.self_attn.linear_out.bias": "model-00001-of-00015.safetensors", + "audio.encoders.20.self_attn.linear_out.weight": "model-00001-of-00015.safetensors", + "audio.encoders.20.self_attn.linear_q_k_v.bias": "model-00001-of-00015.safetensors", + "audio.encoders.20.self_attn.linear_q_k_v.weight": "model-00001-of-00015.safetensors", + "audio.encoders.21.feed_forward.w_1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.21.feed_forward.w_1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.21.feed_forward.w_2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.21.feed_forward.w_2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.21.norm1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.21.norm1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.21.norm2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.21.norm2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.21.self_attn.fsmn_block.weight": "model-00001-of-00015.safetensors", + "audio.encoders.21.self_attn.linear_out.bias": "model-00001-of-00015.safetensors", + "audio.encoders.21.self_attn.linear_out.weight": "model-00001-of-00015.safetensors", + "audio.encoders.21.self_attn.linear_q_k_v.bias": "model-00001-of-00015.safetensors", + "audio.encoders.21.self_attn.linear_q_k_v.weight": "model-00001-of-00015.safetensors", + "audio.encoders.22.feed_forward.w_1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.22.feed_forward.w_1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.22.feed_forward.w_2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.22.feed_forward.w_2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.22.norm1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.22.norm1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.22.norm2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.22.norm2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.22.self_attn.fsmn_block.weight": "model-00001-of-00015.safetensors", + "audio.encoders.22.self_attn.linear_out.bias": "model-00001-of-00015.safetensors", + "audio.encoders.22.self_attn.linear_out.weight": "model-00001-of-00015.safetensors", + "audio.encoders.22.self_attn.linear_q_k_v.bias": "model-00001-of-00015.safetensors", + "audio.encoders.22.self_attn.linear_q_k_v.weight": "model-00001-of-00015.safetensors", + "audio.encoders.23.feed_forward.w_1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.23.feed_forward.w_1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.23.feed_forward.w_2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.23.feed_forward.w_2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.23.norm1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.23.norm1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.23.norm2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.23.norm2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.23.self_attn.fsmn_block.weight": "model-00001-of-00015.safetensors", + "audio.encoders.23.self_attn.linear_out.bias": "model-00001-of-00015.safetensors", + "audio.encoders.23.self_attn.linear_out.weight": "model-00001-of-00015.safetensors", + "audio.encoders.23.self_attn.linear_q_k_v.bias": "model-00001-of-00015.safetensors", + "audio.encoders.23.self_attn.linear_q_k_v.weight": "model-00001-of-00015.safetensors", + "audio.encoders.24.feed_forward.w_1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.24.feed_forward.w_1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.24.feed_forward.w_2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.24.feed_forward.w_2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.24.norm1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.24.norm1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.24.norm2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.24.norm2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.24.self_attn.fsmn_block.weight": "model-00001-of-00015.safetensors", + "audio.encoders.24.self_attn.linear_out.bias": "model-00001-of-00015.safetensors", + "audio.encoders.24.self_attn.linear_out.weight": "model-00001-of-00015.safetensors", + "audio.encoders.24.self_attn.linear_q_k_v.bias": "model-00001-of-00015.safetensors", + "audio.encoders.24.self_attn.linear_q_k_v.weight": "model-00001-of-00015.safetensors", + "audio.encoders.25.feed_forward.w_1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.25.feed_forward.w_1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.25.feed_forward.w_2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.25.feed_forward.w_2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.25.norm1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.25.norm1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.25.norm2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.25.norm2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.25.self_attn.fsmn_block.weight": "model-00001-of-00015.safetensors", + "audio.encoders.25.self_attn.linear_out.bias": "model-00001-of-00015.safetensors", + "audio.encoders.25.self_attn.linear_out.weight": "model-00001-of-00015.safetensors", + "audio.encoders.25.self_attn.linear_q_k_v.bias": "model-00001-of-00015.safetensors", + "audio.encoders.25.self_attn.linear_q_k_v.weight": "model-00001-of-00015.safetensors", + "audio.encoders.26.feed_forward.w_1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.26.feed_forward.w_1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.26.feed_forward.w_2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.26.feed_forward.w_2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.26.norm1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.26.norm1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.26.norm2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.26.norm2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.26.self_attn.fsmn_block.weight": "model-00001-of-00015.safetensors", + "audio.encoders.26.self_attn.linear_out.bias": "model-00001-of-00015.safetensors", + "audio.encoders.26.self_attn.linear_out.weight": "model-00001-of-00015.safetensors", + "audio.encoders.26.self_attn.linear_q_k_v.bias": "model-00001-of-00015.safetensors", + "audio.encoders.26.self_attn.linear_q_k_v.weight": "model-00001-of-00015.safetensors", + "audio.encoders.27.feed_forward.w_1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.27.feed_forward.w_1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.27.feed_forward.w_2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.27.feed_forward.w_2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.27.norm1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.27.norm1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.27.norm2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.27.norm2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.27.self_attn.fsmn_block.weight": "model-00001-of-00015.safetensors", + "audio.encoders.27.self_attn.linear_out.bias": "model-00001-of-00015.safetensors", + "audio.encoders.27.self_attn.linear_out.weight": "model-00001-of-00015.safetensors", + "audio.encoders.27.self_attn.linear_q_k_v.bias": "model-00001-of-00015.safetensors", + "audio.encoders.27.self_attn.linear_q_k_v.weight": "model-00001-of-00015.safetensors", + "audio.encoders.28.feed_forward.w_1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.28.feed_forward.w_1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.28.feed_forward.w_2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.28.feed_forward.w_2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.28.norm1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.28.norm1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.28.norm2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.28.norm2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.28.self_attn.fsmn_block.weight": "model-00001-of-00015.safetensors", + "audio.encoders.28.self_attn.linear_out.bias": "model-00001-of-00015.safetensors", + "audio.encoders.28.self_attn.linear_out.weight": "model-00001-of-00015.safetensors", + "audio.encoders.28.self_attn.linear_q_k_v.bias": "model-00001-of-00015.safetensors", + "audio.encoders.28.self_attn.linear_q_k_v.weight": "model-00001-of-00015.safetensors", + "audio.encoders.29.feed_forward.w_1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.29.feed_forward.w_1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.29.feed_forward.w_2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.29.feed_forward.w_2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.29.norm1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.29.norm1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.29.norm2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.29.norm2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.29.self_attn.fsmn_block.weight": "model-00001-of-00015.safetensors", + "audio.encoders.29.self_attn.linear_out.bias": "model-00001-of-00015.safetensors", + "audio.encoders.29.self_attn.linear_out.weight": "model-00001-of-00015.safetensors", + "audio.encoders.29.self_attn.linear_q_k_v.bias": "model-00001-of-00015.safetensors", + "audio.encoders.29.self_attn.linear_q_k_v.weight": "model-00001-of-00015.safetensors", + "audio.encoders.3.feed_forward.w_1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.3.feed_forward.w_1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.3.feed_forward.w_2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.3.feed_forward.w_2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.3.norm1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.3.norm1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.3.norm2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.3.norm2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.3.self_attn.fsmn_block.weight": "model-00001-of-00015.safetensors", + "audio.encoders.3.self_attn.linear_out.bias": "model-00001-of-00015.safetensors", + "audio.encoders.3.self_attn.linear_out.weight": "model-00001-of-00015.safetensors", + "audio.encoders.3.self_attn.linear_q_k_v.bias": "model-00001-of-00015.safetensors", + "audio.encoders.3.self_attn.linear_q_k_v.weight": "model-00001-of-00015.safetensors", + "audio.encoders.30.feed_forward.w_1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.30.feed_forward.w_1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.30.feed_forward.w_2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.30.feed_forward.w_2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.30.norm1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.30.norm1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.30.norm2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.30.norm2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.30.self_attn.fsmn_block.weight": "model-00001-of-00015.safetensors", + "audio.encoders.30.self_attn.linear_out.bias": "model-00001-of-00015.safetensors", + "audio.encoders.30.self_attn.linear_out.weight": "model-00001-of-00015.safetensors", + "audio.encoders.30.self_attn.linear_q_k_v.bias": "model-00001-of-00015.safetensors", + "audio.encoders.30.self_attn.linear_q_k_v.weight": "model-00001-of-00015.safetensors", + "audio.encoders.31.feed_forward.w_1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.31.feed_forward.w_1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.31.feed_forward.w_2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.31.feed_forward.w_2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.31.norm1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.31.norm1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.31.norm2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.31.norm2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.31.self_attn.fsmn_block.weight": "model-00001-of-00015.safetensors", + "audio.encoders.31.self_attn.linear_out.bias": "model-00001-of-00015.safetensors", + "audio.encoders.31.self_attn.linear_out.weight": "model-00001-of-00015.safetensors", + "audio.encoders.31.self_attn.linear_q_k_v.bias": "model-00001-of-00015.safetensors", + "audio.encoders.31.self_attn.linear_q_k_v.weight": "model-00001-of-00015.safetensors", + "audio.encoders.32.feed_forward.w_1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.32.feed_forward.w_1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.32.feed_forward.w_2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.32.feed_forward.w_2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.32.norm1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.32.norm1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.32.norm2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.32.norm2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.32.self_attn.fsmn_block.weight": "model-00001-of-00015.safetensors", + "audio.encoders.32.self_attn.linear_out.bias": "model-00001-of-00015.safetensors", + "audio.encoders.32.self_attn.linear_out.weight": "model-00001-of-00015.safetensors", + "audio.encoders.32.self_attn.linear_q_k_v.bias": "model-00001-of-00015.safetensors", + "audio.encoders.32.self_attn.linear_q_k_v.weight": "model-00001-of-00015.safetensors", + "audio.encoders.33.feed_forward.w_1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.33.feed_forward.w_1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.33.feed_forward.w_2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.33.feed_forward.w_2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.33.norm1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.33.norm1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.33.norm2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.33.norm2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.33.self_attn.fsmn_block.weight": "model-00001-of-00015.safetensors", + "audio.encoders.33.self_attn.linear_out.bias": "model-00001-of-00015.safetensors", + "audio.encoders.33.self_attn.linear_out.weight": "model-00001-of-00015.safetensors", + "audio.encoders.33.self_attn.linear_q_k_v.bias": "model-00001-of-00015.safetensors", + "audio.encoders.33.self_attn.linear_q_k_v.weight": "model-00001-of-00015.safetensors", + "audio.encoders.34.feed_forward.w_1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.34.feed_forward.w_1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.34.feed_forward.w_2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.34.feed_forward.w_2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.34.norm1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.34.norm1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.34.norm2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.34.norm2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.34.self_attn.fsmn_block.weight": "model-00001-of-00015.safetensors", + "audio.encoders.34.self_attn.linear_out.bias": "model-00001-of-00015.safetensors", + "audio.encoders.34.self_attn.linear_out.weight": "model-00001-of-00015.safetensors", + "audio.encoders.34.self_attn.linear_q_k_v.bias": "model-00001-of-00015.safetensors", + "audio.encoders.34.self_attn.linear_q_k_v.weight": "model-00001-of-00015.safetensors", + "audio.encoders.35.feed_forward.w_1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.35.feed_forward.w_1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.35.feed_forward.w_2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.35.feed_forward.w_2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.35.norm1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.35.norm1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.35.norm2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.35.norm2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.35.self_attn.fsmn_block.weight": "model-00001-of-00015.safetensors", + "audio.encoders.35.self_attn.linear_out.bias": "model-00001-of-00015.safetensors", + "audio.encoders.35.self_attn.linear_out.weight": "model-00001-of-00015.safetensors", + "audio.encoders.35.self_attn.linear_q_k_v.bias": "model-00001-of-00015.safetensors", + "audio.encoders.35.self_attn.linear_q_k_v.weight": "model-00001-of-00015.safetensors", + "audio.encoders.36.feed_forward.w_1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.36.feed_forward.w_1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.36.feed_forward.w_2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.36.feed_forward.w_2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.36.norm1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.36.norm1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.36.norm2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.36.norm2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.36.self_attn.fsmn_block.weight": "model-00001-of-00015.safetensors", + "audio.encoders.36.self_attn.linear_out.bias": "model-00001-of-00015.safetensors", + "audio.encoders.36.self_attn.linear_out.weight": "model-00001-of-00015.safetensors", + "audio.encoders.36.self_attn.linear_q_k_v.bias": "model-00001-of-00015.safetensors", + "audio.encoders.36.self_attn.linear_q_k_v.weight": "model-00001-of-00015.safetensors", + "audio.encoders.37.feed_forward.w_1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.37.feed_forward.w_1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.37.feed_forward.w_2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.37.feed_forward.w_2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.37.norm1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.37.norm1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.37.norm2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.37.norm2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.37.self_attn.fsmn_block.weight": "model-00001-of-00015.safetensors", + "audio.encoders.37.self_attn.linear_out.bias": "model-00001-of-00015.safetensors", + "audio.encoders.37.self_attn.linear_out.weight": "model-00001-of-00015.safetensors", + "audio.encoders.37.self_attn.linear_q_k_v.bias": "model-00001-of-00015.safetensors", + "audio.encoders.37.self_attn.linear_q_k_v.weight": "model-00001-of-00015.safetensors", + "audio.encoders.38.feed_forward.w_1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.38.feed_forward.w_1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.38.feed_forward.w_2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.38.feed_forward.w_2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.38.norm1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.38.norm1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.38.norm2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.38.norm2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.38.self_attn.fsmn_block.weight": "model-00001-of-00015.safetensors", + "audio.encoders.38.self_attn.linear_out.bias": "model-00001-of-00015.safetensors", + "audio.encoders.38.self_attn.linear_out.weight": "model-00001-of-00015.safetensors", + "audio.encoders.38.self_attn.linear_q_k_v.bias": "model-00001-of-00015.safetensors", + "audio.encoders.38.self_attn.linear_q_k_v.weight": "model-00001-of-00015.safetensors", + "audio.encoders.39.feed_forward.w_1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.39.feed_forward.w_1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.39.feed_forward.w_2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.39.feed_forward.w_2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.39.norm1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.39.norm1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.39.norm2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.39.norm2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.39.self_attn.fsmn_block.weight": "model-00001-of-00015.safetensors", + "audio.encoders.39.self_attn.linear_out.bias": "model-00001-of-00015.safetensors", + "audio.encoders.39.self_attn.linear_out.weight": "model-00001-of-00015.safetensors", + "audio.encoders.39.self_attn.linear_q_k_v.bias": "model-00001-of-00015.safetensors", + "audio.encoders.39.self_attn.linear_q_k_v.weight": "model-00001-of-00015.safetensors", + "audio.encoders.4.feed_forward.w_1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.4.feed_forward.w_1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.4.feed_forward.w_2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.4.feed_forward.w_2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.4.norm1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.4.norm1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.4.norm2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.4.norm2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.4.self_attn.fsmn_block.weight": "model-00001-of-00015.safetensors", + "audio.encoders.4.self_attn.linear_out.bias": "model-00001-of-00015.safetensors", + "audio.encoders.4.self_attn.linear_out.weight": "model-00001-of-00015.safetensors", + "audio.encoders.4.self_attn.linear_q_k_v.bias": "model-00001-of-00015.safetensors", + "audio.encoders.4.self_attn.linear_q_k_v.weight": "model-00001-of-00015.safetensors", + "audio.encoders.40.feed_forward.w_1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.40.feed_forward.w_1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.40.feed_forward.w_2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.40.feed_forward.w_2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.40.norm1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.40.norm1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.40.norm2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.40.norm2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.40.self_attn.fsmn_block.weight": "model-00001-of-00015.safetensors", + "audio.encoders.40.self_attn.linear_out.bias": "model-00001-of-00015.safetensors", + "audio.encoders.40.self_attn.linear_out.weight": "model-00001-of-00015.safetensors", + "audio.encoders.40.self_attn.linear_q_k_v.bias": "model-00001-of-00015.safetensors", + "audio.encoders.40.self_attn.linear_q_k_v.weight": "model-00001-of-00015.safetensors", + "audio.encoders.41.feed_forward.w_1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.41.feed_forward.w_1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.41.feed_forward.w_2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.41.feed_forward.w_2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.41.norm1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.41.norm1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.41.norm2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.41.norm2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.41.self_attn.fsmn_block.weight": "model-00001-of-00015.safetensors", + "audio.encoders.41.self_attn.linear_out.bias": "model-00001-of-00015.safetensors", + "audio.encoders.41.self_attn.linear_out.weight": "model-00001-of-00015.safetensors", + "audio.encoders.41.self_attn.linear_q_k_v.bias": "model-00001-of-00015.safetensors", + "audio.encoders.41.self_attn.linear_q_k_v.weight": "model-00001-of-00015.safetensors", + "audio.encoders.42.feed_forward.w_1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.42.feed_forward.w_1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.42.feed_forward.w_2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.42.feed_forward.w_2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.42.norm1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.42.norm1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.42.norm2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.42.norm2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.42.self_attn.fsmn_block.weight": "model-00001-of-00015.safetensors", + "audio.encoders.42.self_attn.linear_out.bias": "model-00001-of-00015.safetensors", + "audio.encoders.42.self_attn.linear_out.weight": "model-00001-of-00015.safetensors", + "audio.encoders.42.self_attn.linear_q_k_v.bias": "model-00001-of-00015.safetensors", + "audio.encoders.42.self_attn.linear_q_k_v.weight": "model-00001-of-00015.safetensors", + "audio.encoders.43.feed_forward.w_1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.43.feed_forward.w_1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.43.feed_forward.w_2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.43.feed_forward.w_2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.43.norm1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.43.norm1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.43.norm2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.43.norm2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.43.self_attn.fsmn_block.weight": "model-00001-of-00015.safetensors", + "audio.encoders.43.self_attn.linear_out.bias": "model-00001-of-00015.safetensors", + "audio.encoders.43.self_attn.linear_out.weight": "model-00001-of-00015.safetensors", + "audio.encoders.43.self_attn.linear_q_k_v.bias": "model-00001-of-00015.safetensors", + "audio.encoders.43.self_attn.linear_q_k_v.weight": "model-00001-of-00015.safetensors", + "audio.encoders.44.feed_forward.w_1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.44.feed_forward.w_1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.44.feed_forward.w_2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.44.feed_forward.w_2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.44.norm1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.44.norm1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.44.norm2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.44.norm2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.44.self_attn.fsmn_block.weight": "model-00001-of-00015.safetensors", + "audio.encoders.44.self_attn.linear_out.bias": "model-00001-of-00015.safetensors", + "audio.encoders.44.self_attn.linear_out.weight": "model-00001-of-00015.safetensors", + "audio.encoders.44.self_attn.linear_q_k_v.bias": "model-00001-of-00015.safetensors", + "audio.encoders.44.self_attn.linear_q_k_v.weight": "model-00001-of-00015.safetensors", + "audio.encoders.45.feed_forward.w_1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.45.feed_forward.w_1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.45.feed_forward.w_2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.45.feed_forward.w_2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.45.norm1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.45.norm1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.45.norm2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.45.norm2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.45.self_attn.fsmn_block.weight": "model-00001-of-00015.safetensors", + "audio.encoders.45.self_attn.linear_out.bias": "model-00001-of-00015.safetensors", + "audio.encoders.45.self_attn.linear_out.weight": "model-00001-of-00015.safetensors", + "audio.encoders.45.self_attn.linear_q_k_v.bias": "model-00001-of-00015.safetensors", + "audio.encoders.45.self_attn.linear_q_k_v.weight": "model-00001-of-00015.safetensors", + "audio.encoders.46.feed_forward.w_1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.46.feed_forward.w_1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.46.feed_forward.w_2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.46.feed_forward.w_2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.46.norm1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.46.norm1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.46.norm2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.46.norm2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.46.self_attn.fsmn_block.weight": "model-00001-of-00015.safetensors", + "audio.encoders.46.self_attn.linear_out.bias": "model-00001-of-00015.safetensors", + "audio.encoders.46.self_attn.linear_out.weight": "model-00001-of-00015.safetensors", + "audio.encoders.46.self_attn.linear_q_k_v.bias": "model-00001-of-00015.safetensors", + "audio.encoders.46.self_attn.linear_q_k_v.weight": "model-00001-of-00015.safetensors", + "audio.encoders.47.feed_forward.w_1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.47.feed_forward.w_1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.47.feed_forward.w_2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.47.feed_forward.w_2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.47.norm1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.47.norm1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.47.norm2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.47.norm2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.47.self_attn.fsmn_block.weight": "model-00001-of-00015.safetensors", + "audio.encoders.47.self_attn.linear_out.bias": "model-00001-of-00015.safetensors", + "audio.encoders.47.self_attn.linear_out.weight": "model-00001-of-00015.safetensors", + "audio.encoders.47.self_attn.linear_q_k_v.bias": "model-00001-of-00015.safetensors", + "audio.encoders.47.self_attn.linear_q_k_v.weight": "model-00001-of-00015.safetensors", + "audio.encoders.48.feed_forward.w_1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.48.feed_forward.w_1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.48.feed_forward.w_2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.48.feed_forward.w_2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.48.norm1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.48.norm1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.48.norm2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.48.norm2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.48.self_attn.fsmn_block.weight": "model-00001-of-00015.safetensors", + "audio.encoders.48.self_attn.linear_out.bias": "model-00001-of-00015.safetensors", + "audio.encoders.48.self_attn.linear_out.weight": "model-00001-of-00015.safetensors", + "audio.encoders.48.self_attn.linear_q_k_v.bias": "model-00001-of-00015.safetensors", + "audio.encoders.48.self_attn.linear_q_k_v.weight": "model-00001-of-00015.safetensors", + "audio.encoders.5.feed_forward.w_1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.5.feed_forward.w_1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.5.feed_forward.w_2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.5.feed_forward.w_2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.5.norm1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.5.norm1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.5.norm2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.5.norm2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.5.self_attn.fsmn_block.weight": "model-00001-of-00015.safetensors", + "audio.encoders.5.self_attn.linear_out.bias": "model-00001-of-00015.safetensors", + "audio.encoders.5.self_attn.linear_out.weight": "model-00001-of-00015.safetensors", + "audio.encoders.5.self_attn.linear_q_k_v.bias": "model-00001-of-00015.safetensors", + "audio.encoders.5.self_attn.linear_q_k_v.weight": "model-00001-of-00015.safetensors", + "audio.encoders.6.feed_forward.w_1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.6.feed_forward.w_1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.6.feed_forward.w_2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.6.feed_forward.w_2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.6.norm1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.6.norm1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.6.norm2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.6.norm2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.6.self_attn.fsmn_block.weight": "model-00001-of-00015.safetensors", + "audio.encoders.6.self_attn.linear_out.bias": "model-00001-of-00015.safetensors", + "audio.encoders.6.self_attn.linear_out.weight": "model-00001-of-00015.safetensors", + "audio.encoders.6.self_attn.linear_q_k_v.bias": "model-00001-of-00015.safetensors", + "audio.encoders.6.self_attn.linear_q_k_v.weight": "model-00001-of-00015.safetensors", + "audio.encoders.7.feed_forward.w_1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.7.feed_forward.w_1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.7.feed_forward.w_2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.7.feed_forward.w_2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.7.norm1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.7.norm1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.7.norm2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.7.norm2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.7.self_attn.fsmn_block.weight": "model-00001-of-00015.safetensors", + "audio.encoders.7.self_attn.linear_out.bias": "model-00001-of-00015.safetensors", + "audio.encoders.7.self_attn.linear_out.weight": "model-00001-of-00015.safetensors", + "audio.encoders.7.self_attn.linear_q_k_v.bias": "model-00001-of-00015.safetensors", + "audio.encoders.7.self_attn.linear_q_k_v.weight": "model-00001-of-00015.safetensors", + "audio.encoders.8.feed_forward.w_1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.8.feed_forward.w_1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.8.feed_forward.w_2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.8.feed_forward.w_2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.8.norm1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.8.norm1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.8.norm2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.8.norm2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.8.self_attn.fsmn_block.weight": "model-00001-of-00015.safetensors", + "audio.encoders.8.self_attn.linear_out.bias": "model-00001-of-00015.safetensors", + "audio.encoders.8.self_attn.linear_out.weight": "model-00001-of-00015.safetensors", + "audio.encoders.8.self_attn.linear_q_k_v.bias": "model-00001-of-00015.safetensors", + "audio.encoders.8.self_attn.linear_q_k_v.weight": "model-00001-of-00015.safetensors", + "audio.encoders.9.feed_forward.w_1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.9.feed_forward.w_1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.9.feed_forward.w_2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.9.feed_forward.w_2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.9.norm1.bias": "model-00001-of-00015.safetensors", + "audio.encoders.9.norm1.weight": "model-00001-of-00015.safetensors", + "audio.encoders.9.norm2.bias": "model-00001-of-00015.safetensors", + "audio.encoders.9.norm2.weight": "model-00001-of-00015.safetensors", + "audio.encoders.9.self_attn.fsmn_block.weight": "model-00001-of-00015.safetensors", + "audio.encoders.9.self_attn.linear_out.bias": "model-00001-of-00015.safetensors", + "audio.encoders.9.self_attn.linear_out.weight": "model-00001-of-00015.safetensors", + "audio.encoders.9.self_attn.linear_q_k_v.bias": "model-00001-of-00015.safetensors", + "audio.encoders.9.self_attn.linear_q_k_v.weight": "model-00001-of-00015.safetensors", + "audio.encoders0.0.feed_forward.w_1.bias": "model-00001-of-00015.safetensors", + "audio.encoders0.0.feed_forward.w_1.weight": "model-00001-of-00015.safetensors", + "audio.encoders0.0.feed_forward.w_2.bias": "model-00001-of-00015.safetensors", + "audio.encoders0.0.feed_forward.w_2.weight": "model-00001-of-00015.safetensors", + "audio.encoders0.0.norm1.bias": "model-00001-of-00015.safetensors", + "audio.encoders0.0.norm1.weight": "model-00001-of-00015.safetensors", + "audio.encoders0.0.norm2.bias": "model-00001-of-00015.safetensors", + "audio.encoders0.0.norm2.weight": "model-00001-of-00015.safetensors", + "audio.encoders0.0.self_attn.fsmn_block.weight": "model-00001-of-00015.safetensors", + "audio.encoders0.0.self_attn.linear_out.bias": "model-00001-of-00015.safetensors", + "audio.encoders0.0.self_attn.linear_out.weight": "model-00001-of-00015.safetensors", + "audio.encoders0.0.self_attn.linear_q_k_v.bias": "model-00001-of-00015.safetensors", + "audio.encoders0.0.self_attn.linear_q_k_v.weight": "model-00001-of-00015.safetensors", + "linear_proj.0.bias": "model-00015-of-00015.safetensors", + "linear_proj.0.weight": "model-00015-of-00015.safetensors", + "linear_proj.2.bias": "model-00015-of-00015.safetensors", + "linear_proj.2.weight": "model-00015-of-00015.safetensors", + "linear_proj_audio.0.bias": "model-00015-of-00015.safetensors", + "linear_proj_audio.0.weight": "model-00015-of-00015.safetensors", + "linear_proj_audio.3.bias": "model-00015-of-00015.safetensors", + "linear_proj_audio.3.weight": "model-00015-of-00015.safetensors", + "model.lm_head.weight": "model-00015-of-00015.safetensors", + "model.model.layers.0.attention.dense.weight": "model-00001-of-00015.safetensors", + "model.model.layers.0.attention.query_key_value.weight": "model-00001-of-00015.safetensors", + "model.model.layers.0.input_layernorm.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.audio_gate.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.0.down_proj.weight": "model-00001-of-00015.safetensors", + "model.model.layers.0.mlp.experts.0.gate_proj.weight": "model-00001-of-00015.safetensors", + "model.model.layers.0.mlp.experts.0.up_proj.weight": "model-00001-of-00015.safetensors", + "model.model.layers.0.mlp.experts.1.down_proj.weight": "model-00001-of-00015.safetensors", + "model.model.layers.0.mlp.experts.1.gate_proj.weight": "model-00001-of-00015.safetensors", + "model.model.layers.0.mlp.experts.1.up_proj.weight": "model-00001-of-00015.safetensors", + "model.model.layers.0.mlp.experts.10.down_proj.weight": "model-00001-of-00015.safetensors", + "model.model.layers.0.mlp.experts.10.gate_proj.weight": "model-00001-of-00015.safetensors", + "model.model.layers.0.mlp.experts.10.up_proj.weight": "model-00001-of-00015.safetensors", + "model.model.layers.0.mlp.experts.11.down_proj.weight": "model-00001-of-00015.safetensors", + "model.model.layers.0.mlp.experts.11.gate_proj.weight": "model-00001-of-00015.safetensors", + "model.model.layers.0.mlp.experts.11.up_proj.weight": "model-00001-of-00015.safetensors", + "model.model.layers.0.mlp.experts.12.down_proj.weight": "model-00001-of-00015.safetensors", + "model.model.layers.0.mlp.experts.12.gate_proj.weight": "model-00001-of-00015.safetensors", + "model.model.layers.0.mlp.experts.12.up_proj.weight": "model-00001-of-00015.safetensors", + "model.model.layers.0.mlp.experts.13.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.13.gate_proj.weight": "model-00001-of-00015.safetensors", + "model.model.layers.0.mlp.experts.13.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.14.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.14.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.14.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.15.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.15.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.15.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.16.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.16.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.16.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.17.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.17.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.17.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.18.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.18.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.18.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.19.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.19.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.19.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.2.down_proj.weight": "model-00001-of-00015.safetensors", + "model.model.layers.0.mlp.experts.2.gate_proj.weight": "model-00001-of-00015.safetensors", + "model.model.layers.0.mlp.experts.2.up_proj.weight": "model-00001-of-00015.safetensors", + "model.model.layers.0.mlp.experts.20.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.20.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.20.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.21.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.21.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.21.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.22.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.22.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.22.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.23.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.23.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.23.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.24.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.24.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.24.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.25.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.25.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.25.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.26.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.26.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.26.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.27.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.27.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.27.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.28.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.28.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.28.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.29.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.29.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.29.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.3.down_proj.weight": "model-00001-of-00015.safetensors", + "model.model.layers.0.mlp.experts.3.gate_proj.weight": "model-00001-of-00015.safetensors", + "model.model.layers.0.mlp.experts.3.up_proj.weight": "model-00001-of-00015.safetensors", + "model.model.layers.0.mlp.experts.30.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.30.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.30.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.31.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.31.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.31.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.32.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.32.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.32.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.33.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.33.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.33.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.34.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.34.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.34.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.35.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.35.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.35.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.36.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.36.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.36.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.37.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.37.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.37.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.38.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.38.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.38.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.39.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.39.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.39.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.4.down_proj.weight": "model-00001-of-00015.safetensors", + "model.model.layers.0.mlp.experts.4.gate_proj.weight": "model-00001-of-00015.safetensors", + "model.model.layers.0.mlp.experts.4.up_proj.weight": "model-00001-of-00015.safetensors", + "model.model.layers.0.mlp.experts.40.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.40.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.40.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.41.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.41.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.41.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.42.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.42.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.42.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.43.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.43.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.43.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.44.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.44.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.44.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.45.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.45.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.45.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.46.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.46.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.46.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.47.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.47.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.47.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.48.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.48.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.48.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.49.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.49.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.49.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.5.down_proj.weight": "model-00001-of-00015.safetensors", + "model.model.layers.0.mlp.experts.5.gate_proj.weight": "model-00001-of-00015.safetensors", + "model.model.layers.0.mlp.experts.5.up_proj.weight": "model-00001-of-00015.safetensors", + "model.model.layers.0.mlp.experts.50.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.50.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.50.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.51.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.51.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.51.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.52.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.52.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.52.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.53.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.53.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.53.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.54.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.54.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.54.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.55.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.55.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.55.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.56.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.56.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.56.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.57.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.57.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.57.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.58.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.58.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.58.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.59.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.59.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.59.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.6.down_proj.weight": "model-00001-of-00015.safetensors", + "model.model.layers.0.mlp.experts.6.gate_proj.weight": "model-00001-of-00015.safetensors", + "model.model.layers.0.mlp.experts.6.up_proj.weight": "model-00001-of-00015.safetensors", + "model.model.layers.0.mlp.experts.60.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.60.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.60.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.61.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.61.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.61.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.62.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.62.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.62.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.63.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.63.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.63.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.experts.7.down_proj.weight": "model-00001-of-00015.safetensors", + "model.model.layers.0.mlp.experts.7.gate_proj.weight": "model-00001-of-00015.safetensors", + "model.model.layers.0.mlp.experts.7.up_proj.weight": "model-00001-of-00015.safetensors", + "model.model.layers.0.mlp.experts.8.down_proj.weight": "model-00001-of-00015.safetensors", + "model.model.layers.0.mlp.experts.8.gate_proj.weight": "model-00001-of-00015.safetensors", + "model.model.layers.0.mlp.experts.8.up_proj.weight": "model-00001-of-00015.safetensors", + "model.model.layers.0.mlp.experts.9.down_proj.weight": "model-00001-of-00015.safetensors", + "model.model.layers.0.mlp.experts.9.gate_proj.weight": "model-00001-of-00015.safetensors", + "model.model.layers.0.mlp.experts.9.up_proj.weight": "model-00001-of-00015.safetensors", + "model.model.layers.0.mlp.gate.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.image_gate.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.shared_experts.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.shared_experts.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.mlp.shared_experts.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.0.post_attention_layernorm.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.attention.dense.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.attention.query_key_value.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.input_layernorm.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.audio_gate.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.0.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.0.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.0.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.1.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.1.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.1.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.10.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.10.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.10.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.11.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.11.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.11.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.12.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.12.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.12.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.13.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.13.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.13.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.14.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.14.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.14.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.15.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.15.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.15.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.16.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.16.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.16.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.17.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.17.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.17.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.18.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.18.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.18.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.19.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.19.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.19.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.2.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.2.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.2.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.20.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.20.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.20.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.21.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.21.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.21.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.22.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.22.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.22.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.23.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.23.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.23.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.24.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.24.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.24.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.25.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.25.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.25.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.26.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.26.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.26.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.27.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.27.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.27.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.28.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.28.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.28.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.29.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.29.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.29.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.3.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.3.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.3.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.30.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.30.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.30.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.31.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.31.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.31.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.32.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.32.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.32.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.33.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.33.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.33.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.34.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.34.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.34.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.35.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.35.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.35.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.36.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.36.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.36.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.37.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.37.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.37.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.38.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.38.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.38.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.39.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.39.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.39.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.4.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.4.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.4.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.40.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.40.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.40.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.41.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.41.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.41.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.42.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.42.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.42.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.43.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.43.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.43.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.44.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.44.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.44.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.45.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.45.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.45.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.46.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.46.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.46.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.47.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.47.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.47.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.48.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.48.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.48.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.49.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.49.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.49.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.5.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.5.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.5.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.50.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.50.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.50.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.51.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.51.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.51.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.52.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.52.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.52.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.53.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.53.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.53.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.54.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.54.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.54.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.55.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.55.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.55.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.56.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.56.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.56.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.57.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.57.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.57.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.58.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.58.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.58.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.59.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.59.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.59.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.6.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.6.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.6.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.60.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.60.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.60.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.61.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.61.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.61.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.62.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.62.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.62.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.63.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.63.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.63.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.7.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.7.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.7.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.8.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.8.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.8.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.9.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.9.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.experts.9.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.gate.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.image_gate.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.shared_experts.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.shared_experts.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.mlp.shared_experts.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.1.post_attention_layernorm.weight": "model-00002-of-00015.safetensors", + "model.model.layers.10.attention.dense.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.attention.query_key_value.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.input_layernorm.weight": "model-00007-of-00015.safetensors", + "model.model.layers.10.mlp.audio_gate.weight": "model-00007-of-00015.safetensors", + "model.model.layers.10.mlp.experts.0.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.0.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.0.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.1.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.1.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.1.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.10.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.10.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.10.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.11.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.11.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.11.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.12.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.12.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.12.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.13.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.13.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.13.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.14.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.14.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.14.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.15.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.15.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.15.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.16.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.16.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.16.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.17.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.17.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.17.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.18.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.18.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.18.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.19.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.19.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.19.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.2.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.2.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.2.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.20.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.20.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.20.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.21.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.21.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.21.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.22.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.22.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.22.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.23.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.23.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.23.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.24.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.24.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.24.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.25.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.25.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.25.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.26.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.26.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.26.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.27.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.27.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.27.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.28.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.28.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.28.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.29.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.29.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.29.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.3.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.3.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.3.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.30.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.30.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.30.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.31.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.31.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.31.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.32.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.32.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.32.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.33.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.33.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.33.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.34.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.34.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.34.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.35.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.35.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.35.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.36.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.36.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.36.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.37.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.37.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.37.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.38.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.38.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.38.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.39.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.39.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.39.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.4.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.4.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.4.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.40.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.40.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.40.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.41.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.41.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.41.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.42.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.42.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.42.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.43.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.43.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.43.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.44.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.44.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.44.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.45.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.45.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.45.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.46.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.46.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.46.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.47.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.47.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.47.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.48.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.48.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.48.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.49.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.49.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.49.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.5.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.5.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.5.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.50.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.50.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.50.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.51.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.51.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.51.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.52.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.52.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.52.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.53.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.53.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.53.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.54.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.54.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.54.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.55.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.55.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.55.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.56.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.56.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.56.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.57.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.57.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.57.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.58.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.58.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.58.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.59.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.59.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.59.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.6.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.6.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.6.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.60.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.60.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.60.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.61.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.10.mlp.experts.61.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.61.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.62.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.10.mlp.experts.62.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.10.mlp.experts.62.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.10.mlp.experts.63.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.10.mlp.experts.63.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.10.mlp.experts.63.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.10.mlp.experts.7.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.7.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.7.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.8.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.8.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.8.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.9.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.9.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.experts.9.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.10.mlp.gate.weight": "model-00007-of-00015.safetensors", + "model.model.layers.10.mlp.image_gate.weight": "model-00007-of-00015.safetensors", + "model.model.layers.10.mlp.shared_experts.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.10.mlp.shared_experts.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.10.mlp.shared_experts.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.10.post_attention_layernorm.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.attention.dense.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.attention.query_key_value.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.input_layernorm.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.audio_gate.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.0.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.0.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.0.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.1.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.1.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.1.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.10.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.10.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.10.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.11.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.11.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.11.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.12.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.12.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.12.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.13.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.13.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.13.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.14.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.14.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.14.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.15.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.15.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.15.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.16.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.16.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.16.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.17.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.17.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.17.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.18.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.18.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.18.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.19.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.19.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.19.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.2.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.2.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.2.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.20.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.20.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.20.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.21.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.21.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.21.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.22.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.22.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.22.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.23.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.23.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.23.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.24.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.24.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.24.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.25.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.25.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.25.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.26.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.26.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.26.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.27.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.27.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.27.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.28.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.28.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.28.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.29.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.29.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.29.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.3.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.3.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.3.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.30.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.30.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.30.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.31.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.31.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.31.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.32.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.32.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.32.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.33.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.33.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.33.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.34.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.34.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.34.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.35.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.35.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.35.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.36.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.36.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.36.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.37.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.37.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.37.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.38.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.38.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.38.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.39.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.39.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.39.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.4.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.4.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.4.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.40.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.40.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.40.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.41.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.41.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.41.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.42.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.42.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.42.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.43.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.43.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.43.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.44.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.44.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.44.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.45.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.45.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.45.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.46.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.46.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.46.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.47.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.47.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.47.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.48.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.48.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.48.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.49.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.49.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.49.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.5.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.5.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.5.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.50.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.50.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.50.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.51.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.51.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.51.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.52.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.52.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.52.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.53.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.53.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.53.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.54.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.54.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.54.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.55.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.55.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.55.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.56.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.56.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.56.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.57.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.57.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.57.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.58.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.58.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.58.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.59.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.59.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.59.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.6.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.6.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.6.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.60.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.60.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.60.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.61.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.61.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.61.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.62.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.62.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.62.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.63.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.63.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.63.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.7.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.7.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.7.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.8.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.8.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.8.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.9.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.9.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.experts.9.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.gate.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.image_gate.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.shared_experts.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.shared_experts.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.mlp.shared_experts.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.11.post_attention_layernorm.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.attention.dense.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.attention.query_key_value.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.input_layernorm.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.audio_gate.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.0.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.0.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.0.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.1.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.1.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.1.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.10.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.10.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.10.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.11.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.11.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.11.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.12.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.12.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.12.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.13.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.13.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.13.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.14.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.14.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.14.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.15.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.15.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.15.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.16.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.16.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.16.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.17.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.17.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.17.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.18.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.18.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.18.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.19.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.19.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.19.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.2.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.2.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.2.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.20.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.20.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.20.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.21.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.21.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.21.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.22.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.22.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.22.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.23.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.23.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.23.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.24.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.24.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.24.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.25.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.25.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.25.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.26.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.26.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.26.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.27.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.27.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.27.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.28.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.28.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.28.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.29.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.29.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.29.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.3.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.3.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.3.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.30.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.30.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.30.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.31.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.31.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.31.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.32.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.32.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.32.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.33.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.33.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.33.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.34.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.34.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.34.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.35.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.35.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.35.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.36.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.36.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.36.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.37.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.37.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.37.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.38.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.38.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.38.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.39.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.39.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.39.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.4.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.4.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.4.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.40.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.40.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.40.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.41.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.41.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.41.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.42.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.42.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.42.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.43.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.43.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.43.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.44.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.44.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.44.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.45.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.45.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.45.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.46.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.46.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.46.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.47.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.47.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.47.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.48.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.48.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.48.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.49.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.49.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.49.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.5.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.5.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.5.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.50.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.50.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.50.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.51.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.51.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.51.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.52.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.52.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.52.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.53.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.53.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.53.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.54.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.54.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.54.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.55.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.55.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.55.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.56.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.56.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.56.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.57.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.57.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.57.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.58.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.58.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.58.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.59.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.59.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.59.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.6.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.6.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.6.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.60.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.60.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.60.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.61.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.61.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.61.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.62.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.62.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.62.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.63.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.63.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.63.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.7.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.7.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.7.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.8.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.8.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.8.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.9.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.9.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.experts.9.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.gate.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.image_gate.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.shared_experts.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.shared_experts.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.mlp.shared_experts.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.12.post_attention_layernorm.weight": "model-00007-of-00015.safetensors", + "model.model.layers.13.attention.dense.weight": "model-00007-of-00015.safetensors", + "model.model.layers.13.attention.query_key_value.weight": "model-00007-of-00015.safetensors", + "model.model.layers.13.input_layernorm.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.audio_gate.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.0.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.13.mlp.experts.0.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.13.mlp.experts.0.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.13.mlp.experts.1.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.13.mlp.experts.1.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.13.mlp.experts.1.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.13.mlp.experts.10.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.10.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.10.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.11.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.11.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.11.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.12.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.12.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.12.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.13.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.13.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.13.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.14.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.14.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.14.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.15.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.15.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.15.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.16.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.16.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.16.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.17.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.17.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.17.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.18.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.18.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.18.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.19.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.19.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.19.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.2.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.13.mlp.experts.2.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.13.mlp.experts.2.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.13.mlp.experts.20.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.20.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.20.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.21.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.21.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.21.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.22.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.22.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.22.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.23.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.23.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.23.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.24.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.24.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.24.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.25.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.25.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.25.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.26.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.26.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.26.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.27.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.27.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.27.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.28.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.28.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.28.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.29.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.29.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.29.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.3.down_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.13.mlp.experts.3.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.13.mlp.experts.3.up_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.13.mlp.experts.30.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.30.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.30.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.31.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.31.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.31.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.32.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.32.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.32.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.33.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.33.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.33.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.34.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.34.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.34.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.35.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.35.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.35.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.36.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.36.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.36.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.37.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.37.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.37.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.38.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.38.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.38.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.39.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.39.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.39.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.4.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.4.gate_proj.weight": "model-00007-of-00015.safetensors", + "model.model.layers.13.mlp.experts.4.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.40.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.40.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.40.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.41.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.41.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.41.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.42.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.42.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.42.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.43.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.43.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.43.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.44.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.44.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.44.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.45.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.45.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.45.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.46.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.46.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.46.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.47.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.47.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.47.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.48.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.48.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.48.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.49.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.49.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.49.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.5.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.5.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.5.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.50.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.50.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.50.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.51.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.51.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.51.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.52.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.52.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.52.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.53.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.53.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.53.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.54.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.54.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.54.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.55.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.55.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.55.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.56.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.56.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.56.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.57.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.57.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.57.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.58.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.58.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.58.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.59.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.59.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.59.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.6.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.6.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.6.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.60.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.60.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.60.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.61.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.61.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.61.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.62.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.62.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.62.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.63.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.63.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.63.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.7.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.7.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.7.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.8.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.8.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.8.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.9.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.9.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.experts.9.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.gate.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.image_gate.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.shared_experts.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.shared_experts.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.mlp.shared_experts.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.13.post_attention_layernorm.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.attention.dense.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.attention.query_key_value.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.input_layernorm.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.audio_gate.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.0.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.0.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.0.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.1.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.1.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.1.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.10.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.10.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.10.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.11.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.11.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.11.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.12.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.12.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.12.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.13.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.13.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.13.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.14.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.14.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.14.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.15.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.15.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.15.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.16.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.16.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.16.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.17.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.17.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.17.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.18.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.18.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.18.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.19.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.19.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.19.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.2.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.2.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.2.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.20.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.20.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.20.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.21.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.21.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.21.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.22.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.22.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.22.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.23.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.23.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.23.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.24.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.24.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.24.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.25.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.25.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.25.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.26.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.26.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.26.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.27.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.27.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.27.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.28.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.28.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.28.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.29.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.29.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.29.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.3.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.3.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.3.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.30.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.30.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.30.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.31.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.31.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.31.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.32.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.32.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.32.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.33.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.33.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.33.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.34.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.34.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.34.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.35.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.35.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.35.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.36.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.36.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.36.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.37.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.37.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.37.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.38.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.38.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.38.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.39.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.39.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.39.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.4.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.4.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.4.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.40.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.40.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.40.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.41.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.41.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.41.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.42.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.42.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.42.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.43.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.43.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.43.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.44.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.44.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.44.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.45.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.45.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.45.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.46.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.46.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.46.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.47.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.47.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.47.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.48.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.48.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.48.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.49.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.49.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.49.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.5.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.5.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.5.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.50.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.50.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.50.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.51.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.51.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.51.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.52.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.52.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.52.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.53.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.53.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.53.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.54.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.54.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.54.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.55.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.55.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.55.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.56.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.56.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.56.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.57.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.57.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.57.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.58.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.58.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.58.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.59.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.59.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.59.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.6.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.6.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.6.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.60.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.60.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.60.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.61.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.61.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.61.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.62.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.62.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.62.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.63.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.63.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.63.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.7.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.7.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.7.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.8.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.8.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.8.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.9.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.9.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.experts.9.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.gate.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.image_gate.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.shared_experts.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.shared_experts.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.mlp.shared_experts.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.14.post_attention_layernorm.weight": "model-00008-of-00015.safetensors", + "model.model.layers.15.attention.dense.weight": "model-00008-of-00015.safetensors", + "model.model.layers.15.attention.query_key_value.weight": "model-00008-of-00015.safetensors", + "model.model.layers.15.input_layernorm.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.audio_gate.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.0.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.15.mlp.experts.0.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.15.mlp.experts.0.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.15.mlp.experts.1.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.15.mlp.experts.1.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.15.mlp.experts.1.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.15.mlp.experts.10.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.15.mlp.experts.10.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.15.mlp.experts.10.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.15.mlp.experts.11.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.15.mlp.experts.11.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.15.mlp.experts.11.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.15.mlp.experts.12.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.15.mlp.experts.12.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.15.mlp.experts.12.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.15.mlp.experts.13.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.15.mlp.experts.13.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.15.mlp.experts.13.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.15.mlp.experts.14.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.14.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.14.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.15.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.15.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.15.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.16.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.16.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.16.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.17.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.17.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.17.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.18.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.18.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.18.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.19.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.19.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.19.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.2.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.15.mlp.experts.2.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.15.mlp.experts.2.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.15.mlp.experts.20.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.20.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.20.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.21.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.21.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.21.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.22.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.22.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.22.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.23.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.23.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.23.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.24.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.24.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.24.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.25.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.25.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.25.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.26.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.26.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.26.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.27.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.27.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.27.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.28.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.28.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.28.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.29.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.29.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.29.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.3.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.15.mlp.experts.3.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.15.mlp.experts.3.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.15.mlp.experts.30.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.30.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.30.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.31.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.31.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.31.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.32.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.32.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.32.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.33.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.33.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.33.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.34.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.34.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.34.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.35.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.35.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.35.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.36.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.36.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.36.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.37.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.37.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.37.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.38.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.38.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.38.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.39.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.39.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.39.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.4.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.15.mlp.experts.4.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.15.mlp.experts.4.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.15.mlp.experts.40.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.40.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.40.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.41.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.41.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.41.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.42.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.42.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.42.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.43.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.43.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.43.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.44.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.44.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.44.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.45.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.45.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.45.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.46.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.46.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.46.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.47.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.47.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.47.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.48.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.48.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.48.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.49.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.49.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.49.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.5.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.15.mlp.experts.5.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.15.mlp.experts.5.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.15.mlp.experts.50.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.50.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.50.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.51.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.51.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.51.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.52.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.52.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.52.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.53.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.53.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.53.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.54.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.54.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.54.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.55.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.55.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.55.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.56.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.56.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.56.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.57.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.57.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.57.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.58.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.58.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.58.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.59.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.59.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.59.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.6.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.15.mlp.experts.6.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.15.mlp.experts.6.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.15.mlp.experts.60.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.60.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.60.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.61.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.61.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.61.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.62.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.62.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.62.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.63.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.63.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.63.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.experts.7.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.15.mlp.experts.7.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.15.mlp.experts.7.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.15.mlp.experts.8.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.15.mlp.experts.8.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.15.mlp.experts.8.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.15.mlp.experts.9.down_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.15.mlp.experts.9.gate_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.15.mlp.experts.9.up_proj.weight": "model-00008-of-00015.safetensors", + "model.model.layers.15.mlp.gate.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.image_gate.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.shared_experts.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.shared_experts.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.mlp.shared_experts.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.15.post_attention_layernorm.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.attention.dense.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.attention.query_key_value.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.input_layernorm.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.audio_gate.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.0.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.0.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.0.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.1.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.1.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.1.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.10.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.10.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.10.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.11.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.11.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.11.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.12.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.12.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.12.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.13.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.13.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.13.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.14.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.14.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.14.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.15.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.15.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.15.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.16.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.16.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.16.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.17.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.17.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.17.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.18.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.18.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.18.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.19.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.19.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.19.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.2.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.2.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.2.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.20.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.20.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.20.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.21.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.21.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.21.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.22.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.22.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.22.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.23.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.23.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.23.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.24.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.24.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.24.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.25.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.25.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.25.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.26.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.26.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.26.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.27.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.27.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.27.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.28.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.28.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.28.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.29.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.29.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.29.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.3.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.3.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.3.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.30.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.30.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.30.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.31.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.31.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.31.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.32.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.32.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.32.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.33.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.33.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.33.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.34.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.34.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.34.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.35.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.35.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.35.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.36.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.36.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.36.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.37.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.37.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.37.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.38.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.38.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.38.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.39.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.39.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.39.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.4.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.4.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.4.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.40.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.40.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.40.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.41.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.41.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.41.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.42.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.42.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.42.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.43.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.43.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.43.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.44.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.44.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.44.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.45.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.45.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.45.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.46.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.46.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.46.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.47.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.47.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.47.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.48.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.48.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.48.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.49.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.49.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.49.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.5.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.5.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.5.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.50.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.50.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.50.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.51.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.51.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.51.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.52.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.52.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.52.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.53.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.53.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.53.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.54.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.54.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.54.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.55.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.55.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.55.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.56.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.56.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.56.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.57.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.57.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.57.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.58.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.58.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.58.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.59.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.59.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.59.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.6.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.6.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.6.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.60.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.60.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.60.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.61.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.61.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.61.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.62.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.62.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.62.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.63.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.63.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.63.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.7.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.7.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.7.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.8.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.8.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.8.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.9.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.9.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.experts.9.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.gate.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.image_gate.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.shared_experts.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.shared_experts.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.mlp.shared_experts.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.16.post_attention_layernorm.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.attention.dense.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.attention.query_key_value.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.input_layernorm.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.audio_gate.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.0.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.0.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.0.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.1.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.1.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.1.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.10.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.10.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.10.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.11.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.11.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.11.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.12.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.12.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.12.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.13.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.13.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.13.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.14.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.14.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.14.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.15.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.15.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.15.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.16.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.16.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.16.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.17.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.17.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.17.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.18.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.18.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.18.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.19.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.19.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.19.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.2.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.2.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.2.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.20.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.20.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.20.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.21.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.21.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.21.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.22.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.22.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.22.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.23.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.23.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.23.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.24.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.24.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.24.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.25.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.25.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.25.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.26.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.26.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.26.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.27.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.27.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.27.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.28.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.28.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.28.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.29.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.29.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.29.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.3.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.3.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.3.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.30.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.30.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.30.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.31.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.31.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.31.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.32.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.32.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.32.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.33.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.33.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.33.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.34.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.34.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.34.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.35.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.35.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.35.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.36.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.36.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.36.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.37.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.37.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.37.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.38.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.38.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.38.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.39.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.39.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.39.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.4.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.4.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.4.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.40.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.40.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.40.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.41.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.41.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.41.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.42.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.42.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.42.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.43.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.43.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.43.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.44.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.44.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.44.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.45.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.45.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.45.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.46.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.46.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.46.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.47.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.47.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.47.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.48.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.48.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.48.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.49.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.49.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.49.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.5.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.5.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.5.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.50.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.50.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.50.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.51.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.51.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.51.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.52.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.52.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.52.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.53.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.53.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.53.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.54.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.54.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.54.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.55.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.55.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.55.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.56.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.56.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.56.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.57.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.57.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.57.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.58.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.58.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.58.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.59.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.59.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.59.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.6.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.6.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.6.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.60.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.60.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.60.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.61.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.61.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.61.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.62.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.62.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.62.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.63.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.63.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.63.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.experts.7.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.7.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.7.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.8.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.8.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.8.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.9.down_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.9.gate_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.experts.9.up_proj.weight": "model-00009-of-00015.safetensors", + "model.model.layers.17.mlp.gate.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.image_gate.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.shared_experts.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.shared_experts.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.mlp.shared_experts.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.17.post_attention_layernorm.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.attention.dense.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.attention.query_key_value.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.input_layernorm.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.audio_gate.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.0.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.0.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.0.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.1.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.1.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.1.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.10.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.10.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.10.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.11.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.11.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.11.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.12.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.12.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.12.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.13.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.13.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.13.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.14.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.14.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.14.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.15.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.15.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.15.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.16.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.16.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.16.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.17.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.17.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.17.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.18.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.18.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.18.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.19.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.19.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.19.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.2.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.2.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.2.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.20.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.20.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.20.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.21.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.21.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.21.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.22.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.22.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.22.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.23.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.23.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.23.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.24.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.24.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.24.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.25.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.25.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.25.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.26.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.26.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.26.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.27.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.27.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.27.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.28.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.28.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.28.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.29.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.29.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.29.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.3.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.3.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.3.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.30.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.30.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.30.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.31.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.31.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.31.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.32.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.32.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.32.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.33.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.33.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.33.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.34.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.34.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.34.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.35.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.35.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.35.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.36.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.36.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.36.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.37.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.37.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.37.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.38.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.38.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.38.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.39.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.39.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.39.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.4.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.4.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.4.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.40.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.40.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.40.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.41.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.41.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.41.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.42.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.42.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.42.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.43.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.43.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.43.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.44.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.44.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.44.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.45.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.45.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.45.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.46.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.46.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.46.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.47.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.47.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.47.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.48.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.48.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.48.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.49.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.49.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.49.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.5.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.5.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.5.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.50.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.50.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.50.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.51.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.51.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.51.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.52.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.52.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.52.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.53.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.53.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.53.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.54.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.54.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.54.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.55.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.55.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.55.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.56.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.56.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.56.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.57.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.57.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.57.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.58.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.58.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.58.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.59.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.59.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.59.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.6.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.6.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.6.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.60.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.60.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.60.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.61.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.61.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.61.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.62.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.62.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.62.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.63.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.63.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.63.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.7.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.7.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.7.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.8.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.8.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.8.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.9.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.9.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.experts.9.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.gate.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.image_gate.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.shared_experts.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.shared_experts.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.mlp.shared_experts.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.18.post_attention_layernorm.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.attention.dense.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.attention.query_key_value.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.input_layernorm.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.audio_gate.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.0.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.0.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.0.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.1.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.1.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.1.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.10.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.10.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.10.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.11.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.11.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.11.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.12.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.12.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.12.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.13.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.13.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.13.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.14.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.14.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.14.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.15.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.15.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.15.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.16.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.16.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.16.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.17.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.17.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.17.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.18.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.18.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.18.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.19.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.19.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.19.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.2.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.2.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.2.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.20.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.20.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.20.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.21.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.21.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.21.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.22.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.22.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.22.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.23.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.23.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.23.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.24.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.24.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.24.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.25.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.25.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.25.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.26.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.26.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.26.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.27.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.27.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.27.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.28.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.28.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.28.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.29.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.29.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.29.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.3.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.3.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.3.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.30.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.30.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.30.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.31.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.31.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.31.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.32.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.32.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.32.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.33.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.33.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.33.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.34.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.34.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.34.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.35.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.35.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.35.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.36.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.36.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.36.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.37.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.37.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.37.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.38.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.38.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.38.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.39.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.39.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.39.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.4.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.4.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.4.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.40.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.40.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.40.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.41.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.41.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.41.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.42.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.42.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.42.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.43.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.43.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.43.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.44.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.44.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.44.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.45.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.45.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.45.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.46.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.46.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.46.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.47.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.47.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.47.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.48.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.48.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.48.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.49.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.49.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.49.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.5.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.5.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.5.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.50.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.50.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.50.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.51.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.51.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.51.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.52.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.52.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.52.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.53.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.53.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.53.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.54.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.54.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.54.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.55.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.55.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.55.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.56.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.56.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.56.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.57.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.57.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.57.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.58.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.58.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.58.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.59.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.59.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.59.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.6.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.6.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.6.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.60.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.60.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.60.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.61.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.61.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.61.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.62.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.62.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.62.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.63.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.63.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.63.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.experts.7.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.7.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.7.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.8.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.8.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.8.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.9.down_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.9.gate_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.experts.9.up_proj.weight": "model-00010-of-00015.safetensors", + "model.model.layers.19.mlp.gate.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.image_gate.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.shared_experts.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.shared_experts.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.mlp.shared_experts.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.19.post_attention_layernorm.weight": "model-00011-of-00015.safetensors", + "model.model.layers.2.attention.dense.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.attention.query_key_value.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.input_layernorm.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.audio_gate.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.0.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.0.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.0.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.1.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.1.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.1.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.10.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.10.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.10.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.11.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.11.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.11.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.12.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.12.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.12.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.13.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.13.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.13.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.14.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.14.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.14.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.15.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.15.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.15.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.16.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.16.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.16.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.17.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.17.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.17.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.18.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.18.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.18.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.19.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.19.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.19.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.2.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.2.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.2.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.20.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.20.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.20.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.21.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.21.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.21.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.22.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.22.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.22.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.23.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.23.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.23.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.24.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.24.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.24.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.25.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.25.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.25.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.26.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.26.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.26.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.27.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.27.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.27.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.28.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.28.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.28.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.29.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.29.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.29.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.3.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.3.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.3.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.30.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.30.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.30.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.31.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.31.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.31.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.32.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.32.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.32.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.33.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.33.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.33.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.34.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.34.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.34.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.35.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.35.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.35.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.36.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.36.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.36.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.37.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.37.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.37.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.38.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.38.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.38.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.39.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.39.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.39.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.4.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.4.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.4.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.40.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.40.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.40.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.41.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.41.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.41.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.42.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.42.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.42.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.43.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.43.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.43.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.44.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.44.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.44.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.45.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.45.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.45.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.46.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.46.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.46.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.47.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.47.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.47.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.48.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.48.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.48.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.49.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.49.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.49.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.5.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.5.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.5.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.50.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.50.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.50.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.51.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.51.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.51.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.52.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.52.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.52.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.53.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.53.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.53.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.54.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.54.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.54.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.55.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.55.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.55.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.56.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.56.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.56.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.57.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.57.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.57.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.58.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.58.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.58.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.59.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.59.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.59.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.6.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.6.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.6.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.60.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.60.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.60.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.61.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.61.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.61.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.62.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.62.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.62.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.63.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.63.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.63.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.experts.7.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.7.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.7.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.8.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.8.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.8.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.9.down_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.9.gate_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.experts.9.up_proj.weight": "model-00002-of-00015.safetensors", + "model.model.layers.2.mlp.gate.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.image_gate.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.shared_experts.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.shared_experts.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.mlp.shared_experts.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.2.post_attention_layernorm.weight": "model-00003-of-00015.safetensors", + "model.model.layers.20.attention.dense.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.attention.query_key_value.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.input_layernorm.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.audio_gate.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.0.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.0.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.0.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.1.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.1.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.1.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.10.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.10.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.10.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.11.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.11.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.11.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.12.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.12.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.12.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.13.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.13.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.13.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.14.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.14.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.14.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.15.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.15.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.15.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.16.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.16.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.16.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.17.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.17.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.17.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.18.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.18.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.18.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.19.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.19.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.19.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.2.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.2.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.2.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.20.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.20.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.20.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.21.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.21.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.21.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.22.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.22.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.22.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.23.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.23.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.23.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.24.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.24.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.24.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.25.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.25.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.25.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.26.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.26.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.26.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.27.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.27.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.27.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.28.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.28.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.28.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.29.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.29.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.29.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.3.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.3.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.3.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.30.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.30.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.30.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.31.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.31.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.31.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.32.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.32.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.32.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.33.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.33.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.33.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.34.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.34.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.34.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.35.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.35.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.35.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.36.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.36.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.36.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.37.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.37.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.37.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.38.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.38.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.38.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.39.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.39.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.39.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.4.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.4.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.4.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.40.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.40.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.40.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.41.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.41.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.41.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.42.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.42.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.42.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.43.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.43.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.43.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.44.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.44.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.44.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.45.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.45.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.45.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.46.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.46.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.46.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.47.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.47.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.47.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.48.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.48.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.48.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.49.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.49.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.49.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.5.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.5.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.5.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.50.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.50.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.50.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.51.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.51.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.51.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.52.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.52.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.52.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.53.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.53.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.53.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.54.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.54.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.54.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.55.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.55.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.55.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.56.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.56.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.56.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.57.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.57.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.57.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.58.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.58.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.58.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.59.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.59.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.59.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.6.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.6.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.6.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.60.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.60.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.60.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.61.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.61.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.61.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.62.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.62.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.62.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.63.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.63.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.63.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.7.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.7.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.7.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.8.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.8.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.8.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.9.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.9.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.experts.9.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.gate.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.image_gate.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.shared_experts.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.shared_experts.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.mlp.shared_experts.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.20.post_attention_layernorm.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.attention.dense.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.attention.query_key_value.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.input_layernorm.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.audio_gate.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.0.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.0.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.0.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.1.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.1.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.1.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.10.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.10.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.10.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.11.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.11.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.11.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.12.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.12.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.12.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.13.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.13.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.13.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.14.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.14.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.14.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.15.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.15.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.15.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.16.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.16.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.16.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.17.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.17.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.17.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.18.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.18.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.18.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.19.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.19.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.19.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.2.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.2.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.2.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.20.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.20.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.20.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.21.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.21.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.21.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.22.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.22.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.22.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.23.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.23.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.23.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.24.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.24.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.24.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.25.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.25.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.25.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.26.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.26.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.26.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.27.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.27.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.27.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.28.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.28.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.28.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.29.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.29.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.29.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.3.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.3.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.3.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.30.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.30.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.30.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.31.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.31.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.31.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.32.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.32.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.32.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.33.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.33.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.33.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.34.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.34.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.34.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.35.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.35.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.35.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.36.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.36.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.36.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.37.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.37.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.37.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.38.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.38.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.38.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.39.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.39.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.39.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.4.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.4.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.4.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.40.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.40.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.40.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.41.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.41.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.41.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.42.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.42.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.42.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.43.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.43.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.43.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.44.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.44.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.44.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.45.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.45.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.45.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.46.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.46.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.46.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.47.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.47.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.47.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.48.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.48.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.48.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.49.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.49.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.49.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.5.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.5.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.5.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.50.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.50.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.50.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.51.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.51.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.51.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.52.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.52.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.52.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.53.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.53.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.53.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.54.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.54.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.54.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.55.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.55.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.55.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.56.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.56.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.56.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.57.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.57.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.57.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.58.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.58.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.58.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.59.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.59.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.59.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.6.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.6.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.6.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.60.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.60.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.60.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.61.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.61.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.61.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.62.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.62.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.62.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.63.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.63.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.63.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.experts.7.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.7.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.7.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.8.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.8.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.8.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.9.down_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.9.gate_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.experts.9.up_proj.weight": "model-00011-of-00015.safetensors", + "model.model.layers.21.mlp.gate.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.image_gate.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.shared_experts.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.shared_experts.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.mlp.shared_experts.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.21.post_attention_layernorm.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.attention.dense.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.attention.query_key_value.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.input_layernorm.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.audio_gate.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.0.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.0.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.0.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.1.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.1.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.1.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.10.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.10.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.10.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.11.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.11.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.11.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.12.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.12.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.12.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.13.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.13.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.13.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.14.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.14.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.14.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.15.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.15.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.15.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.16.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.16.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.16.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.17.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.17.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.17.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.18.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.18.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.18.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.19.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.19.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.19.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.2.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.2.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.2.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.20.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.20.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.20.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.21.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.21.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.21.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.22.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.22.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.22.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.23.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.23.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.23.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.24.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.24.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.24.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.25.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.25.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.25.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.26.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.26.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.26.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.27.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.27.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.27.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.28.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.28.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.28.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.29.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.29.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.29.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.3.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.3.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.3.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.30.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.30.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.30.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.31.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.31.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.31.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.32.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.32.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.32.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.33.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.33.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.33.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.34.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.34.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.34.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.35.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.35.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.35.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.36.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.36.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.36.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.37.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.37.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.37.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.38.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.38.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.38.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.39.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.39.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.39.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.4.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.4.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.4.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.40.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.40.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.40.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.41.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.41.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.41.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.42.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.42.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.42.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.43.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.43.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.43.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.44.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.44.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.44.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.45.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.45.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.45.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.46.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.46.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.46.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.47.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.47.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.47.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.48.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.48.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.48.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.49.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.49.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.49.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.5.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.5.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.5.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.50.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.50.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.50.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.51.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.51.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.51.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.52.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.52.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.52.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.53.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.53.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.53.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.54.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.54.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.54.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.55.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.55.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.55.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.56.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.56.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.56.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.57.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.57.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.57.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.58.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.58.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.58.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.59.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.59.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.59.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.6.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.6.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.6.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.60.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.60.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.60.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.61.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.61.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.61.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.62.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.62.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.62.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.63.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.63.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.63.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.7.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.7.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.7.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.8.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.8.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.8.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.9.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.9.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.experts.9.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.gate.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.image_gate.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.shared_experts.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.shared_experts.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.mlp.shared_experts.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.22.post_attention_layernorm.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.attention.dense.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.attention.query_key_value.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.input_layernorm.weight": "model-00013-of-00015.safetensors", + "model.model.layers.23.mlp.audio_gate.weight": "model-00013-of-00015.safetensors", + "model.model.layers.23.mlp.experts.0.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.0.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.0.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.1.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.1.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.1.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.10.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.10.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.10.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.11.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.11.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.11.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.12.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.12.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.12.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.13.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.13.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.13.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.14.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.14.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.14.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.15.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.15.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.15.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.16.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.16.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.16.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.17.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.17.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.17.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.18.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.18.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.18.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.19.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.19.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.19.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.2.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.2.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.2.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.20.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.20.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.20.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.21.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.21.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.21.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.22.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.22.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.22.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.23.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.23.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.23.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.24.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.24.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.24.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.25.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.25.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.25.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.26.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.26.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.26.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.27.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.27.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.27.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.28.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.28.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.28.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.29.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.29.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.29.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.3.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.3.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.3.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.30.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.30.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.30.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.31.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.31.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.31.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.32.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.32.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.32.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.33.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.33.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.33.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.34.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.34.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.34.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.35.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.35.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.35.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.36.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.36.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.36.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.37.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.37.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.37.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.38.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.38.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.38.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.39.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.39.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.39.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.4.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.4.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.4.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.40.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.40.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.40.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.41.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.41.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.41.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.42.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.42.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.42.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.43.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.43.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.43.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.44.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.44.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.44.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.45.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.45.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.45.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.46.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.46.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.46.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.47.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.47.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.47.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.48.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.48.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.48.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.49.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.49.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.49.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.5.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.5.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.5.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.50.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.50.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.50.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.51.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.51.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.51.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.52.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.23.mlp.experts.52.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.52.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.53.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.23.mlp.experts.53.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.23.mlp.experts.53.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.23.mlp.experts.54.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.23.mlp.experts.54.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.23.mlp.experts.54.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.23.mlp.experts.55.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.23.mlp.experts.55.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.23.mlp.experts.55.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.23.mlp.experts.56.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.23.mlp.experts.56.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.23.mlp.experts.56.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.23.mlp.experts.57.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.23.mlp.experts.57.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.23.mlp.experts.57.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.23.mlp.experts.58.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.23.mlp.experts.58.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.23.mlp.experts.58.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.23.mlp.experts.59.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.23.mlp.experts.59.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.23.mlp.experts.59.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.23.mlp.experts.6.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.6.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.6.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.60.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.23.mlp.experts.60.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.23.mlp.experts.60.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.23.mlp.experts.61.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.23.mlp.experts.61.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.23.mlp.experts.61.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.23.mlp.experts.62.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.23.mlp.experts.62.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.23.mlp.experts.62.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.23.mlp.experts.63.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.23.mlp.experts.63.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.23.mlp.experts.63.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.23.mlp.experts.7.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.7.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.7.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.8.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.8.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.8.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.9.down_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.9.gate_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.experts.9.up_proj.weight": "model-00012-of-00015.safetensors", + "model.model.layers.23.mlp.gate.weight": "model-00013-of-00015.safetensors", + "model.model.layers.23.mlp.image_gate.weight": "model-00013-of-00015.safetensors", + "model.model.layers.23.mlp.shared_experts.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.23.mlp.shared_experts.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.23.mlp.shared_experts.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.23.post_attention_layernorm.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.attention.dense.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.attention.query_key_value.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.input_layernorm.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.audio_gate.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.0.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.0.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.0.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.1.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.1.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.1.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.10.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.10.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.10.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.11.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.11.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.11.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.12.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.12.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.12.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.13.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.13.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.13.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.14.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.14.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.14.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.15.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.15.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.15.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.16.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.16.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.16.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.17.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.17.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.17.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.18.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.18.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.18.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.19.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.19.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.19.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.2.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.2.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.2.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.20.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.20.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.20.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.21.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.21.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.21.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.22.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.22.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.22.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.23.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.23.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.23.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.24.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.24.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.24.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.25.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.25.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.25.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.26.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.26.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.26.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.27.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.27.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.27.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.28.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.28.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.28.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.29.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.29.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.29.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.3.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.3.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.3.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.30.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.30.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.30.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.31.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.31.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.31.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.32.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.32.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.32.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.33.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.33.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.33.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.34.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.34.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.34.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.35.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.35.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.35.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.36.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.36.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.36.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.37.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.37.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.37.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.38.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.38.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.38.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.39.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.39.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.39.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.4.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.4.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.4.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.40.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.40.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.40.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.41.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.41.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.41.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.42.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.42.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.42.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.43.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.43.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.43.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.44.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.44.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.44.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.45.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.45.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.45.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.46.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.46.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.46.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.47.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.47.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.47.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.48.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.48.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.48.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.49.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.49.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.49.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.5.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.5.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.5.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.50.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.50.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.50.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.51.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.51.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.51.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.52.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.52.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.52.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.53.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.53.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.53.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.54.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.54.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.54.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.55.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.55.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.55.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.56.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.56.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.56.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.57.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.57.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.57.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.58.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.58.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.58.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.59.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.59.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.59.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.6.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.6.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.6.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.60.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.60.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.60.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.61.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.61.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.61.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.62.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.62.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.62.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.63.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.63.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.63.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.7.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.7.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.7.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.8.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.8.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.8.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.9.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.9.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.experts.9.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.gate.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.image_gate.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.shared_experts.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.shared_experts.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.mlp.shared_experts.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.24.post_attention_layernorm.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.attention.dense.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.attention.query_key_value.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.input_layernorm.weight": "model-00014-of-00015.safetensors", + "model.model.layers.25.mlp.audio_gate.weight": "model-00014-of-00015.safetensors", + "model.model.layers.25.mlp.experts.0.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.0.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.0.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.1.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.1.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.1.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.10.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.10.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.10.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.11.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.11.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.11.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.12.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.12.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.12.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.13.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.13.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.13.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.14.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.14.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.14.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.15.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.15.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.15.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.16.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.16.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.16.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.17.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.17.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.17.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.18.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.18.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.18.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.19.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.19.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.19.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.2.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.2.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.2.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.20.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.20.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.20.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.21.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.21.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.21.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.22.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.22.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.22.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.23.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.23.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.23.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.24.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.24.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.24.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.25.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.25.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.25.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.26.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.26.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.26.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.27.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.27.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.27.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.28.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.28.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.28.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.29.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.29.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.29.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.3.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.3.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.3.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.30.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.30.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.30.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.31.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.31.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.31.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.32.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.32.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.32.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.33.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.33.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.33.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.34.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.34.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.34.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.35.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.35.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.35.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.36.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.36.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.36.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.37.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.37.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.37.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.38.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.38.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.38.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.39.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.39.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.39.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.4.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.4.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.4.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.40.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.40.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.40.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.41.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.41.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.41.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.42.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.42.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.42.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.43.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.43.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.43.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.44.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.44.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.44.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.45.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.45.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.45.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.46.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.46.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.46.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.47.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.47.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.47.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.48.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.48.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.48.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.49.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.49.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.49.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.5.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.5.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.5.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.50.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.50.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.50.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.51.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.51.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.51.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.52.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.52.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.52.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.53.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.53.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.53.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.54.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.54.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.54.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.55.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.55.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.55.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.56.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.56.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.56.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.57.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.57.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.57.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.58.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.58.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.58.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.59.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.59.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.59.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.6.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.6.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.6.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.60.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.60.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.60.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.61.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.61.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.61.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.62.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.25.mlp.experts.62.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.62.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.25.mlp.experts.63.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.25.mlp.experts.63.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.25.mlp.experts.63.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.25.mlp.experts.7.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.7.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.7.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.8.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.8.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.8.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.9.down_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.9.gate_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.experts.9.up_proj.weight": "model-00013-of-00015.safetensors", + "model.model.layers.25.mlp.gate.weight": "model-00014-of-00015.safetensors", + "model.model.layers.25.mlp.image_gate.weight": "model-00014-of-00015.safetensors", + "model.model.layers.25.mlp.shared_experts.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.25.mlp.shared_experts.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.25.mlp.shared_experts.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.25.post_attention_layernorm.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.attention.dense.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.attention.query_key_value.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.input_layernorm.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.audio_gate.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.0.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.0.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.0.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.1.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.1.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.1.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.10.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.10.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.10.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.11.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.11.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.11.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.12.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.12.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.12.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.13.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.13.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.13.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.14.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.14.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.14.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.15.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.15.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.15.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.16.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.16.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.16.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.17.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.17.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.17.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.18.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.18.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.18.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.19.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.19.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.19.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.2.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.2.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.2.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.20.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.20.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.20.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.21.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.21.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.21.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.22.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.22.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.22.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.23.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.23.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.23.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.24.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.24.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.24.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.25.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.25.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.25.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.26.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.26.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.26.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.27.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.27.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.27.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.28.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.28.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.28.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.29.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.29.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.29.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.3.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.3.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.3.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.30.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.30.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.30.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.31.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.31.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.31.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.32.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.32.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.32.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.33.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.33.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.33.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.34.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.34.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.34.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.35.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.35.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.35.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.36.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.36.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.36.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.37.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.37.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.37.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.38.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.38.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.38.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.39.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.39.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.39.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.4.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.4.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.4.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.40.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.40.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.40.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.41.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.41.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.41.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.42.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.42.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.42.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.43.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.43.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.43.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.44.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.44.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.44.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.45.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.45.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.45.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.46.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.46.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.46.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.47.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.47.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.47.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.48.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.48.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.48.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.49.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.49.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.49.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.5.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.5.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.5.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.50.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.50.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.50.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.51.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.51.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.51.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.52.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.52.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.52.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.53.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.53.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.53.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.54.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.54.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.54.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.55.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.55.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.55.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.56.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.56.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.56.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.57.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.57.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.57.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.58.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.58.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.58.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.59.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.59.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.59.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.6.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.6.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.6.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.60.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.60.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.60.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.61.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.61.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.61.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.62.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.62.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.62.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.63.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.63.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.63.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.7.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.7.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.7.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.8.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.8.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.8.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.9.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.9.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.experts.9.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.gate.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.image_gate.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.shared_experts.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.shared_experts.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.mlp.shared_experts.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.26.post_attention_layernorm.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.attention.dense.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.attention.query_key_value.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.input_layernorm.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.audio_gate.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.0.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.0.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.0.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.1.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.1.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.1.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.10.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.10.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.10.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.11.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.11.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.11.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.12.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.12.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.12.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.13.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.13.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.13.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.14.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.14.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.14.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.15.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.15.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.15.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.16.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.16.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.16.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.17.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.17.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.17.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.18.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.18.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.18.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.19.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.19.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.19.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.2.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.2.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.2.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.20.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.20.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.20.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.21.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.21.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.21.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.22.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.22.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.22.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.23.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.23.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.23.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.24.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.24.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.24.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.25.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.25.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.25.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.26.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.26.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.26.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.27.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.27.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.27.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.28.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.28.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.28.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.29.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.29.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.29.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.3.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.3.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.3.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.30.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.30.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.30.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.31.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.31.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.31.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.32.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.32.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.32.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.33.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.33.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.33.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.34.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.34.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.34.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.35.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.35.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.35.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.36.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.36.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.36.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.37.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.37.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.37.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.38.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.38.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.38.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.39.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.39.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.39.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.4.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.4.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.4.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.40.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.40.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.40.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.41.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.41.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.41.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.42.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.42.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.42.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.43.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.43.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.43.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.44.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.44.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.44.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.45.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.45.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.45.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.46.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.46.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.46.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.47.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.47.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.47.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.48.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.48.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.48.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.49.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.49.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.49.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.5.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.5.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.5.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.50.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.50.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.50.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.51.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.51.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.51.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.52.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.52.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.52.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.53.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.53.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.53.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.54.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.54.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.54.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.55.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.55.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.55.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.56.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.56.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.56.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.57.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.57.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.57.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.58.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.58.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.58.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.59.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.59.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.59.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.6.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.6.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.6.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.60.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.60.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.60.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.61.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.61.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.61.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.62.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.62.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.62.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.63.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.63.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.63.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.7.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.7.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.7.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.8.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.8.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.8.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.9.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.9.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.experts.9.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.gate.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.image_gate.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.shared_experts.down_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.shared_experts.gate_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.mlp.shared_experts.up_proj.weight": "model-00014-of-00015.safetensors", + "model.model.layers.27.post_attention_layernorm.weight": "model-00014-of-00015.safetensors", + "model.model.layers.3.attention.dense.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.attention.query_key_value.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.input_layernorm.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.audio_gate.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.0.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.0.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.0.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.1.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.1.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.1.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.10.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.10.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.10.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.11.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.11.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.11.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.12.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.12.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.12.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.13.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.13.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.13.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.14.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.14.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.14.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.15.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.15.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.15.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.16.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.16.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.16.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.17.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.17.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.17.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.18.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.18.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.18.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.19.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.19.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.19.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.2.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.2.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.2.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.20.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.20.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.20.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.21.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.21.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.21.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.22.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.22.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.22.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.23.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.23.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.23.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.24.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.24.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.24.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.25.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.25.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.25.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.26.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.26.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.26.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.27.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.27.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.27.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.28.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.28.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.28.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.29.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.29.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.29.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.3.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.3.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.3.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.30.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.30.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.30.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.31.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.31.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.31.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.32.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.32.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.32.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.33.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.33.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.33.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.34.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.34.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.34.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.35.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.35.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.35.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.36.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.36.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.36.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.37.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.37.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.37.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.38.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.38.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.38.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.39.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.39.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.39.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.4.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.4.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.4.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.40.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.40.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.40.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.41.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.41.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.41.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.42.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.42.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.42.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.43.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.43.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.43.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.44.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.44.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.44.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.45.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.45.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.45.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.46.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.46.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.46.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.47.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.47.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.47.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.48.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.48.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.48.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.49.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.49.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.49.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.5.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.5.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.5.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.50.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.50.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.50.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.51.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.51.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.51.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.52.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.52.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.52.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.53.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.53.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.53.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.54.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.54.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.54.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.55.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.55.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.55.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.56.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.56.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.56.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.57.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.57.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.57.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.58.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.58.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.58.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.59.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.59.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.59.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.6.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.6.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.6.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.60.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.60.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.60.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.61.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.61.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.61.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.62.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.62.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.62.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.63.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.63.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.63.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.7.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.7.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.7.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.8.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.8.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.8.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.9.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.9.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.experts.9.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.gate.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.image_gate.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.shared_experts.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.shared_experts.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.mlp.shared_experts.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.3.post_attention_layernorm.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.attention.dense.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.attention.query_key_value.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.input_layernorm.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.audio_gate.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.0.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.0.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.0.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.1.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.1.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.1.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.10.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.10.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.10.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.11.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.11.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.11.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.12.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.12.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.12.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.13.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.13.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.13.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.14.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.14.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.14.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.15.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.15.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.15.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.16.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.16.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.16.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.17.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.17.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.17.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.18.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.18.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.18.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.19.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.19.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.19.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.2.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.2.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.2.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.20.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.20.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.20.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.21.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.21.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.21.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.22.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.22.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.22.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.23.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.23.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.23.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.24.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.24.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.24.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.25.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.25.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.25.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.26.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.26.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.26.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.27.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.27.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.27.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.28.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.28.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.28.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.29.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.29.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.29.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.3.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.3.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.3.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.30.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.30.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.30.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.31.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.31.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.31.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.32.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.32.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.32.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.33.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.33.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.33.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.34.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.34.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.34.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.35.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.35.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.35.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.36.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.36.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.36.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.37.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.37.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.37.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.38.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.38.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.38.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.39.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.39.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.39.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.4.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.4.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.4.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.40.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.40.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.40.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.41.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.41.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.41.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.42.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.42.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.42.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.43.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.43.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.43.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.44.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.44.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.44.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.45.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.45.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.45.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.46.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.46.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.46.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.47.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.47.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.47.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.48.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.48.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.48.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.49.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.49.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.49.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.5.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.5.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.5.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.50.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.50.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.50.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.51.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.51.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.51.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.52.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.52.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.52.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.53.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.53.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.53.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.54.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.54.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.54.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.55.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.55.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.55.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.56.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.56.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.56.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.57.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.57.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.57.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.58.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.58.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.58.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.59.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.59.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.59.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.6.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.6.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.6.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.60.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.60.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.60.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.61.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.61.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.61.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.62.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.62.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.62.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.63.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.63.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.63.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.experts.7.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.7.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.7.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.8.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.8.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.8.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.9.down_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.9.gate_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.experts.9.up_proj.weight": "model-00003-of-00015.safetensors", + "model.model.layers.4.mlp.gate.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.image_gate.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.shared_experts.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.shared_experts.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.mlp.shared_experts.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.4.post_attention_layernorm.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.attention.dense.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.attention.query_key_value.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.input_layernorm.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.audio_gate.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.0.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.0.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.0.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.1.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.1.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.1.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.10.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.10.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.10.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.11.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.11.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.11.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.12.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.12.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.12.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.13.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.13.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.13.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.14.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.14.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.14.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.15.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.15.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.15.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.16.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.16.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.16.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.17.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.17.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.17.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.18.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.18.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.18.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.19.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.19.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.19.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.2.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.2.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.2.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.20.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.20.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.20.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.21.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.21.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.21.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.22.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.22.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.22.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.23.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.23.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.23.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.24.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.24.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.24.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.25.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.25.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.25.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.26.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.26.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.26.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.27.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.27.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.27.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.28.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.28.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.28.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.29.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.29.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.29.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.3.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.3.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.3.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.30.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.30.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.30.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.31.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.31.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.31.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.32.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.32.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.32.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.33.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.33.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.33.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.34.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.34.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.34.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.35.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.35.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.35.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.36.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.36.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.36.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.37.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.37.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.37.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.38.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.38.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.38.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.39.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.39.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.39.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.4.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.4.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.4.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.40.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.40.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.40.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.41.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.41.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.41.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.42.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.42.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.42.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.43.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.43.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.43.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.44.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.44.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.44.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.45.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.45.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.45.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.46.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.46.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.46.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.47.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.47.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.47.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.48.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.48.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.48.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.49.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.49.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.49.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.5.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.5.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.5.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.50.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.50.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.50.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.51.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.51.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.51.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.52.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.52.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.52.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.53.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.53.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.53.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.54.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.54.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.54.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.55.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.55.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.55.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.56.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.56.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.56.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.57.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.57.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.57.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.58.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.58.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.58.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.59.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.59.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.59.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.6.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.6.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.6.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.60.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.60.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.60.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.61.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.61.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.61.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.62.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.62.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.62.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.63.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.63.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.63.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.7.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.7.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.7.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.8.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.8.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.8.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.9.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.9.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.experts.9.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.gate.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.image_gate.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.shared_experts.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.shared_experts.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.mlp.shared_experts.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.5.post_attention_layernorm.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.attention.dense.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.attention.query_key_value.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.input_layernorm.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.audio_gate.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.0.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.0.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.0.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.1.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.1.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.1.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.10.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.10.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.10.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.11.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.11.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.11.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.12.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.12.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.12.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.13.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.13.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.13.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.14.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.14.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.14.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.15.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.15.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.15.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.16.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.16.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.16.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.17.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.17.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.17.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.18.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.18.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.18.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.19.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.19.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.19.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.2.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.2.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.2.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.20.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.20.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.20.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.21.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.21.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.21.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.22.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.22.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.22.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.23.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.23.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.23.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.24.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.24.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.24.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.25.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.25.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.25.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.26.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.26.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.26.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.27.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.27.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.27.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.28.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.28.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.28.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.29.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.29.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.29.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.3.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.3.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.3.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.30.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.30.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.30.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.31.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.31.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.31.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.32.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.32.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.32.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.33.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.33.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.33.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.34.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.34.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.34.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.35.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.35.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.35.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.36.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.36.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.36.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.37.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.37.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.37.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.38.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.38.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.38.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.39.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.39.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.39.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.4.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.4.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.4.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.40.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.40.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.40.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.41.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.41.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.41.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.42.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.42.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.42.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.43.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.43.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.43.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.44.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.44.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.44.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.45.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.45.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.45.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.46.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.46.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.46.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.47.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.47.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.47.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.48.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.48.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.48.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.49.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.49.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.49.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.5.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.5.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.5.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.50.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.50.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.50.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.51.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.51.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.51.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.52.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.52.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.52.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.53.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.53.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.53.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.54.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.54.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.54.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.55.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.55.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.55.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.56.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.56.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.56.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.57.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.57.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.57.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.58.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.58.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.58.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.59.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.59.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.59.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.6.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.6.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.6.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.60.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.60.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.60.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.61.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.61.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.61.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.62.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.62.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.62.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.63.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.63.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.63.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.experts.7.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.7.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.7.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.8.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.8.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.8.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.9.down_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.9.gate_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.experts.9.up_proj.weight": "model-00004-of-00015.safetensors", + "model.model.layers.6.mlp.gate.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.image_gate.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.shared_experts.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.shared_experts.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.mlp.shared_experts.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.6.post_attention_layernorm.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.attention.dense.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.attention.query_key_value.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.input_layernorm.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.audio_gate.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.0.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.0.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.0.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.1.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.1.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.1.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.10.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.10.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.10.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.11.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.11.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.11.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.12.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.12.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.12.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.13.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.13.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.13.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.14.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.14.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.14.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.15.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.15.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.15.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.16.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.16.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.16.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.17.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.17.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.17.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.18.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.18.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.18.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.19.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.19.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.19.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.2.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.2.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.2.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.20.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.20.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.20.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.21.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.21.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.21.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.22.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.22.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.22.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.23.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.23.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.23.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.24.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.24.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.24.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.25.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.25.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.25.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.26.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.26.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.26.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.27.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.27.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.27.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.28.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.28.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.28.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.29.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.29.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.29.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.3.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.3.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.3.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.30.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.30.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.30.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.31.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.31.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.31.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.32.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.32.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.32.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.33.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.33.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.33.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.34.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.34.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.34.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.35.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.35.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.35.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.36.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.36.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.36.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.37.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.37.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.37.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.38.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.38.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.38.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.39.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.39.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.39.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.4.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.4.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.4.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.40.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.40.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.40.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.41.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.41.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.41.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.42.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.42.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.42.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.43.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.43.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.43.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.44.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.44.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.44.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.45.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.45.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.45.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.46.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.46.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.46.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.47.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.47.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.47.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.48.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.48.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.48.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.49.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.49.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.49.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.5.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.5.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.5.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.50.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.50.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.50.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.51.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.51.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.51.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.52.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.52.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.52.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.53.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.53.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.53.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.54.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.54.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.54.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.55.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.55.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.55.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.56.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.56.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.56.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.57.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.57.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.57.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.58.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.58.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.58.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.59.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.59.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.59.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.6.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.6.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.6.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.60.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.60.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.60.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.61.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.61.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.61.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.62.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.62.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.62.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.63.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.63.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.63.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.7.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.7.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.7.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.8.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.8.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.8.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.9.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.9.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.experts.9.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.gate.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.image_gate.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.shared_experts.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.shared_experts.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.mlp.shared_experts.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.7.post_attention_layernorm.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.attention.dense.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.attention.query_key_value.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.input_layernorm.weight": "model-00006-of-00015.safetensors", + "model.model.layers.8.mlp.audio_gate.weight": "model-00006-of-00015.safetensors", + "model.model.layers.8.mlp.experts.0.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.0.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.0.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.1.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.1.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.1.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.10.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.10.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.10.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.11.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.11.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.11.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.12.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.12.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.12.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.13.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.13.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.13.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.14.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.14.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.14.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.15.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.15.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.15.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.16.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.16.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.16.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.17.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.17.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.17.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.18.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.18.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.18.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.19.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.19.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.19.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.2.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.2.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.2.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.20.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.20.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.20.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.21.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.21.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.21.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.22.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.22.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.22.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.23.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.23.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.23.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.24.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.24.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.24.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.25.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.25.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.25.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.26.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.26.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.26.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.27.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.27.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.27.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.28.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.28.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.28.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.29.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.29.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.29.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.3.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.3.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.3.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.30.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.30.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.30.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.31.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.31.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.31.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.32.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.32.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.32.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.33.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.33.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.33.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.34.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.34.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.34.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.35.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.35.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.35.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.36.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.36.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.36.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.37.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.37.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.37.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.38.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.38.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.38.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.39.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.39.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.39.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.4.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.4.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.4.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.40.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.40.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.40.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.41.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.41.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.41.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.42.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.42.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.42.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.43.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.43.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.43.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.44.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.44.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.44.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.45.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.45.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.45.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.46.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.46.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.46.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.47.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.47.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.47.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.48.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.48.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.48.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.49.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.49.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.49.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.5.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.5.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.5.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.50.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.50.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.50.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.51.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.51.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.51.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.52.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.8.mlp.experts.52.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.8.mlp.experts.52.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.8.mlp.experts.53.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.8.mlp.experts.53.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.8.mlp.experts.53.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.8.mlp.experts.54.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.8.mlp.experts.54.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.8.mlp.experts.54.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.8.mlp.experts.55.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.8.mlp.experts.55.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.8.mlp.experts.55.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.8.mlp.experts.56.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.8.mlp.experts.56.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.8.mlp.experts.56.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.8.mlp.experts.57.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.8.mlp.experts.57.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.8.mlp.experts.57.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.8.mlp.experts.58.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.8.mlp.experts.58.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.8.mlp.experts.58.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.8.mlp.experts.59.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.8.mlp.experts.59.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.8.mlp.experts.59.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.8.mlp.experts.6.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.6.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.6.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.60.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.8.mlp.experts.60.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.8.mlp.experts.60.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.8.mlp.experts.61.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.8.mlp.experts.61.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.8.mlp.experts.61.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.8.mlp.experts.62.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.8.mlp.experts.62.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.8.mlp.experts.62.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.8.mlp.experts.63.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.8.mlp.experts.63.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.8.mlp.experts.63.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.8.mlp.experts.7.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.7.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.7.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.8.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.8.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.8.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.9.down_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.9.gate_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.experts.9.up_proj.weight": "model-00005-of-00015.safetensors", + "model.model.layers.8.mlp.gate.weight": "model-00006-of-00015.safetensors", + "model.model.layers.8.mlp.image_gate.weight": "model-00006-of-00015.safetensors", + "model.model.layers.8.mlp.shared_experts.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.8.mlp.shared_experts.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.8.mlp.shared_experts.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.8.post_attention_layernorm.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.attention.dense.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.attention.query_key_value.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.input_layernorm.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.audio_gate.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.0.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.0.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.0.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.1.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.1.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.1.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.10.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.10.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.10.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.11.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.11.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.11.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.12.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.12.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.12.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.13.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.13.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.13.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.14.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.14.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.14.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.15.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.15.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.15.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.16.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.16.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.16.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.17.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.17.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.17.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.18.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.18.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.18.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.19.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.19.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.19.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.2.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.2.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.2.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.20.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.20.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.20.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.21.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.21.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.21.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.22.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.22.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.22.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.23.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.23.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.23.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.24.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.24.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.24.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.25.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.25.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.25.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.26.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.26.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.26.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.27.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.27.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.27.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.28.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.28.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.28.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.29.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.29.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.29.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.3.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.3.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.3.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.30.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.30.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.30.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.31.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.31.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.31.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.32.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.32.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.32.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.33.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.33.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.33.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.34.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.34.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.34.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.35.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.35.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.35.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.36.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.36.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.36.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.37.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.37.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.37.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.38.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.38.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.38.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.39.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.39.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.39.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.4.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.4.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.4.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.40.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.40.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.40.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.41.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.41.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.41.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.42.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.42.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.42.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.43.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.43.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.43.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.44.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.44.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.44.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.45.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.45.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.45.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.46.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.46.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.46.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.47.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.47.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.47.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.48.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.48.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.48.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.49.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.49.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.49.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.5.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.5.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.5.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.50.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.50.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.50.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.51.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.51.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.51.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.52.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.52.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.52.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.53.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.53.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.53.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.54.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.54.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.54.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.55.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.55.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.55.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.56.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.56.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.56.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.57.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.57.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.57.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.58.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.58.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.58.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.59.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.59.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.59.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.6.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.6.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.6.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.60.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.60.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.60.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.61.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.61.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.61.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.62.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.62.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.62.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.63.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.63.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.63.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.7.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.7.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.7.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.8.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.8.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.8.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.9.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.9.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.experts.9.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.gate.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.image_gate.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.shared_experts.down_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.shared_experts.gate_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.mlp.shared_experts.up_proj.weight": "model-00006-of-00015.safetensors", + "model.model.layers.9.post_attention_layernorm.weight": "model-00006-of-00015.safetensors", + "model.model.norm.weight": "model-00014-of-00015.safetensors", + "model.model.word_embeddings.weight": "model-00001-of-00015.safetensors", + "talker.model.lm_head.weight": "model-00015-of-00015.safetensors", + "talker.model.model.embed_tokens.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.0.input_layernorm.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.0.mlp.down_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.0.mlp.gate_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.0.mlp.up_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.0.post_attention_layernorm.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.0.self_attn.k_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.0.self_attn.k_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.0.self_attn.o_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.0.self_attn.q_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.0.self_attn.q_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.0.self_attn.v_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.0.self_attn.v_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.1.input_layernorm.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.1.mlp.down_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.1.mlp.gate_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.1.mlp.up_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.1.post_attention_layernorm.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.1.self_attn.k_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.1.self_attn.k_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.1.self_attn.o_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.1.self_attn.q_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.1.self_attn.q_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.1.self_attn.v_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.1.self_attn.v_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.10.input_layernorm.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.10.mlp.down_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.10.mlp.gate_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.10.mlp.up_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.10.post_attention_layernorm.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.10.self_attn.k_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.10.self_attn.k_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.10.self_attn.o_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.10.self_attn.q_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.10.self_attn.q_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.10.self_attn.v_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.10.self_attn.v_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.11.input_layernorm.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.11.mlp.down_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.11.mlp.gate_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.11.mlp.up_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.11.post_attention_layernorm.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.11.self_attn.k_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.11.self_attn.k_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.11.self_attn.o_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.11.self_attn.q_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.11.self_attn.q_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.11.self_attn.v_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.11.self_attn.v_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.12.input_layernorm.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.12.mlp.down_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.12.mlp.gate_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.12.mlp.up_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.12.post_attention_layernorm.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.12.self_attn.k_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.12.self_attn.k_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.12.self_attn.o_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.12.self_attn.q_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.12.self_attn.q_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.12.self_attn.v_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.12.self_attn.v_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.13.input_layernorm.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.13.mlp.down_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.13.mlp.gate_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.13.mlp.up_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.13.post_attention_layernorm.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.13.self_attn.k_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.13.self_attn.k_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.13.self_attn.o_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.13.self_attn.q_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.13.self_attn.q_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.13.self_attn.v_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.13.self_attn.v_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.14.input_layernorm.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.14.mlp.down_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.14.mlp.gate_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.14.mlp.up_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.14.post_attention_layernorm.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.14.self_attn.k_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.14.self_attn.k_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.14.self_attn.o_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.14.self_attn.q_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.14.self_attn.q_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.14.self_attn.v_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.14.self_attn.v_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.15.input_layernorm.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.15.mlp.down_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.15.mlp.gate_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.15.mlp.up_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.15.post_attention_layernorm.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.15.self_attn.k_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.15.self_attn.k_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.15.self_attn.o_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.15.self_attn.q_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.15.self_attn.q_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.15.self_attn.v_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.15.self_attn.v_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.16.input_layernorm.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.16.mlp.down_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.16.mlp.gate_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.16.mlp.up_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.16.post_attention_layernorm.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.16.self_attn.k_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.16.self_attn.k_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.16.self_attn.o_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.16.self_attn.q_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.16.self_attn.q_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.16.self_attn.v_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.16.self_attn.v_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.17.input_layernorm.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.17.mlp.down_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.17.mlp.gate_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.17.mlp.up_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.17.post_attention_layernorm.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.17.self_attn.k_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.17.self_attn.k_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.17.self_attn.o_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.17.self_attn.q_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.17.self_attn.q_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.17.self_attn.v_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.17.self_attn.v_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.18.input_layernorm.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.18.mlp.down_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.18.mlp.gate_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.18.mlp.up_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.18.post_attention_layernorm.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.18.self_attn.k_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.18.self_attn.k_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.18.self_attn.o_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.18.self_attn.q_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.18.self_attn.q_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.18.self_attn.v_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.18.self_attn.v_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.19.input_layernorm.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.19.mlp.down_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.19.mlp.gate_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.19.mlp.up_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.19.post_attention_layernorm.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.19.self_attn.k_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.19.self_attn.k_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.19.self_attn.o_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.19.self_attn.q_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.19.self_attn.q_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.19.self_attn.v_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.19.self_attn.v_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.2.input_layernorm.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.2.mlp.down_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.2.mlp.gate_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.2.mlp.up_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.2.post_attention_layernorm.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.2.self_attn.k_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.2.self_attn.k_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.2.self_attn.o_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.2.self_attn.q_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.2.self_attn.q_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.2.self_attn.v_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.2.self_attn.v_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.20.input_layernorm.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.20.mlp.down_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.20.mlp.gate_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.20.mlp.up_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.20.post_attention_layernorm.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.20.self_attn.k_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.20.self_attn.k_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.20.self_attn.o_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.20.self_attn.q_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.20.self_attn.q_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.20.self_attn.v_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.20.self_attn.v_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.21.input_layernorm.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.21.mlp.down_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.21.mlp.gate_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.21.mlp.up_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.21.post_attention_layernorm.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.21.self_attn.k_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.21.self_attn.k_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.21.self_attn.o_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.21.self_attn.q_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.21.self_attn.q_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.21.self_attn.v_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.21.self_attn.v_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.22.input_layernorm.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.22.mlp.down_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.22.mlp.gate_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.22.mlp.up_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.22.post_attention_layernorm.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.22.self_attn.k_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.22.self_attn.k_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.22.self_attn.o_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.22.self_attn.q_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.22.self_attn.q_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.22.self_attn.v_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.22.self_attn.v_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.23.input_layernorm.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.23.mlp.down_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.23.mlp.gate_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.23.mlp.up_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.23.post_attention_layernorm.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.23.self_attn.k_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.23.self_attn.k_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.23.self_attn.o_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.23.self_attn.q_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.23.self_attn.q_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.23.self_attn.v_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.23.self_attn.v_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.3.input_layernorm.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.3.mlp.down_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.3.mlp.gate_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.3.mlp.up_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.3.post_attention_layernorm.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.3.self_attn.k_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.3.self_attn.k_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.3.self_attn.o_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.3.self_attn.q_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.3.self_attn.q_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.3.self_attn.v_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.3.self_attn.v_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.4.input_layernorm.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.4.mlp.down_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.4.mlp.gate_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.4.mlp.up_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.4.post_attention_layernorm.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.4.self_attn.k_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.4.self_attn.k_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.4.self_attn.o_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.4.self_attn.q_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.4.self_attn.q_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.4.self_attn.v_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.4.self_attn.v_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.5.input_layernorm.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.5.mlp.down_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.5.mlp.gate_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.5.mlp.up_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.5.post_attention_layernorm.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.5.self_attn.k_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.5.self_attn.k_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.5.self_attn.o_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.5.self_attn.q_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.5.self_attn.q_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.5.self_attn.v_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.5.self_attn.v_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.6.input_layernorm.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.6.mlp.down_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.6.mlp.gate_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.6.mlp.up_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.6.post_attention_layernorm.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.6.self_attn.k_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.6.self_attn.k_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.6.self_attn.o_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.6.self_attn.q_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.6.self_attn.q_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.6.self_attn.v_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.6.self_attn.v_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.7.input_layernorm.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.7.mlp.down_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.7.mlp.gate_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.7.mlp.up_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.7.post_attention_layernorm.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.7.self_attn.k_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.7.self_attn.k_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.7.self_attn.o_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.7.self_attn.q_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.7.self_attn.q_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.7.self_attn.v_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.7.self_attn.v_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.8.input_layernorm.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.8.mlp.down_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.8.mlp.gate_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.8.mlp.up_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.8.post_attention_layernorm.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.8.self_attn.k_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.8.self_attn.k_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.8.self_attn.o_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.8.self_attn.q_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.8.self_attn.q_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.8.self_attn.v_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.8.self_attn.v_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.9.input_layernorm.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.9.mlp.down_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.9.mlp.gate_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.9.mlp.up_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.9.post_attention_layernorm.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.9.self_attn.k_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.9.self_attn.k_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.9.self_attn.o_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.9.self_attn.q_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.9.self_attn.q_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.layers.9.self_attn.v_proj.bias": "model-00015-of-00015.safetensors", + "talker.model.model.layers.9.self_attn.v_proj.weight": "model-00015-of-00015.safetensors", + "talker.model.model.norm.weight": "model-00015-of-00015.safetensors", + "talker.thinker_to_talker_proj.bias": "model-00015-of-00015.safetensors", + "talker.thinker_to_talker_proj.weight": "model-00015-of-00015.safetensors", + "talker.vp_head.bias": "model-00015-of-00015.safetensors", + "talker.vp_head.weight": "model-00015-of-00015.safetensors", + "vision.blocks.0.attn.proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.0.attn.proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.0.attn.qkv.bias": "model-00001-of-00015.safetensors", + "vision.blocks.0.attn.qkv.weight": "model-00001-of-00015.safetensors", + "vision.blocks.0.mlp.down_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.0.mlp.down_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.0.mlp.gate_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.0.mlp.gate_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.0.mlp.up_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.0.mlp.up_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.0.norm1.weight": "model-00001-of-00015.safetensors", + "vision.blocks.0.norm2.weight": "model-00001-of-00015.safetensors", + "vision.blocks.1.attn.proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.1.attn.proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.1.attn.qkv.bias": "model-00001-of-00015.safetensors", + "vision.blocks.1.attn.qkv.weight": "model-00001-of-00015.safetensors", + "vision.blocks.1.mlp.down_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.1.mlp.down_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.1.mlp.gate_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.1.mlp.gate_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.1.mlp.up_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.1.mlp.up_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.1.norm1.weight": "model-00001-of-00015.safetensors", + "vision.blocks.1.norm2.weight": "model-00001-of-00015.safetensors", + "vision.blocks.10.attn.proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.10.attn.proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.10.attn.qkv.bias": "model-00001-of-00015.safetensors", + "vision.blocks.10.attn.qkv.weight": "model-00001-of-00015.safetensors", + "vision.blocks.10.mlp.down_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.10.mlp.down_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.10.mlp.gate_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.10.mlp.gate_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.10.mlp.up_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.10.mlp.up_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.10.norm1.weight": "model-00001-of-00015.safetensors", + "vision.blocks.10.norm2.weight": "model-00001-of-00015.safetensors", + "vision.blocks.11.attn.proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.11.attn.proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.11.attn.qkv.bias": "model-00001-of-00015.safetensors", + "vision.blocks.11.attn.qkv.weight": "model-00001-of-00015.safetensors", + "vision.blocks.11.mlp.down_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.11.mlp.down_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.11.mlp.gate_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.11.mlp.gate_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.11.mlp.up_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.11.mlp.up_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.11.norm1.weight": "model-00001-of-00015.safetensors", + "vision.blocks.11.norm2.weight": "model-00001-of-00015.safetensors", + "vision.blocks.12.attn.proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.12.attn.proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.12.attn.qkv.bias": "model-00001-of-00015.safetensors", + "vision.blocks.12.attn.qkv.weight": "model-00001-of-00015.safetensors", + "vision.blocks.12.mlp.down_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.12.mlp.down_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.12.mlp.gate_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.12.mlp.gate_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.12.mlp.up_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.12.mlp.up_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.12.norm1.weight": "model-00001-of-00015.safetensors", + "vision.blocks.12.norm2.weight": "model-00001-of-00015.safetensors", + "vision.blocks.13.attn.proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.13.attn.proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.13.attn.qkv.bias": "model-00001-of-00015.safetensors", + "vision.blocks.13.attn.qkv.weight": "model-00001-of-00015.safetensors", + "vision.blocks.13.mlp.down_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.13.mlp.down_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.13.mlp.gate_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.13.mlp.gate_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.13.mlp.up_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.13.mlp.up_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.13.norm1.weight": "model-00001-of-00015.safetensors", + "vision.blocks.13.norm2.weight": "model-00001-of-00015.safetensors", + "vision.blocks.14.attn.proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.14.attn.proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.14.attn.qkv.bias": "model-00001-of-00015.safetensors", + "vision.blocks.14.attn.qkv.weight": "model-00001-of-00015.safetensors", + "vision.blocks.14.mlp.down_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.14.mlp.down_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.14.mlp.gate_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.14.mlp.gate_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.14.mlp.up_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.14.mlp.up_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.14.norm1.weight": "model-00001-of-00015.safetensors", + "vision.blocks.14.norm2.weight": "model-00001-of-00015.safetensors", + "vision.blocks.15.attn.proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.15.attn.proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.15.attn.qkv.bias": "model-00001-of-00015.safetensors", + "vision.blocks.15.attn.qkv.weight": "model-00001-of-00015.safetensors", + "vision.blocks.15.mlp.down_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.15.mlp.down_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.15.mlp.gate_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.15.mlp.gate_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.15.mlp.up_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.15.mlp.up_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.15.norm1.weight": "model-00001-of-00015.safetensors", + "vision.blocks.15.norm2.weight": "model-00001-of-00015.safetensors", + "vision.blocks.16.attn.proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.16.attn.proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.16.attn.qkv.bias": "model-00001-of-00015.safetensors", + "vision.blocks.16.attn.qkv.weight": "model-00001-of-00015.safetensors", + "vision.blocks.16.mlp.down_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.16.mlp.down_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.16.mlp.gate_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.16.mlp.gate_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.16.mlp.up_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.16.mlp.up_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.16.norm1.weight": "model-00001-of-00015.safetensors", + "vision.blocks.16.norm2.weight": "model-00001-of-00015.safetensors", + "vision.blocks.17.attn.proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.17.attn.proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.17.attn.qkv.bias": "model-00001-of-00015.safetensors", + "vision.blocks.17.attn.qkv.weight": "model-00001-of-00015.safetensors", + "vision.blocks.17.mlp.down_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.17.mlp.down_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.17.mlp.gate_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.17.mlp.gate_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.17.mlp.up_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.17.mlp.up_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.17.norm1.weight": "model-00001-of-00015.safetensors", + "vision.blocks.17.norm2.weight": "model-00001-of-00015.safetensors", + "vision.blocks.18.attn.proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.18.attn.proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.18.attn.qkv.bias": "model-00001-of-00015.safetensors", + "vision.blocks.18.attn.qkv.weight": "model-00001-of-00015.safetensors", + "vision.blocks.18.mlp.down_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.18.mlp.down_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.18.mlp.gate_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.18.mlp.gate_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.18.mlp.up_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.18.mlp.up_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.18.norm1.weight": "model-00001-of-00015.safetensors", + "vision.blocks.18.norm2.weight": "model-00001-of-00015.safetensors", + "vision.blocks.19.attn.proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.19.attn.proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.19.attn.qkv.bias": "model-00001-of-00015.safetensors", + "vision.blocks.19.attn.qkv.weight": "model-00001-of-00015.safetensors", + "vision.blocks.19.mlp.down_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.19.mlp.down_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.19.mlp.gate_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.19.mlp.gate_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.19.mlp.up_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.19.mlp.up_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.19.norm1.weight": "model-00001-of-00015.safetensors", + "vision.blocks.19.norm2.weight": "model-00001-of-00015.safetensors", + "vision.blocks.2.attn.proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.2.attn.proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.2.attn.qkv.bias": "model-00001-of-00015.safetensors", + "vision.blocks.2.attn.qkv.weight": "model-00001-of-00015.safetensors", + "vision.blocks.2.mlp.down_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.2.mlp.down_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.2.mlp.gate_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.2.mlp.gate_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.2.mlp.up_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.2.mlp.up_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.2.norm1.weight": "model-00001-of-00015.safetensors", + "vision.blocks.2.norm2.weight": "model-00001-of-00015.safetensors", + "vision.blocks.20.attn.proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.20.attn.proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.20.attn.qkv.bias": "model-00001-of-00015.safetensors", + "vision.blocks.20.attn.qkv.weight": "model-00001-of-00015.safetensors", + "vision.blocks.20.mlp.down_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.20.mlp.down_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.20.mlp.gate_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.20.mlp.gate_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.20.mlp.up_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.20.mlp.up_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.20.norm1.weight": "model-00001-of-00015.safetensors", + "vision.blocks.20.norm2.weight": "model-00001-of-00015.safetensors", + "vision.blocks.21.attn.proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.21.attn.proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.21.attn.qkv.bias": "model-00001-of-00015.safetensors", + "vision.blocks.21.attn.qkv.weight": "model-00001-of-00015.safetensors", + "vision.blocks.21.mlp.down_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.21.mlp.down_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.21.mlp.gate_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.21.mlp.gate_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.21.mlp.up_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.21.mlp.up_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.21.norm1.weight": "model-00001-of-00015.safetensors", + "vision.blocks.21.norm2.weight": "model-00001-of-00015.safetensors", + "vision.blocks.22.attn.proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.22.attn.proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.22.attn.qkv.bias": "model-00001-of-00015.safetensors", + "vision.blocks.22.attn.qkv.weight": "model-00001-of-00015.safetensors", + "vision.blocks.22.mlp.down_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.22.mlp.down_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.22.mlp.gate_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.22.mlp.gate_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.22.mlp.up_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.22.mlp.up_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.22.norm1.weight": "model-00001-of-00015.safetensors", + "vision.blocks.22.norm2.weight": "model-00001-of-00015.safetensors", + "vision.blocks.23.attn.proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.23.attn.proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.23.attn.qkv.bias": "model-00001-of-00015.safetensors", + "vision.blocks.23.attn.qkv.weight": "model-00001-of-00015.safetensors", + "vision.blocks.23.mlp.down_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.23.mlp.down_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.23.mlp.gate_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.23.mlp.gate_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.23.mlp.up_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.23.mlp.up_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.23.norm1.weight": "model-00001-of-00015.safetensors", + "vision.blocks.23.norm2.weight": "model-00001-of-00015.safetensors", + "vision.blocks.24.attn.proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.24.attn.proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.24.attn.qkv.bias": "model-00001-of-00015.safetensors", + "vision.blocks.24.attn.qkv.weight": "model-00001-of-00015.safetensors", + "vision.blocks.24.mlp.down_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.24.mlp.down_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.24.mlp.gate_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.24.mlp.gate_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.24.mlp.up_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.24.mlp.up_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.24.norm1.weight": "model-00001-of-00015.safetensors", + "vision.blocks.24.norm2.weight": "model-00001-of-00015.safetensors", + "vision.blocks.25.attn.proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.25.attn.proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.25.attn.qkv.bias": "model-00001-of-00015.safetensors", + "vision.blocks.25.attn.qkv.weight": "model-00001-of-00015.safetensors", + "vision.blocks.25.mlp.down_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.25.mlp.down_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.25.mlp.gate_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.25.mlp.gate_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.25.mlp.up_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.25.mlp.up_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.25.norm1.weight": "model-00001-of-00015.safetensors", + "vision.blocks.25.norm2.weight": "model-00001-of-00015.safetensors", + "vision.blocks.26.attn.proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.26.attn.proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.26.attn.qkv.bias": "model-00001-of-00015.safetensors", + "vision.blocks.26.attn.qkv.weight": "model-00001-of-00015.safetensors", + "vision.blocks.26.mlp.down_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.26.mlp.down_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.26.mlp.gate_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.26.mlp.gate_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.26.mlp.up_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.26.mlp.up_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.26.norm1.weight": "model-00001-of-00015.safetensors", + "vision.blocks.26.norm2.weight": "model-00001-of-00015.safetensors", + "vision.blocks.27.attn.proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.27.attn.proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.27.attn.qkv.bias": "model-00001-of-00015.safetensors", + "vision.blocks.27.attn.qkv.weight": "model-00001-of-00015.safetensors", + "vision.blocks.27.mlp.down_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.27.mlp.down_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.27.mlp.gate_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.27.mlp.gate_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.27.mlp.up_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.27.mlp.up_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.27.norm1.weight": "model-00001-of-00015.safetensors", + "vision.blocks.27.norm2.weight": "model-00001-of-00015.safetensors", + "vision.blocks.28.attn.proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.28.attn.proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.28.attn.qkv.bias": "model-00001-of-00015.safetensors", + "vision.blocks.28.attn.qkv.weight": "model-00001-of-00015.safetensors", + "vision.blocks.28.mlp.down_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.28.mlp.down_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.28.mlp.gate_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.28.mlp.gate_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.28.mlp.up_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.28.mlp.up_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.28.norm1.weight": "model-00001-of-00015.safetensors", + "vision.blocks.28.norm2.weight": "model-00001-of-00015.safetensors", + "vision.blocks.29.attn.proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.29.attn.proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.29.attn.qkv.bias": "model-00001-of-00015.safetensors", + "vision.blocks.29.attn.qkv.weight": "model-00001-of-00015.safetensors", + "vision.blocks.29.mlp.down_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.29.mlp.down_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.29.mlp.gate_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.29.mlp.gate_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.29.mlp.up_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.29.mlp.up_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.29.norm1.weight": "model-00001-of-00015.safetensors", + "vision.blocks.29.norm2.weight": "model-00001-of-00015.safetensors", + "vision.blocks.3.attn.proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.3.attn.proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.3.attn.qkv.bias": "model-00001-of-00015.safetensors", + "vision.blocks.3.attn.qkv.weight": "model-00001-of-00015.safetensors", + "vision.blocks.3.mlp.down_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.3.mlp.down_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.3.mlp.gate_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.3.mlp.gate_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.3.mlp.up_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.3.mlp.up_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.3.norm1.weight": "model-00001-of-00015.safetensors", + "vision.blocks.3.norm2.weight": "model-00001-of-00015.safetensors", + "vision.blocks.30.attn.proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.30.attn.proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.30.attn.qkv.bias": "model-00001-of-00015.safetensors", + "vision.blocks.30.attn.qkv.weight": "model-00001-of-00015.safetensors", + "vision.blocks.30.mlp.down_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.30.mlp.down_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.30.mlp.gate_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.30.mlp.gate_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.30.mlp.up_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.30.mlp.up_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.30.norm1.weight": "model-00001-of-00015.safetensors", + "vision.blocks.30.norm2.weight": "model-00001-of-00015.safetensors", + "vision.blocks.31.attn.proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.31.attn.proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.31.attn.qkv.bias": "model-00001-of-00015.safetensors", + "vision.blocks.31.attn.qkv.weight": "model-00001-of-00015.safetensors", + "vision.blocks.31.mlp.down_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.31.mlp.down_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.31.mlp.gate_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.31.mlp.gate_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.31.mlp.up_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.31.mlp.up_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.31.norm1.weight": "model-00001-of-00015.safetensors", + "vision.blocks.31.norm2.weight": "model-00001-of-00015.safetensors", + "vision.blocks.4.attn.proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.4.attn.proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.4.attn.qkv.bias": "model-00001-of-00015.safetensors", + "vision.blocks.4.attn.qkv.weight": "model-00001-of-00015.safetensors", + "vision.blocks.4.mlp.down_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.4.mlp.down_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.4.mlp.gate_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.4.mlp.gate_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.4.mlp.up_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.4.mlp.up_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.4.norm1.weight": "model-00001-of-00015.safetensors", + "vision.blocks.4.norm2.weight": "model-00001-of-00015.safetensors", + "vision.blocks.5.attn.proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.5.attn.proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.5.attn.qkv.bias": "model-00001-of-00015.safetensors", + "vision.blocks.5.attn.qkv.weight": "model-00001-of-00015.safetensors", + "vision.blocks.5.mlp.down_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.5.mlp.down_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.5.mlp.gate_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.5.mlp.gate_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.5.mlp.up_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.5.mlp.up_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.5.norm1.weight": "model-00001-of-00015.safetensors", + "vision.blocks.5.norm2.weight": "model-00001-of-00015.safetensors", + "vision.blocks.6.attn.proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.6.attn.proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.6.attn.qkv.bias": "model-00001-of-00015.safetensors", + "vision.blocks.6.attn.qkv.weight": "model-00001-of-00015.safetensors", + "vision.blocks.6.mlp.down_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.6.mlp.down_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.6.mlp.gate_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.6.mlp.gate_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.6.mlp.up_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.6.mlp.up_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.6.norm1.weight": "model-00001-of-00015.safetensors", + "vision.blocks.6.norm2.weight": "model-00001-of-00015.safetensors", + "vision.blocks.7.attn.proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.7.attn.proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.7.attn.qkv.bias": "model-00001-of-00015.safetensors", + "vision.blocks.7.attn.qkv.weight": "model-00001-of-00015.safetensors", + "vision.blocks.7.mlp.down_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.7.mlp.down_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.7.mlp.gate_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.7.mlp.gate_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.7.mlp.up_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.7.mlp.up_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.7.norm1.weight": "model-00001-of-00015.safetensors", + "vision.blocks.7.norm2.weight": "model-00001-of-00015.safetensors", + "vision.blocks.8.attn.proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.8.attn.proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.8.attn.qkv.bias": "model-00001-of-00015.safetensors", + "vision.blocks.8.attn.qkv.weight": "model-00001-of-00015.safetensors", + "vision.blocks.8.mlp.down_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.8.mlp.down_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.8.mlp.gate_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.8.mlp.gate_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.8.mlp.up_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.8.mlp.up_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.8.norm1.weight": "model-00001-of-00015.safetensors", + "vision.blocks.8.norm2.weight": "model-00001-of-00015.safetensors", + "vision.blocks.9.attn.proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.9.attn.proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.9.attn.qkv.bias": "model-00001-of-00015.safetensors", + "vision.blocks.9.attn.qkv.weight": "model-00001-of-00015.safetensors", + "vision.blocks.9.mlp.down_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.9.mlp.down_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.9.mlp.gate_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.9.mlp.gate_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.9.mlp.up_proj.bias": "model-00001-of-00015.safetensors", + "vision.blocks.9.mlp.up_proj.weight": "model-00001-of-00015.safetensors", + "vision.blocks.9.norm1.weight": "model-00001-of-00015.safetensors", + "vision.blocks.9.norm2.weight": "model-00001-of-00015.safetensors", + "vision.merger.ln_q.weight": "model-00001-of-00015.safetensors", + "vision.merger.mlp.0.bias": "model-00001-of-00015.safetensors", + "vision.merger.mlp.0.weight": "model-00001-of-00015.safetensors", + "vision.merger.mlp.2.bias": "model-00001-of-00015.safetensors", + "vision.merger.mlp.2.weight": "model-00001-of-00015.safetensors", + "vision.patch_embed.proj.weight": "model-00001-of-00015.safetensors" + } +} diff --git a/modeling_bailing_moe.py b/modeling_bailing_moe.py new file mode 100644 index 0000000000000000000000000000000000000000..b8c0fcc3865d840da6f9a0a39be7b4d3a67b73de --- /dev/null +++ b/modeling_bailing_moe.py @@ -0,0 +1,1650 @@ +# coding=utf-8 +# Copyright 2023 Antgroup and The HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch BailingMoE model.""" +import math +import warnings +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +from torch import nn +from torch.nn import CrossEntropyLoss + +from transformers.activations import ACT2FN +from transformers.cache_utils import Cache, DynamicCache +from transformers.modeling_attn_mask_utils import ( + AttentionMaskConverter, + _prepare_4d_attention_mask, + _prepare_4d_causal_attention_mask, + _prepare_4d_causal_attention_mask_for_sdpa, +) +from transformers.modeling_outputs import ( + MoeCausalLMOutputWithPast, + MoeModelOutputWithPast, +) +from transformers.modeling_utils import PreTrainedModel +from transformers.pytorch_utils import ( + ALL_LAYERNORM_LAYERS, + is_torch_greater_or_equal_than_1_13, +) +from transformers.utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_flash_attn_2_available, + is_flash_attn_greater_or_equal_2_10, + logging, + replace_return_docstrings, +) +from transformers.utils.import_utils import is_torch_fx_available + +from configuration_bailing_moe import BailingMoeConfig + +if is_flash_attn_2_available(): + from flash_attn import flash_attn_func, flash_attn_varlen_func + from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa + + +# This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph. +# It means that the function will not be traced through and simply appear as a node in the graph. +if is_torch_fx_available(): + if not is_torch_greater_or_equal_than_1_13: + import torch.fx + + _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask) + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "BailingMoeConfig" + + +def _get_unpad_data(attention_mask): + seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0)) + return ( + indices, + cu_seqlens, + max_seqlen_in_batch, + ) + + +def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): + warnings.warn( + "Calling `transformers.models.BailingMoe.modeling_BailingMoe._prepare_4d_attention_mask` is deprecated and will be removed in v4.37. Use `transformers.modeling_attn_mask_utils._prepare_4d_attention_mask" + ) + return _prepare_4d_attention_mask(mask=mask, dtype=dtype, tgt_len=tgt_len) + + +def _make_causal_mask( + input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0 +): + warnings.warn( + "Calling `transformers.models.BailingMoe.modeling_BailingMoe._make_causal_mask` is deprecated and will be removed in v4.37. Use `transformers.models.BailingMoe.modeling_BailingMoe.AttentionMaskConverter._make_causal_mask" + ) + return AttentionMaskConverter._make_causal_mask( + input_ids_shape=input_ids_shape, dtype=dtype, device=device, past_key_values_length=past_key_values_length + ) + + +class BailingMoeRMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + BailingMoeRMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + +ALL_LAYERNORM_LAYERS.append(BailingMoeRMSNorm) + + +class BailingMoeRotaryEmbedding(nn.Module): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + # Build here to make `torch.jit.trace` work. + self._set_cos_sin_cache( + seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() + ) + self.max_seq_len_cached = None + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + + freqs = torch.outer(t, self.inv_freq.to(t.device)) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + def forward(self, x, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + if self.max_seq_len_cached is None or seq_len > self.max_seq_len_cached: + self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) + + return ( + self.cos_cached[:seq_len].to(dtype=x.dtype), + self.sin_cached[:seq_len].to(dtype=x.dtype), + ) + + +# Copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->BailingMoe +class BailingMoeLinearScalingRotaryEmbedding(BailingMoeRotaryEmbedding): + """BailingMoeRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev""" + + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): + self.scaling_factor = scaling_factor + super().__init__(dim, max_position_embeddings, base, device) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + t = t / self.scaling_factor + + freqs = torch.outer(t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + +# Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->BailingMoe +class BailingMoeDynamicNTKScalingRotaryEmbedding(BailingMoeRotaryEmbedding): + """BailingMoeRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla""" + + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): + self.scaling_factor = scaling_factor + super().__init__(dim, max_position_embeddings, base, device) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + + if seq_len > self.max_position_embeddings: + base = self.base * ( + (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1) + ) ** (self.dim / (self.dim - 2)) + inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + + freqs = torch.outer(t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + +# Inverse dim formula to find dim based on number of rotations +def yarn_find_correction_dim(num_rotations, dim, base=10000, max_position_embeddings=2048): + return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (2 * math.log(base)) + + +# Find dim range bounds based on rotations +def yarn_find_correction_range(low_rot, high_rot, dim, base=10000, max_position_embeddings=2048): + low = math.floor(yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings)) + high = math.ceil(yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings)) + return max(low, 0), min(high, dim - 1) # Clamp values just in case + + +def yarn_get_mscale(scale=1, mscale=1): + if scale <= 1: + return 1.0 + return 0.1 * mscale * math.log(scale) + 1.0 + + +def yarn_linear_ramp_mask(min, max, dim): + if min == max: + max += 0.001 # Prevent singularity + + linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min) + ramp_func = torch.clamp(linear_func, 0, 1) + return ramp_func + + +class BailingMoeYarnRotaryEmbedding(BailingMoeRotaryEmbedding): + + def __init__( + self, + dim, + max_position_embeddings=2048, + base=10000, + device=None, + scaling_factor=1.0, + original_max_position_embeddings=4096, + beta_fast=32, + beta_slow=1, + mscale=1, + mscale_all_dim=0, + ): + self.scaling_factor = scaling_factor + self.original_max_position_embeddings = original_max_position_embeddings + self.beta_fast = beta_fast + self.beta_slow = beta_slow + self.mscale = mscale + self.mscale_all_dim = mscale_all_dim + super().__init__(dim, max_position_embeddings, base, device) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + dim = self.dim + + freq_extra = 1.0 / (self.base ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim)) + freq_inter = 1.0 / ( + self.scaling_factor * self.base ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim) + ) + + low, high = yarn_find_correction_range( + self.beta_fast, + self.beta_slow, + dim, + self.base, + self.original_max_position_embeddings, + ) + inv_freq_mask = 1.0 - yarn_linear_ramp_mask(low, high, dim // 2).to(device=device, dtype=torch.float32) + inv_freq = freq_inter * (1 - inv_freq_mask) + freq_extra * inv_freq_mask + self.register_buffer("inv_freq", inv_freq, persistent=False) + + t = torch.arange(seq_len, device=device, dtype=torch.float32) + + freqs = torch.outer(t, inv_freq) + + _mscale = float( + yarn_get_mscale(self.scaling_factor, self.mscale) + / yarn_get_mscale(self.scaling_factor, self.mscale_all_dim) + ) + + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", (emb.cos() * _mscale).to(dtype), persistent=False) + self.register_buffer("sin_cached", (emb.sin() * _mscale).to(dtype), persistent=False) + + +# Copied from transformers.models.llama.modeling_llama.rotate_half +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb +def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`): + The position indices of the tokens corresponding to the query and key tensors. For example, this can be + used to pass offsetted position ids when working with a KV-cache. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos[position_ids].unsqueeze(unsqueeze_dim) + sin = sin[position_ids].unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +class BailingMoeMLP(nn.Module): + def __init__(self, config: BailingMoeConfig, intermediate_size: int): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = intermediate_size + + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + + +class BailingMoeGate(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.top_k = config.num_experts_per_tok + self.num_experts = config.num_experts + + # topk selection algorithm + self.norm_topk_prob = config.norm_topk_prob + self.gating_dim = config.hidden_size + self.weight = nn.Parameter(torch.empty((self.num_experts, self.gating_dim))) + self.reset_parameters() + + def reset_parameters(self) -> None: + import torch.nn.init as init + + init.kaiming_uniform_(self.weight, a=math.sqrt(5)) + + def forward(self, hidden_states, sort=False): + bsz, seq_len, h = hidden_states.shape + # compute gating score + hidden_states = hidden_states.view(-1, h) + logits = F.linear(hidden_states, self.weight, None) + scores = logits.softmax(dim=-1, dtype=torch.float32) + + # select top-k experts + topk_weight, topk_idx = torch.topk(scores, k=self.top_k, dim=-1, sorted=sort) + + # norm gate to sum 1 + if self.top_k > 1 and self.norm_topk_prob: + denominator = topk_weight.sum(dim=-1, keepdim=True) + topk_weight = topk_weight / denominator + + return topk_idx, topk_weight, logits + + +class BailingMoeSparseMoeBlock(nn.Module): + """ + A mixed expert module containing shared experts. + """ + + def __init__(self, config: BailingMoeConfig): + super().__init__() + self.config = config + self.num_experts_per_tok = config.num_experts_per_tok + self._setup_experts() + self.multi_gate = config.multi_gate + if self.multi_gate: + self.image_gate = BailingMoeGate(config) + self.audio_gate = BailingMoeGate(config) + self.gate = BailingMoeGate(config) + if config.num_shared_experts is not None: + self.shared_experts = BailingMoeMLP( + config=config, intermediate_size=config.moe_intermediate_size * config.num_shared_experts + ) + + def _setup_experts(self): + self.experts = nn.ModuleList( + [ + BailingMoeMLP(config=self.config, intermediate_size=self.config.moe_intermediate_size) + for _ in range(self.config.num_experts) + ] + ) + + def create_mask(self, device, start_indices, end_indices, indices): + start_indices = torch.tensor(start_indices, device=device).view(-1, 1) + end_indices = torch.tensor(end_indices, device=device).view(-1, 1) + return (indices > start_indices) & (indices < end_indices) + + def forward(self, hidden_states: torch.Tensor, + image_start_indices: Optional[List[int]], + image_end_indices: Optional[List[int]], + audio_start_indices: Optional[List[int]], + audio_end_indices: Optional[List[int]]): + identity = hidden_states + bsz, seq_len, h = hidden_states.shape + + if self.multi_gate: + indices = torch.arange(seq_len, device=hidden_states.device).unsqueeze(0).expand(bsz, seq_len) + has_image = len(image_start_indices) != 0 and len(image_end_indices) != 0 + has_audio = len(audio_start_indices) != 0 and len(audio_end_indices) != 0 + # Get base text router results + topk_idx, topk_weight, router_logits = self.gate(hidden_states) + # router_probs = text_router_probs + # router_logits = text_router_logits + # top1_expert_index = text_top1_expert_index + + image_mask = None + audio_mask = None + + # Process image modality + if has_image: + image_mask = self.create_mask(hidden_states.device, image_start_indices, image_end_indices, indices) + image_topk_idx, image_topk_weight, image_router_logits = self.image_gate(hidden_states) + image_mask = image_mask.reshape(bsz * seq_len, 1) + + topk_idx = topk_idx * ~image_mask + image_topk_idx * image_mask + topk_weight = topk_weight * ~image_mask + image_topk_weight * image_mask + router_logits = router_logits * ~image_mask + image_router_logits * image_mask + + # Process audio modality + if has_audio: + audio_mask = self.create_mask(hidden_states.device, audio_start_indices, audio_end_indices, indices) + audio_topk_idx, audio_topk_weight, audio_router_logits = self.audio_gate(hidden_states) + audio_mask = audio_mask.reshape(bsz * seq_len, 1) + + topk_idx = topk_idx * ~audio_mask + audio_topk_idx * audio_mask + topk_weight = topk_weight * ~audio_mask + audio_topk_weight * audio_mask + router_logits = router_logits * ~audio_mask + audio_router_logits * audio_mask + + # Verify mask consistency when both modalities exist + if has_image and has_audio: + assert torch.logical_and(image_mask, audio_mask).sum() == 0 + # Error handling when no modalities found + if not has_image and not has_audio: + print(f'Error: No modalities found - ' + f'Image: {len(image_start_indices)}/{len(image_end_indices)}, ' + f'Audio: {len(audio_start_indices)}/{len(audio_end_indices)}') + else: + topk_idx, topk_weight, router_logits = self.gate(hidden_states) + hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) + flat_topk_idx = topk_idx.view(-1) + if self.training: + hidden_states = hidden_states.repeat_interleave(self.num_experts_per_tok, dim=0) + y = torch.empty_like(hidden_states) + for i, expert in enumerate(self.experts): + y[flat_topk_idx == i] = expert(hidden_states[flat_topk_idx == i]) + y = (y.view(*topk_weight.shape, -1) * topk_weight.unsqueeze(-1)).sum(dim=1) + y = y.to(hidden_states.dtype).view(bsz, seq_len, h) + else: + y = self.moe_infer(hidden_states, topk_idx, topk_weight).view(bsz, seq_len, h) + if self.config.num_shared_experts is not None: + y = y + self.shared_experts(identity) + return y, (router_logits.view(bsz, seq_len, -1), topk_idx.view(bsz, seq_len, -1)) + + @torch.no_grad() + def moe_infer(self, x, topk_ids, topk_weight): + cnts = topk_ids.new_zeros((topk_ids.shape[0], len(self.experts))) + cnts.scatter_(1, topk_ids, 1) + tokens_per_expert = cnts.sum(dim=0) + idxs = topk_ids.view(-1).argsort() + sorted_tokens = x[idxs // topk_ids.shape[1]] + sorted_tokens_shape = sorted_tokens.shape + tokens_per_expert = tokens_per_expert.cpu().numpy() + outputs = [] + start_idx = 0 + for i, num_tokens in enumerate(tokens_per_expert): + end_idx = start_idx + num_tokens + if num_tokens == 0: + continue + expert = self.experts[i] + tokens_for_this_expert = sorted_tokens[start_idx:end_idx] + expert_out = expert(tokens_for_this_expert) + outputs.append(expert_out) + start_idx = end_idx + + outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0) + new_x = torch.empty_like(outs) + new_x[idxs] = outs + final_out = ( + new_x.view(*topk_ids.shape, -1) + .type(topk_weight.dtype) + .mul_(topk_weight.unsqueeze(dim=-1)) + .sum(dim=1) + .type(new_x.dtype) + ) + return final_out + + +# Copied from transformers.models.llama.modeling_llama.repeat_kv +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +# Copied from transformers.models.llama.modeling_llama.LlamaAttention with Llama->BailingMoe +class BailingMoeAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: BailingMoeConfig, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will " + "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.attention_dropout = config.attention_dropout + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = config.head_dim or self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.rope_theta = config.rope_theta + self.is_causal = True + + self.query_key_value = nn.Linear( + self.hidden_size, + (self.num_heads + 2 * self.num_key_value_heads) * self.head_dim, + bias=config.use_qkv_bias, + ) + self.dense = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.use_bias) + self._init_rope() + + def _init_rope(self): + if self.config.rope_scaling is None: + self.rotary_emb = BailingMoeRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) + else: + scaling_type = self.config.rope_scaling["type"] + scaling_factor = self.config.rope_scaling["factor"] + if scaling_type == "linear": + self.rotary_emb = BailingMoeLinearScalingRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + scaling_factor=scaling_factor, + base=self.rope_theta, + ) + elif scaling_type == "dynamic": + self.rotary_emb = BailingMoeDynamicNTKScalingRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + scaling_factor=scaling_factor, + base=self.rope_theta, + ) + elif scaling_type == "yarn": + kwargs = { + key: self.config.rope_scaling[key] + for key in [ + "original_max_position_embeddings", + "beta_fast", + "beta_slow", + "mscale", + "mscale_all_dim", + ] + if key in self.config.rope_scaling + } + self.rotary_emb = BailingMoeYarnRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + scaling_factor=scaling_factor, + base=self.rope_theta, + **kwargs, + ) + else: + raise ValueError(f"Unknown RoPE scaling type {scaling_type}") + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + + bsz, q_len, _ = hidden_states.size() + + qkv = self.query_key_value(hidden_states) + qkv = qkv.view(bsz, q_len, self.num_heads + 2 * self.num_key_value_heads, self.head_dim) + + query_states, key_states, value_states = qkv.split( + [self.num_heads, self.num_key_value_heads, self.num_key_value_heads], dim=-2 + ) + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + attn_weights = torch.matmul(query_states / math.sqrt(self.head_dim), key_states.transpose(2, 3)) + + if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): + raise ValueError( + f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights + attention_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + + attn_output = attn_output.reshape(bsz, q_len, -1) + + attn_output = self.dense(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->BailingMoe +class BailingMoeFlashAttention2(BailingMoeAttention): + """ + BailingMoe flash attention module. This module inherits from `BailingMoeAttention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it needs to correctly call the public API of + flash attention and deal with padding tokens in case the input contains any of them. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). + self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + # BailingMoeFlashAttention2 attention does not support output_attentions + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + + # overwrite attention_mask with padding_mask + attention_mask = kwargs.pop("padding_mask") + + output_attentions = False + + bsz, q_len, _ = hidden_states.size() + + # Flash attention requires the input to have the shape + # batch_size x seq_length x head_dim x hidden_dim + # therefore we just need to keep the original shape + + qkv = self.query_key_value(hidden_states) + qkv = qkv.view(bsz, q_len, self.num_heads + 2 * self.num_key_value_heads, self.head_dim) + + query_states, key_states, value_states = qkv.split( + [self.num_heads, self.num_key_value_heads, self.num_key_value_heads], dim=-2 + ) + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache + # to be able to avoid many of these transpose/reshape/view. + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + dropout_rate = self.attention_dropout if self.training else 0.0 + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently cast in float32. Hence, we need + # cast them back in the correct dtype just to be sure everything works as expected. + # This might slow down training & inference so it is recommended to not cast the LayerNorms + # in fp32. (BailingMoeRMSNorm handles it correctly) + + input_dtype = query_states.dtype + if input_dtype == torch.float32: + # Handle the case where the model is quantized + if hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + elif torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + else: + target_dtype = self.q_proj.weight.dtype + + logger.warning_once( + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f" {target_dtype}." + ) + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + attn_output = self._flash_attention_forward( + query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate + ) + + attn_output = attn_output.reshape(bsz, q_len, -1).contiguous() + attn_output = self.dense(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + def _flash_attention_forward( + self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None + ): + """ + Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token + first unpad the input, then computes the attention scores and pad the final attention scores. + + Args: + query_states (`torch.Tensor`): + Input query states to be passed to Flash Attention API + key_states (`torch.Tensor`): + Input key states to be passed to Flash Attention API + value_states (`torch.Tensor`): + Input value states to be passed to Flash Attention API + attention_mask (`torch.Tensor`): + The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the + position of padding tokens and 1 for the position of non-padding tokens. + dropout (`int`, *optional*): + Attention dropout + softmax_scale (`float`, *optional*): + The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) + query_length (`int`): + The length of the query sequence in terms of tokens. This represents the number of tokens in the + `query_states` tensor along the sequence dimension. It is used to determine the effective sequence + length for attention computations. + """ + if not self._flash_attn_uses_top_left_mask: + causal = self.is_causal + else: + # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in BailingMoeFlashAttention2 __init__. + causal = self.is_causal and query_length != 1 + + # Contains at least one padding token in the sequence + if attention_mask is not None: + batch_size = query_states.shape[0] + query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( + query_states, key_states, value_states, attention_mask, query_length + ) + + cu_seqlens_q, cu_seqlens_k = cu_seq_lens + max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens + + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + + attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) + else: + attn_output = flash_attn_func( + query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal + ) + + return attn_output + + def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length): + indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) + batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape + + key_layer = index_first_axis( + key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k + ) + value_layer = index_first_axis( + value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k + ) + if query_length == kv_seq_len: + query_layer = index_first_axis( + query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k + ) + cu_seqlens_q = cu_seqlens_k + max_seqlen_in_batch_q = max_seqlen_in_batch_k + indices_q = indices_k + elif query_length == 1: + max_seqlen_in_batch_q = 1 + cu_seqlens_q = torch.arange( + batch_size + 1, dtype=torch.int32, device=query_layer.device + ) # There is a memcpy here, that is very bad. + indices_q = cu_seqlens_q[:-1] + query_layer = query_layer.squeeze(1) + else: + # The -q_len: slice assumes left padding. + attention_mask = attention_mask[:, -query_length:] + query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask) + + return ( + query_layer, + key_layer, + value_layer, + indices_q, + (cu_seqlens_q, cu_seqlens_k), + (max_seqlen_in_batch_q, max_seqlen_in_batch_k), + ) + + +# Copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->BailingMoe +class BailingMoeSdpaAttention(BailingMoeAttention): + """ + BailingMoe attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from + `BailingMoeAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to + SDPA API. + """ + + # Adapted from BailingMoeAttention.forward + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if output_attentions: + # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. + logger.warning_once( + "BailingMoeModel is using BailingMoeSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " + 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + bsz, q_len, _ = hidden_states.size() + + qkv = self.query_key_value(hidden_states) + qkv = qkv.view(bsz, q_len, self.num_heads + 2 * self.num_key_value_heads, self.head_dim) + + query_states, key_states, value_states = qkv.split( + [self.num_heads, self.num_key_value_heads, self.num_key_value_heads], dim=-2 + ) + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + + # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, + # Reference: https://github.com/pytorch/pytorch/issues/112577. + if query_states.device.type == "cuda" and attention_mask is not None: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=attention_mask, + dropout_p=self.attention_dropout if self.training else 0.0, + # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1. + is_causal=self.is_causal and attention_mask is None and q_len > 1, + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, -1) + + attn_output = self.dense(attn_output) + + return attn_output, None, past_key_value + + +BAILING_MOE_ATTENTION_CLASSES = { + "eager": BailingMoeAttention, + "flash_attention_2": BailingMoeFlashAttention2, + "sdpa": BailingMoeSdpaAttention, +} + + +class BailingMoeDecoderLayer(nn.Module): + def __init__(self, config: BailingMoeConfig, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + + self.attention = BAILING_MOE_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx) + + self.mlp = ( + BailingMoeSparseMoeBlock(config) + if (config.num_experts is not None and layer_idx >= config.first_k_dense_replace) + else BailingMoeMLP(config=config, intermediate_size=config.intermediate_size) + ) + self.input_layernorm = BailingMoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = BailingMoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + image_start_indices: Optional[List[int]] = None, + image_end_indices: Optional[List[int]] = None, + audio_start_indices: Optional[List[int]] = None, + audio_end_indices: Optional[List[int]] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + output_router_logits: Optional[bool] = False, + use_cache: Optional[bool] = False, + **kwargs, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): + attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1, + query_sequence_length, key_sequence_length)` if default attention is used. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): + cached past key and value projection states + output_attentions (`bool`, *optional*): + Whether to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_router_logits (`bool`, *optional*): + Whether or not to return the logits of all the routers. They are useful for computing the router loss, + and should not be returned during inference. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + """ + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.attention( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states, + image_start_indices, + image_end_indices, + audio_start_indices, + audio_end_indices) + if isinstance(hidden_states, tuple): + hidden_states, router_logits = hidden_states + else: + router_logits = None + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + if output_router_logits: + outputs += (router_logits,) + + return outputs + + +BAILINGMOE_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`BailingMoeConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare BailingMoe Model outputting raw hidden-states without any specific head on top.", + BAILINGMOE_START_DOCSTRING, +) +class BailingMoePreTrainedModel(PreTrainedModel): + config_class = BailingMoeConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["BailingMoeDecoderLayer"] + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn_2 = True + _supports_sdpa = True + _supports_cache_class = True + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +BAILINGMOE_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*): + Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` + returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. + + Two formats are allowed: + - a [`~cache_utils.Cache`] instance; + - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy + cache format. + + The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the + legacy cache format will be returned. + + If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't + have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` + of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare BailingMoe Model outputting raw hidden-states without any specific head on top.", + BAILINGMOE_START_DOCSTRING, +) +class BailingMoeModel(BailingMoePreTrainedModel): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`BailingMoeDecoderLayer`] + + Args: + config: BailingMoeConfig + """ + + def __init__(self, config: BailingMoeConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList( + [BailingMoeDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self._use_sdpa = config._attn_implementation == "sdpa" + self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2" + self.norm = BailingMoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.word_embeddings + + def set_input_embeddings(self, value): + self.word_embeddings = value + + @add_start_docstrings_to_model_forward(BAILINGMOE_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + output_router_logits: Optional[bool] = None, + return_dict: Optional[bool] = None, + image_start_indices: Optional[List[int]] = None, + image_end_indices: Optional[List[int]] = None, + audio_start_indices: Optional[List[int]] = None, + audio_end_indices: Optional[List[int]] = None, + **kwargs, + ) -> Union[Tuple, MoeModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + output_router_logits = ( + output_router_logits if output_router_logits is not None else self.config.output_router_logits + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + batch_size, seq_length = input_ids.shape[:2] + elif inputs_embeds is not None: + batch_size, seq_length = inputs_embeds.shape[:2] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`transformers." + ) + use_cache = False + + past_key_values_length = 0 + if use_cache: + use_legacy_cache = not isinstance(past_key_values, Cache) + if use_legacy_cache: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + past_key_values_length = past_key_values.get_usable_length(seq_length) + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device + ) + position_ids = position_ids.unsqueeze(0) + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + + if self._use_flash_attention_2: + # 2d mask is passed through the layers + attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None + elif self._use_sdpa and not output_attentions: + # output_attentions=True can not be supported when using SDPA, and we fall back on + # the manual implementation that requires a 4D causal mask in all cases. + attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + ) + else: + # 4d mask is passed through the layers + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length + ) + + # embed positions + hidden_states = inputs_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_router_logits = () if output_router_logits else None + next_decoder_cache = None + + for decoder_layer in self.layers: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + attention_mask, + position_ids, + image_start_indices, + image_end_indices, + audio_start_indices, + audio_end_indices, + past_key_values, + output_attentions, + output_router_logits, + use_cache, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + image_start_indices=image_start_indices, + image_end_indices=image_end_indices, + audio_start_indices=audio_start_indices, + audio_end_indices=audio_end_indices, + past_key_value=past_key_values, + output_attentions=output_attentions, + output_router_logits=output_router_logits, + use_cache=use_cache, + ) + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache = layer_outputs[2 if output_attentions else 1] + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + if output_router_logits and layer_outputs[-1] is not None: + all_router_logits += (layer_outputs[-1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = None + if use_cache: + next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache + if not return_dict: + return tuple( + v + for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_router_logits] + if v is not None + ) + return MoeModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + router_logits=all_router_logits, + ) + + +class BailingMoeForCausalLM(BailingMoePreTrainedModel): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config: BailingMoeConfig): + super().__init__(config) + self.model = BailingMoeModel(config) + self.vocab_size = config.vocab_size + self.norm_head = config.norm_head + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.word_embeddings + + def set_input_embeddings(self, value): + self.model.word_embeddings = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + def compute_logit(self, hidden_states): + if self.norm_head: + if self.training: + norm_weight = ( + self.lm_head.weight / (torch.norm(self.lm_head.weight, p=2, dim=0, keepdim=True) + 1e-7).detach() + ) + logits = F.linear(hidden_states, norm_weight, None) + else: + self.lm_head.weight.data = ( + self.lm_head.weight.data.float() + / (torch.norm(self.lm_head.weight.data.float(), p=2, dim=0, keepdim=True) + 1e-7) + ).to(hidden_states.dtype) + logits = F.linear(hidden_states, self.lm_head.weight.data, None) + self.norm_head = False + else: + logits = self.lm_head(hidden_states) + return logits + + @add_start_docstrings_to_model_forward(BAILINGMOE_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=MoeCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + output_router_logits: Optional[bool] = None, + return_dict: Optional[bool] = None, + image_start_indices: Optional[List[int]] = None, + image_end_indices: Optional[List[int]] = None, + audio_start_indices: Optional[List[int]] = None, + audio_end_indices: Optional[List[int]] = None, + **kwargs, + ) -> Union[Tuple, MoeCausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer + + >>> model = BailingMoeForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS) + >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER) + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + output_router_logits = ( + output_router_logits if output_router_logits is not None else self.config.output_router_logits + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + output_router_logits=output_router_logits, + return_dict=return_dict, + image_start_indices=image_start_indices, + image_end_indices=image_end_indices, + audio_start_indices=audio_start_indices, + audio_end_indices=audio_end_indices, + **kwargs, + ) + + hidden_states = outputs[0] + + logits = self.compute_logit(hidden_states=hidden_states) + logits = logits.float() + + loss = None + aux_loss = None + + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + if output_router_logits: + output = (aux_loss,) + output + return (loss,) + output if loss is not None else output + + return MoeCausalLMOutputWithPast( + loss=loss, + aux_loss=aux_loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + router_logits=outputs.router_logits, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + attention_mask=None, + inputs_embeds=None, + token_type_ids=None, + image_start_indices=None, + image_end_indices=None, + audio_start_indices=None, + audio_end_indices=None, + **kwargs + ): + if past_key_values is not None: + if isinstance(past_key_values, Cache): + cache_length = past_key_values.get_seq_length() + past_length = past_key_values.seen_tokens + max_cache_length = ( + past_key_values.get_max_length() + if hasattr(past_key_values, "get_max_length") + else past_key_values.get_max_cache_shape() + ) + else: + cache_length = past_length = past_key_values[0][0].shape[2] + max_cache_length = None + + # Keep only the unprocessed tokens: + # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where + # some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as input) + if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]: + input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :] + # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard + # input_ids based on the past_length. + elif past_length < input_ids.shape[1]: + input_ids = input_ids[:, past_length:] + # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens. + + # If we are about to go beyond the maximum cache length, we need to crop the input attention mask. + if ( + max_cache_length is not None + and attention_mask is not None + and cache_length + input_ids.shape[1] > max_cache_length + ): + attention_mask = attention_mask[:, -max_cache_length:] + + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -input_ids.shape[1] :] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + "image_start_indices": image_start_indices, + "image_end_indices": image_end_indices, + "audio_start_indices": audio_start_indices, + "audio_end_indices": audio_end_indices + } + ) + return model_inputs + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past diff --git a/modeling_bailing_talker.py b/modeling_bailing_talker.py new file mode 100644 index 0000000000000000000000000000000000000000..0259c7cbcb796e05acfc70bd3253facf5becdc13 --- /dev/null +++ b/modeling_bailing_talker.py @@ -0,0 +1,290 @@ +from dataclasses import dataclass +from typing import Optional, Tuple, List +import torch +import torch.nn as nn +import torchaudio +from hyperpyyaml import load_hyperpyyaml + +from transformers import Qwen2Config, PreTrainedModel +from transformers import Qwen2ForCausalLM, AutoTokenizer +from audio_detokenizer.cli.model import AudioDetokenizerModel +from s3bpe_tokenizer import S3BpeTokenizer +from configuration_bailing_talker import BailingTalkerConfig +from transformers.utils import ModelOutput + + +@dataclass +class BailingTalkerOutputWithPast(ModelOutput): + loss: Optional[torch.FloatTensor] = None + past_key_values: Optional[List[torch.FloatTensor]] = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + logits: Optional[torch.FloatTensor] = None + + +class BailingTalkerForConditionalGeneration(PreTrainedModel): + config_class = BailingTalkerConfig + base_model_prefix = 'model' + + def __init__(self, config:BailingTalkerConfig): + super().__init__(config) + + self.config = config + self.vocab_size = self.config.vocab_size + self.tokenizer = AutoTokenizer.from_pretrained(self.config._name_or_path) + self.model_config = Qwen2Config.from_pretrained(self.config._name_or_path) + self.model = Qwen2ForCausalLM(self.model_config) + self.model.resize_token_embeddings(self.vocab_size) + self.thinker_to_talker_proj = nn.Linear(self.config.qa_model_hidden_size, self.model_config.hidden_size) + self.vp_head = nn.Conv1d( + self.config.vp_feature_size, + self.model_config.hidden_size, + kernel_size=self.config.vp_kernel_size, + stride=self.config.vp_stride, + padding=self.config.vp_kernel_size // 2, + ) + self.s3bpe_tokenizer = S3BpeTokenizer(bpe_model=f"{self.config._name_or_path}/s3_bpe/tokenizer.json", mapping_file=f"{self.config._name_or_path}/s3_bpe/char_mapping.txt") + + self.loss_function = nn.CrossEntropyLoss() + + def get_input_embeddings(self): + return self.model.get_input_embeddings() + + def encode_audio_segments( + self, + inputs_embeds: torch.FloatTensor, + vp_emb: torch.FloatTensor, + vp_insert_loc: torch.LongTensor, + thinker_reply_part: Optional[torch.FloatTensor] = None, + thinker_reply_length: Optional[List] = None, + thinker_prefix_insert_loc: Optional[torch.LongTensor] = None + ): + vp_emb_encoded = self.vp_head(vp_emb.transpose(-1, -2)).transpose(-1, -2) + + for idx in range(vp_insert_loc.shape[0]): + inputs_embeds[idx, vp_insert_loc[idx].item():vp_insert_loc[idx].item() + 1, :] = vp_emb_encoded[idx, :, :] + + if thinker_prefix_insert_loc is not None: + thinker_reply_part = self.thinker_to_talker_proj(thinker_reply_part) + for idx in range(thinker_prefix_insert_loc.shape[0]): + real_length = thinker_reply_length[idx] + inputs_embeds[idx, thinker_prefix_insert_loc[idx].item():thinker_prefix_insert_loc[idx].item() + real_length, :] = thinker_reply_part[idx, :real_length, :] + + return inputs_embeds + + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[dict] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + text_input_ids: Optional[torch.LongTensor] = None, + vp_emb: Optional[torch.FloatTensor] = None, + vp_insert_loc: Optional[torch.LongTensor] = None, + thinker_reply_part: Optional[torch.FloatTensor] = None, + thinker_reply_length: Optional[torch.FloatTensor] = None, + thinker_prefix_insert_loc: Optional[torch.LongTensor] = None, + ): + + if inputs_embeds is None: + audio_input_embeds = self.model.get_input_embeddings()(input_ids) + text_input_embeds = self.model.get_input_embeddings()(text_input_ids) + inputs_embeds = audio_input_embeds + text_input_embeds + if past_key_values is None: + inputs_embeds = self.encode_audio_segments( + inputs_embeds, vp_emb, vp_insert_loc, thinker_reply_part=thinker_reply_part, + thinker_reply_length=thinker_reply_length, thinker_prefix_insert_loc=thinker_prefix_insert_loc + ) + + if position_ids is None: + position_ids = (attention_mask.cumsum(-1) - 1).masked_fill_((attention_mask == 0), 1) + + outputs = self.model( + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + logits = outputs.logits + + loss = None + if labels is not None: + loss = self.loss_function(logits.reshape(-1, logits.size(-1)), labels.reshape(-1)) + + return BailingTalkerOutputWithPast( + loss=loss, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + logits=logits, + ) + + def sample(self, logits, topk=20, filter_value=-float("Inf")): + logits = logits.reshape(1, -1) # [1, V] + indices_to_remove = logits < torch.topk(logits, topk)[0][..., -1, None] + logits[indices_to_remove] = filter_value + token_id = torch.multinomial(torch.softmax(logits, dim=-1), num_samples=1).to(torch.long) + return token_id + + def omni_tts_binary_generation( + self, + tts_text, + vp_emb=None, + thinker_reply_part=None, + ): + # thinker_reply_part: [B, T, d] + # get text_emb and hidden_states from thinker + thinker_length = thinker_reply_part.size(1) if thinker_reply_part is not None else 0 + prefix_from_thinker = ( + self.tokenizer.encode("") + + self.tokenizer.encode("") * thinker_length + # placeholder for prefix emb from thinker + self.tokenizer.encode("") + ) + + prompt = self.tokenizer.encode("") + self.tokenizer.encode("") + + text_input_part = self.tokenizer.encode(tts_text) + vp = ( + self.tokenizer.encode("") + + self.tokenizer.encode("") + + self.tokenizer.encode("") + ) + + # audio_prefix and text_prefix for first step generation + talker_text_prefix = ( + prompt + + prefix_from_thinker + + vp + + text_input_part[:1] + ) + + talker_audio_prefix = ( + prompt + + prefix_from_thinker + + vp + + self.tokenizer.encode("") + ) + + # the rest of input_text + talker_text_input_part = ( + text_input_part[1:] + + self.tokenizer.encode("") + + self.tokenizer.encode("") + ) + + attention_mask = torch.ones(len(talker_audio_prefix)).reshape(1, -1).to(self.device) + position_ids = (attention_mask.cumsum(-1) - 1).masked_fill_((attention_mask == 0), 1)[:, -1].view(1, -1) + + talker_audio_prefix = torch.tensor(talker_audio_prefix).reshape(1, -1).to(self.device) + talker_text_prefix = torch.tensor(talker_text_prefix).reshape(1, -1).to(self.device) + vp_insert_loc = torch.tensor(len(prompt) + len(prefix_from_thinker) + 1, dtype=torch.long).reshape(1, -1) + vp_emb = vp_emb.unsqueeze(0).to(torch.bfloat16).to(self.device) + + audio_token = self.generate( + talker_audio_prefix=talker_audio_prefix, + talker_text_prefix=talker_text_prefix, + talker_text_input_part=talker_text_input_part, + position_ids=position_ids, + vp_emb=vp_emb, + vp_insert_loc=vp_insert_loc, + thinker_reply_part=thinker_reply_part, + thinker_reply_length=torch.tensor([thinker_length]).to(self.device), + thinker_prefix_insert_loc=torch.tensor([len(prompt) + 1]).to(self.device) if thinker_reply_part is not None else None, + ) + + audio_token = [ele - len(self.tokenizer) for ele in audio_token] + audio_token = self.s3bpe_tokenizer.decode(audio_token) + audio_token = torch.tensor([audio_token], dtype=torch.int32) + + return audio_token + + @torch.no_grad() + def generate( + self, + talker_audio_prefix: torch.LongTensor, + talker_text_prefix: torch.LongTensor, + talker_text_input_part: List, + position_ids: Optional[torch.LongTensor] = None, + vp_emb: Optional[torch.FloatTensor] = None, + vp_insert_loc: Optional[torch.LongTensor] = None, + thinker_reply_part: Optional[torch.FloatTensor] = None, + thinker_reply_length: Optional[torch.FloatTensor] = None, + thinker_prefix_insert_loc: Optional[torch.LongTensor] = None, + ): + result = [] + step = 0 + eos_id = self.tokenizer.encode("")[0] + while step < 1000: + if step == 0: + talker_audio_input_ids = talker_audio_prefix + talker_text_input_ids = talker_text_prefix + attention_mask = torch.ones(talker_audio_input_ids.shape).to(talker_audio_prefix.device) + + else: + talker_audio_input_ids = next_token + talker_text_input_ids = torch.tensor(talker_text_input_part[0], dtype=torch.long).reshape(1, -1).to( + talker_audio_prefix.device) + attention_mask = torch.ones(next_token.shape[0], 1).to(talker_audio_prefix.device) + position_ids += 1 + thinker_prefix_insert_loc = None + + if len(talker_text_input_part) > 1: + talker_text_input_part = talker_text_input_part[1:] + # print(talker_audio_input_ids, self.tokenizer.decode(talker_text_input_ids.tolist()[0]), attention_mask, position_ids) + outputs = self( + input_ids=talker_audio_input_ids, + text_input_ids=talker_text_input_ids, + thinker_reply_part=thinker_reply_part, + thinker_reply_length=thinker_reply_length, + thinker_prefix_insert_loc=thinker_prefix_insert_loc, + vp_emb=vp_emb, + vp_insert_loc=vp_insert_loc, + attention_mask=attention_mask, + position_ids=position_ids, + use_cache=True, + past_key_values=outputs.past_key_values if step > 0 else None + ) + # 采样 + logits = outputs.logits[:, -1, :] + + next_token = self.sample(logits) + if next_token.item() == eos_id: + break + result.append(next_token.item()) + step += 1 + + return result + + +class AudioDetokenizer: + def __init__(self, config_path, flow_model_path, hifigan_model_path): + with open(config_path, 'r') as f: + configs = load_hyperpyyaml(f) + + self.model = AudioDetokenizerModel(configs['flow'], configs['hift']) + self.model.load(flow_model_path, hifigan_model_path) + self.sr = 22050 + + def token2wav(self, audio_token, flow_embedding, save_path=None): + model_input = {"tts_speech_token": audio_token, + 'flow_embedding': flow_embedding} + model_output = self.model.inference(**model_input) + + silent_dur = 0.02 + silent_tensor = torch.Tensor([0.0] * int(self.sr * silent_dur)) + model_output['tts_speech'][0][:int(self.sr * silent_dur)] = silent_tensor + + if save_path is not None: + torchaudio.save(save_path, model_output['tts_speech'], sample_rate=self.sr) + return model_output['tts_speech'] diff --git a/modeling_bailingmm.py b/modeling_bailingmm.py new file mode 100644 index 0000000000000000000000000000000000000000..69e5ae857af898fc0f8715bf557ea354cd5d0ea4 --- /dev/null +++ b/modeling_bailingmm.py @@ -0,0 +1,358 @@ +#!/usr/bin/env python3 +# coding=utf-8 +# Copyright (c) Ant Group. All rights reserved. + +import copy +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Tuple, Union + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + +from transformers import PreTrainedModel +from transformers.modeling_outputs import ModelOutput +from transformers.utils import logging +from configuration_bailingmm import BailingMMConfig +from modeling_utils import patch_continuous_features + +# audio encoder +from funasr.models.sanm.encoder import SANMEncoder +from modeling_bailing_moe import BailingMoeForCausalLM +from modeling_utils import Transpose, encode_audio_segments + +# vision encoder +from qwen2_5_vit import Qwen2_5_VisionTransformer + +# talker +from modeling_bailing_talker import BailingTalkerForConditionalGeneration + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "BailingMMConfig" + + +@dataclass +class BailingMMCausalLMOutputWithPast(ModelOutput): + """ + Base class for BailingMM causal language model (or autoregressive) outputs. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*): + The rope index difference between sequence length and multimodal rope. + """ + + loss: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor = None + past_key_values: Optional[List[torch.FloatTensor]] = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + +class BailingMMNativeForConditionalGeneration(PreTrainedModel): + config_class = BailingMMConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["BailingAudioModel"] + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn_2 = True + + def __init__( + self, + config: BailingMMConfig, + ): + super().__init__(config) + self.config: BailingMMConfig = config + self.vision = None + self.audio = None + self.talker = None + + self.llm_dytpe = torch.bfloat16 + + if self.config.vision_config: + self.vision = Qwen2_5_VisionTransformer(self.config.vision_config) + + if self.config.audio_config: + self.audio = SANMEncoder(**self.config.audio_config.audio_encoder_config_sanm) + + self.model = BailingMoeForCausalLM(self.config.llm_config) + + mlp_modules_img = [nn.Linear(self.vision.image_emb_dim, self.model.config.hidden_size)] + for _ in range(1, self.config.mlp_depth): + mlp_modules_img.append(nn.GELU()) + mlp_modules_img.append(nn.Linear(self.model.config.hidden_size, self.model.config.hidden_size)) + self.linear_proj = nn.Sequential(*mlp_modules_img) + + if self.audio: + audio_encoder_proj = torch.nn.Conv1d( + self.config.audio_config.audio_encoder_output_size, + self.model.config.hidden_size, + kernel_size=self.config.audio_config.ds_kernel_size, + stride=self.config.audio_config.ds_stride, + padding=self.config.audio_config.ds_kernel_size // 2, + ) + + mlp_modules_audio = [audio_encoder_proj, Transpose(-1, -2)] + for _ in range(1, self.config.mlp_depth): + mlp_modules_audio.append(nn.GELU()) + mlp_modules_audio.append(nn.Linear( + self.model.config.hidden_size, self.model.config.hidden_size + )) + mlp_modules_audio.append(Transpose(-1, -2)) + self.linear_proj_audio = nn.Sequential(*mlp_modules_audio) + + if self.config.talker_config: + self.config.talker_config._name_or_path = f'{self.config._name_or_path}/talker' + self.talker = BailingTalkerForConditionalGeneration(self.config.talker_config) + + self.post_init() + + def extract_image_feature(self, pixel_values, grid_thw): + with torch.cuda.amp.autocast(dtype=torch.bfloat16): + image_embeds = self.vision(pixel_values, grid_thw=grid_thw) + image_embeds = image_embeds.float() + image_embeds = self.linear_proj(image_embeds) + image_embeds = F.normalize(image_embeds, dim=-1) + return image_embeds + + def extract_audio_feature(self, audio_feats, audio_feats_lengths): + assert self.audio is not None + assert self.linear_proj_audio is not None + audio_embeds, _, audio_embeds_lengths = encode_audio_segments( + encoder=self.audio, + proj_layer=self.linear_proj_audio, + wav_feats=audio_feats, + wav_feats_lengths=audio_feats_lengths, + ) + if self.config.audio_config.norm_query_embeds: + audio_embeds = F.normalize(audio_embeds, dim=2) # [-1, 256, 2048] + return audio_embeds, audio_embeds_lengths + + @torch.no_grad() + def generate( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + pixel_values: Optional[torch.FloatTensor] = None, + pixel_values_videos: Optional[torch.FloatTensor] = None, + audio_feats: Optional[torch.FloatTensor] = None, + image_grid_thw: Optional[torch.LongTensor] = None, + video_grid_thw: Optional[torch.LongTensor] = None, + audio_feats_lengths: Optional[torch.LongTensor] = None, + audio_placeholder_loc_lens: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.Tensor]] = None, + **generate_kwargs, + ): + image_embeds, video_embeds, audio_embeds, audio_embeds_lengths = None, None, None, None + if pixel_values is not None: + image_embeds = self.extract_image_feature(pixel_values, grid_thw=image_grid_thw) + if pixel_values_videos is not None: + video_embeds = self.extract_image_feature(pixel_values_videos, grid_thw=video_grid_thw) + if audio_feats is not None: + audio_embeds, audio_embeds_lengths = self.extract_audio_feature(audio_feats, audio_feats_lengths) + + with torch.cuda.amp.autocast(dtype=torch.bfloat16): + if (image_embeds is None and video_embeds is None and audio_embeds is None) or input_ids.size(1) == 1: + words_embeddings = self.model.get_input_embeddings()(input_ids.clip(0, self.model.get_input_embeddings().weight.shape[0] - 1)) + # input_shape = input_ids.size() + batch_size = input_ids.size(0) if input_ids is not None else inputs_embeds.size(0) + image_start_indices = [999999] * batch_size + image_end_indices = [999999] * batch_size + audio_start_indices = [999999] * batch_size + audio_end_indices = [999999] * batch_size + + else: + words_embeddings, image_start_indices, image_end_indices, audio_start_indices, audio_end_indices = self.prompt_wrap_navit( + input_ids.clip(0, self.model.get_input_embeddings().weight.shape[0] - 1), image_embeds, video_embeds, audio_embeds, + audio_embeds_lengths, audio_placeholder_loc_lens, None, # noqa + ) + + outputs = self.model.generate( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=words_embeddings, + use_cache=use_cache, + image_start_indices=image_start_indices, + image_end_indices=image_end_indices, + audio_start_indices=audio_start_indices, + audio_end_indices=audio_end_indices, + **generate_kwargs, + ) + return outputs + + def prompt_wrap_vision(self, input_ids, inputs_embeds, vision_embeds, image_token_id=None): + if vision_embeds is None or input_ids is None: + return inputs_embeds + + if len(vision_embeds.shape) == 3: + vision_embeds = vision_embeds.reshape(-1, vision_embeds.shape[-1]) + + self.config.llm_config.image_patch_token = image_token_id if image_token_id is not None else self.config.llm_config.image_patch_token + n_image_tokens = (input_ids == self.config.llm_config.image_patch_token).sum().item() + n_image_features = vision_embeds.shape[0] + + if n_image_tokens != n_image_features: + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" + ) + seq_len = input_ids.shape[1] + is_image_token = (input_ids == self.config.llm_config.image_patch_token).int() + first_indices = torch.argmax(is_image_token, dim=-1) - 1 + last_indices_flipped = torch.argmax( is_image_token.flip(dims = [1]), dim=-1) + last_indices = (seq_len - 0) - last_indices_flipped + + image_mask = ( + (input_ids == self.config.llm_config.image_patch_token) + .unsqueeze(-1) + .expand_as(inputs_embeds) + .to(inputs_embeds.device) + ) + image_embeds = vision_embeds.to(inputs_embeds.device, inputs_embeds.dtype) + inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds) + return inputs_embeds, first_indices.reshape(-1).tolist(), last_indices.reshape(-1).tolist() + + def prompt_wrap_audio(self, input_ids, inputs_embeds, audio_embeds, audio_embeds_lengths, placeholder_audio_loc_lens): + assert placeholder_audio_loc_lens.shape[1] == 1, f"Currently MoE models do not support multiple audios in a single sample, but placeholder_audio_loc_lens = {placeholder_audio_loc_lens}" + inputs_embeds = patch_continuous_features( + input_embeddings=inputs_embeds, placeholder_loc_lens=placeholder_audio_loc_lens, + encoded_feats=audio_embeds, encoded_feat_lens=audio_embeds_lengths, + ) + first_indices = placeholder_audio_loc_lens[:, 0, 0] - 1 + last_indices = placeholder_audio_loc_lens[:, 0, 0] + placeholder_audio_loc_lens[:, 0, 1] + return inputs_embeds, first_indices.reshape(-1).tolist(), last_indices.reshape(-1).tolist() + + def prompt_wrap_navit(self, input_ids, query_embeds_image=None, query_embeds_video=None, query_embeds_audio=None, + query_embeds_audio_lengths=None, placeholder_audio_loc_lens=None, target_embeds=None): + inputs_embeds = self.model.get_input_embeddings()(input_ids) + if query_embeds_image is None and query_embeds_video is None and query_embeds_audio is None and target_embeds is None: + return inputs_embeds + + audio_start_indices_list = None + audio_end_indices_list = None + image_start_indices_list = None + image_end_indices_list = None + + batch_size = input_ids.shape[0] + + if query_embeds_image is not None: + inputs_embeds, image_start_indices_list, image_end_indices_list = self.prompt_wrap_vision(input_ids, inputs_embeds, query_embeds_image) + if query_embeds_video is not None: + inputs_embeds, image_start_indices_list, image_end_indices_list = self.prompt_wrap_vision(input_ids, inputs_embeds, query_embeds_video) + if query_embeds_audio is not None: + inputs_embeds, audio_start_indices_list, audio_end_indices_list = self.prompt_wrap_audio( + input_ids, inputs_embeds, query_embeds_audio, query_embeds_audio_lengths, placeholder_audio_loc_lens, + ) + + if audio_start_indices_list is None: audio_start_indices_list = [99999] * batch_size + if audio_end_indices_list is None: audio_end_indices_list = [99999] * batch_size + if image_start_indices_list is None: image_start_indices_list = [99999] * batch_size + if image_end_indices_list is None: image_end_indices_list = [99999] * batch_size + + return inputs_embeds, image_start_indices_list, image_end_indices_list, audio_start_indices_list, audio_end_indices_list + + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + pixel_values: Optional[torch.FloatTensor] = None, + pixel_values_videos: Optional[torch.FloatTensor] = None, + audio_feats: Optional[torch.FloatTensor] = None, + image_grid_thw: Optional[torch.LongTensor] = None, + video_grid_thw: Optional[torch.LongTensor] = None, + audio_feats_lengths: Optional[torch.LongTensor] = None, + audio_placeholder_loc_lens: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.Tensor]] = None, + ) -> Union[Tuple, BailingMMCausalLMOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError( + "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one" + ) + + if (pixel_values is not None or pixel_values_videos is not None or audio_feats is not None) and inputs_embeds is not None: + raise ValueError( + "You cannot specify both pixel_values/pixel_values_videos/pixel_values_audios and inputs_embeds at the same time, and must specify either one" + ) + + image_embeds, video_embeds, audio_embeds, audio_embeds_lengths = None, None, None, None + if pixel_values is not None: + image_embeds = self.extract_image_feature(pixel_values, grid_thw=image_grid_thw) + if pixel_values_videos is not None: + video_embeds = self.extract_image_feature(pixel_values_videos, grid_thw=video_grid_thw) + if audio_feats is not None: + audio_embeds, audio_embeds_lengths = self.extract_audio_feature(audio_feats, audio_feats_lengths) + + if (image_embeds is None and video_embeds is None and audio_embeds is None) or input_ids.size(1) == 1: + words_embeddings = self.model.get_input_embeddings()(input_ids.clip(0, self.model.get_input_embeddings().weight.shape[0] - 1)) + batch_size = input_ids.size(0) if input_ids is not None else inputs_embeds.size(0) + image_indices = [999999] * batch_size + image_end_indices = [999999] * batch_size + audio_indices = [999999] * batch_size + audio_end_indices = [999999] * batch_size + + else: + words_embeddings, image_indices, image_end_indices, audio_indices, audio_end_indices = self.prompt_wrap_navit( + input_ids.clip(0, self.model.get_input_embeddings().weight.shape[0] - 1), image_embeds, video_embeds, audio_embeds, + audio_embeds_lengths, audio_placeholder_loc_lens, None, # noqa + ) + + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=words_embeddings, + labels=labels, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + image_indices=image_indices, + image_end_indices=image_end_indices, + audio_indices=audio_indices, + audio_end_indices=audio_end_indices, + ) + + return BailingMMCausalLMOutputWithPast( + loss=outputs.loss, + logits=outputs.logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + ) diff --git a/modeling_utils.py b/modeling_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..8e462b8b42e786ae0718e262cddd15135446af50 --- /dev/null +++ b/modeling_utils.py @@ -0,0 +1,979 @@ +#!/usr/bin/env python +# coding=utf-8 +# @Author: jiangpeijie.jpj +# @Date: Mon 4 Dec 2023 05:21:28 PM CST + +import logging +import math +from dataclasses import dataclass +from typing import Optional + +import torch +import torch.nn.functional as F +import torch.distributed as dist +from numpy import random +from torch import nn +from torch.nn import CrossEntropyLoss +from transformers.activations import ACT2CLS, ClassInstantier + +try: + from atorch.distributed.distributed import parallel_group, parallel_group_size +except Exception: + parallel_group = None + parallel_group_size = None + + +# ## Activations +class SwiGLUActivatition(nn.Module): + + def forward(self, input): + input = torch.chunk(input, 2, dim=-1) + return F.silu(input[0]) * input[1] + + +ACT2CLS["swiglu"] = SwiGLUActivatition +ACT2FN = ClassInstantier(ACT2CLS) + + +def get_activation(activation_string): + if activation_string in ACT2FN: + return ACT2FN[activation_string] + else: + raise KeyError(f"function {activation_string} not found in ACT2FN mapping {list(ACT2FN.keys())}") + + +# For backwards compatibility with: from activations import gelu_python +gelu_python = get_activation("gelu_python") +gelu_new = get_activation("gelu_new") +gelu = get_activation("gelu") +gelu_fast = get_activation("gelu_fast") +quick_gelu = get_activation("quick_gelu") +silu = get_activation("silu") +mish = get_activation("mish") +linear_act = get_activation("linear") +swiglu = get_activation("swiglu") + + +# Rotary Position Embedding Utils +def find_correction_dim(num_rotations, dim, base=10000, max_position_embeddings=2048): + return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (2 * math.log(base)) + + +def find_correction_range(low_rot, high_rot, dim, base=10000, max_position_embeddings=2048): + """Find dim range bounds based on rotations""" + low = math.floor(find_correction_dim(low_rot, dim, base, max_position_embeddings)) + high = math.ceil(find_correction_dim(high_rot, dim, base, max_position_embeddings)) + return max(low, 0), min(high, dim - 1) # Clamp values just in case + + +def linear_ramp_mask(min, max, dim): + if min == max: + max += 0.001 # Prevent singularity + + linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min) + ramp_func = torch.clamp(linear_func, 0, 1) + return ramp_func + + +def rotate_half(x): + x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :] + # dim=-1 triggers a bug in earlier torch versions + return torch.cat((-x2, x1), dim=x1.ndim - 1) + + +# Comment torchscript func for accurate calculate +# @torch.jit.script +def apply_rotary_pos_emb_index(q, k, cos, sin, position_id): + # position_id: [sq, b], q, k: [sq, b, np, hn], cos: [sq, 1, hn] -> [sq, b, 1, hn] + cos, sin = F.embedding(position_id, cos.squeeze(1)).unsqueeze(2), F.embedding( + position_id, sin.squeeze(1) + ).unsqueeze(2) + q, k = (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin) + return q, k + + +class RotaryEmbedding(torch.nn.Module): + def __init__(self, dim, base=10000, precision=torch.half, learnable=False): + super().__init__() + self.dim = dim + self.base = base + inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim)) + # inv_freq 保留float精度,避免bf16损失 + # inv_freq = inv_freq.to(precision) + self.learnable = learnable + if learnable: + self.inv_freq = torch.nn.Parameter(inv_freq) + self.max_seq_len_cached = None + else: + self.register_buffer('inv_freq', inv_freq) + self.max_seq_len_cached = None + self.cos_cached = None + self.sin_cached = None + self.precision = precision + + def _load_from_state_dict( + self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + ): + pass + + def forward(self, x, seq_dim=1, seq_len=None): + if seq_len is None: + seq_len = x.shape[seq_dim] + if self.max_seq_len_cached is None or (seq_len > self.max_seq_len_cached): + self.max_seq_len_cached = None if self.learnable else seq_len + inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float() / self.dim)) + t = torch.arange(seq_len, device=x.device, dtype=torch.float32) + # freqs = torch.einsum('i,j->ij', t, inv_freq.to(x.device)) + freqs = torch.outer(t, inv_freq.to(x.device)) + assert freqs.dtype == torch.float32 + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1).to(x.device) + if self.precision == torch.bfloat16: + emb = emb.float() + + # [sx, 1 (b * np), hn] + cos_cached = emb.cos()[:, None, :] + sin_cached = emb.sin()[:, None, :] + if self.precision == torch.bfloat16: + cos_cached = cos_cached.bfloat16() + sin_cached = sin_cached.bfloat16() + if self.learnable: + return cos_cached, sin_cached + self.cos_cached, self.sin_cached = cos_cached, sin_cached + return self.cos_cached[:seq_len, ...], self.sin_cached[:seq_len, ...] + + def _apply(self, fn, *args, **kwargs): + if self.cos_cached is not None: + self.cos_cached = fn(self.cos_cached) + if self.sin_cached is not None: + self.sin_cached = fn(self.sin_cached) + return super()._apply(fn, *args, **kwargs) + + +class LinearScalingRotaryEmbedding(RotaryEmbedding): + """RotaryEmbedding extended with linear scaling.""" + + def __init__( + self, dim, base=10000, precision=torch.half, learnable=False, max_embedding_length=2048, scaling_factor=1.0 + ): + self.scaling_factor = scaling_factor + self.max_embedding_length = max_embedding_length + super().__init__(dim, base, precision, learnable) + + def forward(self, x, seq_dim=1, seq_len=None): + if seq_len is None: + seq_len = x.shape[seq_dim] + if self.max_seq_len_cached is None or (seq_len > self.max_seq_len_cached): + self.max_seq_len_cached = None if self.learnable else seq_len + + inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float() / self.dim)) + t = torch.arange(seq_len, device=x.device, dtype=torch.float32) + t = t / self.scaling_factor + # freqs = torch.einsum('i,j->ij', t, inv_freq.to(x.device)) + freqs = torch.outer(t, inv_freq.to(x.device)) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1).to(x.device) + if self.precision == torch.bfloat16: + emb = emb.float() + + # [sx, 1 (b * np), hn] + cos_cached = emb.cos()[:, None, :] + sin_cached = emb.sin()[:, None, :] + if self.precision == torch.bfloat16: + cos_cached = cos_cached.bfloat16() + sin_cached = sin_cached.bfloat16() + if self.learnable: + return cos_cached, sin_cached + self.cos_cached, self.sin_cached = cos_cached, sin_cached + return self.cos_cached[:seq_len, ...], self.sin_cached[:seq_len, ...] + + +class NTKScalingRotaryEmbedding(RotaryEmbedding): + """RotaryEmbedding extended with Dynamic NTK scaling.""" + + def __init__( + self, dim, base=10000, precision=torch.half, learnable=False, max_embedding_length=2048, scaling_factor=1.0 + ): + self.scaling_factor = scaling_factor + self.max_embedding_length = max_embedding_length + super().__init__(dim, base, precision, learnable) + + def forward(self, x, seq_dim=1, seq_len=None): + if seq_len is None: + seq_len = x.shape[seq_dim] + if self.max_seq_len_cached is None or (seq_len > self.max_seq_len_cached): + self.max_seq_len_cached = None if self.learnable else seq_len + + base = self.base + if seq_len > self.max_embedding_length: + base = self.base * ( + (self.scaling_factor * seq_len / self.max_embedding_length) - (self.scaling_factor - 1) + ) ** (self.dim / (self.dim - 2)) + + inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2, device=x.device).float() / self.dim)) + t = torch.arange(seq_len, device=x.device, dtype=torch.float32) + # freqs = torch.einsum('i,j->ij', t, inv_freq.to(x.device)) + freqs = torch.outer(t, inv_freq.to(x.device)) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1).to(x.device) + if self.precision == torch.bfloat16: + emb = emb.float() + + # [sx, 1 (b * np), hn] + cos_cached = emb.cos()[:, None, :] + sin_cached = emb.sin()[:, None, :] + if self.precision == torch.bfloat16: + cos_cached = cos_cached.bfloat16() + sin_cached = sin_cached.bfloat16() + if self.learnable: + return cos_cached, sin_cached + self.cos_cached, self.sin_cached = cos_cached, sin_cached + return self.cos_cached[:seq_len, ...], self.sin_cached[:seq_len, ...] + + +class DynamicYaRNScaledRotaryEmbedding(RotaryEmbedding): + def __init__( + self, + dim, + base=10000, + precision=torch.half, + learnable=False, + max_embedding_length=2048, + extrapolation_factor=1, + attn_factor=1, + beta_fast=32, + beta_slow=1, + ): + self.max_embedding_length = max_embedding_length + self.extrapolation_factor = extrapolation_factor + self.attn_factor = attn_factor + self.beta_fast = beta_fast + self.beta_slow = beta_slow + + super().__init__(dim, base, precision, learnable) + + def forward(self, x, seq_dim=1, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case. + if seq_len is None: + seq_len = x.shape[seq_dim] + + if self.max_seq_len_cached is None or (seq_len > self.max_seq_len_cached): + self.max_seq_len_cached = seq_len + + if seq_len > self.max_embedding_length: + self.yarn(seq_len / self.max_embedding_length, x.device) + + t = torch.arange(self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype) + # freqs = torch.einsum('i,j->ij', t, inv_freq.to(x.device)) + freqs = torch.outer(t, self.inv_freq.to(x.device)) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1).to(x.device) + if self.precision == torch.bfloat16: + emb = emb.float() + + cos_cached = emb.cos()[:, None, :] + sin_cached = emb.sin()[:, None, :] + if self.precision == torch.bfloat16: + cos_cached = cos_cached.bfloat16() + sin_cached = sin_cached.bfloat16() + if self.learnable: + return cos_cached, sin_cached + self.cos_cached, self.sin_cached = cos_cached, sin_cached + + return self.cos_cached[:seq_len, ...], self.sin_cached[:seq_len, ...] + + def yarn(self, scale, device): + pos_freqs = self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim) + inv_freq_extrapolation = 1.0 / pos_freqs + inv_freq_interpolation = 1.0 / (scale * pos_freqs) + + low, high = find_correction_range( + self.beta_fast, self.beta_slow, self.dim, self.base, self.max_embedding_length + ) + inv_freq_mask = ( + 1 - linear_ramp_mask(low, high, self.dim // 2).float().to(device) + ) * self.extrapolation_factor # Get n-d rotational scaling corrected for extrapolation + inv_freq = inv_freq_interpolation * (1 - inv_freq_mask) + inv_freq_extrapolation * inv_freq_mask + + self.register_buffer("inv_freq", inv_freq) # Get n-d magnitude scaling corrected for interpolation + + +# ## LongGLM Utils + + +@dataclass +class LongGLMMemCache: + """ + Class with LongLlama's memory cache + + Args: + key (`torch.FloatTensor` of shape `(batch_size, mem_length, head_nums, embed_size_per_head)`) + value (`torch.FloatTensor` of shape `(batch_size, mem_length, head_nums, embed_size_per_head)`) + masks (`torch.FloatTensor` of shape `(batch_size, 1, mem_length, 1)`) + For masking out parts of memory + """ + + key: torch.FloatTensor + value: torch.FloatTensor + masks: torch.FloatTensor + + +def mem_apply_update(prev_external_mem_cache: LongGLMMemCache, new_mem_content: LongGLMMemCache): + + def update_one(prev, new, dim=1): + if len(prev.shape) != len(new.shape): + raise ValueError(f"Memory cache content should be consistent in shape got {prev.shape} {new.shape}") + + return torch.concat([prev, new], dim=dim) + + insert_size = new_mem_content.key.shape[1] + + assert new_mem_content.key.shape[1] == new_mem_content.value.shape[1] + if new_mem_content.masks.shape[-2] != insert_size: + raise ValueError("Inconsistent mem_length in new_mem_content") + + return LongGLMMemCache( + key=update_one(prev_external_mem_cache.key, new_mem_content.key), + value=update_one(prev_external_mem_cache.value, new_mem_content.value), + masks=update_one(prev_external_mem_cache.masks, new_mem_content.masks, dim=-2), + ) + + +def generate_prompt_keypass(n_garbage: int, seed: int = None): + """Generates a text file and inserts an execute line at a random position.""" + if seed is not None: + rnd_state = random.get_state() + random.seed(seed) + n_garbage_prefix = random.randint(0, n_garbage) + n_garbage_suffix = n_garbage - n_garbage_prefix + + task_description = "在下文的大量无关紧要的文字中隐藏着一个非常重要的信息,请找到并记住它们,后面将使用到这个信息。" + garbage = "草是绿色的。天空是蓝色的。太阳是黄色的。我们走。我们离开又回来了。" + garbage_inf = "".join([garbage] * 5000) + assert len(garbage_inf) >= n_garbage + garbage_prefix = garbage_inf[:n_garbage_prefix] + garbage_suffix = garbage_inf[:n_garbage_suffix] + pass_key = random.randint(1, 50000) + information_line = ( + f"以下是本段文本的重要信息: “通行密码是'{pass_key}',这是非常重要的信息,请记住'{pass_key}'是通行密码。”" + ) + information_line = "\n".join([information_line] * 3) + final_question = "请问通行密码是多少?" + lines = [ + task_description, + garbage_prefix, + information_line, + garbage_suffix, + final_question, + ] + if seed is not None: + random.set_state(rnd_state) + return "\n".join(lines), str(pass_key) + + +# ## Loss Fuctions + + +def _unpack_router_logits(router_outputs): + """ + Unpack the router tuple for blance loss calculation. + """ + total_router_logits = [] + total_expert_indexes = [] + for router_output in router_outputs: + if router_output[0] is not None: + router_logits, expert_indexes = router_output + total_router_logits.append(router_logits.unsqueeze(0)) + total_expert_indexes.append(expert_indexes.unsqueeze(0)) + # return torch.cat(total_router_logits, dim=0), torch.cat(total_expert_indexes, dim=0) + return torch.cat(total_router_logits, dim=0), total_expert_indexes + + +def load_balancing_loss_func(router_probs: torch.Tensor, expert_indices: torch.Tensor, labels: torch.Tensor) -> float: + r""" + Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch. + + See Switch Transformer (https://arxiv.org/abs/2101.03961) for more details. This function implements the loss + function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between + experts is too unbalanced. + + Args: + router_probs (`torch.Tensor`): + Probability assigned to each expert per token. Shape: [num_layers, batch_size, seqeunce_length, num_experts]. + expert_indices (`torch.Tensor`): + Indices tensor of shape [num_layers, batch_size, seqeunce_length] identifying the selected expert for a given token. + + Returns: + The auxiliary loss. + """ + + num_layers, _, seq_len, num_experts = router_probs.shape + num_experts = router_probs.shape[-1] + new_labels = labels.clone().detach() + ## + for batch_tensor in new_labels: + neg_mask = batch_tensor == -100 + diff_neg_ones = torch.diff(neg_mask.float()) + start_pos = torch.where(diff_neg_ones == 1.0)[0] # 找到-1序列开始的位置 + if start_pos.nelement() == 0: # 如果没有找到开始位置,可能需要根据实际情况调整 + pass + else: + last_start = start_pos[-1] # 需要修改的最后一串-1的开始位置 + batch_tensor[:last_start] = 0 # 将这部分-1全部改为0 + new_labels = new_labels.to(torch.int64) + + # cast the expert indices to int64, otherwise one-hot encoding will fail + + if expert_indices.dtype != torch.int64: + expert_indices = expert_indices.to(torch.int64) + + if len(expert_indices.shape) == 3: + expert_indices = expert_indices.unsqueeze(3) + + expert_mask = torch.nn.functional.one_hot(expert_indices, num_experts) + + # For a given token, determine if it was routed to a given expert. + expert_mask = torch.max(expert_mask, axis=-2).values + + # cast to float32 otherwise mean will fail + expert_mask = expert_mask.to(torch.float32) + labels_mask = (new_labels[None, ..., None].expand_as(expert_mask) != -100).long() + + # sample level balance loss + tokens_per_group_and_expert = torch.sum(expert_mask * labels_mask, dim=-2) / torch.sum(labels_mask, dim=-2) + router_prob_per_group_and_expert = torch.sum(router_probs * labels_mask, dim=-2) / torch.sum(labels_mask, dim=-2) + tmp_per_group_and_expert = torch.mean(expert_mask) + return torch.mean(tokens_per_group_and_expert * router_prob_per_group_and_expert) * (num_experts**2) + ''' + # batch level balance loss + expert_mask = expert_mask.view(num_layers, -1, num_experts).detach() + labels_mask = labels_mask.view(num_layers, -1, num_experts).detach() + origin_mask = labels_mask.clone() + router_probs = router_probs.view(num_layers, -1, num_experts) + + from antllm.utils import mpu + + torch.distributed.all_reduce(expert_mask, group=mpu.get_data_parallel_group()) + torch.distributed.all_reduce(labels_mask, group=mpu.get_data_parallel_group()) + + labels_mask = labels_mask.bool().long() + + world_size = torch.distributed.get_world_size() + + tokens_per_group_and_expert = ( + torch.sum(expert_mask * labels_mask, dim=-2) / torch.sum(labels_mask, dim=-2) / world_size + ) + router_prob_per_group_and_expert = torch.sum(router_probs * origin_mask, dim=-2) / torch.sum(origin_mask, dim=-2) + layer_loss = tokens_per_group_and_expert * router_prob_per_group_and_expert + loss = layer_loss.sum(-1).mean() * num_experts + return loss + ''' + + +def group_level_device_balancing_loss_func(router_probs: torch.Tensor, expert_indices: torch.Tensor) -> float: + r""" + Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch. + + See Switch Transformer (https://arxiv.org/abs/2101.03961) for more details. This function implements the loss + function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between + experts is too unbalanced. + + Args: + router_probs (`torch.Tensor`): + Probability assigned to each expert per token. Shape: [num_layers, batch_size, seqeunce_length, num_experts]. + expert_indices (`torch.Tensor`): + Indices tensor of shape [num_layers, batch_size, seqeunce_length] identifying the selected expert for a given token. + + Returns: + The auxiliary loss. + """ + assert parallel_group is not None and parallel_group_size is not None + + num_layers, _, seq_len, num_experts = router_probs.shape + + # cast the expert indices to int64, otherwise one-hot encoding will fail + if expert_indices.dtype != torch.int64: + expert_indices = expert_indices.to(torch.int64) + + if len(expert_indices.shape) == 3: + expert_indices = expert_indices.unsqueeze(3) + + expert_mask = torch.nn.functional.one_hot(expert_indices, num_experts) + + # For a given token, determine if it was routed to a given expert. + expert_mask = torch.max(expert_mask, axis=-2).values + + # cast to float32 otherwise mean will fail + expert_mask = expert_mask.to(torch.float32) + torch.distributed.all_reduce(expert_mask, group=parallel_group("expert")) + expert_parallel_size = parallel_group_size("expert") + num_experts_per_device = num_experts / expert_parallel_size + + # sample level balance loss + expert_mask = torch.sum( + torch.cat(torch.chunk(expert_mask.unsqueeze(-2), expert_parallel_size, dim=-1), dim=-2), dim=-1 + ) + tokens_per_group_and_device = torch.mean(expert_mask, axis=-2) / expert_parallel_size + + router_probs = torch.sum( + torch.cat(torch.chunk(router_probs.unsqueeze(-2), expert_parallel_size, dim=-1), dim=-2), dim=-1 + ) + router_prob_per_group_and_device = torch.mean(router_probs, axis=-2) + + device_loss = tokens_per_group_and_device * router_prob_per_group_and_device * expert_parallel_size + loss = device_loss.sum(-1).mean() + + return loss + + +def router_z_loss_func(router_logits: torch.Tensor, labels: torch.Tensor) -> float: + r""" + Compute the router z-loss implemented in PyTorch. + + The router z-loss was introduced in [Designing Effective Sparse Expert Models](https://arxiv.org/abs/2202.08906). + It encourages router logits to remain small in an effort to improve stability. + + Args: + router_logits (`float`): + Input logits of shape [num_layers, batch_size, sequence_length, num_experts] + + Returns: + Scalar router z-loss. + """ + num_layers, num_groups, tokens_per_group, _ = router_logits.shape + labels_mask = (labels[None, ..., None].expand_as(router_logits) != -100).long() + + ori_dtype = router_logits.dtype + if ori_dtype == torch.bfloat16: + loss_func_inputs = (router_logits * labels_mask).to(torch.float32) + else: + loss_func_inputs = router_logits * labels_mask + log_z = torch.logsumexp(loss_func_inputs, dim=-1).to(ori_dtype) + z_loss = log_z**2 + + # log_z = torch.logsumexp(router_logits * labels_mask, dim=-1) + # z_loss = log_z**2 + + return torch.sum(z_loss) / (num_layers * num_groups * tokens_per_group) + + +def auxiliary_loss(outputs, labels): + router_tuple = outputs.router_tuple + balance_loss, z_loss, last_logits_l2_loss = 0.0, 0.0, 0.0 + + loss = 0 + if router_tuple is not None: + router_logits, layer_router_index = _unpack_router_logits(router_tuple) + top1_expert_index = torch.cat(layer_router_index, dim=0) + outputs["layer_expert_index"] = top1_expert_index + z_loss = router_z_loss_func(router_logits, labels) + router_probs = torch.nn.Softmax(dim=-1)(router_logits) + balance_loss = load_balancing_loss_func(router_probs, top1_expert_index, labels) + + num_layers = router_probs.shape[0] + num_experts = router_probs.shape[-1] + router_probs_log = router_probs.detach().view(num_layers, -1, num_experts) + router_probs_mean = router_probs_log.mean(1) + router_probs_sort_mean = router_probs_log.sort(-1, descending=True)[0].mean(1) + router_probs_log = torch.stack([router_probs_mean, router_probs_sort_mean], dim=1) + dist.all_reduce(router_probs_log, dist.ReduceOp.SUM) + router_probs_log = router_probs_log / torch.distributed.get_world_size() + if dist.get_rank() == 0: + router_probs_log = router_probs_log.float() + router_probs_log /= router_probs_log.sum(-1, keepdim=True) + outputs["layer_expert_probs"] = router_probs_log.float().cpu() + + group_balance_loss = 0 + if float(outputs["router_group_balance_loss_alpha"]) > 0: + group_balance_loss = group_level_device_balancing_loss_func(router_probs, top1_expert_index) + loss = ( + float(outputs["router_z_loss_alpha"]) * z_loss + + float(outputs["router_balance_loss_alpha"]) * balance_loss + + float(outputs["router_group_balance_loss_alpha"]) * group_balance_loss + ) + + last_logits_l2_loss = 0.0 + if float(outputs["last_logits_l2_alpha"]) >= 0: + logits = outputs.logits.view(-1, outputs.logits.size(-1)) + labels_mask = (labels.view(-1) != -100).long() + + last_logits_l2_loss = torch.sum(torch.linalg.norm(logits.float(), 2.0, dim=-1) * labels_mask) / torch.sum( + labels_mask + ) + loss += float(outputs["last_logits_l2_alpha"]) * last_logits_l2_loss + last_logits_l2_loss = last_logits_l2_loss.item() + + return loss, balance_loss, z_loss, last_logits_l2_loss + + +def expert_balanced_auxiliary_cross_entropy(outputs, labels, *args, **kwargs): + """FOR PRETRAIN ONLY""" + # Output losses without reduction for compute dataset loss + if kwargs.get("output_losses", False): + lm_loss, losses = cross_entropy_loss(outputs.logits, labels, *args, **kwargs) + else: + lm_loss = cross_entropy_loss(outputs.logits, labels, *args, **kwargs) + aux_loss, balance_loss, z_loss, last_logits_l2_loss = auxiliary_loss(outputs, labels) + loss = lm_loss + aux_loss + if kwargs.get("output_losses", False): + return loss, lm_loss, balance_loss, z_loss, last_logits_l2_loss, losses + return loss, lm_loss, balance_loss, z_loss, last_logits_l2_loss + + +def expert_balanced_auxiliary_cross_entropy_for_sft(outputs, labels, *args, **kwargs): + """FOR SFT ONLY""" + lm_loss = sample_level_cross_entropy(outputs, labels, **kwargs) + aux_loss, balance_loss, z_loss, last_logits_l2_loss = auxiliary_loss(outputs, labels) + loss = lm_loss + aux_loss + return loss + + +def expert_balanced_auxiliary_global_level_cross_entropy(outputs, labels, *args, **kwargs): + """FOR SFT ONLY""" + lm_loss = global_token_level_cross_entropy(outputs, labels, **kwargs) + aux_loss, balance_loss, z_loss, last_logits_l2_loss = auxiliary_loss(outputs, labels) + loss = lm_loss + aux_loss + + return [ + loss, + { + 'aux_loss': aux_loss, + 'balance_loss': balance_loss, + 'z_loss': z_loss, + 'last_logits_l2_loss': last_logits_l2_loss, + }, + ] + + +def cross_entropy_loss(logits, labels, loss_mask, *args, **kwargs): + if kwargs["use_atorch_cross_entropy"]: + from atorch.modules.transformer import losses as atorch_loss + + losses = atorch_loss.CrossEntropyLoss(reduction="none")(logits.view(-1, logits.size(-1)), labels.view(-1)) + else: + losses = torch.nn.CrossEntropyLoss(reduction="none")(logits.view(-1, logits.size(-1)), labels.view(-1)) + + loss = torch.sum(losses * loss_mask.view(-1)) + if loss_mask.sum().item() > 0: + loss = loss / loss_mask.sum() + if kwargs.get("output_losses", False): + return loss, losses + return loss + + +def local_token_level_cross_entropy(outputs, labels, **kwargs): + # return outputs.loss / torch.distributed.get_world_size() + # 在每个batch内部做token-level的平均,然后在所有batch间做平均 + # return outputs.loss + loss_fct = CrossEntropyLoss(ignore_index=-100) + loss = loss_fct(outputs.logits.contiguous().view(-1, outputs.logits.size(-1)), labels.contiguous().view(-1)) + + return loss + + +def mini_batch_token_level_cross_entropy(outputs, labels, mini_batch=1, **kwargs): + # 这个loss会先把batch分成小的mini_batch,在mini_batch内做个token-level的平均,然后做所有卡之间的平均 + loss_fct = CrossEntropyLoss(ignore_index=-100, reduction='none') + if labels.shape[0] % mini_batch != 0: + # 如果batch % mini_batch != 0, 则不切分计算. 有的数据量一个epoch结束的时候可能会出现这个情况 + loss_fct = CrossEntropyLoss(ignore_index=-100) + loss = loss_fct(outputs.logits.contiguous().view(-1, outputs.logits.size(-1)), labels.contiguous().view(-1)) + else: + loss = loss_fct( + outputs.logits.contiguous().view(-1, outputs.logits.size(-1)), labels.contiguous().view(-1) + ).reshape(labels.shape[0] // mini_batch, -1) + + labels = labels.reshape(labels.shape[0] // mini_batch, -1) + loss = loss.sum(-1) / (labels != -100).sum(-1) + loss = loss.mean() + return loss + + +def sample_level_cross_entropy(outputs, labels, **kwargs): + # 先对所有样本字token-level的平均,然后计算所有sample的平均值 + loss_fct = CrossEntropyLoss(ignore_index=-100, reduction='none') + loss = loss_fct( + outputs.logits.contiguous().view(-1, outputs.logits.size(-1)), labels.contiguous().view(-1) + ).reshape(labels.shape[0], -1) + loss = loss.sum(-1) / (labels != -100).sum(-1) + loss = loss.mean() + return loss + + +def global_token_level_cross_entropy(outputs, labels, **kwargs): + # 对所有样本一起做token-level的平均 + loss_fct = CrossEntropyLoss(ignore_index=-100, reduction='none') + loss = loss_fct( + outputs.logits.contiguous().view(-1, outputs.logits.size(-1)), labels.contiguous().view(-1) + ).reshape(labels.shape[0], -1) + num_tokens = (loss != 0).sum() + loss = loss.sum() + + num_tokens_tensor = torch.zeros([1], device=loss.device, dtype=loss.dtype) + num_tokens_tensor[0] = num_tokens.item() + + torch.distributed.all_reduce(num_tokens_tensor) + + global_num_tokens = num_tokens_tensor.sum() + + torch.distributed.barrier() + # global_num_tokens是全局的token数,因为在梯度更新的时候回自动对所有卡求mean + # 所有这里要乘一个world_size + loss = loss.sum() / global_num_tokens * torch.distributed.get_world_size() + + return loss + + +LOSS_MAP = { + 'local_token_level_cross_entropy': local_token_level_cross_entropy, + 'mini_batch_token_level_cross_entropy': mini_batch_token_level_cross_entropy, + 'sample_level_cross_entropy': sample_level_cross_entropy, + 'global_token_level_cross_entropy': global_token_level_cross_entropy, + "moe_auxiliary": expert_balanced_auxiliary_cross_entropy, + "moe_auxiliary_sft": expert_balanced_auxiliary_cross_entropy_for_sft, + "pretrain_default": cross_entropy_loss, + "moe_auxiliary_global_token_level": expert_balanced_auxiliary_global_level_cross_entropy, +} + +class Transpose(nn.Module): + def __init__(self, dim0: int, dim1: int): + super().__init__() + self.dim0 = dim0 + self.dim1 = dim1 + + def forward(self, x): + return x.transpose(self.dim0, self.dim1) + +def patch_continuous_features( + input_embeddings: torch.Tensor, + placeholder_loc_lens: torch.Tensor, + encoded_feats: torch.Tensor, + encoded_feat_lens: torch.Tensor, +): + """ + Patch continuous features into input embeddings, while keeping a valid gradient flow. + + input_embeddings: torch.Tensor, size = [B, C?, T, D] + placeholder_loc_lens: torch.LongTensor, size = [B, N, 2] + Each 2-tuple represents (start, length) of a placeholder. + encoded_feats: torch.Tensor, size = [B, L1 + L2 + ... + LN, ...] + encoded_feat_lens: torch.LongTensor, size = [B, N] + + Example ('X' for patch placeholder tokens): + Inputs: + input_embeddings = [[1, 2, 3, X, X, X, 4, 5, 6, X, X, X, 7, 8]] + placeholder_loc_lens = [[3, 3], [9, 3]] + encoded_feats = [[A, A, A, B, B]] + encoded_feat_lens = [[3, 2]] + Outputs: + embeddings = [[1, 2, 3, A, A, A, 4, 5, 6, B, B, X, 7, 8]] + """ + batch_size = input_embeddings.size(0) + audio_feats_mask = torch.zeros_like(input_embeddings, dtype=torch.bool) + audio_feats_buffer = [] + for i in range(batch_size): + sample_len = 0 + audio_feat_start = 0 + audio_feat_buffer = [] + for j in range(placeholder_loc_lens.shape[1]): + placeholder_start: int = int(placeholder_loc_lens[i, j, 0].item()) + placeholder_len: int = int(placeholder_loc_lens[i, j, 1].item()) + if placeholder_len <= 0: + break + feat_len = int(encoded_feat_lens[i, j].item()) + real_feat_len = feat_len + if feat_len > placeholder_len: + # logger.warning( + # f"Feature length ({feat_len}) > placeholder length ({placeholder_len}). This is not expected. Please " + # "check the implementation of estimate_audio_feature_length(). We truncate the feature to avoid errors." + # ) + feat_len = placeholder_len + if placeholder_start > sample_len: + audio_feat_buffer.append(input_embeddings.new_zeros((placeholder_start - sample_len, input_embeddings.shape[2]))) + sample_len = placeholder_start + audio_feat_buffer.append(encoded_feats[i, audio_feat_start:audio_feat_start + feat_len]) + if feat_len < placeholder_len: + audio_feat_buffer.append(encoded_feats.new_zeros(placeholder_len - feat_len)) + audio_feats_mask[i, sample_len:sample_len + feat_len] = 1 + audio_feat_start += real_feat_len + sample_len += placeholder_len + if sample_len < input_embeddings.shape[1]: + audio_feat_buffer.append( + input_embeddings.new_zeros((input_embeddings.shape[1] - sample_len, input_embeddings.shape[2])) + ) + audio_feats_buffer.append(torch.cat(audio_feat_buffer)) + audio_feats_buffer = torch.stack(audio_feats_buffer, dim=0) + embeddings = audio_feats_buffer * audio_feats_mask + input_embeddings * ~audio_feats_mask + return embeddings + +def unwrap_feats(feats: torch.Tensor, feats_lengths: torch.Tensor): + """ + The input feats are in the "wrapped" format, which means that features from (at most) N audios are concatenated + as a single sample feats[i]. In this case, each row of feats_lengths contains the lengths of the concatenated + feature. This function unwraps the features. + For samples with less than N segments, one should pad feats_lengths with 0. The result will contain valid + segments only. + + feats: torch.Tensor, size = [B, L1 + L2 + ... + LN, ...] + feats_lengths: torch.LongTensor, size = [B, N] + + Example ('X' for padding): + Inputs: + feats = [[A, A, A, A, X], + [B, B, C, C, C]] + feats_lengths = [[4, 0], + [2, 3]] + Outputs: + feat_segs = [[A, A, A, A], + [B, B, X, X], + [C, C, C, X]] + feat_seg_lengths = [4, 2, 3] + """ + feat_segs = [] + feat_seg_lengths = [] + for i in range(feats_lengths.shape[0]): + feat_index = 0 + for j in range(feats_lengths.shape[1]): + feat_len = feats_lengths[i, j].item() + if feat_len == 0: break + feat_segs.append(feats[i, feat_index:feat_index + feat_len]) + feat_seg_lengths.append(feat_len) + feat_index += feat_len + feat_segs_batch = torch.nn.utils.rnn.pad_sequence(feat_segs, True).to(feats.device) + feat_seg_lengths = torch.tensor(feat_seg_lengths, dtype=torch.long, device=feats.device) + return feat_segs_batch, feat_seg_lengths + +def wrap_feats(feat_segs: torch.Tensor, feats_lengths: torch.Tensor, feats_seg_lengths: Optional[torch.Tensor] = None): + """ + Wrap segmented features back to the wrapped format. + This function is the inverse operation of unwrap_feats(). See its documentation for details. + Note that the feats_lengths value does not matter a lot. We only check the location of the first 0 to determine the + number of feature segments. + """ + feat_idx = 0 + feats_buffer = [] + feats_locs_buffer = [] + feats_lengths_buffer = [] + for i in range(feats_lengths.shape[0]): + feat_buffer = [] + feat_locs_buffer = [] + feat_lengths_buffer = [] + feat_total_len = 0 + for j in range(feats_lengths.shape[1]): + feat_len = feats_lengths[i, j].item() + if feat_len == 0: + break + if feats_seg_lengths is not None: + feat_len = feats_seg_lengths[feat_idx].item() + feat_buffer.append(feat_segs[feat_idx, :feat_len]) + feat_locs_buffer.append(feat_total_len) + feat_lengths_buffer.append(feat_len) + feat_idx += 1 + feat_total_len += feat_len + feats_buffer.append(torch.cat(feat_buffer)) + feats_locs_buffer.append(torch.tensor(feat_locs_buffer, dtype=torch.long)) + feats_lengths_buffer.append(torch.tensor(feat_lengths_buffer, dtype=torch.long)) + feats = torch.nn.utils.rnn.pad_sequence(feats_buffer, True).to(feat_segs.device) + feats_locs = torch.nn.utils.rnn.pad_sequence(feats_locs_buffer, True).to(feats_lengths.device) + feats_new_lengths = torch.nn.utils.rnn.pad_sequence(feats_lengths_buffer, True).to(feats_lengths.device) + return feats, feats_locs, feats_new_lengths + +def encode_audio_segments( + encoder, + proj_layer, + wav_feats=None, + wav_feats_lengths=None, + waveforms=None, + waveforms_lengths=None, + use_waveform=False, +): + """ + Apply audio encoder to input audio features in wrapped format. + See the documentation of unwrap_feats() for details about 'wrapped format'. + """ + + # Forward audio encoder. + if use_waveform: + assert waveforms is not None and waveforms_lengths is not None + # Unwrap the waveforms so each waveform is placed at an independent row. + waveform_segs_batch, waveform_seg_lengths = unwrap_feats(waveforms, waveforms_lengths) + audio_feats_seg, audio_feat_seg_lengths = encoder(waveform_segs_batch, waveform_seg_lengths)[:2] + else: + assert wav_feats is not None and wav_feats_lengths is not None + # Unwrap the features so the feature of each waveform is placed at an independent row. + feat_segs_batch, feat_seg_lengths = unwrap_feats(wav_feats, wav_feats_lengths) + audio_feats_seg, audio_feat_seg_lengths = encoder(feat_segs_batch, feat_seg_lengths)[:2] + audio_feats_seg_proj = proj_layer(audio_feats_seg.transpose(-1, -2)).transpose(-1, -2) + + # Wrap the features so the 1st dim represents batch_size. + input_lengths = waveforms_lengths if use_waveform else wav_feats_lengths + assert input_lengths is not None + audio_feats, _, audio_feats_lengths = wrap_feats(audio_feats_seg, input_lengths, audio_feat_seg_lengths) + audio_feats_proj, _, audio_feats_lengths2 = wrap_feats(audio_feats_seg_proj, input_lengths, audio_feat_seg_lengths) + assert torch.all(audio_feats_lengths == audio_feats_lengths2), f"{audio_feats_lengths}, {audio_feats_lengths2}" + + return audio_feats_proj, audio_feats, audio_feats_lengths + +def patch_continuous_features( + input_embeddings: torch.Tensor, + placeholder_loc_lens: torch.Tensor, + encoded_feats: torch.Tensor, + encoded_feat_lens: torch.Tensor, +): + """ + Patch continuous features into input embeddings, while keeping a valid gradient flow. + + input_embeddings: torch.Tensor, size = [B, C?, T, D] + placeholder_loc_lens: torch.LongTensor, size = [B, N, 2] + Each 2-tuple represents (start, length) of a placeholder. + encoded_feats: torch.Tensor, size = [B, L1 + L2 + ... + LN, ...] + encoded_feat_lens: torch.LongTensor, size = [B, N] + + Example ('X' for patch placeholder tokens): + Inputs: + input_embeddings = [[1, 2, 3, X, X, X, 4, 5, 6, X, X, X, 7, 8]] + placeholder_loc_lens = [[3, 3], [9, 3]] + encoded_feats = [[A, A, A, B, B]] + encoded_feat_lens = [[3, 2]] + Outputs: + embeddings = [[1, 2, 3, A, A, A, 4, 5, 6, B, B, X, 7, 8]] + """ + batch_size = input_embeddings.size(0) + audio_feats_mask = torch.zeros_like(input_embeddings, dtype=torch.bool) + audio_feats_buffer = [] + for i in range(batch_size): + sample_len = 0 + audio_feat_start = 0 + audio_feat_buffer = [] + for j in range(placeholder_loc_lens.shape[1]): + placeholder_start: int = int(placeholder_loc_lens[i, j, 0].item()) + placeholder_len: int = int(placeholder_loc_lens[i, j, 1].item()) + if placeholder_len <= 0: + break + feat_len = int(encoded_feat_lens[i, j].item()) + real_feat_len = feat_len + if feat_len > placeholder_len: + logging.warning( + f"Feature length ({feat_len}) > placeholder length ({placeholder_len}). This is not expected. Please " + "check the implementation of estimate_audio_feature_length(). We truncate the feature to avoid errors." + ) + feat_len = placeholder_len + if placeholder_start > sample_len: + audio_feat_buffer.append(input_embeddings.new_zeros((placeholder_start - sample_len, input_embeddings.shape[2]))) + sample_len = placeholder_start + audio_feat_buffer.append(encoded_feats[i, audio_feat_start:audio_feat_start + feat_len]) + if feat_len < placeholder_len: + audio_feat_buffer.append(encoded_feats.new_zeros(placeholder_len - feat_len)) + audio_feats_mask[i, sample_len:sample_len + feat_len] = 1 + audio_feat_start += real_feat_len + sample_len += placeholder_len + if sample_len < input_embeddings.shape[1]: + audio_feat_buffer.append( + input_embeddings.new_zeros((input_embeddings.shape[1] - sample_len, input_embeddings.shape[2])) + ) + audio_feats_buffer.append(torch.cat(audio_feat_buffer)) + audio_feats_buffer = torch.stack(audio_feats_buffer, dim=0) + embeddings = audio_feats_buffer * audio_feats_mask + input_embeddings * ~audio_feats_mask + return embeddings + diff --git a/out.wav b/out.wav new file mode 100644 index 0000000000000000000000000000000000000000..12aaf025837844e0547de7ff36f8c664ef954c17 --- /dev/null +++ b/out.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:979abbf02cae463f74b3c0e2f1a50fc1d3ad1882cd54f291956f52dc11006947 +size 1091664 diff --git a/preprocessor_config.json b/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..709ada1bd891093da129a1f9ae85a18903e626bc --- /dev/null +++ b/preprocessor_config.json @@ -0,0 +1,54 @@ +{ + "auto_map": { + "AutoImageProcessor": "image_processing_bailingmm.BailingMMImageProcessor", + "AutoFeatureExtractor": "audio_processing_bailingmm.BailingMMAudioProcessor", + "AutoProcessor": "processing_bailingmm.BailingMMProcessor" + }, + "min_pixels": 78400, + "max_pixels": 802816, + "patch_size": 14, + "temporal_patch_size": 2, + "merge_size": 2, + "image_mean": [ + 0.48145466, + 0.4578275, + 0.40821073 + ], + "image_std": [ + 0.26862954, + 0.26130258, + 0.27577711 + ], + "image_token": "", + "video_token": "