Spaces:

ASLP-lab
/

DiffRhythm

Running on Zero

App Files Files Community

hkchen commited on about 15 hours ago

Commit

34ded66

1 Parent(s): 7d15a50

update v1.2 full web code

Browse files

Files changed (5) hide show

app.py +54 -43
diffrhythm/infer/infer.py +4 -1
diffrhythm/infer/infer_utils.py +20 -9
diffrhythm/model/cfm.py +4 -2
diffrhythm/model/dit.py +4 -1

app.py CHANGED Viewed

@@ -27,19 +27,22 @@ from diffrhythm.infer.infer import inference
 MAX_SEED = np.iinfo(np.int32).max
 device='cuda'
-cfm, tokenizer, muq, vae, eval_model, eval_muq = prepare_model(device)
 cfm = torch.compile(cfm)
 @spaces.GPU
-def infer_music(lrc, ref_audio_path, text_prompt, current_prompt_type, seed=42, randomize_seed=False, steps=32, cfg_strength=4.0, file_type='wav', odeint_method='euler', preference_infer="quality first", edit=False, edit_segments=None, device='cuda'):
-    max_frames = 2048
     sway_sampling_coef = -1 if steps < 32 else None
     if randomize_seed:
         seed = random.randint(0, MAX_SEED)
     torch.manual_seed(seed)
     vocal_flag = False
     try:
-        lrc_prompt, start_time = get_lrc_token(max_frames, lrc, tokenizer, device)
         if current_prompt_type == 'audio':
             style_prompt, vocal_flag = get_audio_style_prompt(muq, ref_audio_path)
         else:
@@ -59,7 +62,7 @@ def infer_music(lrc, ref_audio_path, text_prompt, current_prompt_type, seed=42,
                                eval_muq=eval_muq,
                                cond=latent_prompt,
                                text=lrc_prompt,
-                               duration=max_frames,
                                style_prompt=style_prompt,
                                negative_style_prompt=negative_style_prompt,
                                steps=steps,
@@ -71,6 +74,7 @@ def infer_music(lrc, ref_audio_path, text_prompt, current_prompt_type, seed=42,
                                odeint_method=odeint_method,
                                pred_frames=pred_frames,
                                batch_infer_num=batch_infer_num,
                                )
     return generated_song
@@ -194,7 +198,7 @@ with gr.Blocks(css=css) as demo:
                         lines=12,
                         max_lines=50,
                         elem_classes="lyrics-scroll-box",
-                        value="""[00:04.34]Tell me that I'm special\n[00:06.57]Tell me I look pretty\n[00:08.46]Tell me I'm a little angel\n[00:10.58]Sweetheart of your city\n[00:13.64]Say what I'm dying to hear\n[00:17.35]Cause I'm dying to hear you\n[00:20.86]Tell me I'm that new thing\n[00:22.93]Tell me that I'm relevant\n[00:24.96]Tell me that I got a big heart\n[00:27.04]Then back it up with evidence\n[00:29.94]I need it and I don't know why\n[00:34.28]This late at night\n[00:36.32]Isn't it lonely\n[00:39.24]I'd do anything to make you want me\n[00:43.40]I'd give it all up if you told me\n[00:47.42]That I'd be\n[00:49.43]The number one girl in your eyes\n[00:52.85]Your one and only\n[00:55.74]So what's it gon' take for you to want me\n[00:59.78]I'd give it all up if you told me\n[01:03.89]That I'd be\n[01:05.94]The number one girl in your eyes\n[01:11.34]Tell me I'm going real big places\n[01:14.32]Down to earth so friendly\n[01:16.30]And even through all the phases\n[01:18.46]Tell me you accept me\n[01:21.56]Well that's all I'm dying to hear\n[01:25.30]Yeah I'm dying to hear you\n[01:28.91]Tell me that you need me\n[01:30.85]Tell me that I'm loved\n[01:32.90]Tell me that I'm worth it\n"""
                     )
                     current_prompt_type = gr.State(value="audio")
@@ -217,35 +221,39 @@ with gr.Blocks(css=css) as demo:
                 with gr.Column():
                     with gr.Accordion("Best Practices Guide", open=True):
                         gr.Markdown("""
-1. **Lyrics Format Requirements**
-    - Each line must follow: `[mm:ss.xx]Lyric content`
-    - Example of valid format:
-    ```
-    [00:10.00]Moonlight spills through broken blinds
-    [00:13.20]Your shadow dances on the dashboard shrine
-    ```
-2. **Audio Prompt Requirements**
-    - Reference audio should be ≥ 1 second, audio >10 seconds will be randomly clipped into 10 seconds.
-    - For optimal results, the 10-second clips should be carefully selected.
-    - Shorter clips may lead to incoherent generation.
-3. **Supported Languages**
-    - **Chinese and English**
-    - More languages comming soon.
-4. **Editing Function in Advanced Settings**
-    - Using full-length audio as reference is recommended for best results.
-    - Use -1 to represent the start/end of audio (e.g. [[-1,25], [50,-1]] means "from start to 25s" and "from 50s to end").
-5. **Generate Preference**
-    - Quality First: Higher quality , slightly slower.
-    - Speed First: Faster generation with slightly reduced quality.
-6. **Others**
-    - If loading audio result is slow, you can select Output Format as mp3 in Advanced Settings.
                         """)
                     # Music_Duration = gr.Radio(["95s", "285s"], label="Music Duration", value="95s")
                     preference_infer = gr.Radio(["quality first", "speed first"], label="Preference", value="quality first")
                     lyrics_btn = gr.Button("Generate", variant="primary")
                     audio_output = gr.Audio(label="Audio Result", type="filepath", elem_id="audio_output")
@@ -277,11 +285,13 @@ with gr.Blocks(css=css) as demo:
                                     interactive=True,
                                     elem_id="step_slider"
                                 )
-                        edit = gr.Checkbox(label="edit", value=False)
-                        edit_segments = gr.Textbox(
-                            label="Edit Segments",
-                            placeholder="Time segments to edit (in seconds). Format: `[[start1,end1],...]",
-                            )
                         odeint_method = gr.Radio(["euler", "midpoint", "rk4","implicit_adams"], label="ODE Solver", value="euler")
                         file_type = gr.Dropdown(["wav", "mp3", "ogg"], label="Output Format", value="mp3")
@@ -324,14 +334,15 @@ with gr.Blocks(css=css) as demo:
             gr.Examples(
                 examples=[
-                    ["""[00:04.34]Tell me that I'm special\n[00:06.57]Tell me I look pretty\n[00:08.46]Tell me I'm a little angel\n[00:10.58]Sweetheart of your city\n[00:13.64]Say what I'm dying to hear\n[00:17.35]Cause I'm dying to hear you\n[00:20.86]Tell me I'm that new thing\n[00:22.93]Tell me that I'm relevant\n[00:24.96]Tell me that I got a big heart\n[00:27.04]Then back it up with evidence\n[00:29.94]I need it and I don't know why\n[00:34.28]This late at night\n[00:36.32]Isn't it lonely\n[00:39.24]I'd do anything to make you want me\n[00:43.40]I'd give it all up if you told me\n[00:47.42]That I'd be\n[00:49.43]The number one girl in your eyes\n[00:52.85]Your one and only\n[00:55.74]So what's it gon' take for you to want me\n[00:59.78]I'd give it all up if you told me\n[01:03.89]That I'd be\n[01:05.94]The number one girl in your eyes\n[01:11.34]Tell me I'm going real big places\n[01:14.32]Down to earth so friendly\n[01:16.30]And even through all the phases\n[01:18.46]Tell me you accept me\n[01:21.56]Well that's all I'm dying to hear\n[01:25.30]Yeah I'm dying to hear you\n[01:28.91]Tell me that you need me\n[01:30.85]Tell me that I'm loved\n[01:32.90]Tell me that I'm worth it\n"""],
-                    ["""[00:00.52]Abracadabra abracadabra\n[00:03.97]Ha\n[00:04.66]Abracadabra abracadabra\n[00:12.02]Yeah\n[00:15.80]Pay the toll to the angels\n[00:19.08]Drawin' circles in the clouds\n[00:23.31]Keep your mind on the distance\n[00:26.67]When the devil turns around\n[00:30.95]Hold me in your heart tonight\n[00:34.11]In the magic of the dark moonlight\n[00:38.44]Save me from this empty fight\n[00:43.83]In the game of life\n[00:45.84]Like a poem said by a lady in red\n[00:49.45]You hear the last few words of your life\n[00:53.15]With a haunting dance now you're both in a trance\n[00:56.90]It's time to cast your spell on the night\n[01:01.40]Abracadabra ama-ooh-na-na\n[01:04.88]Abracadabra porta-ooh-ga-ga\n[01:08.92]Abracadabra abra-ooh-na-na\n[01:12.30]In her tongue she's sayin'\n[01:14.76]Death or love tonight\n[01:18.61]Abracadabra abracadabra\n[01:22.18]Abracadabra abracadabra\n[01:26.08]Feel the beat under your feet\n[01:27.82]The floor's on fire\n[01:29.90]Abracadabra abracadabra\n"""],
-                    ["""[00:00.27]只因你太美 baby 只因你太美 baby\n[00:08.95]只因你实在是太美 baby\n[00:13.99]只因你太美 baby\n[00:18.89]迎面走来的你让我如此蠢蠢欲动\n[00:20.88]这种感觉我从未有\n[00:21.79]Cause I got a crush on you who you\n[00:25.74]你是我的我是你的谁\n[00:28.09]再多一眼看一眼就会爆炸\n[00:30.31]再近一点靠近点快被融化\n[00:32.49]想要把你占为己有 baby bae\n[00:34.60]不管走到哪里\n[00:35.44]都会想起的人是你 you you\n[00:38.12]我应该拿你怎样\n[00:39.61]Uh 所有人都在看着你\n[00:42.36]我的心总是不安\n[00:44.18]Oh 我现在已病入膏肓\n[00:46.63]Eh oh\n[00:47.84]难道真的因你而疯狂吗\n[00:51.57]我本来不是这种人\n[00:53.59]因你变成奇怪的人\n[00:55.77]第一次呀变成这样的我\n[01:01.23]不管我怎么去否认\n[01:03.21]只因你太美 baby 只因你太美 baby\n[01:11.46]只因你实在是太美 baby\n[01:16.75]只因你太美 baby\n[01:21.09]Oh eh oh\n[01:22.82]现在确认地告诉我\n[01:25.26]Oh eh oh\n[01:27.31]你到底属于谁\n[01:29.98]Oh eh oh\n[01:31.70]现在确认地告诉我\n[01:34.45]Oh eh oh\n"""]
                 ],
                 inputs=[lrc],
                 label="Lrc Examples",
-                examples_per_page=3,
                 elem_id="lrc-examples-container",
             )
@@ -426,7 +437,7 @@ with gr.Blocks(css=css) as demo:
     lyrics_btn.click(
         fn=infer_music,
-        inputs=[lrc, audio_prompt, text_prompt, current_prompt_type, seed, randomize_seed, steps, cfg_strength, file_type, odeint_method, preference_infer, edit, edit_segments],
         outputs=audio_output
     )

 MAX_SEED = np.iinfo(np.int32).max
 device='cuda'
+cfm, tokenizer, muq, vae, eval_model, eval_muq = prepare_model(max_frames=6144, device=device)
 cfm = torch.compile(cfm)
 @spaces.GPU
+def infer_music(lrc, ref_audio_path, text_prompt, current_prompt_type, seed=42,
+                randomize_seed=False, steps=32, cfg_strength=4.0, file_type='wav',
+                odeint_method='euler', preference_infer="quality first", Music_Duration=285, edit=False,
+                edit_segments=None, device='cuda'):
+    max_frames = 2048 if Music_Duration == 95 else 6144
     sway_sampling_coef = -1 if steps < 32 else None
     if randomize_seed:
         seed = random.randint(0, MAX_SEED)
     torch.manual_seed(seed)
     vocal_flag = False
     try:
+        lrc_prompt, start_time, end_frame, song_duration = get_lrc_token(max_frames, lrc, tokenizer, Music_Duration, device)
         if current_prompt_type == 'audio':
             style_prompt, vocal_flag = get_audio_style_prompt(muq, ref_audio_path)
         else:
                                eval_muq=eval_muq,
                                cond=latent_prompt,
                                text=lrc_prompt,
+                               duration=end_frame,
                                style_prompt=style_prompt,
                                negative_style_prompt=negative_style_prompt,
                                steps=steps,
                                odeint_method=odeint_method,
                                pred_frames=pred_frames,
                                batch_infer_num=batch_infer_num,
+                               song_duration=song_duration
                                )
     return generated_song
                         lines=12,
                         max_lines=50,
                         elem_classes="lyrics-scroll-box",
+                        value="""[00:04.074] Tell me that I'm special\n[00:06.226] Tell me I look pretty\n[00:08.175] Tell me I'm a little angel\n[00:10.175] Sweetheart of your city\n[00:13.307] Say what I'm dying to hear\n[00:17.058] 'Cause I'm dying to hear you\n[00:20.523] Tell me I'm that new thing\n[00:22.571] Tell me that I'm relevant\n[00:24.723] Tell me that I got a big heart\n[00:26.723] Then back it up with evidence\n[00:29.408] I need it and I don't know why\n[00:33.907] This late at night\n[00:36.139] Isn't it lonely\n[00:38.991] I'd do anything to make you want me\n[00:43.222] I'd give it all up if you told me\n[00:47.339] That I'd be\n[00:49.157] The number one girl in your eyes\n[00:52.506] Your one and only\n[00:55.437] So what's it gon' take for you to want me\n[00:59.589] I'd give it all up if you told me\n[01:03.674] That I'd be\n[01:05.823] The number one girl in your eyes\n[01:10.841] Tell me I'm going real big places\n[01:14.055] Down to earth, so friendly\n[01:16.105] And even through all the phases\n[01:18.256] Tell me you accept me\n[01:21.155] Well, that's all I'm dying to hear\n[01:24.937] Yeah, I'm dying to hear you\n[01:28.521] Tell me that you need me\n[01:30.437] Tell me that I'm loved\n[01:32.740] Tell me that I'm worth it\n[01:34.605] And that I'm enough\n[01:37.571] I need it and I don't know why\n[01:41.889] This late at night\n[01:43.805] Isn't it lonely\n[01:46.871] I'd do anything to make you want me\n[01:51.004] I'd give it all up if you told me\n[01:55.237] That I'd be\n[01:57.089] The number one girl in your eyes\n[02:00.325] Your one and only\n[02:03.305] So what's it gon' take for you to want me\n[02:07.355] I'd give it all up if you told me\n[02:11.589] That I'd be\n[02:13.623] The number one girl in your eyes\n[02:16.804] The girl in your eyes\n[02:20.823] The girl in your eyes\n[02:26.055] Tell me I'm the number one girl\n[02:28.355] I'm the number one girl in your eyes\n[02:33.172] The girl in your eyes\n[02:37.321] The girl in your eyes\n[02:42.472] Tell me I'm the number one girl\n[02:44.539] I'm the number one girl in your eyes\n[02:49.605] Well isn't it lonely\n[02:52.488] I'd do anything to make you want me\n[02:56.637] I'd give it all up if you told me\n[03:00.888] That I'd be\n[03:03.172] The number one girl in your eyes\n[03:06.272] Your one and only\n[03:09.160] So what's it gon' take for you to want me\n[03:13.056] I'd give it all up if you told me\n[03:17.305] That I'd be\n[03:19.488] The number one girl in your eyes\n[03:25.420] The number one girl in your eyes\n"""
                     )
                     current_prompt_type = gr.State(value="audio")
                 with gr.Column():
                     with gr.Accordion("Best Practices Guide", open=True):
                         gr.Markdown("""
+                        1. **Lyrics Format Requirements**
+                        - Each line must follow: `[mm:ss.xx]Lyric content`
+                        - Example of valid format:
+                            ```
+                            [00:10.00]Moonlight spills through broken blinds
+                            [00:13.20]Your shadow dances on the dashboard shrine
+                            ```
+                        2. **Generation Duration Limits**
+                        - The generated music must be between **95 seconds (1:35)** and **285 seconds (4:45)** in length
+                        - The latest valid lyric timestamp cannot exceed **04:45.00 (285s)**
+                        3. **Audio Prompt Requirements**
+                        - Reference audio should be ≥ 1 second, Audio >10 seconds will be randomly clipped into 10 seconds
+                        - For optimal results, the 10-second clips should be carefully selected
+                        - Shorter clips may lead to incoherent generation
+                        4. **Supported Languages**
+                        - Chinese and English
+                        - More languages comming soon
+                        5. **Others**
+                        - If loading audio result is slow, you can select Output Format as mp3 in Advanced Settings.
                         """)
                     # Music_Duration = gr.Radio(["95s", "285s"], label="Music Duration", value="95s")
+                    Music_Duration = gr.Slider(
+                        minimum=95,
+                        maximum=285,
+                        step=1,
+                        value=95,
+                        label="Music Duration (s)",
+                        interactive=True
+                    )
                     preference_infer = gr.Radio(["quality first", "speed first"], label="Preference", value="quality first")
                     lyrics_btn = gr.Button("Generate", variant="primary")
                     audio_output = gr.Audio(label="Audio Result", type="filepath", elem_id="audio_output")
                                     interactive=True,
                                     elem_id="step_slider"
                                 )
+                        # edit = gr.Checkbox(label="edit", value=False)
+                        # edit = False
+                        # preference_infer = gr.Radio(["quality first", "speed first"], label="Preference", value="quality first")
+                        # edit_segments = gr.Textbox(
+                        #     label="Edit Segments",
+                        #     placeholder="Time segments to edit (in seconds). Format: `[[start1,end1],...]",
+                        #     )
                         odeint_method = gr.Radio(["euler", "midpoint", "rk4","implicit_adams"], label="ODE Solver", value="euler")
                         file_type = gr.Dropdown(["wav", "mp3", "ogg"], label="Output Format", value="mp3")
             gr.Examples(
                 examples=[
+                    ["""[00:04.074] Tell me that I'm special\n[00:06.226] Tell me I look pretty\n[00:08.175] Tell me I'm a little angel\n[00:10.175] Sweetheart of your city\n[00:13.307] Say what I'm dying to hear\n[00:17.058] 'Cause I'm dying to hear you\n[00:20.523] Tell me I'm that new thing\n[00:22.571] Tell me that I'm relevant\n[00:24.723] Tell me that I got a big heart\n[00:26.723] Then back it up with evidence\n[00:29.408] I need it and I don't know why\n[00:33.907] This late at night\n[00:36.139] Isn't it lonely\n[00:38.991] I'd do anything to make you want me\n[00:43.222] I'd give it all up if you told me\n[00:47.339] That I'd be\n[00:49.157] The number one girl in your eyes\n[00:52.506] Your one and only\n[00:55.437] So what's it gon' take for you to want me\n[00:59.589] I'd give it all up if you told me\n[01:03.674] That I'd be\n[01:05.823] The number one girl in your eyes\n[01:10.841] Tell me I'm going real big places\n[01:14.055] Down to earth, so friendly\n[01:16.105] And even through all the phases\n[01:18.256] Tell me you accept me\n[01:21.155] Well, that's all I'm dying to hear\n[01:24.937] Yeah, I'm dying to hear you\n[01:28.521] Tell me that you need me\n[01:30.437] Tell me that I'm loved\n[01:32.740] Tell me that I'm worth it\n[01:34.605] And that I'm enough\n[01:37.571] I need it and I don't know why\n[01:41.889] This late at night\n[01:43.805] Isn't it lonely\n[01:46.871] I'd do anything to make you want me\n[01:51.004] I'd give it all up if you told me\n[01:55.237] That I'd be\n[01:57.089] The number one girl in your eyes\n[02:00.325] Your one and only\n[02:03.305] So what's it gon' take for you to want me\n[02:07.355] I'd give it all up if you told me\n[02:11.589] That I'd be\n[02:13.623] The number one girl in your eyes\n[02:16.804] The girl in your eyes\n[02:20.823] The girl in your eyes\n[02:26.055] Tell me I'm the number one girl\n[02:28.355] I'm the number one girl in your eyes\n[02:33.172] The girl in your eyes\n[02:37.321] The girl in your eyes\n[02:42.472] Tell me I'm the number one girl\n[02:44.539] I'm the number one girl in your eyes\n[02:49.605] Well isn't it lonely\n[02:52.488] I'd do anything to make you want me\n[02:56.637] I'd give it all up if you told me\n[03:00.888] That I'd be\n[03:03.172] The number one girl in your eyes\n[03:06.272] Your one and only\n[03:09.160] So what's it gon' take for you to want me\n[03:13.056] I'd give it all up if you told me\n[03:17.305] That I'd be\n[03:19.488] The number one girl in your eyes\n[03:25.420] The number one girl in your eyes\n"""],
+                    ["""[00:00.133]Abracadabra, abracadabra\n[00:03.985]Abracadabra, abracadabra\n[00:15.358]Pay the toll to the angels\n[00:18.694]Drawin' circles in the clouds\n[00:22.966]Keep your mind on the distance\n[00:26.321]When the devil turns around\n[00:30.540]Hold me in your heart tonight\n[00:33.751]In the magic of the dark moonlight\n[00:38.189]Save me from this empty fight\n[00:43.521]In the game of life\n[00:45.409]Like a poem said by a lady in red\n[00:49.088]You hear the last few words of your life\n[00:53.013]With a haunting dance, now you're both in a trance\n[00:56.687]It's time to cast your spell on the night\n[01:01.131]Abracadabra amor-oo-na-na\n[01:04.394]Abracadabra morta-oo-gaga\n[01:08.778]Abracadabra, abra-ooh-na-na\n[01:12.063]In her tongue, she's sayin'\n[01:14.367]Death or love tonight\n[01:18.249]Abracadabra, abracadabra\n[01:22.136]Abracadabra, abracadabra\n[01:25.859]Feel the beat under your feet\n[01:27.554]The floor's on fire\n[01:29.714]Abracadabra, abracadabra\n[01:33.464]Choose the road on the west side\n[01:36.818]As the dust flies, watch it burn\n[01:41.057]Don't waste time on feeling\n[01:44.419]Use your passion no return\n[01:48.724]Hold me in your heart tonight\n[01:51.886]In the magic of the dark moonlight\n[01:56.270]Save me from this empty fight\n[02:01.599]In the game of life\n[02:03.524]Like a poem said by a lady in red\n[02:07.192]You hear the last few words of your life\n[02:11.055]With a haunting dance, now you're both in a trance\n[02:14.786]It's time to cast your spell on the night\n[02:19.225]Abracadabra amor-oo-na-na\n[02:22.553]Abracadabra morta-oo-gaga\n[02:26.852]Abracadabra, abra-ooh-na-na\n[02:30.110]In her tongue, she's sayin'\n[02:32.494]Death or love tonight\n[02:36.244]Abracadabra, abracadabra\n[02:40.161]Abracadabra, abracadabra\n[02:43.966]Feel the beat under your feet\n[02:45.630]The floor's on fire\n[02:47.812]Abracadabra, abracadabra\n[02:50.537]Phantom of the dancefloor, come to me\n[02:58.169]Sing for me a sinful melody\n[03:05.833]Ah-ah-ah-ah-ah, ah-ah, ah-ah\n[03:13.453]Ah-ah-ah-ah-ah, ah-ah, ah-ah\n[03:22.025]Abracadabra amor-oo-na-na\n[03:25.423]Abracadabra morta-oo-gaga\n[03:29.674]Abracadabra, abra-ooh-na-na\n[03:33.013]In her tongue, she's sayin'\n[03:35.401]Death or love tonight\n"""],
+                    ["""[00:00.27]只因你太美 baby 只因你太美 baby\n[00:08.95]只因你实在是太美 baby\n[00:13.99]只因你太美 baby\n[00:18.89]迎面走来的你让我如此蠢蠢欲动\n[00:20.88]这种感觉我从未有\n[00:21.79]Cause I got a crush on you who you\n[00:25.74]你是我的我是你的谁\n[00:28.09]再多一眼看一眼就会爆炸\n[00:30.31]再近一点靠近点快被融化\n[00:32.49]想要把你占为己有 baby bae\n[00:34.60]不管走到哪里\n[00:35.44]都会想起的人是你 you you\n[00:38.12]我应该拿你怎样\n[00:39.61]Uh 所有人都在看着你\n[00:42.36]我的心总是不安\n[00:44.18]Oh 我现在已病入膏肓\n[00:46.63]Eh oh\n[00:47.84]难道真的因你而疯狂吗\n[00:51.57]我本来不是这种人\n[00:53.59]因你变成奇怪的人\n[00:55.77]第一次呀变成这样的我\n[01:01.23]不管我怎么去否认\n[01:03.21]只因你太美 baby 只因你太美 baby\n[01:11.46]只因你实在是太美 baby\n[01:16.75]只因你太美 baby\n[01:21.09]Oh eh oh\n[01:22.82]现在确认地告诉我\n[01:25.26]Oh eh oh\n[01:27.31]你到底属于谁\n[01:29.98]Oh eh oh\n[01:31.70]现在确认地告诉我\n[01:34.45]Oh eh oh\n"""],
+                    ["""[00:19.50]也想不到要怎么问你别来无恙\n[00:25.71]世界乱的一塌糊涂可是能怎样\n[00:31.85]偶尔抬起头来还好有颗月亮可赏\n[00:38.96]太多回忆要我怎么摆进行李箱\n[00:45.22]一直没哭一直走路走灰多少太阳\n[00:51.70]因为往事没有办法悬赏 隐形在那大街小巷\n[01:00.22]剪断了它还嚣张\n[01:03.85]我的嘴在说谎 说的那么漂亮\n[01:10.07]说我早就忘了你像月一样的俏脸庞\n[01:16.51]最怕一边忙呀忙一边回想那旧时光\n[01:22.87]剪不掉的是你 带泪的脸 还真是烦\n[01:43.32]多少原因将我绑在半夜屋顶上\n[01:49.23]一直没再爱一个人如今就是这样\n[01:55.75]因为故事跟你说了一半 于是搁在所谓云端\n[02:04.21]谁忘不了谁孤单\n[02:07.79]我的心在说谎 说下去会疯狂\n[02:14.02]如果没有月亮那些日子都无妨\n[02:20.43]最怕一边忙呀忙一边想那旧时光\n[02:26.91]剪不掉的是你 带笑的苦 还真烦\n[02:33.81]我的嘴又说了谎 说的那么漂亮\n[02:39.68]以为已经忘了你的那些美像月光它剪不断\n[02:47.15]因为爱早就钻进心脏 心一跳泪就会烫\n[02:52.22]那些带泪的脸 带笑的苦 还真烦\n[02:59.28]月亮是个凶手 想你的我 是通缉犯\n[03:08.03]我有时候真的很怕望见那月光中的你\n"""]
                 ],
                 inputs=[lrc],
                 label="Lrc Examples",
+                examples_per_page=4,
                 elem_id="lrc-examples-container",
             )
     lyrics_btn.click(
         fn=infer_music,
+        inputs=[lrc, audio_prompt, text_prompt, current_prompt_type, seed, randomize_seed, steps, cfg_strength, file_type, odeint_method, preference_infer, Music_Duration],
         outputs=audio_output
     )

diffrhythm/infer/infer.py CHANGED Viewed

@@ -43,6 +43,7 @@ def inference(
     odeint_method,
     pred_frames,
     batch_infer_num,
     chunked=True,
 ):
     with torch.inference_mode():
@@ -50,6 +51,7 @@ def inference(
             cond=cond,
             text=text,
             duration=duration,
             style_prompt=style_prompt,
             negative_style_prompt=negative_style_prompt,
             steps=steps,
@@ -59,7 +61,8 @@ def inference(
             vocal_flag=vocal_flag,
             odeint_method=odeint_method,
             latent_pred_segments=pred_frames,
-            batch_infer_num=batch_infer_num
         )
         outputs = []

     odeint_method,
     pred_frames,
     batch_infer_num,
+    song_duration,
     chunked=True,
 ):
     with torch.inference_mode():
             cond=cond,
             text=text,
             duration=duration,
+            max_duration=duration,
             style_prompt=style_prompt,
             negative_style_prompt=negative_style_prompt,
             steps=steps,
             vocal_flag=vocal_flag,
             odeint_method=odeint_method,
             latent_pred_segments=pred_frames,
+            batch_infer_num=batch_infer_num,
+            song_duration=song_duration
         )
         outputs = []

diffrhythm/infer/infer_utils.py CHANGED Viewed

@@ -194,17 +194,21 @@ def encode_audio(audio, vae_model, chunked=False, overlap=32, chunk_size=128):
             y_final[:,:,t_start:t_end] = y_chunk[:,:,chunk_start:chunk_end]
         return y_final
-def prepare_model(device):
     # prepare cfm model
-    dit_ckpt_path = hf_hub_download(repo_id="ASLP-lab/DiffRhythm-1_2", filename="cfm_model.pt")
     dit_config_path = "./diffrhythm/config/config.json"
     with open(dit_config_path) as f:
         model_config = json.load(f)
     dit_model_cls = DiT
     cfm = CFM(
-                transformer=dit_model_cls(**model_config["model"], max_frames=2048),
                 num_channels=model_config["model"]['mel_dim'],
              )
     cfm = cfm.to(device)
     cfm = load_checkpoint(cfm, dit_ckpt_path, device=device, use_ema=False)
@@ -410,12 +414,11 @@ class CNENTokenizer:
         return "|".join([self.id2phone[x - 1] for x in token])
-def get_lrc_token(max_frames, text, tokenizer, device):
     lyrics_shift = 0
     sampling_rate = 44100
     downsample_rate = 2048
-    max_secs = max_frames / (sampling_rate / downsample_rate)
     comma_token_id = 1
     period_token_id = 2
@@ -436,10 +439,15 @@ def get_lrc_token(max_frames, text, tokenizer, device):
     ]
     if max_frames == 2048:
         lrc_with_time = lrc_with_time[:-1] if len(lrc_with_time) >= 1 else lrc_with_time
     normalized_start_time = 0.0
-    lrc = torch.zeros((max_frames,), dtype=torch.long)
     tokens_count = 0
     last_end_pos = 0
@@ -455,7 +463,7 @@ def get_lrc_token(max_frames, text, tokenizer, device):
         frame_shift = random.randint(int(-lyrics_shift), int(lyrics_shift))
         frame_start = max(gt_frame_start - frame_shift, last_end_pos)
-        frame_len = min(num_tokens, max_frames - frame_start)
         lrc[frame_start : frame_start + frame_len] = tokens[:frame_len]
@@ -466,8 +474,11 @@ def get_lrc_token(max_frames, text, tokenizer, device):
     normalized_start_time = torch.tensor(normalized_start_time).unsqueeze(0).to(device)
     normalized_start_time = normalized_start_time.half()
-    return lrc_emb, normalized_start_time
 def load_checkpoint(model, ckpt_path, device, use_ema=True):

             y_final[:,:,t_start:t_end] = y_chunk[:,:,chunk_start:chunk_end]
         return y_final
+def prepare_model(max_frames, device):
     # prepare cfm model
+    if max_frames == 2048:
+        dit_ckpt_path = hf_hub_download(repo_id="ASLP-lab/DiffRhythm-1_2", filename="cfm_model.pt")
+    else:
+        dit_ckpt_path = hf_hub_download(repo_id="ASLP-lab/DiffRhythm-1_2-full", filename="cfm_model.pt")
     dit_config_path = "./diffrhythm/config/config.json"
     with open(dit_config_path) as f:
         model_config = json.load(f)
     dit_model_cls = DiT
     cfm = CFM(
+                transformer=dit_model_cls(**model_config["model"], max_frames=max_frames),
                 num_channels=model_config["model"]['mel_dim'],
+                max_frames=max_frames
              )
     cfm = cfm.to(device)
     cfm = load_checkpoint(cfm, dit_ckpt_path, device=device, use_ema=False)
         return "|".join([self.id2phone[x - 1] for x in token])
+def get_lrc_token(max_frames, text, tokenizer, max_secs, device):
     lyrics_shift = 0
     sampling_rate = 44100
     downsample_rate = 2048
     comma_token_id = 1
     period_token_id = 2
     ]
     if max_frames == 2048:
         lrc_with_time = lrc_with_time[:-1] if len(lrc_with_time) >= 1 else lrc_with_time
+    end_frame = max_frames if max_frames == 2048 else int(max_secs * (sampling_rate / downsample_rate))
+    end_frame = min(end_frame, max_frames)
     normalized_start_time = 0.0
+    normalized_duration = end_frame / max_frames
+    lrc = torch.zeros((end_frame,), dtype=torch.long)
     tokens_count = 0
     last_end_pos = 0
         frame_shift = random.randint(int(-lyrics_shift), int(lyrics_shift))
         frame_start = max(gt_frame_start - frame_shift, last_end_pos)
+        frame_len = min(num_tokens, end_frame - frame_start)
         lrc[frame_start : frame_start + frame_len] = tokens[:frame_len]
     normalized_start_time = torch.tensor(normalized_start_time).unsqueeze(0).to(device)
     normalized_start_time = normalized_start_time.half()
+    normalized_duration = torch.tensor(normalized_duration).unsqueeze(0).to(device)
+    normalized_duration = normalized_duration.half()
+    return lrc_emb, normalized_start_time, end_frame, normalized_duration
 def load_checkpoint(model, ckpt_path, device, use_ema=True):

diffrhythm/model/cfm.py CHANGED Viewed

@@ -138,6 +138,7 @@ class CFM(nn.Module):
         latent_pred_segments=None,
         vocal_flag=False,
         odeint_method="euler",
         batch_infer_num=5
     ):
         self.eval()
@@ -208,19 +209,20 @@ class CFM(nn.Module):
         negative_style_prompt = negative_style_prompt.repeat(batch_infer_num, 1)
         start_time = start_time.repeat(batch_infer_num)
         fixed_span_mask = fixed_span_mask.repeat(batch_infer_num, 1, 1)
         def fn(t, x):
             # predict flow
             pred = self.transformer(
                 x=x, cond=step_cond, text=text, time=t, drop_audio_cond=False, drop_text=False, drop_prompt=False,
-                style_prompt=style_prompt, start_time=start_time
             )
             if cfg_strength < 1e-5:
                 return pred
             null_pred = self.transformer(
                 x=x, cond=step_cond, text=text, time=t, drop_audio_cond=True, drop_text=True, drop_prompt=False,
-                style_prompt=negative_style_prompt, start_time=start_time
             )
             return pred + (pred - null_pred) * cfg_strength

         latent_pred_segments=None,
         vocal_flag=False,
         odeint_method="euler",
+        song_duration=None,
         batch_infer_num=5
     ):
         self.eval()
         negative_style_prompt = negative_style_prompt.repeat(batch_infer_num, 1)
         start_time = start_time.repeat(batch_infer_num)
         fixed_span_mask = fixed_span_mask.repeat(batch_infer_num, 1, 1)
+        song_duration = song_duration.repeat(batch_infer_num)
         def fn(t, x):
             # predict flow
             pred = self.transformer(
                 x=x, cond=step_cond, text=text, time=t, drop_audio_cond=False, drop_text=False, drop_prompt=False,
+                style_prompt=style_prompt, start_time=start_time, duration=song_duration
             )
             if cfg_strength < 1e-5:
                 return pred
             null_pred = self.transformer(
                 x=x, cond=step_cond, text=text, time=t, drop_audio_cond=True, drop_text=True, drop_prompt=False,
+                style_prompt=negative_style_prompt, start_time=start_time, duration=song_duration
             )
             return pred + (pred - null_pred) * cfg_strength

diffrhythm/model/dit.py CHANGED Viewed

@@ -118,6 +118,7 @@ class DiT(nn.Module):
         cond_dim = 512
         self.time_embed = TimestepEmbedding(cond_dim)
         self.start_time_embed = TimestepEmbedding(cond_dim)
         if text_dim is None:
             text_dim = mel_dim
         self.text_embed = TextEmbedding(text_num_embeds, text_dim, conv_layers=conv_layers, max_pos=self.max_frames)
@@ -170,6 +171,7 @@ class DiT(nn.Module):
         drop_prompt=False,
         style_prompt=None, # [b d t]
         start_time=None,
     ):
         batch, seq_len = x.shape[0], x.shape[1]
@@ -179,7 +181,8 @@ class DiT(nn.Module):
         # t: conditioning time, c: context (text + masked cond audio), x: noised input audio
         t = self.time_embed(time)
         s_t = self.start_time_embed(start_time)
-        c = t + s_t
         text_embed = self.text_embed(text, seq_len, drop_text=drop_text)
         if drop_prompt:

         cond_dim = 512
         self.time_embed = TimestepEmbedding(cond_dim)
         self.start_time_embed = TimestepEmbedding(cond_dim)
+        self.duration_time_embed = TimestepEmbedding(cond_dim) if self.max_frames == 6144 else None
         if text_dim is None:
             text_dim = mel_dim
         self.text_embed = TextEmbedding(text_num_embeds, text_dim, conv_layers=conv_layers, max_pos=self.max_frames)
         drop_prompt=False,
         style_prompt=None, # [b d t]
         start_time=None,
+        duration=None
     ):
         batch, seq_len = x.shape[0], x.shape[1]
         # t: conditioning time, c: context (text + masked cond audio), x: noised input audio
         t = self.time_embed(time)
         s_t = self.start_time_embed(start_time)
+        d_t = self.duration_time_embed(duration) if self.max_frames == 6144 else torch.zeros_like(s_t)
+        c = t + s_t + d_t
         text_embed = self.text_embed(text, seq_len, drop_text=drop_text)
         if drop_prompt: