Spaces:

WeixuanYuan
/

DiffuSynthV0.2

Sleeping

App Files Files Community

WeixuanYuan commited on Sep 15, 2024

Commit

bd6e54b

verified ·

1 Parent(s): bdd2a77

Upload 70 files

Browse files

Files changed (48) hide show

app.py +22 -11
webUI/natural_language_guided/__pycache__/README.cpython-310.pyc +0 -0
webUI/natural_language_guided/__pycache__/build_instrument.cpython-310.pyc +0 -0
webUI/natural_language_guided/__pycache__/gradio_webUI.cpython-310.pyc +0 -0
webUI/natural_language_guided/__pycache__/inpaint_with_text.cpython-310.pyc +0 -0
webUI/natural_language_guided/__pycache__/note2music.cpython-310.pyc +0 -0
webUI/natural_language_guided/__pycache__/sound2sound_with_text.cpython-310.pyc +0 -0
webUI/natural_language_guided/__pycache__/text2sound.cpython-310.pyc +0 -0
webUI/natural_language_guided/__pycache__/track_maker.cpython-310.pyc +0 -0
webUI/natural_language_guided/__pycache__/utils.cpython-310.pyc +0 -0
webUI/natural_language_guided/build_instrument.py +2 -1
webUI/natural_language_guided/note2music.py +174 -0
webUI/natural_language_guided/text2sound.py +2 -2
webUI/natural_language_guided/track_maker.py +246 -246
webUI/natural_language_guided_4/GAN.py +164 -0
webUI/natural_language_guided_4/README.py +53 -0
webUI/natural_language_guided_4/__pycache__/gradio_webUI.cpython-310.pyc +0 -0
webUI/natural_language_guided_4/__pycache__/inpaint_with_text.cpython-310.pyc +0 -0
webUI/natural_language_guided_4/__pycache__/instruments.cpython-310.pyc +0 -0
webUI/natural_language_guided_4/__pycache__/load_presets.cpython-310.pyc +0 -0
webUI/natural_language_guided_4/__pycache__/note2music.cpython-310.pyc +0 -0
webUI/natural_language_guided_4/__pycache__/sound2sound_with_text.cpython-310.pyc +0 -0
webUI/natural_language_guided_4/__pycache__/text2sound.cpython-310.pyc +0 -0
webUI/natural_language_guided_4/__pycache__/track_maker.cpython-310.pyc +0 -0
webUI/natural_language_guided_4/__pycache__/utils.cpython-310.pyc +0 -0
webUI/natural_language_guided_4/build_instrument.py +305 -0
webUI/natural_language_guided_4/gradio_webUI.py +68 -0
webUI/natural_language_guided_4/inpaint_with_text.py +371 -0
webUI/natural_language_guided_4/instruments.py +60 -0
webUI/natural_language_guided_4/load_presets.py +81 -0
webUI/natural_language_guided_4/note2music.py +200 -0
webUI/natural_language_guided_4/rec.py +190 -0
webUI/natural_language_guided_4/sound2sound_with_text.py +325 -0
webUI/natural_language_guided_4/super_resolution_with_text.py +387 -0
webUI/natural_language_guided_4/text2sound.py +220 -0
webUI/natural_language_guided_4/track_maker.py +248 -0
webUI/natural_language_guided_4/utils.py +228 -0
webUI/presets/instruments/ax.wav +0 -0
webUI/presets/instruments/electronic_sound.wav +0 -0
webUI/presets/instruments/keyboard.wav +0 -0
webUI/presets/instruments/organ.wav +0 -0
webUI/presets/instruments/string.wav +0 -0
webUI/presets/instruments/synth_lead.wav +0 -0
webUI/presets/midis/Air_on_the_G_String.mid +0 -0
webUI/presets/midis/Arhbo.mid +0 -0
webUI/presets/midis/Canon_in_D.mid +0 -0
webUI/presets/midis/Ode_to_Joy_Easy_variation.mid +0 -0
webUI/presets/midis/Rrharil.mid +0 -0

app.py CHANGED Viewed

@@ -15,12 +15,17 @@ from model.multimodal_model import get_multi_modal_model
 import gradio as gr
-from webUI.natural_language_guided.gradio_webUI import GradioWebUI
-from webUI.natural_language_guided.text2sound import get_text2sound_module
-from webUI.natural_language_guided.sound2sound_with_text import get_sound2sound_with_text_module
-from webUI.natural_language_guided.inpaint_with_text import get_inpaint_with_text_module
-from webUI.natural_language_guided.build_instrument import get_build_instrument_module
-from webUI.natural_language_guided.README import get_readme_module
@@ -62,28 +67,34 @@ else:
 gradioWebUI = GradioWebUI(device, VAE, uNet, text_encoder, CLAP_tokenizer, freq_resolution=512, time_resolution=256, channels=4, timesteps=1000, squared=False,
                           VAE_scale=4, flexible_duration=True, noise_strategy="repeat", GAN_generator=None)
 with gr.Blocks(theme=gr.themes.Soft(), mode="dark") as demo:
-# with gr.Blocks(theme='WeixuanYuan/Soft_dark', mode="dark") as demo:
-    gr.Markdown('Thank you for using DiffuSynth v0.2! \n <span style="color:red">The [Arrangement] feature is still being improved!</span>', unsafe_allow_html=True)
     reconstruction_state = gr.State(value={})
     text2sound_state = gr.State(value={})
     sound2sound_state = gr.State(value={})
     inpaint_state = gr.State(value={})
     super_resolution_state = gr.State(value={})
-    virtual_instruments_state = gr.State(value={"virtual_instruments": {}})
     get_text2sound_module(gradioWebUI, text2sound_state, virtual_instruments_state)
     get_sound2sound_with_text_module(gradioWebUI, sound2sound_state, virtual_instruments_state)
     get_inpaint_with_text_module(gradioWebUI, inpaint_state, virtual_instruments_state)
-    get_build_instrument_module(gradioWebUI, virtual_instruments_state)
     # get_readme_module()
 demo.launch(debug=True, share=True)

 import gradio as gr
+from tools import read_wav_to_numpy
+from webUI.natural_language_guided_4.gradio_webUI import GradioWebUI
+from webUI.natural_language_guided_4.instruments import get_instruments_module
+from webUI.natural_language_guided_4.load_presets import load_presets
+from webUI.natural_language_guided_4.text2sound import get_text2sound_module
+from webUI.natural_language_guided_4.sound2sound_with_text import get_sound2sound_with_text_module
+from webUI.natural_language_guided_4.inpaint_with_text import get_inpaint_with_text_module
+# from webUI.natural_language_guided_4.build_instrument import get_build_instrument_module
+from webUI.natural_language_guided_4.note2music import get_arrangement_module
+# from webUI.natural_language_guided_4.README import get_readme_module
 gradioWebUI = GradioWebUI(device, VAE, uNet, text_encoder, CLAP_tokenizer, freq_resolution=512, time_resolution=256, channels=4, timesteps=1000, squared=False,
                           VAE_scale=4, flexible_duration=True, noise_strategy="repeat", GAN_generator=None)
+virtual_instruments, midis = load_presets(gradioWebUI)
 with gr.Blocks(theme=gr.themes.Soft(), mode="dark") as demo:
+    gr.Markdown("Thank you for using DiffuSynth v0.2!")
     reconstruction_state = gr.State(value={})
     text2sound_state = gr.State(value={})
     sound2sound_state = gr.State(value={})
     inpaint_state = gr.State(value={})
     super_resolution_state = gr.State(value={})
+    virtual_instruments_state = gr.State(value={"virtual_instruments": virtual_instruments})
+    midi_files_state = gr.State(value={"midis": midis})
     get_text2sound_module(gradioWebUI, text2sound_state, virtual_instruments_state)
     get_sound2sound_with_text_module(gradioWebUI, sound2sound_state, virtual_instruments_state)
     get_inpaint_with_text_module(gradioWebUI, inpaint_state, virtual_instruments_state)
+    # get_build_instrument_module(gradioWebUI, virtual_instruments_state)
+    get_arrangement_module(gradioWebUI, virtual_instruments_state, midi_files_state)
     # get_readme_module()
+    # get_instruments_module(gradioWebUI, virtual_instruments_state)
 demo.launch(debug=True, share=True)
+# demo.launch(debug=True, share=False)

webUI/natural_language_guided/__pycache__/README.cpython-310.pyc CHANGED Viewed

Binary files a/webUI/natural_language_guided/__pycache__/README.cpython-310.pyc and b/webUI/natural_language_guided/__pycache__/README.cpython-310.pyc differ

webUI/natural_language_guided/__pycache__/build_instrument.cpython-310.pyc CHANGED Viewed

Binary files a/webUI/natural_language_guided/__pycache__/build_instrument.cpython-310.pyc and b/webUI/natural_language_guided/__pycache__/build_instrument.cpython-310.pyc differ

webUI/natural_language_guided/__pycache__/gradio_webUI.cpython-310.pyc CHANGED Viewed

Binary files a/webUI/natural_language_guided/__pycache__/gradio_webUI.cpython-310.pyc and b/webUI/natural_language_guided/__pycache__/gradio_webUI.cpython-310.pyc differ

webUI/natural_language_guided/__pycache__/inpaint_with_text.cpython-310.pyc CHANGED Viewed

Binary files a/webUI/natural_language_guided/__pycache__/inpaint_with_text.cpython-310.pyc and b/webUI/natural_language_guided/__pycache__/inpaint_with_text.cpython-310.pyc differ

webUI/natural_language_guided/__pycache__/note2music.cpython-310.pyc ADDED Viewed

Binary file (6.59 kB). View file

webUI/natural_language_guided/__pycache__/sound2sound_with_text.cpython-310.pyc CHANGED Viewed

Binary files a/webUI/natural_language_guided/__pycache__/sound2sound_with_text.cpython-310.pyc and b/webUI/natural_language_guided/__pycache__/sound2sound_with_text.cpython-310.pyc differ

webUI/natural_language_guided/__pycache__/text2sound.cpython-310.pyc CHANGED Viewed

Binary files a/webUI/natural_language_guided/__pycache__/text2sound.cpython-310.pyc and b/webUI/natural_language_guided/__pycache__/text2sound.cpython-310.pyc differ

webUI/natural_language_guided/__pycache__/track_maker.cpython-310.pyc CHANGED Viewed

Binary files a/webUI/natural_language_guided/__pycache__/track_maker.cpython-310.pyc and b/webUI/natural_language_guided/__pycache__/track_maker.cpython-310.pyc differ

webUI/natural_language_guided/__pycache__/utils.cpython-310.pyc CHANGED Viewed

Binary files a/webUI/natural_language_guided/__pycache__/utils.cpython-310.pyc and b/webUI/natural_language_guided/__pycache__/utils.cpython-310.pyc differ

webUI/natural_language_guided/build_instrument.py CHANGED Viewed

@@ -4,13 +4,14 @@ import torch
 import gradio as gr
 import mido
 from io import BytesIO
 from model.DiffSynthSampler import DiffSynthSampler
 from tools import adsr_envelope, adjust_audio_length
 from webUI.natural_language_guided.track_maker import DiffSynth
 from webUI.natural_language_guided.utils import encodeBatch2GradioOutput_STFT, phase_to_Gradio_image, \
     spectrogram_to_Gradio_image
-import torchaudio.transforms as transforms
 def time_stretch_audio(waveform, sample_rate, stretch_factor):

 import gradio as gr
 import mido
 from io import BytesIO
+# import pyrubberband as pyrb
+import torchaudio.transforms as transforms
 from model.DiffSynthSampler import DiffSynthSampler
 from tools import adsr_envelope, adjust_audio_length
 from webUI.natural_language_guided.track_maker import DiffSynth
 from webUI.natural_language_guided.utils import encodeBatch2GradioOutput_STFT, phase_to_Gradio_image, \
     spectrogram_to_Gradio_image
 def time_stretch_audio(waveform, sample_rate, stretch_factor):

webUI/natural_language_guided/note2music.py ADDED Viewed

	@@ -0,0 +1,174 @@

+import librosa
+import numpy as np
+import torch
+import gradio as gr
+import mido
+from io import BytesIO
+# import pyrubberband as pyrb
+import torchaudio.transforms as transforms
+from model.DiffSynthSampler import DiffSynthSampler
+from tools import adsr_envelope, adjust_audio_length
+from webUI.natural_language_guided.track_maker import DiffSynth, Track
+from webUI.natural_language_guided.utils import encodeBatch2GradioOutput_STFT, phase_to_Gradio_image, \
+    spectrogram_to_Gradio_image
+def get_arrangement_module(gradioWebUI, virtual_instruments_state, midi_files_state):
+    # Load configurations
+    uNet = gradioWebUI.uNet
+    freq_resolution, time_resolution = gradioWebUI.freq_resolution, gradioWebUI.time_resolution
+    VAE_scale = gradioWebUI.VAE_scale
+    height, width, channels = int(freq_resolution / VAE_scale), int(time_resolution / VAE_scale), gradioWebUI.channels
+    timesteps = gradioWebUI.timesteps
+    VAE_quantizer = gradioWebUI.VAE_quantizer
+    VAE_decoder = gradioWebUI.VAE_decoder
+    CLAP = gradioWebUI.CLAP
+    CLAP_tokenizer = gradioWebUI.CLAP_tokenizer
+    device = gradioWebUI.device
+    squared = gradioWebUI.squared
+    sample_rate = gradioWebUI.sample_rate
+    noise_strategy = gradioWebUI.noise_strategy
+    def read_midi(midi, midi_files_dict):
+        print(midi)
+        midi_name = midi_file.name
+        mid = mido.MidiFile(file=BytesIO(midi))
+        tracks = [Track(t, mid.ticks_per_beat) for t in mid.tracks]
+        midi_info_text = f"Name: {midi_name}"
+        for track in tracks:
+            midi_info_text += f"\n {len(track.events)}"
+        return {midi_info_textbox: gr.Textbox(label="Midi info", lines=10,
+                                                            placeholder=midi_info_text),
+                midi_files_state: midi_files_dict}
+    def refresh_instruments(virtual_instruments_dict):
+        virtual_instruments_names = list(virtual_instruments_dict["virtual_instruments"].keys())
+        print(f"virtual_instruments_names: {virtual_instruments_names}")
+        return {select_instrument_dropdown: gr.Dropdown.update(choices=["New Option 1", "New Option 2", "New Option 3"])}
+    def select_sound(virtual_instrument_name, virtual_instruments_dict):
+        virtual_instruments = virtual_instruments_dict["virtual_instruments"]
+        virtual_instrument = virtual_instruments[virtual_instrument_name]
+        return {source_sound_spectrogram_image: virtual_instrument["spectrogram_gradio_image"],
+                source_sound_phase_image: virtual_instrument["phase_gradio_image"],
+                source_sound_audio: virtual_instrument["signal"]}
+    def make_track(inpaint_steps, midi, noising_strength, attack, before_release, instrument_names,
+                   virtual_instruments_dict):
+        if noising_strength < 1:
+            print(f"Warning: making track with noising_strength = {noising_strength} < 1")
+        virtual_instruments = virtual_instruments_dict["virtual_instruments"]
+        sample_steps = int(inpaint_steps)
+        instrument_names = instrument_names.split("@")
+        instruments_configs = {}
+        for virtual_instrument_name in instrument_names:
+            virtual_instrument = virtual_instruments[virtual_instrument_name]
+            latent_representation = torch.tensor(virtual_instrument["latent_representation"], dtype=torch.float32).to(
+                device)
+            sampler = virtual_instrument["sampler"]
+            batchsize = 1
+            latent_representation = latent_representation.repeat(batchsize, 1, 1, 1)
+            mid = mido.MidiFile(file=BytesIO(midi))
+            instruments_configs[virtual_instrument_name] = {
+                'sample_steps': sample_steps,
+                'sampler': sampler,
+                'noising_strength': noising_strength,
+                'latent_representation': latent_representation,
+                'attack': attack,
+                'before_release': before_release}
+        diffSynth = DiffSynth(instruments_configs, uNet, VAE_quantizer, VAE_decoder, CLAP, CLAP_tokenizer, device)
+        full_audio = diffSynth.get_music(mid, instrument_names)
+        return {track_audio: (sample_rate, full_audio)}
+    with gr.Tab("Arrangement"):
+        gr.Markdown("Make music with generated sounds!")
+        with gr.Row(variant="panel"):
+            with gr.Column(scale=3):
+                preset_button_1 = gr.Button(variant="primary", value="Ode_to_Joy", scale=1)
+                preset_button_2 = gr.Button(variant="primary", value="Ode_to_Joy", scale=1)
+                preset_button_3 = gr.Button(variant="primary", value="Ode_to_Joy", scale=1)
+                midi_file = gr.File(label="Upload midi file", type="binary", scale=2)
+            with gr.Column(scale=3):
+                midi_info_textbox = gr.Textbox(label="Midi info", lines=10, placeholder="Please select/upload a midi on the left.")
+                instrument_names_textbox = gr.Textbox(label="Instrument names", lines=2,
+                                                     placeholder="Names of your instrument used to play the midi", scale=1)
+            with gr.Column(scale=3):
+                refresh_instrument_button = gr.Button(variant="primary", value="Refresh instruments", scale=1)
+                # instrument_name_textbox = gr.Textbox(label="Instrument name", lines=1,
+                #                                      placeholder="Name of your instrument", scale=1)
+                select_instrument_dropdown = gr.Dropdown(choices=["Option 1", "Option 2", "Option 3"], label="Choose an option")
+                source_sound_audio = gr.Audio(type="numpy", label="Play new sound", interactive=False)
+            with gr.Column(scale=3):
+                make_track_button = gr.Button(variant="primary", value="Make track", scale=1)
+                track_audio = gr.Audio(type="numpy", label="Play new sound", interactive=False)
+        with gr.Row(variant="panel"):
+            with gr.Tab("Origin sound"):
+                inpaint_steps_slider = gr.Slider(minimum=5.0, maximum=999.0, value=20.0, step=1.0,
+                                                 label="inpaint_steps")
+                noising_strength_slider = gradioWebUI.get_noising_strength_slider(default_noising_strength=1.)
+                end_noise_level_ratio_slider = gr.Slider(minimum=0.0, maximum=1., value=0.0, step=0.01,
+                                                         label="end_noise_level_ratio")
+                attack_slider = gr.Slider(minimum=0.0, maximum=1.5, value=0.5, step=0.01, label="attack in sec")
+                before_release_slider = gr.Slider(minimum=0.0, maximum=1.5, value=0.5, step=0.01,
+                                                  label="before_release in sec")
+                release_slider = gr.Slider(minimum=0.0, maximum=1.0, value=0.3, step=0.01, label="release in sec")
+                mask_flexivity_slider = gr.Slider(minimum=0.01, maximum=1.00, value=1., step=0.01,
+                                                  label="mask_flexivity")
+            with gr.Tab("Length adjustment config"):
+                use_dynamic_mask_checkbox = gr.Checkbox(label="Use dynamic mask", value=True)
+                test_duration_envelope_button = gr.Button(variant="primary", value="Apply envelope", scale=1)
+                test_duration_stretch_button = gr.Button(variant="primary", value="Apply stretch", scale=1)
+                test_duration_inpaint_button = gr.Button(variant="primary", value="Inpaint different duration", scale=1)
+                duration_slider = gradioWebUI.get_duration_slider()
+            with gr.Tab("Pitch shift config"):
+                pitch_shift_radio = gr.Radio(choices=["librosa", "torchaudio", "rubberband"],
+                                                          value="librosa")
+        with gr.Row(variant="panel"):
+            with gr.Column(scale=2):
+                with gr.Row(variant="panel"):
+                    source_sound_spectrogram_image = gr.Image(label="New sound spectrogram", type="numpy",
+                                                              height=600, scale=1)
+                    source_sound_phase_image = gr.Image(label="New sound phase", type="numpy",
+                                                        height=600, scale=1)
+    # instrument_name_textbox.change(select_sound,
+    #                                inputs=[instrument_name_textbox, virtual_instruments_state],
+    #                                outputs=[source_sound_audio])
+    refresh_instrument_button.click(refresh_instruments,
+                                   inputs=[virtual_instruments_state],
+                                   outputs=[select_instrument_dropdown])
+    make_track_button.click(make_track,
+                            inputs=[inpaint_steps_slider, midi_file,
+                                    noising_strength_slider,
+                                    attack_slider,
+                                    before_release_slider,
+                                    instrument_names_textbox,
+                                    virtual_instruments_state],
+                            outputs=[track_audio])
+    midi_file.change(read_midi,
+                     inputs=[midi_file,
+                             midi_files_state],
+                     outputs=[midi_info_textbox,
+                              midi_files_state])

webUI/natural_language_guided/text2sound.py CHANGED Viewed

@@ -46,8 +46,8 @@ def get_text2sound_module(gradioWebUI, text2sound_state, virtual_instruments_sta
         mySampler = DiffSynthSampler(timesteps, height=height, channels=channels, noise_strategy=noise_strategy)
         negative_condition = \
-            CLAP.get_text_features(**CLAP_tokenizer([text2sound_negative_prompts], padding=True, return_tensors="pt"))[
-                0]
         mySampler.activate_classifier_free_guidance(CFG, negative_condition.to(device))
         mySampler.respace(list(np.linspace(0, timesteps - 1, text2sound_sample_steps, dtype=np.int32)))

         mySampler = DiffSynthSampler(timesteps, height=height, channels=channels, noise_strategy=noise_strategy)
         negative_condition = \
+            CLAP.get_text_features(**CLAP_tokenizer([text2sound_negative_prompts], padding=True, return_tensors="pt"))[0]
         mySampler.activate_classifier_free_guidance(CFG, negative_condition.to(device))
         mySampler.respace(list(np.linspace(0, timesteps - 1, text2sound_sample_steps, dtype=np.int32)))

webUI/natural_language_guided/track_maker.py CHANGED Viewed

@@ -1,247 +1,247 @@
-import librosa
-import numpy as np
-import torch
-from model.DiffSynthSampler import DiffSynthSampler
-from webUI.natural_language_guided.utils import encodeBatch2GradioOutput_STFT
-import mido
-import torchaudio.transforms as transforms
-from tqdm import tqdm
-# def pitch_shift_audio(waveform, sample_rate, n_steps, device='cpu', n_fft=1024, hop_length=None):
-#     # 如果输入是 numpy 数组，则转换为 torch.Tensor
-#     if isinstance(waveform, np.ndarray):
-#         waveform = torch.from_numpy(waveform)
-#
-#     # 设置 hop_length 为 n_fft 的一半（合理的默认值），以减少 STFT 操作的内存开销
-#     if hop_length is None:
-#         hop_length = n_fft // 4
-#
-#     # 将 waveform 移动到指定设备上
-#     waveform = waveform.to(device, dtype=torch.float32)
-#
-#     # 创建 pitch_shift 变换，并移动到指定设备上
-#     pitch_shift = transforms.PitchShift(
-#         sample_rate=sample_rate,
-#         n_steps=n_steps,
-#         n_fft=n_fft,
-#         hop_length=hop_length
-#     ).to(device)
-#
-#     # 执行变换，并将结果从设备移动到 CPU 上，最后转换为 numpy 数组
-#     shifted_waveform = pitch_shift(waveform).detach().cpu().numpy()
-#
-#     return shifted_waveform
-def pitch_shift_librosa(waveform, sample_rate, total_steps, step_size=4, n_fft=4096, hop_length=None):
-    # librosa 需要输入的是 numpy 数组
-    if isinstance(waveform, torch.Tensor):
-        waveform = waveform.numpy()
-    # 如果 hop_length 未提供，则使用 n_fft 的四分之一作为默认值
-    if hop_length is None:
-        hop_length = n_fft // 4
-    # 逐步进行 pitch shift，每次提升 step_size 个半音
-    current_waveform = waveform
-    num_steps = int(np.ceil(total_steps / step_size))
-    for i in range(num_steps):
-        step = min(step_size, total_steps - i * step_size)  # 确保最后一步不会超过 total_steps
-        current_waveform = librosa.effects.pitch_shift(
-            current_waveform, sr=sample_rate, n_steps=step,
-            n_fft=n_fft, hop_length=hop_length
-        )
-    return current_waveform
-class NoteEvent:
-    def __init__(self, note, velocity, start_time, duration):
-        self.note = note
-        self.velocity = velocity
-        self.start_time = start_time  # In ticks
-        self.duration = duration      # In ticks
-    def __str__(self):
-        return f"Note {self.note}, velocity {self.velocity}, start_time {self.start_time}, duration {self.duration}"
-class Track:
-    def __init__(self, track, ticks_per_beat):
-        self.tempo_events = self._parse_tempo_events(track)
-        self.events = self._parse_note_events(track)
-        self.ticks_per_beat = ticks_per_beat
-    def _parse_tempo_events(self, track):
-        tempo_events = []
-        current_tempo = 500000  # Default MIDI tempo is 120 BPM which is 500000 microseconds per beat
-        for msg in track:
-            if msg.type == 'set_tempo':
-                tempo_events.append((msg.time, msg.tempo))
-            elif not msg.is_meta:
-                tempo_events.append((msg.time, current_tempo))
-        return tempo_events
-    def _parse_note_events(self, track):
-        events = []
-        start_time = 0
-        for msg in track:
-            if not msg.is_meta:
-                start_time += msg.time
-                if msg.type == 'note_on' and msg.velocity > 0:
-                    note_on_time = start_time
-                elif msg.type == 'note_on' and msg.velocity == 0:
-                    duration = start_time - note_on_time
-                    events.append(NoteEvent(msg.note, msg.velocity, note_on_time, duration))
-        return events
-    def synthesize_track(self, diffSynthSampler, sample_rate=16000):
-        track_audio = np.zeros(int(self._get_total_time() * sample_rate), dtype=np.float32)
-        current_tempo = 500000  # Start with default MIDI tempo 120 BPM
-        duration_note_mapping = {}
-        for event in tqdm(self.events[:25]):
-            current_tempo = self._get_tempo_at(event.start_time)
-            seconds_per_tick = mido.tick2second(1, self.ticks_per_beat, current_tempo)
-            start_time_sec = event.start_time * seconds_per_tick
-            # Todo: set a minimum duration
-            duration_sec = event.duration * seconds_per_tick
-            duration_sec = max(duration_sec, 0.75)
-            start_sample = int(start_time_sec * sample_rate)
-            if not (str(duration_sec) in duration_note_mapping):
-                note_sample = diffSynthSampler(event.velocity, duration_sec)
-                duration_note_mapping[str(duration_sec)] = note_sample / np.max(np.abs(note_sample))
-            # note_audio = pyrb.pitch_shift(duration_note_mapping[str(duration_sec)], sample_rate, event.note - 52)
-            # note_audio = pitch_shift_audio(duration_note_mapping[str(duration_sec)], sample_rate, event.note - 52)
-            note_audio = pitch_shift_librosa(duration_note_mapping[str(duration_sec)], sample_rate, event.note - 52)
-            end_sample = start_sample + len(note_audio)
-            track_audio[start_sample:end_sample] += note_audio
-        return track_audio
-    def _get_tempo_at(self, time_tick):
-        current_tempo = 500000  # Start with default MIDI tempo 120 BPM
-        elapsed_ticks = 0
-        for tempo_change in self.tempo_events:
-            if elapsed_ticks + tempo_change[0] > time_tick:
-                return current_tempo
-            elapsed_ticks += tempo_change[0]
-            current_tempo = tempo_change[1]
-        return current_tempo
-    def _get_total_time(self):
-        total_time = 0
-        current_tempo = 500000  # Start with default MIDI tempo 120 BPM
-        for event in self.events:
-            current_tempo = self._get_tempo_at(event.start_time)
-            seconds_per_tick = mido.tick2second(1, self.ticks_per_beat, current_tempo)
-            total_time += event.duration * seconds_per_tick
-        return total_time
-class DiffSynth:
-    def __init__(self, instruments_configs, noise_prediction_model, VAE_quantizer, VAE_decoder, text_encoder, CLAP_tokenizer, device,
-                               model_sample_rate=16000, timesteps=1000, channels=4, freq_resolution=512, time_resolution=256, VAE_scale=4, squared=False):
-        self.noise_prediction_model = noise_prediction_model
-        self.VAE_quantizer = VAE_quantizer
-        self.VAE_decoder = VAE_decoder
-        self.device = device
-        self.model_sample_rate = model_sample_rate
-        self.timesteps = timesteps
-        self.channels = channels
-        self.freq_resolution = freq_resolution
-        self.time_resolution = time_resolution
-        self.height = int(freq_resolution/VAE_scale)
-        self.VAE_scale = VAE_scale
-        self.squared = squared
-        self.text_encoder = text_encoder
-        self.CLAP_tokenizer = CLAP_tokenizer
-        # instruments_configs 是字典 string -> (condition, negative_condition, guidance_scale, sample_steps, seed, initial_noise, sampler)
-        self.instruments_configs = instruments_configs
-        self.diffSynthSamplers = {}
-        self._update_instruments()
-    def _update_instruments(self):
-        def diffSynthSamplerWrapper(instruments_config):
-            def diffSynthSampler(velocity, duration_sec, sample_rate=16000):
-                condition = self.text_encoder.get_text_features(**self.CLAP_tokenizer([""], padding=True, return_tensors="pt")).to(self.device)
-                sample_steps = instruments_config['sample_steps']
-                sampler = instruments_config['sampler']
-                noising_strength = instruments_config['noising_strength']
-                latent_representation = instruments_config['latent_representation']
-                attack = instruments_config['attack']
-                before_release = instruments_config['before_release']
-                assert sample_rate == self.model_sample_rate, "sample_rate != model_sample_rate"
-                width = int(self.time_resolution * ((duration_sec + 1) / 4) / self.VAE_scale)
-                mySampler = DiffSynthSampler(self.timesteps, height=128, channels=4, noise_strategy="repeat", mute=True)
-                mySampler.respace(list(np.linspace(0, self.timesteps - 1, sample_steps, dtype=np.int32)))
-                # mask = 1, freeze
-                latent_mask = torch.zeros((1, 1, self.height, width), dtype=torch.float32).to(self.device)
-                latent_mask[:, :, :, :int(self.time_resolution * (attack / 4) / self.VAE_scale)] = 1.0
-                latent_mask[:, :, :, -int(self.time_resolution * ((before_release+1) / 4) / self.VAE_scale):] = 1.0
-                latent_representations, _ = \
-                    mySampler.inpaint_sample(model=self.noise_prediction_model, shape=(1, self.channels, self.height, width),
-                                            noising_strength=noising_strength, condition=condition,
-                                            guide_img=latent_representation, mask=latent_mask, return_tensor=True,
-                                            sampler=sampler,
-                                            use_dynamic_mask=True, end_noise_level_ratio=0.0,
-                                            mask_flexivity=1.0)
-                latent_representations = latent_representations[-1]
-                quantized_latent_representations, _, (_, _, _) = self.VAE_quantizer(latent_representations)
-                # Todo: remove hard-coding
-                flipped_log_spectrums, flipped_phases, rec_signals, _, _, _ = encodeBatch2GradioOutput_STFT(self.VAE_decoder,
-                                                                  quantized_latent_representations,
-                                                                  resolution=(
-                                                                      512,
-                                                                      width * self.VAE_scale),
-                                                                  original_STFT_batch=None,
-                                                            )
-                return rec_signals[0]
-            return diffSynthSampler
-        for key in self.instruments_configs.keys():
-            self.diffSynthSamplers[key] = diffSynthSamplerWrapper(self.instruments_configs[key])
-    def get_music(self, mid, instrument_names, sample_rate=16000):
-        tracks = [Track(t, mid.ticks_per_beat) for t in mid.tracks]
-        assert len(tracks) == len(instrument_names), f"len(tracks) = {len(tracks)} != {len(instrument_names)} = len(instrument_names)"
-        track_audios = [track.synthesize_track(self.diffSynthSamplers[instrument_names[i]], sample_rate=sample_rate) for i, track in enumerate(tracks)]
-        # 将所有音轨填充至最长音轨的长度，以便它们可以被叠加
-        max_length = max(len(audio) for audio in track_audios)
-        full_audio = np.zeros(max_length, dtype=np.float32)  # 初始化全音频数组为零
-        for audio in track_audios:
-            # 音轨可能不够长，需要填充零
-            padded_audio = np.pad(audio, (0, max_length - len(audio)), 'constant')
-            full_audio += padded_audio  # 叠加音轨
         return full_audio

+import librosa
+import numpy as np
+import torch
+from model.DiffSynthSampler import DiffSynthSampler
+from webUI.natural_language_guided.utils import encodeBatch2GradioOutput_STFT
+import mido
+import torchaudio.transforms as transforms
+from tqdm import tqdm
+# def pitch_shift_audio(waveform, sample_rate, n_steps, device='cpu', n_fft=1024, hop_length=None):
+#     # 如果输入是 numpy 数组，则转换为 torch.Tensor
+#     if isinstance(waveform, np.ndarray):
+#         waveform = torch.from_numpy(waveform)
+#
+#     # 设置 hop_length 为 n_fft 的一半（合理的默认值），以减少 STFT 操作的内存开销
+#     if hop_length is None:
+#         hop_length = n_fft // 4
+#
+#     # 将 waveform 移动到指定设备上
+#     waveform = waveform.to(device, dtype=torch.float32)
+#
+#     # 创建 pitch_shift 变换，并移动到指定设备上
+#     pitch_shift = transforms.PitchShift(
+#         sample_rate=sample_rate,
+#         n_steps=n_steps,
+#         n_fft=n_fft,
+#         hop_length=hop_length
+#     ).to(device)
+#
+#     # 执行变换，并将结果从设备移动到 CPU 上，最后转换为 numpy 数组
+#     shifted_waveform = pitch_shift(waveform).detach().cpu().numpy()
+#
+#     return shifted_waveform
+def pitch_shift_librosa(waveform, sample_rate, total_steps, step_size=4, n_fft=4096, hop_length=None):
+    # librosa 需要输入的是 numpy 数组
+    if isinstance(waveform, torch.Tensor):
+        waveform = waveform.numpy()
+    # 如果 hop_length 未提供，则使用 n_fft 的四分之一作为默认值
+    if hop_length is None:
+        hop_length = n_fft // 4
+    # 逐步进行 pitch shift，每次提升 step_size 个半音
+    current_waveform = waveform
+    num_steps = int(np.ceil(total_steps / step_size))
+    for i in range(num_steps):
+        step = min(step_size, total_steps - i * step_size)  # 确保最后一步不会超过 total_steps
+        current_waveform = librosa.effects.pitch_shift(
+            current_waveform, sr=sample_rate, n_steps=step,
+            n_fft=n_fft, hop_length=hop_length
+        )
+    return current_waveform
+class NoteEvent:
+    def __init__(self, note, velocity, start_time, duration):
+        self.note = note
+        self.velocity = velocity
+        self.start_time = start_time  # In ticks
+        self.duration = duration      # In ticks
+    def __str__(self):
+        return f"Note {self.note}, velocity {self.velocity}, start_time {self.start_time}, duration {self.duration}"
+class Track:
+    def __init__(self, track, ticks_per_beat):
+        self.tempo_events = self._parse_tempo_events(track)
+        self.events = self._parse_note_events(track)
+        self.ticks_per_beat = ticks_per_beat
+    def _parse_tempo_events(self, track):
+        tempo_events = []
+        current_tempo = 500000  # Default MIDI tempo is 120 BPM which is 500000 microseconds per beat
+        for msg in track:
+            if msg.type == 'set_tempo':
+                tempo_events.append((msg.time, msg.tempo))
+            elif not msg.is_meta:
+                tempo_events.append((msg.time, current_tempo))
+        return tempo_events
+    def _parse_note_events(self, track):
+        events = []
+        start_time = 0
+        for msg in track:
+            if not msg.is_meta:
+                start_time += msg.time
+                if msg.type == 'note_on' and msg.velocity > 0:
+                    note_on_time = start_time
+                elif msg.type == 'note_on' and msg.velocity == 0:
+                    duration = start_time - note_on_time
+                    events.append(NoteEvent(msg.note, msg.velocity, note_on_time, duration))
+        return events
+    def synthesize_track(self, diffSynthSampler, sample_rate=16000):
+        track_audio = np.zeros(int(self._get_total_time() * sample_rate), dtype=np.float32)
+        current_tempo = 500000  # Start with default MIDI tempo 120 BPM
+        duration_note_mapping = {}
+        for event in tqdm(self.events[:25]):
+            current_tempo = self._get_tempo_at(event.start_time)
+            seconds_per_tick = mido.tick2second(1, self.ticks_per_beat, current_tempo)
+            start_time_sec = event.start_time * seconds_per_tick
+            # Todo: set a minimum duration
+            duration_sec = event.duration * seconds_per_tick
+            duration_sec = max(duration_sec, 0.75)
+            start_sample = int(start_time_sec * sample_rate)
+            if not (str(duration_sec) in duration_note_mapping):
+                note_sample = diffSynthSampler(event.velocity, duration_sec)
+                duration_note_mapping[str(duration_sec)] = note_sample / np.max(np.abs(note_sample))
+            # note_audio = pyrb.pitch_shift(duration_note_mapping[str(duration_sec)], sample_rate, event.note - 52)
+            # note_audio = pitch_shift_audio(duration_note_mapping[str(duration_sec)], sample_rate, event.note - 52)
+            note_audio = pitch_shift_librosa(duration_note_mapping[str(duration_sec)], sample_rate, event.note - 52)
+            end_sample = start_sample + len(note_audio)
+            track_audio[start_sample:end_sample] += note_audio
+        return track_audio
+    def _get_tempo_at(self, time_tick):
+        current_tempo = 500000  # Start with default MIDI tempo 120 BPM
+        elapsed_ticks = 0
+        for tempo_change in self.tempo_events:
+            if elapsed_ticks + tempo_change[0] > time_tick:
+                return current_tempo
+            elapsed_ticks += tempo_change[0]
+            current_tempo = tempo_change[1]
+        return current_tempo
+    def _get_total_time(self):
+        total_time = 0
+        current_tempo = 500000  # Start with default MIDI tempo 120 BPM
+        for event in self.events:
+            current_tempo = self._get_tempo_at(event.start_time)
+            seconds_per_tick = mido.tick2second(1, self.ticks_per_beat, current_tempo)
+            total_time += event.duration * seconds_per_tick
+        return total_time
+class DiffSynth:
+    def __init__(self, instruments_configs, noise_prediction_model, VAE_quantizer, VAE_decoder, text_encoder, CLAP_tokenizer, device,
+                               model_sample_rate=16000, timesteps=1000, channels=4, freq_resolution=512, time_resolution=256, VAE_scale=4, squared=False):
+        self.noise_prediction_model = noise_prediction_model
+        self.VAE_quantizer = VAE_quantizer
+        self.VAE_decoder = VAE_decoder
+        self.device = device
+        self.model_sample_rate = model_sample_rate
+        self.timesteps = timesteps
+        self.channels = channels
+        self.freq_resolution = freq_resolution
+        self.time_resolution = time_resolution
+        self.height = int(freq_resolution/VAE_scale)
+        self.VAE_scale = VAE_scale
+        self.squared = squared
+        self.text_encoder = text_encoder
+        self.CLAP_tokenizer = CLAP_tokenizer
+        # instruments_configs 是字典 string -> (condition, negative_condition, guidance_scale, sample_steps, seed, initial_noise, sampler)
+        self.instruments_configs = instruments_configs
+        self.diffSynthSamplers = {}
+        self._update_instruments()
+    def _update_instruments(self):
+        def diffSynthSamplerWrapper(instruments_config):
+            def diffSynthSampler(velocity, duration_sec, sample_rate=16000):
+                condition = self.text_encoder.get_text_features(**self.CLAP_tokenizer([""], padding=True, return_tensors="pt")).to(self.device)
+                sample_steps = instruments_config['sample_steps']
+                sampler = instruments_config['sampler']
+                noising_strength = instruments_config['noising_strength']
+                latent_representation = instruments_config['latent_representation']
+                attack = instruments_config['attack']
+                before_release = instruments_config['before_release']
+                assert sample_rate == self.model_sample_rate, "sample_rate != model_sample_rate"
+                width = int(self.time_resolution * ((duration_sec + 1) / 4) / self.VAE_scale)
+                mySampler = DiffSynthSampler(self.timesteps, height=128, channels=4, noise_strategy="repeat", mute=True)
+                mySampler.respace(list(np.linspace(0, self.timesteps - 1, sample_steps, dtype=np.int32)))
+                # mask = 1, freeze
+                latent_mask = torch.zeros((1, 1, self.height, width), dtype=torch.float32).to(self.device)
+                latent_mask[:, :, :, :int(self.time_resolution * (attack / 4) / self.VAE_scale)] = 1.0
+                latent_mask[:, :, :, -int(self.time_resolution * ((before_release+1) / 4) / self.VAE_scale):] = 1.0
+                latent_representations, _ = \
+                    mySampler.inpaint_sample(model=self.noise_prediction_model, shape=(1, self.channels, self.height, width),
+                                            noising_strength=noising_strength, condition=condition,
+                                            guide_img=latent_representation, mask=latent_mask, return_tensor=True,
+                                            sampler=sampler,
+                                            use_dynamic_mask=True, end_noise_level_ratio=0.0,
+                                            mask_flexivity=1.0)
+                latent_representations = latent_representations[-1]
+                quantized_latent_representations, _, (_, _, _) = self.VAE_quantizer(latent_representations)
+                # Todo: remove hard-coding
+                flipped_log_spectrums, flipped_phases, rec_signals, _, _, _ = encodeBatch2GradioOutput_STFT(self.VAE_decoder,
+                                                                  quantized_latent_representations,
+                                                                  resolution=(
+                                                                      512,
+                                                                      width * self.VAE_scale),
+                                                                  original_STFT_batch=None,
+                                                            )
+                return rec_signals[0]
+            return diffSynthSampler
+        for key in self.instruments_configs.keys():
+            self.diffSynthSamplers[key] = diffSynthSamplerWrapper(self.instruments_configs[key])
+    def get_music(self, mid, instrument_names, sample_rate=16000):
+        tracks = [Track(t, mid.ticks_per_beat) for t in mid.tracks]
+        assert len(tracks) <= len(instrument_names), f"len(tracks) = {len(tracks)} > {len(instrument_names)} = len(instrument_names)"
+        track_audios = [track.synthesize_track(self.diffSynthSamplers[instrument_names[i]], sample_rate=sample_rate) for i, track in enumerate(tracks)]
+        # 将所有音轨填充至最长音轨的长度，以便它们可以被叠加
+        max_length = max(len(audio) for audio in track_audios)
+        full_audio = np.zeros(max_length, dtype=np.float32)  # 初始化全音频数组为零
+        for audio in track_audios:
+            # 音轨可能不够长，需要填充零
+            padded_audio = np.pad(audio, (0, max_length - len(audio)), 'constant')
+            full_audio += padded_audio  # 叠加音轨
         return full_audio

webUI/natural_language_guided_4/GAN.py ADDED Viewed

	@@ -0,0 +1,164 @@

+import gradio as gr
+import numpy as np
+import torch
+from tools import safe_int
+from webUI.natural_language_guided_STFT.utils import encodeBatch2GradioOutput, latent_representation_to_Gradio_image, \
+    add_instrument
+def get_testGAN(gradioWebUI, text2sound_state, virtual_instruments_state):
+    # Load configurations
+    gan_generator = gradioWebUI.GAN_generator
+    freq_resolution, time_resolution = gradioWebUI.freq_resolution, gradioWebUI.time_resolution
+    VAE_scale = gradioWebUI.VAE_scale
+    height, width, channels = int(freq_resolution / VAE_scale), int(time_resolution / VAE_scale), gradioWebUI.channels
+    timesteps = gradioWebUI.timesteps
+    VAE_quantizer = gradioWebUI.VAE_quantizer
+    VAE_decoder = gradioWebUI.VAE_decoder
+    CLAP = gradioWebUI.CLAP
+    CLAP_tokenizer = gradioWebUI.CLAP_tokenizer
+    device = gradioWebUI.device
+    squared = gradioWebUI.squared
+    sample_rate = gradioWebUI.sample_rate
+    noise_strategy = gradioWebUI.noise_strategy
+    def gan_random_sample(text2sound_prompts, text2sound_negative_prompts, text2sound_batchsize,
+                                text2sound_duration,
+                                text2sound_guidance_scale, text2sound_sampler,
+                                text2sound_sample_steps, text2sound_seed,
+                                text2sound_dict):
+        text2sound_seed = safe_int(text2sound_seed, 12345678)
+        width = int(time_resolution * ((text2sound_duration + 1) / 4) / VAE_scale)
+        text2sound_batchsize = int(text2sound_batchsize)
+        text2sound_embedding = \
+            CLAP.get_text_features(**CLAP_tokenizer([text2sound_prompts], padding=True, return_tensors="pt"))[0].to(
+                device)
+        CFG = int(text2sound_guidance_scale)
+        condition = text2sound_embedding.repeat(text2sound_batchsize, 1)
+        noise = torch.randn(text2sound_batchsize, channels, height, width).to(device)
+        latent_representations = gan_generator(noise, condition)
+        print(latent_representations[0, 0, :3, :3])
+        latent_representation_gradio_images = []
+        quantized_latent_representation_gradio_images = []
+        new_sound_spectrogram_gradio_images = []
+        new_sound_rec_signals_gradio = []
+        quantized_latent_representations, loss, (_, _, _) = VAE_quantizer(latent_representations)
+        # Todo: remove hard-coding
+        flipped_log_spectrums, rec_signals = encodeBatch2GradioOutput(VAE_decoder, quantized_latent_representations,
+                                                                      resolution=(512, width * VAE_scale),
+                                                                      centralized=False,
+                                                                      squared=squared)
+        for i in range(text2sound_batchsize):
+            latent_representation_gradio_images.append(latent_representation_to_Gradio_image(latent_representations[i]))
+            quantized_latent_representation_gradio_images.append(
+                latent_representation_to_Gradio_image(quantized_latent_representations[i]))
+            new_sound_spectrogram_gradio_images.append(flipped_log_spectrums[i])
+            new_sound_rec_signals_gradio.append((sample_rate, rec_signals[i]))
+        text2sound_dict["latent_representations"] = latent_representations.to("cpu").detach().numpy()
+        text2sound_dict["quantized_latent_representations"] = quantized_latent_representations.to("cpu").detach().numpy()
+        text2sound_dict["latent_representation_gradio_images"] = latent_representation_gradio_images
+        text2sound_dict["quantized_latent_representation_gradio_images"] = quantized_latent_representation_gradio_images
+        text2sound_dict["new_sound_spectrogram_gradio_images"] = new_sound_spectrogram_gradio_images
+        text2sound_dict["new_sound_rec_signals_gradio"] = new_sound_rec_signals_gradio
+        text2sound_dict["condition"] = condition.to("cpu").detach().numpy()
+        # text2sound_dict["negative_condition"] = negative_condition.to("cpu").detach().numpy()
+        text2sound_dict["guidance_scale"] = CFG
+        text2sound_dict["sampler"] = text2sound_sampler
+        return {text2sound_latent_representation_image: text2sound_dict["latent_representation_gradio_images"][0],
+                text2sound_quantized_latent_representation_image:
+                    text2sound_dict["quantized_latent_representation_gradio_images"][0],
+                text2sound_sampled_spectrogram_image: text2sound_dict["new_sound_spectrogram_gradio_images"][0],
+                text2sound_sampled_audio: text2sound_dict["new_sound_rec_signals_gradio"][0],
+                text2sound_seed_textbox: text2sound_seed,
+                text2sound_state: text2sound_dict,
+                text2sound_sample_index_slider: gr.update(minimum=0, maximum=text2sound_batchsize - 1, value=0, step=1,
+                                                          visible=True,
+                                                          label="Sample index.",
+                                                          info="Swipe to view other samples")}
+    def show_random_sample(sample_index, text2sound_dict):
+        sample_index = int(sample_index)
+        text2sound_dict["sample_index"] = sample_index
+        return {text2sound_latent_representation_image: text2sound_dict["latent_representation_gradio_images"][
+            sample_index],
+                text2sound_quantized_latent_representation_image:
+                    text2sound_dict["quantized_latent_representation_gradio_images"][sample_index],
+                text2sound_sampled_spectrogram_image: text2sound_dict["new_sound_spectrogram_gradio_images"][
+                    sample_index],
+                text2sound_sampled_audio: text2sound_dict["new_sound_rec_signals_gradio"][sample_index]}
+    with gr.Tab("Text2sound_GAN"):
+        gr.Markdown("Use neural networks to select random sounds using your favorite instrument!")
+        with gr.Row(variant="panel"):
+            with gr.Column(scale=3):
+                text2sound_prompts_textbox = gr.Textbox(label="Positive prompt", lines=2, value="organ")
+                text2sound_negative_prompts_textbox = gr.Textbox(label="Negative prompt", lines=2, value="")
+            with gr.Column(scale=1):
+                text2sound_sampling_button = gr.Button(variant="primary",
+                                                       value="Generate a batch of samples and show "
+                                                             "the first one",
+                                                       scale=1)
+                text2sound_sample_index_slider = gr.Slider(minimum=0, maximum=3, value=0, step=1.0, visible=False,
+                                                           label="Sample index",
+                                                           info="Swipe to view other samples")
+        with gr.Row(variant="panel"):
+            with gr.Column(scale=1, variant="panel"):
+                text2sound_sample_steps_slider = gradioWebUI.get_sample_steps_slider()
+                text2sound_sampler_radio = gradioWebUI.get_sampler_radio()
+                text2sound_batchsize_slider = gradioWebUI.get_batchsize_slider()
+                text2sound_duration_slider = gradioWebUI.get_duration_slider()
+                text2sound_guidance_scale_slider = gradioWebUI.get_guidance_scale_slider()
+                text2sound_seed_textbox = gradioWebUI.get_seed_textbox()
+            with gr.Column(scale=1):
+                text2sound_sampled_spectrogram_image = gr.Image(label="Sampled spectrogram", type="numpy", height=420)
+                text2sound_sampled_audio = gr.Audio(type="numpy", label="Play")
+        with gr.Row(variant="panel"):
+            text2sound_latent_representation_image = gr.Image(label="Sampled latent representation", type="numpy",
+                                                              height=200, width=100)
+            text2sound_quantized_latent_representation_image = gr.Image(label="Quantized latent representation",
+                                                                        type="numpy", height=200, width=100)
+    text2sound_sampling_button.click(gan_random_sample,
+                                     inputs=[text2sound_prompts_textbox,
+                                             text2sound_negative_prompts_textbox,
+                                             text2sound_batchsize_slider,
+                                             text2sound_duration_slider,
+                                             text2sound_guidance_scale_slider, text2sound_sampler_radio,
+                                             text2sound_sample_steps_slider,
+                                             text2sound_seed_textbox,
+                                             text2sound_state],
+                                     outputs=[text2sound_latent_representation_image,
+                                              text2sound_quantized_latent_representation_image,
+                                              text2sound_sampled_spectrogram_image,
+                                              text2sound_sampled_audio,
+                                              text2sound_seed_textbox,
+                                              text2sound_state,
+                                              text2sound_sample_index_slider])
+    text2sound_sample_index_slider.change(show_random_sample,
+                                          inputs=[text2sound_sample_index_slider, text2sound_state],
+                                          outputs=[text2sound_latent_representation_image,
+                                                   text2sound_quantized_latent_representation_image,
+                                                   text2sound_sampled_spectrogram_image,
+                                                   text2sound_sampled_audio])

webUI/natural_language_guided_4/README.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import gradio as gr
+readme_content = """## Stable Diffusion for Sound Generation
+This project applies stable diffusion[1] to sound generation. Inspired by the work of AUTOMATIC1111, 2022[2], we have implemented a preliminary version of text2sound, sound2sound, inpaint, as well as an additional interpolation feature, all accessible through a web UI.
+### Neural Network Training Data:
+The neural network is trained using the filtered NSynth dataset[3], which is a large-scale and high-quality collection of annotated musical notes, comprising 305,979 musical notes. However, for this project, only samples with a pitch set to E3 were used, resulting in an actual training sample size of 4,096, making it a low-resource project.
+The training took place on an NVIDIA Tesla T4 GPU and spanned approximately 10 hours.
+### Natural Language Guidance:
+Natural language guidance is derived from the multi-label annotations of the NSynth dataset. The labels included in the training are:
+- **Instrument Families**: bass, brass, flute, guitar, keyboard, mallet, organ, reed, string, synth lead, vocal.
+- **Instrument Sources**: acoustic, electronic, synthetic.
+- **Note Qualities**: bright, dark, distortion, fast decay, long release, multiphonic, nonlinear env, percussive, reverb, tempo-synced.
+### Usage Hints:
+1. **Prompt Format**: It's recommended to use the format “label1, label2, label3“, e.g., ”organ, dark, long release“.
+2. **Unique Sounds**: If you keep generating the same sound, try setting a different seed!
+3. **Sample Indexing**: Drag the "Sample index slider" to view other samples within the generated batch.
+4. **Running on CPU**: Be cautious with the settings for 'batchsize' and 'sample_steps' when running on CPU to avoid timeouts. Recommended settings are batchsize ≤ 4 and sample_steps = 15.
+5. **Editing Sounds**: Generated audio can be downloaded and then re-uploaded for further editing at the sound2sound/inpaint sections.
+6. **Guidance Scale**: A higher 'guidance_scale' intensifies the influence of natural language conditioning on the generation[4]. It's recommended to set it between 3 and 10.
+7. **Noising Strength**: A smaller 'noising_strength' value makes the generated sound closer to the input sound.
+References:
+[1] Rombach, R., Blattmann, A., Lorenz, D., Esser, P., & Ommer, B. (2022). High-resolution image synthesis with latent diffusion models. In Proceedings of the IEEE/CVF conference on computer vision and pattern recognition (pp. 10684-10695).
+[2] AUTOMATIC1111. (2022). Stable Diffusion Web UI [Computer software]. Retrieved from https://github.com/AUTOMATIC1111/stable-diffusion-webui
+[3] Engel, J., Resnick, C., Roberts, A., Dieleman, S., Eck, D., Simonyan, K., & Norouzi, M. (2017). Neural Audio Synthesis of Musical Notes with WaveNet Autoencoders.
+[4] Ho, J., & Salimans, T. (2022). Classifier-free diffusion guidance. arXiv preprint arXiv:2207.12598.
+"""
+def get_readme_module():
+    with gr.Tab("README"):
+        # gr.Markdown("Use interpolation to generate a gradient sound sequence.")
+        with gr.Column(scale=3):
+            readme_textbox = gr.Textbox(label="readme", lines=40, value=readme_content, interactive=False)

webUI/natural_language_guided_4/__pycache__/gradio_webUI.cpython-310.pyc ADDED Viewed

Binary file (3.55 kB). View file

webUI/natural_language_guided_4/__pycache__/inpaint_with_text.cpython-310.pyc ADDED Viewed

Binary file (10.9 kB). View file

webUI/natural_language_guided_4/__pycache__/instruments.cpython-310.pyc ADDED Viewed

Binary file (2.62 kB). View file

webUI/natural_language_guided_4/__pycache__/load_presets.cpython-310.pyc ADDED Viewed

Binary file (2.74 kB). View file

webUI/natural_language_guided_4/__pycache__/note2music.cpython-310.pyc ADDED Viewed

Binary file (7.62 kB). View file

webUI/natural_language_guided_4/__pycache__/sound2sound_with_text.cpython-310.pyc ADDED Viewed

Binary file (9.32 kB). View file

webUI/natural_language_guided_4/__pycache__/text2sound.cpython-310.pyc ADDED Viewed

Binary file (6.5 kB). View file

webUI/natural_language_guided_4/__pycache__/track_maker.cpython-310.pyc ADDED Viewed

Binary file (7.56 kB). View file

webUI/natural_language_guided_4/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (5.89 kB). View file

webUI/natural_language_guided_4/build_instrument.py ADDED Viewed

	@@ -0,0 +1,305 @@

+import librosa
+import numpy as np
+import torch
+import gradio as gr
+import mido
+from io import BytesIO
+# import pyrubberband as pyrb
+import torchaudio.transforms as transforms
+from model.DiffSynthSampler import DiffSynthSampler
+from tools import adsr_envelope, adjust_audio_length
+from webUI.natural_language_guided.track_maker import DiffSynth
+from webUI.natural_language_guided.utils import encodeBatch2GradioOutput_STFT, phase_to_Gradio_image, \
+    spectrogram_to_Gradio_image
+def time_stretch_audio(waveform, sample_rate, stretch_factor):
+    # 如果输入是 numpy 数组，则转换为 torch.Tensor
+    if isinstance(waveform, np.ndarray):
+        waveform = torch.from_numpy(waveform)
+    # 确保 waveform 的类型为 torch.float32
+    waveform = waveform.to(torch.float32)
+    # 设置 STFT 参数
+    n_fft = 2048  # STFT 窗口大小
+    hop_length = n_fft // 4  # STFT 的 hop length 设置为 n_fft 的四分之一
+    # 计算短时傅里叶变换 (STFT)
+    stft = torch.stft(waveform, n_fft=n_fft, hop_length=hop_length, return_complex=True)
+    # 创建 TimeStretch 变换
+    time_stretch = transforms.TimeStretch(hop_length=hop_length, n_freq=1025, fixed_rate=False)
+    print(stft.shape)
+    # 应用时间伸缩
+    stretched_stft = time_stretch(stft, stretch_factor)
+    # 将 STFT 转换回时域波形
+    stretched_waveform = torch.istft(stretched_stft, n_fft=n_fft, hop_length=hop_length)
+    # 返回处理后的 waveform，转换为 numpy 数组
+    return stretched_waveform.detach().numpy()
+def get_build_instrument_module(gradioWebUI, virtual_instruments_state):
+    # Load configurations
+    uNet = gradioWebUI.uNet
+    freq_resolution, time_resolution = gradioWebUI.freq_resolution, gradioWebUI.time_resolution
+    VAE_scale = gradioWebUI.VAE_scale
+    height, width, channels = int(freq_resolution / VAE_scale), int(time_resolution / VAE_scale), gradioWebUI.channels
+    timesteps = gradioWebUI.timesteps
+    VAE_quantizer = gradioWebUI.VAE_quantizer
+    VAE_decoder = gradioWebUI.VAE_decoder
+    CLAP = gradioWebUI.CLAP
+    CLAP_tokenizer = gradioWebUI.CLAP_tokenizer
+    device = gradioWebUI.device
+    squared = gradioWebUI.squared
+    sample_rate = gradioWebUI.sample_rate
+    noise_strategy = gradioWebUI.noise_strategy
+    def select_sound(virtual_instrument_name, virtual_instruments_dict):
+        virtual_instruments = virtual_instruments_dict["virtual_instruments"]
+        virtual_instrument = virtual_instruments[virtual_instrument_name]
+        return {source_sound_spectrogram_image: virtual_instrument["spectrogram_gradio_image"],
+                source_sound_phase_image: virtual_instrument["phase_gradio_image"],
+                source_sound_audio: virtual_instrument["signal"]}
+    def make_track(inpaint_steps, midi, noising_strength, attack, before_release, instrument_names, virtual_instruments_dict):
+        if noising_strength < 1:
+          print(f"Warning: making track with noising_strength = {noising_strength} < 1")
+        virtual_instruments = virtual_instruments_dict["virtual_instruments"]
+        sample_steps = int(inpaint_steps)
+        instrument_names = instrument_names.split("@")
+        instruments_configs = {}
+        for virtual_instrument_name in instrument_names:
+            virtual_instrument = virtual_instruments[virtual_instrument_name]
+            latent_representation = torch.tensor(virtual_instrument["latent_representation"], dtype=torch.float32).to(device)
+            sampler = virtual_instrument["sampler"]
+            batchsize = 1
+            latent_representation = latent_representation.repeat(batchsize, 1, 1, 1)
+            mid = mido.MidiFile(file=BytesIO(midi))
+            instruments_configs[virtual_instrument_name] = {
+                    'sample_steps': sample_steps,
+                    'sampler': sampler,
+                    'noising_strength': noising_strength,
+                    'latent_representation': latent_representation,
+                    'attack': attack,
+                    'before_release': before_release}
+        diffSynth = DiffSynth(instruments_configs, uNet, VAE_quantizer, VAE_decoder, CLAP, CLAP_tokenizer, device)
+        full_audio = diffSynth.get_music(mid, instrument_names)
+        return {track_audio: (sample_rate, full_audio)}
+    def test_duration_inpaint(virtual_instrument_name, inpaint_steps, duration, noising_strength, end_noise_level_ratio, attack, before_release, mask_flexivity, virtual_instruments_dict, use_dynamic_mask):
+        width = int(time_resolution * ((duration + 1) / 4) / VAE_scale)
+        virtual_instruments = virtual_instruments_dict["virtual_instruments"]
+        virtual_instrument = virtual_instruments[virtual_instrument_name]
+        latent_representation = torch.tensor(virtual_instrument["latent_representation"], dtype=torch.float32).to(device)
+        sample_steps = int(inpaint_steps)
+        sampler = virtual_instrument["sampler"]
+        batchsize = 1
+        mySampler = DiffSynthSampler(timesteps, height=height, channels=channels, noise_strategy=noise_strategy)
+        mySampler.respace(list(np.linspace(0, timesteps - 1, sample_steps, dtype=np.int32)))
+        latent_representation = latent_representation.repeat(batchsize, 1, 1, 1)
+        # mask = 1, freeze
+        latent_mask = torch.zeros((batchsize, 1, height, width), dtype=torch.float32).to(device)
+        latent_mask[:, :, :, :int(time_resolution * (attack / 4) / VAE_scale)] = 1.0
+        latent_mask[:, :, :, -int(time_resolution * ((before_release+1) / 4) / VAE_scale):] = 1.0
+        text2sound_embedding = \
+            CLAP.get_text_features(**CLAP_tokenizer([""], padding=True, return_tensors="pt"))[0].to(
+                device)
+        condition = text2sound_embedding.repeat(1, 1)
+        latent_representations, initial_noise = \
+            mySampler.inpaint_sample(model=uNet, shape=(batchsize, channels, height, width),
+                                     noising_strength=noising_strength,
+                                     guide_img=latent_representation, mask=latent_mask, return_tensor=True,
+                                     condition=condition, sampler=sampler,
+                                     use_dynamic_mask=use_dynamic_mask,
+                                     end_noise_level_ratio=end_noise_level_ratio,
+                                     mask_flexivity=mask_flexivity)
+        latent_representations = latent_representations[-1]
+        quantized_latent_representations, loss, (_, _, _) = VAE_quantizer(latent_representations)
+        # Todo: remove hard-coding
+        flipped_log_spectrums, flipped_phases, rec_signals, _, _, _ = encodeBatch2GradioOutput_STFT(VAE_decoder,
+                                                                                                            quantized_latent_representations,
+                                                                                                            resolution=(
+                                                                                                                512,
+                                                                                                                width * VAE_scale),
+                                                                                                            original_STFT_batch=None
+                                                                                                     )
+        return {test_duration_spectrogram_image: flipped_log_spectrums[0],
+                test_duration_phase_image: flipped_phases[0],
+                test_duration_audio: (sample_rate, rec_signals[0])}
+    def test_duration_envelope(virtual_instrument_name, duration, noising_strength, attack, before_release, release, virtual_instruments_dict):
+        virtual_instruments = virtual_instruments_dict["virtual_instruments"]
+        virtual_instrument = virtual_instruments[virtual_instrument_name]
+        sample_rate, signal = virtual_instrument["signal"]
+        applied_signal = adsr_envelope(signal=signal, sample_rate=sample_rate, duration=duration,
+                                       attack_time=0.0, decay_time=0.0, sustain_level=1.0, release_time=release)
+        D = librosa.stft(applied_signal, n_fft=1024, hop_length=256, win_length=1024)[1:, :]
+        spc = np.abs(D)
+        phase = np.angle(D)
+        flipped_log_spectrum = spectrogram_to_Gradio_image(spc)
+        flipped_phase = phase_to_Gradio_image(phase)
+        return {test_duration_spectrogram_image: flipped_log_spectrum,
+                test_duration_phase_image: flipped_phase,
+                test_duration_audio: (sample_rate, applied_signal)}
+    def test_duration_stretch(virtual_instrument_name, duration, noising_strength, attack, before_release, release, virtual_instruments_dict):
+        virtual_instruments = virtual_instruments_dict["virtual_instruments"]
+        virtual_instrument = virtual_instruments[virtual_instrument_name]
+        sample_rate, signal = virtual_instrument["signal"]
+        s = 3 / duration
+        # applied_signal = pyrb.time_stretch(signal, sample_rate, s)
+        applied_signal = time_stretch_audio(signal, sample_rate, s)
+        applied_signal = adjust_audio_length(applied_signal, int((duration+1) * sample_rate), sample_rate, sample_rate)
+        D = librosa.stft(applied_signal, n_fft=1024, hop_length=256, win_length=1024)[1:, :]
+        spc = np.abs(D)
+        phase = np.angle(D)
+        flipped_log_spectrum = spectrogram_to_Gradio_image(spc)
+        flipped_phase = phase_to_Gradio_image(phase)
+        return {test_duration_spectrogram_image: flipped_log_spectrum,
+                test_duration_phase_image: flipped_phase,
+                test_duration_audio: (sample_rate, applied_signal)}
+    with gr.Tab("TestInTrack"):
+        gr.Markdown("Make music with generated sounds!")
+        with gr.Row(variant="panel"):
+            with gr.Column(scale=3):
+                instrument_name_textbox = gr.Textbox(label="Instrument name", lines=1,
+                                                     placeholder="Name of your instrument", scale=1)
+                select_instrument_button = gr.Button(variant="primary", value="Select", scale=1)
+            with gr.Column(scale=3):
+                inpaint_steps_slider = gr.Slider(minimum=5.0, maximum=999.0, value=20.0, step=1.0, label="inpaint_steps")
+                noising_strength_slider = gradioWebUI.get_noising_strength_slider(default_noising_strength=1.)
+                end_noise_level_ratio_slider = gr.Slider(minimum=0.0, maximum=1., value=0.0, step=0.01, label="end_noise_level_ratio")
+                attack_slider = gr.Slider(minimum=0.0, maximum=1.5, value=0.5, step=0.01, label="attack in sec")
+                before_release_slider = gr.Slider(minimum=0.0, maximum=1.5, value=0.5, step=0.01, label="before_release in sec")
+                release_slider = gr.Slider(minimum=0.0, maximum=1.0, value=0.3, step=0.01, label="release in sec")
+                mask_flexivity_slider = gr.Slider(minimum=0.01, maximum=1.00, value=1., step=0.01, label="mask_flexivity")
+            with gr.Column(scale=3):
+                use_dynamic_mask_checkbox = gr.Checkbox(label="Use dynamic mask", value=True)
+                test_duration_envelope_button = gr.Button(variant="primary", value="Apply envelope", scale=1)
+                test_duration_stretch_button = gr.Button(variant="primary", value="Apply stretch", scale=1)
+                test_duration_inpaint_button = gr.Button(variant="primary", value="Inpaint different duration", scale=1)
+                duration_slider = gradioWebUI.get_duration_slider()
+        with gr.Row(variant="panel"):
+            with gr.Column(scale=2):
+                with gr.Row(variant="panel"):
+                    source_sound_spectrogram_image = gr.Image(label="New sound spectrogram", type="numpy",
+                                                              height=600, scale=1)
+                    source_sound_phase_image = gr.Image(label="New sound phase", type="numpy",
+                                                              height=600, scale=1)
+                source_sound_audio = gr.Audio(type="numpy", label="Play new sound", interactive=False)
+            with gr.Column(scale=3):
+                with gr.Row(variant="panel"):
+                    test_duration_spectrogram_image = gr.Image(label="New sound spectrogram", type="numpy",
+                                                              height=600, scale=1)
+                    test_duration_phase_image = gr.Image(label="New sound phase", type="numpy",
+                                                              height=600, scale=1)
+                test_duration_audio = gr.Audio(type="numpy", label="Play new sound", interactive=False)
+        with gr.Row(variant="panel"):
+            with gr.Column(scale=1):
+                # track_spectrogram_image = gr.Image(label="New sound spectrogram", type="numpy",
+                #                                    height=420, scale=1)
+                midi_file = gr.File(label="Upload midi file", type="binary")
+                instrument_names_textbox = gr.Textbox(label="Instrument names", lines=2,
+                                                     placeholder="Names of your instrument used to play the midi", scale=1)
+                track_audio = gr.Audio(type="numpy", label="Play new sound", interactive=False)
+            make_track_button = gr.Button(variant="primary", value="Make track", scale=1)
+    select_instrument_button.click(select_sound,
+                                   inputs=[instrument_name_textbox, virtual_instruments_state],
+                                   outputs=[source_sound_spectrogram_image,
+                                            source_sound_phase_image,
+                                            source_sound_audio])
+    test_duration_envelope_button.click(test_duration_envelope,
+                                      inputs=[instrument_name_textbox, duration_slider,
+                                              noising_strength_slider,
+                                              attack_slider,
+                                              before_release_slider,
+                                              release_slider,
+                                              virtual_instruments_state,
+                                              ],
+                                      outputs=[test_duration_spectrogram_image,
+                                               test_duration_phase_image,
+                                               test_duration_audio])
+    test_duration_stretch_button.click(test_duration_stretch,
+                                      inputs=[instrument_name_textbox, duration_slider,
+                                              noising_strength_slider,
+                                              attack_slider,
+                                              before_release_slider,
+                                              release_slider,
+                                              virtual_instruments_state,
+                                              ],
+                                      outputs=[test_duration_spectrogram_image,
+                                               test_duration_phase_image,
+                                               test_duration_audio])
+    test_duration_inpaint_button.click(test_duration_inpaint,
+                                      inputs=[instrument_name_textbox,
+                                              inpaint_steps_slider,
+                                              duration_slider,
+                                              noising_strength_slider,
+                                              end_noise_level_ratio_slider,
+                                              attack_slider,
+                                              before_release_slider,
+                                              mask_flexivity_slider,
+                                              virtual_instruments_state,
+                                              use_dynamic_mask_checkbox],
+                                      outputs=[test_duration_spectrogram_image,
+                                               test_duration_phase_image,
+                                               test_duration_audio])
+    make_track_button.click(make_track,
+                                      inputs=[inpaint_steps_slider, midi_file,
+                                              noising_strength_slider,
+                                              attack_slider,
+                                              before_release_slider,
+                                              instrument_names_textbox,
+                                              virtual_instruments_state],
+                                      outputs=[track_audio])

webUI/natural_language_guided_4/gradio_webUI.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import gradio as gr
+class GradioWebUI():
+    def __init__(self, device, VAE, uNet, CLAP, CLAP_tokenizer,
+                 freq_resolution=512, time_resolution=256, channels=4, timesteps=1000,
+                 sample_rate=16000, squared=False, VAE_scale=4,
+                 flexible_duration=False, noise_strategy="repeat",
+                 GAN_generator = None):
+        self.device = device
+        self.VAE_encoder, self.VAE_quantizer, self.VAE_decoder = VAE._encoder, VAE._vq_vae, VAE._decoder
+        self.uNet = uNet
+        self.CLAP, self.CLAP_tokenizer = CLAP, CLAP_tokenizer
+        self.freq_resolution, self.time_resolution = freq_resolution, time_resolution
+        self.channels = channels
+        self.GAN_generator = GAN_generator
+        self.timesteps = timesteps
+        self.sample_rate = sample_rate
+        self.squared = squared
+        self.VAE_scale = VAE_scale
+        self.flexible_duration = flexible_duration
+        self.noise_strategy = noise_strategy
+        self.text2sound_state = gr.State(value={})
+        self.interpolation_state = gr.State(value={})
+        self.sound2sound_state = gr.State(value={})
+        self.inpaint_state = gr.State(value={})
+    def get_sample_steps_slider(self):
+        default_steps = 10 if (self.device == "cpu") else 20
+        return gr.Slider(minimum=10, maximum=100, value=default_steps, step=1,
+                         label="Sample steps",
+                         info="Sampling steps. The more sampling steps, the better the "
+                              "theoretical result, but the time it consumes.")
+    def get_sampler_radio(self):
+        # return gr.Radio(choices=["ddpm", "ddim", "dpmsolver++", "dpmsolver"], value="ddim", label="Sampler")
+        return gr.Radio(choices=["ddpm", "ddim"], value="ddim", label="Sampler")
+    def get_batchsize_slider(self, cpu_batchsize=1):
+        return gr.Slider(minimum=1., maximum=16, value=cpu_batchsize if (self.device == "cpu") else 8, step=1, label="Batchsize")
+    def get_time_resolution_slider(self):
+        return gr.Slider(minimum=16., maximum=int(1024/self.VAE_scale), value=int(256/self.VAE_scale), step=1, label="Time resolution", interactive=True)
+    def get_duration_slider(self):
+        if self.flexible_duration:
+            return gr.Slider(minimum=0.25, maximum=8., value=3., step=0.01, label="duration in sec")
+        else:
+            return gr.Slider(minimum=1., maximum=8., value=3., step=1., label="duration in sec")
+    def get_guidance_scale_slider(self):
+        return gr.Slider(minimum=0., maximum=20., value=6., step=1.,
+                         label="Guidance scale",
+                         info="The larger this value, the more the generated sound is "
+                              "influenced by the condition. Setting it to 0 is equivalent to "
+                              "the negative case.")
+    def get_noising_strength_slider(self, default_noising_strength=0.7):
+        return gr.Slider(minimum=0.0, maximum=1.00, value=default_noising_strength, step=0.01,
+                         label="noising strength",
+                         info="The smaller this value, the more the generated sound is "
+                              "closed to the origin.")
+    def get_seed_textbox(self):
+        return gr.Textbox(label="Seed", lines=1, placeholder="seed", value=0)

webUI/natural_language_guided_4/inpaint_with_text.py ADDED Viewed

	@@ -0,0 +1,371 @@

+import librosa
+import numpy as np
+import torch
+import gradio as gr
+from scipy.ndimage import zoom
+from model.DiffSynthSampler import DiffSynthSampler
+from tools import adjust_audio_length, safe_int, pad_STFT, encode_stft
+from webUI.natural_language_guided_4.utils import latent_representation_to_Gradio_image, InputBatch2Encode_STFT, \
+    encodeBatch2GradioOutput_STFT, add_instrument, average_np_arrays
+def get_triangle_mask(height, width):
+    mask = np.zeros((height, width))
+    slope = 8 / 3
+    for i in range(height):
+        for j in range(width):
+            if i < slope * j:
+                mask[i, j] = 1
+    return mask
+def get_inpaint_with_text_module(gradioWebUI, inpaintWithText_state, virtual_instruments_state):
+    # Load configurations
+    uNet = gradioWebUI.uNet
+    freq_resolution, time_resolution = gradioWebUI.freq_resolution, gradioWebUI.time_resolution
+    VAE_scale = gradioWebUI.VAE_scale
+    height, width, channels = int(freq_resolution / VAE_scale), int(time_resolution / VAE_scale), gradioWebUI.channels
+    timesteps = gradioWebUI.timesteps
+    VAE_encoder = gradioWebUI.VAE_encoder
+    VAE_quantizer = gradioWebUI.VAE_quantizer
+    VAE_decoder = gradioWebUI.VAE_decoder
+    CLAP = gradioWebUI.CLAP
+    CLAP_tokenizer = gradioWebUI.CLAP_tokenizer
+    device = gradioWebUI.device
+    squared = gradioWebUI.squared
+    sample_rate = gradioWebUI.sample_rate
+    noise_strategy = gradioWebUI.noise_strategy
+    def receive_upload_origin_audio(sound2sound_duration, sound2sound_origin, inpaintWithText_dict):
+        origin_sr, origin_audio = sound2sound_origin
+        origin_audio = origin_audio / np.max(np.abs(origin_audio))
+        width = int(time_resolution * ((sound2sound_duration + 1) / 4) / VAE_scale)
+        audio_length = 256 * (VAE_scale * width - 1)
+        origin_audio = adjust_audio_length(origin_audio, audio_length, origin_sr, sample_rate)
+        D = librosa.stft(origin_audio, n_fft=1024, hop_length=256, win_length=1024)
+        padded_D = pad_STFT(D)
+        encoded_D = encode_stft(padded_D)
+        # Todo: justify batchsize to 1
+        origin_spectrogram_batch_tensor = torch.from_numpy(
+            np.repeat(encoded_D[np.newaxis, :, :, :], 1, axis=0)).float().to(device)
+        # Todo: remove hard-coding
+        origin_flipped_log_spectrums, origin_flipped_phases, origin_signals, origin_latent_representations, quantized_origin_latent_representations = InputBatch2Encode_STFT(
+            VAE_encoder, origin_spectrogram_batch_tensor, resolution=(512, width * VAE_scale), quantizer=VAE_quantizer,
+            squared=squared)
+        inpaintWithText_dict["origin_upload_latent_representations"] = origin_latent_representations.tolist()
+        inpaintWithText_dict[
+            "sound2sound_origin_upload_latent_representation_image"] = latent_representation_to_Gradio_image(
+            origin_latent_representations[0]).tolist()
+        inpaintWithText_dict[
+            "sound2sound_origin_upload_quantized_latent_representation_image"] = latent_representation_to_Gradio_image(
+            quantized_origin_latent_representations[0]).tolist()
+        return {sound2sound_origin_spectrogram_image: origin_flipped_log_spectrums[0],
+                sound2sound_origin_phase_image: origin_flipped_phases[0],
+                sound2sound_origin_upload_latent_representation_image: latent_representation_to_Gradio_image(
+                    origin_latent_representations[0]),
+                sound2sound_origin_upload_quantized_latent_representation_image: latent_representation_to_Gradio_image(
+                    quantized_origin_latent_representations[0]),
+                sound2sound_origin_microphone_latent_representation_image: gr.update(),
+                sound2sound_origin_microphone_quantized_latent_representation_image: gr.update(),
+                inpaintWithText_state: inpaintWithText_dict}
+    def sound2sound_sample(sound2sound_origin_spectrogram,
+                           text2sound_prompts, text2sound_negative_prompts, sound2sound_batchsize,
+                           sound2sound_guidance_scale, sound2sound_sampler,
+                           sound2sound_sample_steps,
+                           sound2sound_noising_strength, sound2sound_seed, sound2sound_inpaint_area,
+                           mask_time_begin, mask_time_end, mask_frequency_begin, mask_frequency_end,
+                           inpaintWithText_dict
+                           ):
+        # input preprocessing
+        sound2sound_seed = safe_int(sound2sound_seed, 12345678)
+        sound2sound_batchsize = int(sound2sound_batchsize)
+        noising_strength = sound2sound_noising_strength
+        sound2sound_sample_steps = int(sound2sound_sample_steps)
+        CFG = int(sound2sound_guidance_scale)
+        text2sound_embedding = \
+            CLAP.get_text_features(**CLAP_tokenizer([text2sound_prompts], padding=True, return_tensors="pt"))[0].to(
+                device)
+        averaged_transparency = average_np_arrays(sound2sound_origin_spectrogram["layers"])
+        # print(f"averaged_transparency: {averaged_transparency}")
+        averaged_transparency = averaged_transparency[:, :, -1]
+        # print(f"averaged_transparency: {averaged_transparency}")
+        # print(f"np.shape(averaged_transparency): {np.shape(averaged_transparency)}")
+        # print(f"np.mean(averaged_transparency): {np.mean(averaged_transparency)}")
+        origin_latent_representations = torch.tensor(
+            inpaintWithText_dict["origin_upload_latent_representations"]).repeat(sound2sound_batchsize, 1, 1, 1).to(
+            device)
+        merged_mask = np.where(averaged_transparency > 0, 1, 0)
+        latent_mask = zoom(merged_mask, (1 / VAE_scale, 1 / VAE_scale))
+        latent_mask = np.clip(latent_mask, 0, 1)
+        # print(f"latent_mask.avg = {np.mean(latent_mask)}")
+        latent_mask[int(mask_frequency_begin):int(mask_frequency_end),
+        int(mask_time_begin * time_resolution / (VAE_scale * 4)):int(
+            mask_time_end * time_resolution / (VAE_scale * 4))] = 1
+        if sound2sound_inpaint_area == "masked":
+            latent_mask = 1 - latent_mask
+        latent_mask = torch.from_numpy(latent_mask).unsqueeze(0).unsqueeze(1).repeat(sound2sound_batchsize, channels, 1,
+                                                                                     1).float().to(device)
+        latent_mask = torch.flip(latent_mask, [2])
+        mySampler = DiffSynthSampler(timesteps, height=height, channels=channels, noise_strategy=noise_strategy)
+        unconditional_condition = \
+            CLAP.get_text_features(**CLAP_tokenizer([text2sound_negative_prompts], padding=True, return_tensors="pt"))[
+                0]
+        mySampler.activate_classifier_free_guidance(CFG, unconditional_condition.to(device))
+        normalized_sample_steps = int(sound2sound_sample_steps / noising_strength)
+        mySampler.respace(list(np.linspace(0, timesteps - 1, normalized_sample_steps, dtype=np.int32)))
+        # Todo: remove hard-coding
+        width = origin_latent_representations.shape[-1]
+        condition = text2sound_embedding.repeat(sound2sound_batchsize, 1)
+        new_sound_latent_representations, initial_noise = \
+            mySampler.inpaint_sample(model=uNet, shape=(sound2sound_batchsize, channels, height, width),
+                                     seed=sound2sound_seed,
+                                     noising_strength=noising_strength,
+                                     guide_img=origin_latent_representations, mask=latent_mask, return_tensor=True,
+                                     condition=condition, sampler=sound2sound_sampler)
+        new_sound_latent_representations = new_sound_latent_representations[-1]
+        # Quantize new sound latent representations
+        quantized_new_sound_latent_representations, loss, (_, _, _) = VAE_quantizer(new_sound_latent_representations)
+        new_sound_flipped_log_spectrums, new_sound_flipped_phases, new_sound_signals, _, _, _ = encodeBatch2GradioOutput_STFT(
+            VAE_decoder,
+            quantized_new_sound_latent_representations,
+            resolution=(
+                512,
+                width * VAE_scale),
+            original_STFT_batch=None
+            )
+        new_sound_latent_representation_gradio_images = []
+        new_sound_quantized_latent_representation_gradio_images = []
+        new_sound_spectrogram_gradio_images = []
+        new_sound_phase_gradio_images = []
+        new_sound_rec_signals_gradio = []
+        for i in range(sound2sound_batchsize):
+            new_sound_latent_representation_gradio_images.append(
+                latent_representation_to_Gradio_image(new_sound_latent_representations[i]))
+            new_sound_quantized_latent_representation_gradio_images.append(
+                latent_representation_to_Gradio_image(quantized_new_sound_latent_representations[i]))
+            new_sound_spectrogram_gradio_images.append(new_sound_flipped_log_spectrums[i])
+            new_sound_phase_gradio_images.append(new_sound_flipped_phases[i])
+            new_sound_rec_signals_gradio.append((sample_rate, new_sound_signals[i]))
+        inpaintWithText_dict[
+            "new_sound_latent_representation_gradio_images"] = new_sound_latent_representation_gradio_images
+        inpaintWithText_dict[
+            "new_sound_quantized_latent_representation_gradio_images"] = new_sound_quantized_latent_representation_gradio_images
+        inpaintWithText_dict["new_sound_spectrogram_gradio_images"] = new_sound_spectrogram_gradio_images
+        inpaintWithText_dict["new_sound_phase_gradio_images"] = new_sound_phase_gradio_images
+        inpaintWithText_dict["new_sound_rec_signals_gradio"] = new_sound_rec_signals_gradio
+        inpaintWithText_dict["latent_representations"] = new_sound_latent_representations.to("cpu").detach().numpy()
+        inpaintWithText_dict["quantized_latent_representations"] = quantized_new_sound_latent_representations.to(
+            "cpu").detach().numpy()
+        inpaintWithText_dict["sampler"] = sound2sound_sampler
+        return {sound2sound_new_sound_latent_representation_image: latent_representation_to_Gradio_image(
+            new_sound_latent_representations[0]),
+            sound2sound_new_sound_quantized_latent_representation_image: latent_representation_to_Gradio_image(
+                quantized_new_sound_latent_representations[0]),
+            sound2sound_new_sound_spectrogram_image: new_sound_flipped_log_spectrums[0],
+            sound2sound_new_sound_phase_image: new_sound_flipped_phases[0],
+            sound2sound_new_sound_audio: (sample_rate, new_sound_signals[0]),
+            sound2sound_sample_index_slider: gr.update(minimum=0, maximum=sound2sound_batchsize - 1, value=0,
+                                                       step=1.0,
+                                                       visible=True,
+                                                       label="Sample index",
+                                                       info="Swipe to view other samples"),
+            sound2sound_seed_textbox: sound2sound_seed,
+            inpaintWithText_state: inpaintWithText_dict}
+    def show_sound2sound_sample(sound2sound_sample_index, inpaintWithText_dict):
+        sample_index = int(sound2sound_sample_index)
+        return {sound2sound_new_sound_latent_representation_image:
+                    inpaintWithText_dict["new_sound_latent_representation_gradio_images"][sample_index],
+                sound2sound_new_sound_quantized_latent_representation_image:
+                    inpaintWithText_dict["new_sound_quantized_latent_representation_gradio_images"][sample_index],
+                sound2sound_new_sound_spectrogram_image: inpaintWithText_dict["new_sound_spectrogram_gradio_images"][
+                    sample_index],
+                sound2sound_new_sound_phase_image: inpaintWithText_dict["new_sound_phase_gradio_images"][
+                    sample_index],
+                sound2sound_new_sound_audio: inpaintWithText_dict["new_sound_rec_signals_gradio"][sample_index]}
+    def save_virtual_instrument(sample_index, virtual_instrument_name, sound2sound_dict, virtual_instruments_dict):
+        virtual_instruments_dict = add_instrument(sound2sound_dict, virtual_instruments_dict, virtual_instrument_name,
+                                                  sample_index)
+        return {virtual_instruments_state: virtual_instruments_dict,
+                sound2sound_instrument_name_textbox: gr.Textbox(label="Instrument name", lines=1,
+                                                                placeholder=f"Saved as {virtual_instrument_name}!")}
+    with gr.Tab("Inpaint"):
+        gr.Markdown("Upload a musical note and select the area by drawing on \"Input spectrogram\" for inpainting!")
+        with gr.Row(variant="panel"):
+            with gr.Column(scale=3):
+                text2sound_prompts_textbox = gr.Textbox(label="Positive prompt", lines=2, value="organ")
+                text2sound_negative_prompts_textbox = gr.Textbox(label="Negative prompt", lines=2, value="")
+            with gr.Column(scale=1):
+                sound2sound_sample_button = gr.Button(variant="primary", value="Generate", scale=1)
+                sound2sound_sample_index_slider = gr.Slider(minimum=0, maximum=3, value=0, step=1.0, visible=False,
+                                                            label="Sample index",
+                                                            info="Swipe to view other samples")
+        with gr.Row(variant="panel"):
+            with gr.Column(scale=1):
+                sound2sound_duration_slider = gradioWebUI.get_duration_slider()
+                sound2sound_origin_audio = gr.Audio(
+                    sources=["microphone", "upload"], label="Upload/Record source sound",
+                    waveform_options=gr.WaveformOptions(
+                        waveform_color="#01C6FF",
+                        waveform_progress_color="#0066B4",
+                        skip_length=1,
+                        show_controls=False,
+                    ),
+                )
+                with gr.Row(variant="panel"):
+                    with gr.Tab("Sound2sound settings"):
+                        sound2sound_sample_steps_slider = gradioWebUI.get_sample_steps_slider()
+                        sound2sound_sampler_radio = gradioWebUI.get_sampler_radio()
+                        sound2sound_batchsize_slider = gradioWebUI.get_batchsize_slider()
+                        sound2sound_noising_strength_slider = gradioWebUI.get_noising_strength_slider()
+                        sound2sound_guidance_scale_slider = gradioWebUI.get_guidance_scale_slider()
+                        sound2sound_seed_textbox = gradioWebUI.get_seed_textbox()
+                    with gr.Tab("Mask prototypes"):
+                        with gr.Tab("Mask along time axis"):
+                            mask_time_begin_slider = gr.Slider(minimum=0.0, maximum=4.00, value=0.0, step=0.01,
+                                                               label="Begin time")
+                            mask_time_end_slider = gr.Slider(minimum=0.0, maximum=4.00, value=0.0, step=0.01,
+                                                             label="End time")
+                        with gr.Tab("Mask along frequency axis"):
+                            mask_frequency_begin_slider = gr.Slider(minimum=0, maximum=127, value=0, step=1,
+                                                                    label="Begin freq pixel")
+                            mask_frequency_end_slider = gr.Slider(minimum=0, maximum=127, value=0, step=1,
+                                                                  label="End freq pixel")
+            with gr.Column(scale=1):
+                with gr.Row(variant="panel"):
+                    sound2sound_origin_spectrogram_image = gr.ImageEditor(label="Input spectrogram (draw here!)",
+                                                                          type="numpy",
+                                                                          visible=True, height=600, scale=1)
+                    sound2sound_new_sound_spectrogram_image = gr.Image(label="New sound spectrogram", type="numpy",
+                                                                       height=600, scale=1)
+                with gr.Row(variant="panel"):
+                    sound2sound_inpaint_area_radio = gr.Radio(label="Inpainting area", choices=["masked", "unmasked"],
+                                                              value="masked", scale=1)
+                    sound2sound_new_sound_audio = gr.Audio(type="numpy", label="Play new sound", interactive=False,
+                                                           waveform_options=gr.WaveformOptions(
+                                                               waveform_color="#FFB6C1",
+                                                               waveform_progress_color="#FF0000",
+                                                               skip_length=1,
+                                                               show_controls=False,
+                                                           ), scale=1 )
+                with gr.Row(variant="panel"):
+                    sound2sound_instrument_name_textbox = gr.Textbox(label="Instrument name", lines=1,
+                                                                     placeholder="Name of your instrument")
+                    sound2sound_save_instrument_button = gr.Button(variant="primary",
+                                                                   value="Save instrument",
+                                                                   scale=1)
+        with gr.Row(variant="panel"):
+            sound2sound_origin_upload_latent_representation_image = gr.Image(label="Original latent representation",
+                                                                             type="numpy", height=800,
+                                                                             visible=False)
+            sound2sound_origin_upload_quantized_latent_representation_image = gr.Image(
+                label="Original quantized latent representation", type="numpy", height=800, visible=False)
+            sound2sound_origin_microphone_latent_representation_image = gr.Image(label="Original latent representation",
+                                                                                 type="numpy", height=800,
+                                                                                 visible=False)
+            sound2sound_origin_microphone_quantized_latent_representation_image = gr.Image(
+                label="Original quantized latent representation", type="numpy", height=800, visible=False)
+            sound2sound_new_sound_latent_representation_image = gr.Image(label="New latent representation",
+                                                                         type="numpy", height=800, visible=False)
+            sound2sound_new_sound_quantized_latent_representation_image = gr.Image(
+                label="New sound quantized latent representation", type="numpy", height=800, visible=False)
+            sound2sound_origin_phase_image = gr.Image(label="Original upload phase",
+                                                      type="numpy", visible=False)
+            sound2sound_new_sound_phase_image = gr.Image(label="New sound phase", type="numpy",
+                                                         height=600, scale=1, visible=False)
+    sound2sound_origin_audio.change(receive_upload_origin_audio,
+                                    inputs=[sound2sound_duration_slider, sound2sound_origin_audio,
+                                            inpaintWithText_state],
+                                    outputs=[sound2sound_origin_spectrogram_image,
+                                             sound2sound_origin_phase_image,
+                                             sound2sound_origin_upload_latent_representation_image,
+                                             sound2sound_origin_upload_quantized_latent_representation_image,
+                                             sound2sound_origin_microphone_latent_representation_image,
+                                             sound2sound_origin_microphone_quantized_latent_representation_image,
+                                             inpaintWithText_state])
+    sound2sound_sample_button.click(sound2sound_sample,
+                                    inputs=[sound2sound_origin_spectrogram_image,
+                                            text2sound_prompts_textbox,
+                                            text2sound_negative_prompts_textbox,
+                                            sound2sound_batchsize_slider,
+                                            sound2sound_guidance_scale_slider,
+                                            sound2sound_sampler_radio,
+                                            sound2sound_sample_steps_slider,
+                                            sound2sound_noising_strength_slider,
+                                            sound2sound_seed_textbox,
+                                            sound2sound_inpaint_area_radio,
+                                            mask_time_begin_slider,
+                                            mask_time_end_slider,
+                                            mask_frequency_begin_slider,
+                                            mask_frequency_end_slider,
+                                            inpaintWithText_state],
+                                    outputs=[sound2sound_new_sound_latent_representation_image,
+                                             sound2sound_new_sound_quantized_latent_representation_image,
+                                             sound2sound_new_sound_spectrogram_image,
+                                             sound2sound_new_sound_phase_image,
+                                             sound2sound_new_sound_audio,
+                                             sound2sound_sample_index_slider,
+                                             sound2sound_seed_textbox,
+                                             inpaintWithText_state])
+    sound2sound_sample_index_slider.change(show_sound2sound_sample,
+                                           inputs=[sound2sound_sample_index_slider, inpaintWithText_state],
+                                           outputs=[sound2sound_new_sound_latent_representation_image,
+                                                    sound2sound_new_sound_quantized_latent_representation_image,
+                                                    sound2sound_new_sound_spectrogram_image,
+                                                    sound2sound_new_sound_phase_image,
+                                                    sound2sound_new_sound_audio])
+    sound2sound_save_instrument_button.click(save_virtual_instrument,
+                                             inputs=[sound2sound_sample_index_slider,
+                                                     sound2sound_instrument_name_textbox,
+                                                     inpaintWithText_state,
+                                                     virtual_instruments_state],
+                                             outputs=[virtual_instruments_state,
+                                                      sound2sound_instrument_name_textbox])

webUI/natural_language_guided_4/instruments.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import gradio as gr
+import numpy as np
+from model.DiffSynthSampler import DiffSynthSampler
+from tools import safe_int, read_wav_to_numpy
+from webUI.natural_language_guided.utils import latent_representation_to_Gradio_image, \
+    encodeBatch2GradioOutput_STFT, add_instrument
+from webUI.natural_language_guided_4.utils import resize_image_to_aspect_ratio
+def get_instruments_module(gradioWebUI, virtual_instruments_state):
+    with gr.Tab("intruments"):
+        gr.Markdown("Use neural networks to select random sounds using your favorite instrument!")
+        with gr.Row(variant="panel"):
+            with gr.Column(scale=1):
+                input_text = gr.Textbox(label="input")
+                @gr.render(inputs=input_text)
+                def show_split(text):
+                    textboxes = []
+                    if len(text) == 0:
+                        gr.Markdown("## No Input Provided")
+                    else:
+                        for letter in text:
+                            textboxes.append(gr.Textbox(letter, interactive=True))
+                    def merge(*splitted_texts):
+                        out = ""
+                        for t in splitted_texts:
+                            out += t
+                        return out
+                    submit_botton.click(merge, inputs=textboxes, outputs=merged_textbox)
+                submit_botton = gr.Button("submit")
+                merged_textbox = gr.Textbox(placeholder="placeholder", interactive=False)
+            with gr.Column(scale=1):
+                @gr.render(inputs=virtual_instruments_state)
+                def check_instruments(virtual_instruments_dict):
+                    virtual_instruments = virtual_instruments_dict["virtual_instruments"]
+                    instrument_names = list(virtual_instruments.keys())
+                    instrument_dropdown = gr.Dropdown(
+                        instrument_names, label="instrument", info="info placeholder"
+                    )
+                    def select_instrument(instrument):
+                        print(f"instrument: {instrument}")
+                        sr, signal = virtual_instruments[instrument]["signal"]
+                        return {selected_instrument_audio: (sr, signal)}
+                    instrument_dropdown.select(select_instrument, inputs=instrument_dropdown,
+                                               outputs=selected_instrument_audio)
+                selected_instrument_audio = gr.Audio(type="numpy", label="Play", scale=1, interactive=False)

webUI/natural_language_guided_4/load_presets.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import os
+import librosa
+import mido
+import numpy as np
+import torch
+from tools import read_wav_to_numpy, pad_STFT, encode_stft
+from webUI.natural_language_guided_4.gradio_webUI import GradioWebUI
+from webUI.natural_language_guided_4.utils import InputBatch2Encode_STFT
+def load_presets(gradioWebUI: GradioWebUI):
+    # Load configurations
+    uNet = gradioWebUI.uNet
+    freq_resolution, time_resolution = gradioWebUI.freq_resolution, gradioWebUI.time_resolution
+    VAE_scale = gradioWebUI.VAE_scale
+    height, width, channels = int(freq_resolution / VAE_scale), int(time_resolution / VAE_scale), gradioWebUI.channels
+    timesteps = gradioWebUI.timesteps
+    VAE_quantizer = gradioWebUI.VAE_quantizer
+    VAE_encoder = gradioWebUI.VAE_encoder
+    VAE_decoder = gradioWebUI.VAE_decoder
+    CLAP = gradioWebUI.CLAP
+    CLAP_tokenizer = gradioWebUI.CLAP_tokenizer
+    device = gradioWebUI.device
+    squared = gradioWebUI.squared
+    sample_rate = gradioWebUI.sample_rate
+    noise_strategy = gradioWebUI.noise_strategy
+    def add_preset_instruments(virtual_instruments, instrument_name):
+        instruments_path = os.path.join("webUI", "presets", "instruments", f"{instrument_name}.wav")
+        sample_rate, origin_audio = read_wav_to_numpy(instruments_path)
+        D = librosa.stft(origin_audio, n_fft=1024, hop_length=256, win_length=1024)
+        padded_D = pad_STFT(D)
+        encoded_D = encode_stft(padded_D)
+        # Todo: justify batchsize to 1
+        origin_spectrogram_batch_tensor = torch.from_numpy(
+            np.repeat(encoded_D[np.newaxis, :, :, :], 1, axis=0)).float().to(device)
+        # Todo: remove hard-coding
+        origin_flipped_log_spectrums, origin_flipped_phases, origin_signals, origin_latent_representations, quantized_origin_latent_representations = InputBatch2Encode_STFT(
+            VAE_encoder, origin_spectrogram_batch_tensor, resolution=(512, width * VAE_scale), quantizer=VAE_quantizer,
+            squared=squared)
+        virtual_instrument = {"latent_representation": origin_latent_representations[0].to("cpu").detach().numpy(),
+                              "quantized_latent_representation": quantized_origin_latent_representations[0].to(
+                                  "cpu").detach().numpy(),
+                              "sampler": "ddim",
+                              "signal": (sample_rate, origin_audio),
+                              "spectrogram_gradio_image": origin_flipped_log_spectrums[0],
+                              "phase_gradio_image": origin_flipped_phases[0]}
+        virtual_instruments[f"preset_{instrument_name}"] = virtual_instrument
+        return virtual_instruments
+    virtual_instruments = {}
+    preset_instrument_names = ["ax", "electronic_sound", "organ", "synth_lead", "keyboard", "string"]
+    for preset_instrument_name in preset_instrument_names:
+        virtual_instruments = add_preset_instruments(virtual_instruments, preset_instrument_name)
+    def load_midi_files():
+        midis_dict = {}
+        midi_file_names = ["Ode_to_Joy_Easy_variation", "Air_on_the_G_String", "Canon_in_D"]
+        for midi_file_name in midi_file_names:
+            midi_path = os.path.join("webUI", "presets", "midis", f"{midi_file_name}.mid")
+            mid = mido.MidiFile(midi_path)
+            midis_dict[midi_file_name] = mid
+        return midis_dict
+    midis = load_midi_files()
+    return virtual_instruments, midis

webUI/natural_language_guided_4/note2music.py ADDED Viewed

	@@ -0,0 +1,200 @@

+import torch
+import gradio as gr
+import mido
+from io import BytesIO
+# import pyrubberband as pyrb
+from webUI.natural_language_guided_4.track_maker import DiffSynth, Track
+def get_arrangement_module(gradioWebUI, virtual_instruments_state, midi_files_state):
+    # Load configurations
+    uNet = gradioWebUI.uNet
+    freq_resolution, time_resolution = gradioWebUI.freq_resolution, gradioWebUI.time_resolution
+    VAE_scale = gradioWebUI.VAE_scale
+    height, width, channels = int(freq_resolution / VAE_scale), int(time_resolution / VAE_scale), gradioWebUI.channels
+    timesteps = gradioWebUI.timesteps
+    VAE_quantizer = gradioWebUI.VAE_quantizer
+    VAE_decoder = gradioWebUI.VAE_decoder
+    CLAP = gradioWebUI.CLAP
+    CLAP_tokenizer = gradioWebUI.CLAP_tokenizer
+    device = gradioWebUI.device
+    squared = gradioWebUI.squared
+    sample_rate = gradioWebUI.sample_rate
+    noise_strategy = gradioWebUI.noise_strategy
+    def read_midi(midi, midi_dict):
+        mid = mido.MidiFile(file=BytesIO(midi))
+        tracks = [Track(t, mid.ticks_per_beat) for t in mid.tracks]
+        midi_info_text = f"Uploaded midi:"
+        for i, track in enumerate(tracks):
+            midi_info_text += f"\n{len(track.events)} events loaded from Track {i}."
+        midis = midi_dict["midis"]
+        midis["uploaded_midi"] = mid
+        midi_dict["midis"] = midis
+        return {midi_info_textbox: gr.Textbox(label="Midi info", lines=10,
+                                              placeholder=midi_info_text),
+                current_midi_state: "uploaded_midi",
+                midi_files_state: midi_dict}
+    def make_track(inpaint_steps, current_midi_name, midi_dict, max_notes, noising_strength, attack, before_release, current_instruments,
+                   virtual_instruments_dict):
+        if noising_strength < 1:
+            print(f"Warning: making track with noising_strength = {noising_strength} < 1")
+        virtual_instruments = virtual_instruments_dict["virtual_instruments"]
+        sample_steps = int(inpaint_steps)
+        print(f"current_instruments: {current_instruments}")
+        instrument_names = current_instruments
+        instruments_configs = {}
+        for virtual_instrument_name in instrument_names:
+            virtual_instrument = virtual_instruments[virtual_instrument_name]
+            latent_representation = torch.tensor(virtual_instrument["latent_representation"], dtype=torch.float32).to(
+                device)
+            sampler = virtual_instrument["sampler"]
+            batchsize = 1
+            latent_representation = latent_representation.repeat(batchsize, 1, 1, 1)
+            instruments_configs[virtual_instrument_name] = {
+                'sample_steps': sample_steps,
+                'sampler': sampler,
+                'noising_strength': noising_strength,
+                'latent_representation': latent_representation,
+                'attack': attack,
+                'before_release': before_release}
+        diffSynth = DiffSynth(instruments_configs, uNet, VAE_quantizer, VAE_decoder, CLAP, CLAP_tokenizer, device)
+        midis = midi_dict["midis"]
+        mid = midis[current_midi_name]
+        full_audio = diffSynth.get_music(mid, instrument_names, max_notes=max_notes)
+        return {track_audio: (sample_rate, full_audio)}
+    with gr.Tab("Arrangement"):
+        default_instrument = "preset_string"
+        current_instruments_state = gr.State(value=[default_instrument for _ in range(100)])
+        current_midi_state = gr.State(value="Ode_to_Joy_Easy_variation")
+        gr.Markdown("Make music with generated sounds!")
+        with gr.Row(variant="panel"):
+            with gr.Column(scale=3):
+                @gr.render(inputs=midi_files_state)
+                def check_midis(midi_dict):
+                    midis = midi_dict["midis"]
+                    midi_names = list(midis.keys())
+                    instrument_dropdown = gr.Dropdown(
+                        midi_names, label="Select from preset midi files", value="Ode_to_Joy_Easy_variation"
+                    )
+                    def select_midi(midi_name):
+                        # print(f"midi_name: {midi_name}")
+                        mid = midis[midi_name]
+                        tracks = [Track(t, mid.ticks_per_beat) for t in mid.tracks]
+                        midi_info_text = f"Name: {midi_name}"
+                        for i, track in enumerate(tracks):
+                            midi_info_text += f"\n{len(track.events)} events loaded from Track {i}."
+                        return {midi_info_textbox: gr.Textbox(label="Midi info", lines=10,
+                                                              placeholder=midi_info_text),
+                                current_midi_state: midi_name}
+                    instrument_dropdown.select(select_midi, inputs=instrument_dropdown,
+                                               outputs=[midi_info_textbox, current_midi_state])
+                midi_file = gr.File(label="Upload a midi file", type="binary", scale=1)
+                midi_info_textbox = gr.Textbox(label="Midi info", lines=10,
+                                               placeholder="Please select/upload a midi on the left.", scale=3,
+                                               visible=False)
+            with gr.Column(scale=3, ):
+                @gr.render(inputs=[current_midi_state, midi_files_state, virtual_instruments_state])
+                def render_select_instruments(current_midi_name, midi_dict, virtual_instruments_dict):
+                    virtual_instruments = virtual_instruments_dict["virtual_instruments"]
+                    instrument_names = list(virtual_instruments.keys())
+                    midis = midi_dict["midis"]
+                    mid = midis[current_midi_name]
+                    tracks = [Track(t, mid.ticks_per_beat) for t in mid.tracks]
+                    dropdowns = []
+                    for i, track in enumerate(tracks):
+                        dropdowns.append(gr.Dropdown(
+                            instrument_names, value=default_instrument, label=f"Track {i}:  {len(track.events)} notes",
+                            info=f"Select an instrument to play this track!"
+                        ))
+                    def select_instruments(*instruments):
+                        return instruments
+                    for d in dropdowns:
+                        d.select(select_instruments, inputs=dropdowns,
+                                 outputs=current_instruments_state)
+            with gr.Column(scale=3):
+                max_notes_slider = gr.Slider(minimum=10.0, maximum=999.0, value=100.0, step=1.0,
+                                             label="Maximum number of synthesized notes in each track",
+                                             info="Lower this value to prevent Gradio timeouts")
+                make_track_button = gr.Button(variant="primary", value="Make track", scale=1)
+                track_audio = gr.Audio(type="numpy", label="Play music", interactive=False)
+        with gr.Row(variant="panel", visible=False):
+            with gr.Tab("Origin sound"):
+                inpaint_steps_slider = gr.Slider(minimum=5.0, maximum=999.0, value=20.0, step=1.0,
+                                                 label="inpaint_steps")
+                noising_strength_slider = gradioWebUI.get_noising_strength_slider(default_noising_strength=1.)
+                end_noise_level_ratio_slider = gr.Slider(minimum=0.0, maximum=1., value=0.0, step=0.01,
+                                                         label="end_noise_level_ratio")
+                attack_slider = gr.Slider(minimum=0.0, maximum=1.5, value=0.5, step=0.01, label="attack in sec")
+                before_release_slider = gr.Slider(minimum=0.0, maximum=1.5, value=0.5, step=0.01,
+                                                  label="before_release in sec")
+                release_slider = gr.Slider(minimum=0.0, maximum=1.0, value=0.3, step=0.01, label="release in sec")
+                mask_flexivity_slider = gr.Slider(minimum=0.01, maximum=1.00, value=1., step=0.01,
+                                                  label="mask_flexivity")
+            with gr.Tab("Length adjustment config"):
+                use_dynamic_mask_checkbox = gr.Checkbox(label="Use dynamic mask", value=True)
+                test_duration_envelope_button = gr.Button(variant="primary", value="Apply envelope", scale=1)
+                test_duration_stretch_button = gr.Button(variant="primary", value="Apply stretch", scale=1)
+                test_duration_inpaint_button = gr.Button(variant="primary", value="Inpaint different duration", scale=1)
+                duration_slider = gradioWebUI.get_duration_slider()
+            with gr.Tab("Pitch shift config"):
+                pitch_shift_radio = gr.Radio(choices=["librosa", "torchaudio", "rubberband"],
+                                             value="librosa")
+        with gr.Row(variant="panel", visible=False):
+            with gr.Column(scale=2):
+                with gr.Row(variant="panel"):
+                    source_sound_spectrogram_image = gr.Image(label="New sound spectrogram", type="numpy",
+                                                              height=600, scale=1)
+                    source_sound_phase_image = gr.Image(label="New sound phase", type="numpy",
+                                                        height=600, scale=1)
+    make_track_button.click(make_track,
+                            inputs=[inpaint_steps_slider, current_midi_state, midi_files_state,
+                                    max_notes_slider, noising_strength_slider,
+                                    attack_slider,
+                                    before_release_slider,
+                                    current_instruments_state,
+                                    virtual_instruments_state],
+                            outputs=[track_audio])
+    midi_file.change(read_midi,
+                     inputs=[midi_file,
+                             midi_files_state],
+                     outputs=[midi_info_textbox,
+                              current_midi_state,
+                              midi_files_state])

webUI/natural_language_guided_4/rec.py ADDED Viewed

	@@ -0,0 +1,190 @@

+import gradio as gr
+from data_generation.nsynth import get_nsynth_dataloader
+from webUI.natural_language_guided_STFT.utils import encodeBatch2GradioOutput_STFT, InputBatch2Encode_STFT, \
+    latent_representation_to_Gradio_image
+def get_recSTFT_module(gradioWebUI, reconstruction_state):
+    # Load configurations
+    uNet = gradioWebUI.uNet
+    freq_resolution, time_resolution = gradioWebUI.freq_resolution, gradioWebUI.time_resolution
+    VAE_scale = gradioWebUI.VAE_scale
+    height, width, channels = int(freq_resolution / VAE_scale), int(time_resolution / VAE_scale), gradioWebUI.channels
+    timesteps = gradioWebUI.timesteps
+    VAE_quantizer = gradioWebUI.VAE_quantizer
+    VAE_encoder = gradioWebUI.VAE_encoder
+    VAE_decoder = gradioWebUI.VAE_decoder
+    CLAP = gradioWebUI.CLAP
+    CLAP_tokenizer = gradioWebUI.CLAP_tokenizer
+    device = gradioWebUI.device
+    squared = gradioWebUI.squared
+    sample_rate = gradioWebUI.sample_rate
+    noise_strategy = gradioWebUI.noise_strategy
+    def generate_reconstruction_samples(sample_source, batchsize_slider, encodeCache,
+                                        reconstruction_samples):
+        vae_batchsize = int(batchsize_slider)
+        if sample_source == "text2sound_trainSTFT":
+            training_dataset_path = f'data/NSynth/nsynth-STFT-train-52.hdf5'  # Make sure to use your actual path
+            iterator = get_nsynth_dataloader(training_dataset_path, batch_size=vae_batchsize, shuffle=True,
+                                                        get_latent_representation=False, with_meta_data=False,
+                                                        task="STFT")
+        elif sample_source == "text2sound_validSTFT":
+            training_dataset_path = f'data/NSynth/nsynth-STFT-valid-52.hdf5'  # Make sure to use your actual path
+            iterator = get_nsynth_dataloader(training_dataset_path, batch_size=vae_batchsize, shuffle=True,
+                                                        get_latent_representation=False, with_meta_data=False,
+                                                        task="STFT")
+        elif sample_source == "text2sound_testSTFT":
+            training_dataset_path = f'data/NSynth/nsynth-STFT-test-52.hdf5'  # Make sure to use your actual path
+            iterator = get_nsynth_dataloader(training_dataset_path, batch_size=vae_batchsize, shuffle=True,
+                                                        get_latent_representation=False, with_meta_data=False,
+                                                        task="STFT")
+        else:
+            raise NotImplementedError()
+        spectrogram_batch = next(iter(iterator))
+        origin_flipped_log_spectrums, origin_flipped_phases, origin_signals, latent_representations, quantized_latent_representations = InputBatch2Encode_STFT(
+            VAE_encoder, spectrogram_batch, resolution=(512, width * VAE_scale), quantizer=VAE_quantizer, squared=squared)
+        latent_representation_gradio_images, quantized_latent_representation_gradio_images = [], []
+        for i in range(vae_batchsize):
+            latent_representation_gradio_images.append(latent_representation_to_Gradio_image(latent_representations[i]))
+            quantized_latent_representation_gradio_images.append(
+                latent_representation_to_Gradio_image(quantized_latent_representations[i]))
+        if quantized_latent_representations is None:
+            quantized_latent_representations = latent_representations
+        reconstruction_flipped_log_spectrums, reconstruction_flipped_phases, reconstruction_signals, reconstruction_flipped_log_spectrums_WOA, reconstruction_flipped_phases_WOA, reconstruction_signals_WOA = encodeBatch2GradioOutput_STFT(VAE_decoder,
+                                                                                                            quantized_latent_representations,
+                                                                                                            resolution=(
+                                                                                                                512,
+                                                                                                                width * VAE_scale),
+                                                                                                            original_STFT_batch=spectrogram_batch
+                                                                                                     )
+        reconstruction_samples["origin_flipped_log_spectrums"] = origin_flipped_log_spectrums
+        reconstruction_samples["origin_flipped_phases"] = origin_flipped_phases
+        reconstruction_samples["origin_signals"] = origin_signals
+        reconstruction_samples["latent_representation_gradio_images"] = latent_representation_gradio_images
+        reconstruction_samples[
+            "quantized_latent_representation_gradio_images"] = quantized_latent_representation_gradio_images
+        reconstruction_samples[
+            "reconstruction_flipped_log_spectrums"] = reconstruction_flipped_log_spectrums
+        reconstruction_samples[
+            "reconstruction_flipped_phases"] = reconstruction_flipped_phases
+        reconstruction_samples["reconstruction_signals"] = reconstruction_signals
+        reconstruction_samples[
+            "reconstruction_flipped_log_spectrums_WOA"] = reconstruction_flipped_log_spectrums_WOA
+        reconstruction_samples[
+            "reconstruction_flipped_phases_WOA"] = reconstruction_flipped_phases_WOA
+        reconstruction_samples["reconstruction_signals_WOA"] = reconstruction_signals_WOA
+        reconstruction_samples["sampleRate"] = sample_rate
+        latent_representation_gradio_image = reconstruction_samples["latent_representation_gradio_images"][0]
+        quantized_latent_representation_gradio_image = \
+            reconstruction_samples["quantized_latent_representation_gradio_images"][0]
+        origin_flipped_log_spectrum = reconstruction_samples["origin_flipped_log_spectrums"][0]
+        origin_flipped_phase = reconstruction_samples["origin_flipped_phases"][0]
+        origin_signal = reconstruction_samples["origin_signals"][0]
+        reconstruction_flipped_log_spectrum = reconstruction_samples["reconstruction_flipped_log_spectrums"][0]
+        reconstruction_flipped_phase = reconstruction_samples["reconstruction_flipped_phases"][0]
+        reconstruction_signal = reconstruction_samples["reconstruction_signals"][0]
+        reconstruction_flipped_log_spectrum_WOA = reconstruction_samples["reconstruction_flipped_log_spectrums_WOA"][0]
+        reconstruction_flipped_phase_WOA = reconstruction_samples["reconstruction_flipped_phases_WOA"][0]
+        reconstruction_signal_WOA = reconstruction_samples["reconstruction_signals_WOA"][0]
+        return {origin_amplitude_image_output: origin_flipped_log_spectrum,
+                origin_phase_image_output: origin_flipped_phase,
+                origin_audio_output: (sample_rate, origin_signal),
+                latent_representation_image_output: latent_representation_gradio_image,
+                quantized_latent_representation_image_output: quantized_latent_representation_gradio_image,
+                reconstruction_amplitude_image_output: reconstruction_flipped_log_spectrum,
+                reconstruction_phase_image_output: reconstruction_flipped_phase,
+                reconstruction_audio_output: (sample_rate, reconstruction_signal),
+                reconstruction_amplitude_image_output_WOA: reconstruction_flipped_log_spectrum_WOA,
+                reconstruction_phase_image_output_WOA: reconstruction_flipped_phase_WOA,
+                reconstruction_audio_output_WOA: (sample_rate, reconstruction_signal_WOA),
+                sample_index_slider: gr.update(minimum=0, maximum=vae_batchsize - 1, value=0, step=1.0,
+                                               label="Sample index.",
+                                               info="Slide to view other samples", scale=1, visible=True),
+                reconstruction_state: encodeCache,
+                reconstruction_samples_state: reconstruction_samples}
+    def show_reconstruction_sample(sample_index, encodeCache_state, reconstruction_samples_state):
+        sample_index = int(sample_index)
+        sampleRate = reconstruction_samples_state["sampleRate"]
+        latent_representation_gradio_image = reconstruction_samples_state["latent_representation_gradio_images"][
+            sample_index]
+        quantized_latent_representation_gradio_image = \
+            reconstruction_samples_state["quantized_latent_representation_gradio_images"][sample_index]
+        origin_flipped_log_spectrum = reconstruction_samples_state["origin_flipped_log_spectrums"][sample_index]
+        origin_flipped_phase = reconstruction_samples_state["origin_flipped_phases"][sample_index]
+        origin_signal = reconstruction_samples_state["origin_signals"][sample_index]
+        reconstruction_flipped_log_spectrum = reconstruction_samples_state["reconstruction_flipped_log_spectrums"][
+            sample_index]
+        reconstruction_flipped_phase = reconstruction_samples_state["reconstruction_flipped_phases"][
+            sample_index]
+        reconstruction_signal = reconstruction_samples_state["reconstruction_signals"][sample_index]
+        reconstruction_flipped_log_spectrum_WOA = reconstruction_samples_state["reconstruction_flipped_log_spectrums_WOA"][
+            sample_index]
+        reconstruction_flipped_phase_WOA = reconstruction_samples_state["reconstruction_flipped_phases_WOA"][
+            sample_index]
+        reconstruction_signal_WOA = reconstruction_samples_state["reconstruction_signals_WOA"][sample_index]
+        return origin_flipped_log_spectrum, origin_flipped_phase, (sampleRate, origin_signal), \
+            latent_representation_gradio_image, quantized_latent_representation_gradio_image, \
+            reconstruction_flipped_log_spectrum, reconstruction_flipped_phase, (sampleRate, reconstruction_signal), \
+            reconstruction_flipped_log_spectrum_WOA, reconstruction_flipped_phase_WOA, (sampleRate, reconstruction_signal_WOA), \
+            encodeCache_state, reconstruction_samples_state
+    with gr.Tab("Reconstruction"):
+        reconstruction_samples_state = gr.State(value={})
+        gr.Markdown("Test reconstruction.")
+        with gr.Row(variant="panel"):
+            with gr.Column():
+                sample_source_radio = gr.Radio(
+                    choices=["synthetic", "external", "text2sound_trainSTFT", "text2sound_testSTFT", "text2sound_validSTFT"],
+                    value="text2sound_trainf", info="Info placeholder", scale=2)
+                batchsize_slider = gr.Slider(minimum=1., maximum=16., value=4., step=1.,
+                                             label="batchsize")
+            with gr.Column():
+                generate_button = gr.Button(variant="primary", value="Generate reconstruction samples", scale=1)
+                sample_index_slider = gr.Slider(minimum=0, maximum=3, value=0, step=1.0, label="Sample index.",
+                                                info="Slide to view other samples", scale=1, visible=False)
+        with gr.Row(variant="panel"):
+            with gr.Column():
+                origin_amplitude_image_output = gr.Image(label="Spectrogram", type="numpy", height=300, width=100, scale=1)
+                origin_phase_image_output = gr.Image(label="Phase", type="numpy", height=300, width=100, scale=1)
+                origin_audio_output = gr.Audio(type="numpy", label="Play the example!")
+            with gr.Column():
+                reconstruction_amplitude_image_output = gr.Image(label="Spectrogram", type="numpy", height=300, width=100, scale=1)
+                reconstruction_phase_image_output = gr.Image(label="Phase", type="numpy", height=300, width=100, scale=1)
+                reconstruction_audio_output = gr.Audio(type="numpy", label="Play the example!")
+            with gr.Column():
+                reconstruction_amplitude_image_output_WOA = gr.Image(label="Spectrogram", type="numpy", height=300, width=100, scale=1)
+                reconstruction_phase_image_output_WOA = gr.Image(label="Phase", type="numpy", height=300, width=100, scale=1)
+                reconstruction_audio_output_WOA = gr.Audio(type="numpy", label="Play the example!")
+        with gr.Row(variant="panel", equal_height=True):
+            latent_representation_image_output = gr.Image(label="latent_representation", type="numpy", height=300, width=100)
+            quantized_latent_representation_image_output = gr.Image(label="quantized", type="numpy", height=300, width=100)
+    generate_button.click(generate_reconstruction_samples,
+                          inputs=[sample_source_radio, batchsize_slider, reconstruction_state,
+                                  reconstruction_samples_state],
+                          outputs=[origin_amplitude_image_output, origin_phase_image_output, origin_audio_output,
+                                   latent_representation_image_output, quantized_latent_representation_image_output,
+                                   reconstruction_amplitude_image_output, reconstruction_phase_image_output, reconstruction_audio_output,
+                                   reconstruction_amplitude_image_output_WOA, reconstruction_phase_image_output_WOA, reconstruction_audio_output_WOA,
+                                   sample_index_slider, reconstruction_state, reconstruction_samples_state])
+    sample_index_slider.change(show_reconstruction_sample,
+                               inputs=[sample_index_slider, reconstruction_state, reconstruction_samples_state],
+                               outputs=[origin_amplitude_image_output, origin_phase_image_output, origin_audio_output,
+                                        latent_representation_image_output, quantized_latent_representation_image_output,
+                                        reconstruction_amplitude_image_output, reconstruction_phase_image_output, reconstruction_audio_output,
+                                        reconstruction_amplitude_image_output_WOA, reconstruction_phase_image_output_WOA, reconstruction_audio_output_WOA,
+                                        reconstruction_state, reconstruction_samples_state])

webUI/natural_language_guided_4/sound2sound_with_text.py ADDED Viewed

	@@ -0,0 +1,325 @@

+import gradio as gr
+import librosa
+import numpy as np
+import torch
+from model.DiffSynthSampler import DiffSynthSampler
+from tools import pad_STFT, encode_stft
+from tools import safe_int, adjust_audio_length
+from webUI.natural_language_guided_4.utils import InputBatch2Encode_STFT, encodeBatch2GradioOutput_STFT, \
+    latent_representation_to_Gradio_image, resize_image_to_aspect_ratio, add_instrument
+def get_sound2sound_with_text_module(gradioWebUI, sound2sound_with_text_state, virtual_instruments_state):
+    # Load configurations
+    uNet = gradioWebUI.uNet
+    freq_resolution, time_resolution = gradioWebUI.freq_resolution, gradioWebUI.time_resolution
+    VAE_scale = gradioWebUI.VAE_scale
+    height, width, channels = int(freq_resolution / VAE_scale), int(time_resolution / VAE_scale), gradioWebUI.channels
+    timesteps = gradioWebUI.timesteps
+    VAE_encoder = gradioWebUI.VAE_encoder
+    VAE_quantizer = gradioWebUI.VAE_quantizer
+    VAE_decoder = gradioWebUI.VAE_decoder
+    CLAP = gradioWebUI.CLAP
+    CLAP_tokenizer = gradioWebUI.CLAP_tokenizer
+    device = gradioWebUI.device
+    squared = gradioWebUI.squared
+    sample_rate = gradioWebUI.sample_rate
+    noise_strategy = gradioWebUI.noise_strategy
+    def receive_upload_origin_audio(sound2sound_duration, sound2sound_origin,
+                                    sound2sound_with_text_dict, virtual_instruments_dict):
+        origin_sr, origin_audio = sound2sound_origin
+        origin_audio = origin_audio / np.max(np.abs(origin_audio))
+        width = int(time_resolution * ((sound2sound_duration + 1) / 4) / VAE_scale)
+        audio_length = 256 * (VAE_scale * width - 1)
+        origin_audio = adjust_audio_length(origin_audio, audio_length, origin_sr, sample_rate)
+        D = librosa.stft(origin_audio, n_fft=1024, hop_length=256, win_length=1024)
+        padded_D = pad_STFT(D)
+        encoded_D = encode_stft(padded_D)
+        # Todo: justify batchsize to 1
+        origin_spectrogram_batch_tensor = torch.from_numpy(
+            np.repeat(encoded_D[np.newaxis, :, :, :], 1, axis=0)).float().to(device)
+        # Todo: remove hard-coding
+        origin_flipped_log_spectrums, origin_flipped_phases, origin_signals, origin_latent_representations, quantized_origin_latent_representations = InputBatch2Encode_STFT(
+            VAE_encoder, origin_spectrogram_batch_tensor, resolution=(512, width * VAE_scale), quantizer=VAE_quantizer,
+            squared=squared)
+        sound2sound_with_text_dict["origin_latent_representations"] = origin_latent_representations.tolist()
+        sound2sound_with_text_dict[
+            "sound2sound_origin_latent_representation_image"] = latent_representation_to_Gradio_image(
+            origin_latent_representations[0]).tolist()
+        sound2sound_with_text_dict[
+            "sound2sound_origin_quantized_latent_representation_image"] = latent_representation_to_Gradio_image(
+            quantized_origin_latent_representations[0]).tolist()
+        return {sound2sound_origin_spectrogram_image: resize_image_to_aspect_ratio(origin_flipped_log_spectrums[0],
+                                                                                   1.55,
+                                                                                   1),
+                sound2sound_origin_phase_image: resize_image_to_aspect_ratio(origin_flipped_phases[0],
+                                                                                   1.55,
+                                                                                   1),
+                sound2sound_origin_latent_representation_image: latent_representation_to_Gradio_image(
+                    origin_latent_representations[0]),
+                sound2sound_origin_quantized_latent_representation_image: latent_representation_to_Gradio_image(
+                    quantized_origin_latent_representations[0]),
+                sound2sound_with_text_state: sound2sound_with_text_dict,
+                virtual_instruments_state: virtual_instruments_dict}
+    def sound2sound_sample(sound2sound_prompts, sound2sound_negative_prompts, sound2sound_batchsize,
+                           sound2sound_guidance_scale, sound2sound_sampler,
+                           sound2sound_sample_steps,
+                           sound2sound_noising_strength, sound2sound_seed, sound2sound_dict, virtual_instruments_dict):
+        # input processing
+        sound2sound_seed = safe_int(sound2sound_seed, 12345678)
+        sound2sound_batchsize = int(sound2sound_batchsize)
+        noising_strength = sound2sound_noising_strength
+        sound2sound_sample_steps = int(sound2sound_sample_steps)
+        CFG = int(sound2sound_guidance_scale)
+        origin_latent_representations = torch.tensor(
+            sound2sound_dict["origin_latent_representations"]).repeat(sound2sound_batchsize, 1, 1, 1).to(
+            device)
+        # sound2sound
+        text2sound_embedding = \
+            CLAP.get_text_features(**CLAP_tokenizer([sound2sound_prompts], padding=True, return_tensors="pt"))[0].to(
+                device)
+        mySampler = DiffSynthSampler(timesteps, height=height, channels=channels, noise_strategy=noise_strategy)
+        negative_condition = \
+            CLAP.get_text_features(**CLAP_tokenizer([sound2sound_negative_prompts], padding=True, return_tensors="pt"))[
+                0]
+        mySampler.activate_classifier_free_guidance(CFG, negative_condition.to(device))
+        normalized_sample_steps = int(sound2sound_sample_steps / noising_strength)
+        mySampler.respace(list(np.linspace(0, timesteps - 1, normalized_sample_steps, dtype=np.int32)))
+        condition = text2sound_embedding.repeat(sound2sound_batchsize, 1)
+        # Todo: remove-hard coding
+        width = origin_latent_representations.shape[-1]
+        new_sound_latent_representations, initial_noise = \
+            mySampler.img_guided_sample(model=uNet, shape=(sound2sound_batchsize, channels, height, width),
+                                        seed=sound2sound_seed,
+                                        noising_strength=noising_strength,
+                                        guide_img=origin_latent_representations, return_tensor=True,
+                                        condition=condition,
+                                        sampler=sound2sound_sampler)
+        new_sound_latent_representations = new_sound_latent_representations[-1]
+        # Quantize new sound latent representations
+        quantized_new_sound_latent_representations, loss, (_, _, _) = VAE_quantizer(new_sound_latent_representations)
+        new_sound_flipped_log_spectrums, new_sound_flipped_phases, new_sound_signals, _, _, _ = encodeBatch2GradioOutput_STFT(
+            VAE_decoder,
+            quantized_new_sound_latent_representations,
+            resolution=(
+                512,
+                width * VAE_scale),
+            original_STFT_batch=None
+        )
+        new_sound_latent_representation_gradio_images = []
+        new_sound_quantized_latent_representation_gradio_images = []
+        new_sound_spectrogram_gradio_images = []
+        new_sound_phase_gradio_images = []
+        new_sound_rec_signals_gradio = []
+        for i in range(sound2sound_batchsize):
+            new_sound_latent_representation_gradio_images.append(
+                latent_representation_to_Gradio_image(new_sound_latent_representations[i]))
+            new_sound_quantized_latent_representation_gradio_images.append(
+                latent_representation_to_Gradio_image(quantized_new_sound_latent_representations[i]))
+            new_sound_spectrogram_gradio_images.append(new_sound_flipped_log_spectrums[i])
+            new_sound_phase_gradio_images.append(new_sound_flipped_phases[i])
+            new_sound_rec_signals_gradio.append((sample_rate, new_sound_signals[i]))
+        sound2sound_dict[
+            "new_sound_latent_representation_gradio_images"] = new_sound_latent_representation_gradio_images
+        sound2sound_dict[
+            "new_sound_quantized_latent_representation_gradio_images"] = new_sound_quantized_latent_representation_gradio_images
+        sound2sound_dict["new_sound_spectrogram_gradio_images"] = new_sound_spectrogram_gradio_images
+        sound2sound_dict["new_sound_phase_gradio_images"] = new_sound_phase_gradio_images
+        sound2sound_dict["new_sound_rec_signals_gradio"] = new_sound_rec_signals_gradio
+        # save instrument
+        sound2sound_dict["latent_representations"] = new_sound_latent_representations.to("cpu").detach().numpy()
+        sound2sound_dict["quantized_latent_representations"] = quantized_new_sound_latent_representations.to(
+            "cpu").detach().numpy()
+        sound2sound_dict["condition"] = condition.to("cpu").detach().numpy()
+        sound2sound_dict["negative_condition"] = negative_condition.to("cpu").detach().numpy()
+        sound2sound_dict["guidance_scale"] = CFG
+        sound2sound_dict["sampler"] = sound2sound_sampler
+        return {sound2sound_new_sound_latent_representation_image: latent_representation_to_Gradio_image(
+            new_sound_latent_representations[0]),
+            sound2sound_new_sound_quantized_latent_representation_image: latent_representation_to_Gradio_image(
+                quantized_new_sound_latent_representations[0]),
+            sound2sound_new_sound_spectrogram_image: resize_image_to_aspect_ratio(new_sound_flipped_log_spectrums[0],
+                                                                                   1.55,
+                                                                                   1),
+            sound2sound_new_sound_phase_image: resize_image_to_aspect_ratio(new_sound_flipped_phases[0],
+                                                                                   1.55,
+                                                                                   1),
+            sound2sound_new_sound_audio: (sample_rate, new_sound_signals[0]),
+            sound2sound_sample_index_slider: gr.update(minimum=0, maximum=sound2sound_batchsize - 1, value=0,
+                                                       step=1.0,
+                                                       visible=True,
+                                                       label="Sample index",
+                                                       info="Swipe to view other samples"),
+            sound2sound_seed_textbox: sound2sound_seed,
+            sound2sound_with_text_state: sound2sound_dict,
+            virtual_instruments_state: virtual_instruments_dict}
+    def show_sound2sound_sample(sound2sound_sample_index, sound2sound_with_text_dict):
+        sample_index = int(sound2sound_sample_index)
+        return {sound2sound_new_sound_latent_representation_image:
+                    sound2sound_with_text_dict["new_sound_latent_representation_gradio_images"][sample_index],
+                sound2sound_new_sound_quantized_latent_representation_image:
+                    sound2sound_with_text_dict["new_sound_quantized_latent_representation_gradio_images"][sample_index],
+                sound2sound_new_sound_spectrogram_image: resize_image_to_aspect_ratio(
+                    sound2sound_with_text_dict["new_sound_spectrogram_gradio_images"][
+                        sample_index], 1.55, 1),
+                sound2sound_new_sound_phase_image: resize_image_to_aspect_ratio(
+                    sound2sound_with_text_dict["new_sound_phase_gradio_images"][
+                        sample_index], 1.55, 1),
+                sound2sound_new_sound_audio: sound2sound_with_text_dict["new_sound_rec_signals_gradio"][sample_index]}
+    def save_virtual_instrument(sample_index, virtual_instrument_name, sound2sound_dict, virtual_instruments_dict):
+        virtual_instruments_dict = add_instrument(sound2sound_dict, virtual_instruments_dict, virtual_instrument_name,
+                                                  sample_index)
+        return {virtual_instruments_state: virtual_instruments_dict,
+                text2sound_instrument_name_textbox: gr.Textbox(label="Instrument name", lines=1,
+                                                               placeholder=f"Saved as {virtual_instrument_name}!")}
+    with gr.Tab("Sound2Sound"):
+        gr.Markdown("Generate new sound based on a given sound!")
+        with gr.Row(variant="panel"):
+            with gr.Column(scale=3):
+                sound2sound_prompts_textbox = gr.Textbox(label="Positive prompt", lines=2, value="organ")
+                text2sound_negative_prompts_textbox = gr.Textbox(label="Negative prompt", lines=2, value="")
+            with gr.Column(scale=1):
+                sound2sound_sample_button = gr.Button(variant="primary", value="Generate", scale=1)
+                sound2sound_sample_index_slider = gr.Slider(minimum=0, maximum=3, value=0, step=1.0, visible=False,
+                                                            label="Sample index",
+                                                            info="Swipe to view other samples")
+        with gr.Row(variant="panel"):
+            with gr.Column(scale=1):
+                with gr.Tab("Origin sound"):
+                    sound2sound_duration_slider = gradioWebUI.get_duration_slider()
+                    sound2sound_origin_audio = gr.Audio(
+                        sources=["microphone", "upload"], label="Upload/Record source sound",
+                        waveform_options=gr.WaveformOptions(
+                            waveform_color="#01C6FF",
+                            waveform_progress_color="#0066B4",
+                            skip_length=1,
+                            show_controls=False,
+                        ),
+                    )
+                    with gr.Row(variant="panel"):
+                        sound2sound_origin_spectrogram_image = gr.Image(label="Original upload spectrogram",
+                                                                        type="numpy",visible=True)
+                        sound2sound_origin_phase_image = gr.Image(label="Original upload phase",
+                                                                  type="numpy", visible=True)
+                with gr.Tab("Sound2sound settings"):
+                    sound2sound_sample_steps_slider = gradioWebUI.get_sample_steps_slider()
+                    sound2sound_sampler_radio = gradioWebUI.get_sampler_radio()
+                    sound2sound_batchsize_slider = gradioWebUI.get_batchsize_slider()
+                    sound2sound_noising_strength_slider = gradioWebUI.get_noising_strength_slider()
+                    sound2sound_guidance_scale_slider = gradioWebUI.get_guidance_scale_slider()
+                    sound2sound_seed_textbox = gradioWebUI.get_seed_textbox()
+            with gr.Column(scale=1):
+                sound2sound_new_sound_audio = gr.Audio(type="numpy", label="Play new sound", interactive=False,
+                                                       waveform_options=gr.WaveformOptions(
+                                                           waveform_color="#FFB6C1",
+                                                           waveform_progress_color="#FF0000",
+                                                           skip_length=1,
+                                                           show_controls=False,
+                                                       ), )
+                with gr.Row(variant="panel"):
+                    sound2sound_new_sound_spectrogram_image = gr.Image(label="New sound spectrogram", type="numpy",
+                                                                       scale=1)
+                    sound2sound_new_sound_phase_image = gr.Image(label="New sound phase", type="numpy",
+                                                                 scale=1)
+                with gr.Row(variant="panel",):
+                    text2sound_instrument_name_textbox = gr.Textbox(label="Instrument name", lines=2,
+                                                                    placeholder="Name of your instrument",
+                                                                    scale=1)
+                    text2sound_save_instrument_button = gr.Button(variant="primary",
+                                                                  value="Save instrument",
+                                                                  scale=1)
+        with gr.Row(variant="panel"):
+            sound2sound_origin_latent_representation_image = gr.Image(label="Original latent representation",
+                                                                      type="numpy", height=800,
+                                                                      visible=False)
+            sound2sound_origin_quantized_latent_representation_image = gr.Image(
+                label="Original quantized latent representation", type="numpy", height=800, visible=False)
+            sound2sound_new_sound_latent_representation_image = gr.Image(label="New latent representation",
+                                                                         type="numpy", height=800, visible=False)
+            sound2sound_new_sound_quantized_latent_representation_image = gr.Image(
+                label="New sound quantized latent representation", type="numpy", height=800, visible=False)
+    sound2sound_origin_audio.change(receive_upload_origin_audio,
+                                    inputs=[sound2sound_duration_slider,
+                                            sound2sound_origin_audio,
+                                            sound2sound_with_text_state,
+                                            virtual_instruments_state],
+                                    outputs=[sound2sound_origin_spectrogram_image,
+                                             sound2sound_origin_phase_image,
+                                             sound2sound_origin_latent_representation_image,
+                                             sound2sound_origin_quantized_latent_representation_image,
+                                             sound2sound_with_text_state,
+                                             virtual_instruments_state])
+    sound2sound_sample_button.click(sound2sound_sample,
+                                    inputs=[sound2sound_prompts_textbox,
+                                            text2sound_negative_prompts_textbox,
+                                            sound2sound_batchsize_slider,
+                                            sound2sound_guidance_scale_slider,
+                                            sound2sound_sampler_radio,
+                                            sound2sound_sample_steps_slider,
+                                            sound2sound_noising_strength_slider,
+                                            sound2sound_seed_textbox,
+                                            sound2sound_with_text_state,
+                                            virtual_instruments_state],
+                                    outputs=[sound2sound_new_sound_latent_representation_image,
+                                             sound2sound_new_sound_quantized_latent_representation_image,
+                                             sound2sound_new_sound_spectrogram_image,
+                                             sound2sound_new_sound_phase_image,
+                                             sound2sound_new_sound_audio,
+                                             sound2sound_sample_index_slider,
+                                             sound2sound_seed_textbox,
+                                             sound2sound_with_text_state,
+                                             virtual_instruments_state])
+    text2sound_save_instrument_button.click(save_virtual_instrument,
+                                            inputs=[sound2sound_sample_index_slider,
+                                                    text2sound_instrument_name_textbox,
+                                                    sound2sound_with_text_state,
+                                                    virtual_instruments_state],
+                                            outputs=[virtual_instruments_state,
+                                                     text2sound_instrument_name_textbox])
+    sound2sound_sample_index_slider.change(show_sound2sound_sample,
+                                           inputs=[sound2sound_sample_index_slider, sound2sound_with_text_state],
+                                           outputs=[sound2sound_new_sound_latent_representation_image,
+                                                    sound2sound_new_sound_quantized_latent_representation_image,
+                                                    sound2sound_new_sound_spectrogram_image,
+                                                    sound2sound_new_sound_phase_image,
+                                                    sound2sound_new_sound_audio])

webUI/natural_language_guided_4/super_resolution_with_text.py ADDED Viewed

	@@ -0,0 +1,387 @@

+import librosa
+import numpy as np
+import torch
+import gradio as gr
+from scipy.ndimage import zoom
+from model.DiffSynthSampler import DiffSynthSampler
+from tools import adjust_audio_length, rescale, safe_int, pad_STFT, encode_stft
+from webUI.natural_language_guided_STFT.utils import latent_representation_to_Gradio_image
+from webUI.natural_language_guided_STFT.utils import InputBatch2Encode_STFT, encodeBatch2GradioOutput_STFT
+def get_super_resolution_with_text_module(gradioWebUI, inpaintWithText_state):
+    # Load configurations
+    uNet = gradioWebUI.uNet
+    freq_resolution, time_resolution = gradioWebUI.freq_resolution, gradioWebUI.time_resolution
+    VAE_scale = gradioWebUI.VAE_scale
+    height, width, channels = int(freq_resolution/VAE_scale), int(time_resolution/VAE_scale), gradioWebUI.channels
+    timesteps = gradioWebUI.timesteps
+    VAE_encoder = gradioWebUI.VAE_encoder
+    VAE_quantizer = gradioWebUI.VAE_quantizer
+    VAE_decoder = gradioWebUI.VAE_decoder
+    CLAP = gradioWebUI.CLAP
+    CLAP_tokenizer = gradioWebUI.CLAP_tokenizer
+    device = gradioWebUI.device
+    squared = gradioWebUI.squared
+    sample_rate = gradioWebUI.sample_rate
+    noise_strategy = gradioWebUI.noise_strategy
+    def receive_uopoad_origin_audio(sound2sound_duration, sound2sound_origin_source, sound2sound_origin_upload, sound2sound_origin_microphone,
+                                    inpaintWithText_dict):
+        if sound2sound_origin_source == "upload":
+            origin_sr, origin_audio = sound2sound_origin_upload
+        else:
+            origin_sr, origin_audio = sound2sound_origin_microphone
+        origin_audio = origin_audio / np.max(np.abs(origin_audio))
+        width = int(time_resolution*((sound2sound_duration+1)/4) / VAE_scale)
+        audio_length = 256 * (VAE_scale * width - 1)
+        origin_audio = adjust_audio_length(origin_audio, audio_length, origin_sr, sample_rate)
+        D = librosa.stft(origin_audio, n_fft=1024, hop_length=256, win_length=1024)
+        padded_D = pad_STFT(D)
+        encoded_D = encode_stft(padded_D)
+        # Todo: justify batchsize to 1
+        origin_spectrogram_batch_tensor = torch.from_numpy(
+            np.repeat(encoded_D[np.newaxis, :, :, :], 1, axis=0)).float().to(device)
+        # Todo: remove hard-coding
+        origin_flipped_log_spectrums, origin_flipped_phases, origin_signals, origin_latent_representations, quantized_origin_latent_representations = InputBatch2Encode_STFT(
+            VAE_encoder, origin_spectrogram_batch_tensor, resolution=(512, width * VAE_scale), quantizer=VAE_quantizer, squared=squared)
+        if sound2sound_origin_source == "upload":
+            inpaintWithText_dict["origin_upload_latent_representations"] = origin_latent_representations.tolist()
+            inpaintWithText_dict[
+                "sound2sound_origin_upload_latent_representation_image"] = latent_representation_to_Gradio_image(
+                origin_latent_representations[0]).tolist()
+            inpaintWithText_dict[
+                "sound2sound_origin_upload_quantized_latent_representation_image"] = latent_representation_to_Gradio_image(
+                quantized_origin_latent_representations[0]).tolist()
+            return {sound2sound_origin_spectrogram_upload_image: origin_flipped_log_spectrums[0],
+                    sound2sound_origin_phase_upload_image: origin_flipped_phases[0],
+                    sound2sound_origin_spectrogram_microphone_image: gr.update(),
+                    sound2sound_origin_phase_microphone_image: gr.update(),
+                    sound2sound_origin_upload_latent_representation_image: latent_representation_to_Gradio_image(
+                        origin_latent_representations[0]),
+                    sound2sound_origin_upload_quantized_latent_representation_image: latent_representation_to_Gradio_image(
+                        quantized_origin_latent_representations[0]),
+                    sound2sound_origin_microphone_latent_representation_image: gr.update(),
+                    sound2sound_origin_microphone_quantized_latent_representation_image: gr.update(),
+                    inpaintWithText_state: inpaintWithText_dict}
+        else:
+            inpaintWithText_dict["origin_microphone_latent_representations"] = origin_latent_representations.tolist()
+            inpaintWithText_dict[
+                "sound2sound_origin_microphone_latent_representation_image"] = latent_representation_to_Gradio_image(
+                origin_latent_representations[0]).tolist()
+            inpaintWithText_dict[
+                "sound2sound_origin_microphone_quantized_latent_representation_image"] = latent_representation_to_Gradio_image(
+                quantized_origin_latent_representations[0]).tolist()
+            return {sound2sound_origin_spectrogram_upload_image: origin_flipped_log_spectrums[0],
+                    sound2sound_origin_phase_upload_image: origin_flipped_phases[0],
+                    sound2sound_origin_spectrogram_microphone_image: gr.update(),
+                    sound2sound_origin_phase_microphone_image: gr.update(),
+                    sound2sound_origin_upload_latent_representation_image: latent_representation_to_Gradio_image(
+                        origin_latent_representations[0]),
+                    sound2sound_origin_upload_quantized_latent_representation_image: latent_representation_to_Gradio_image(
+                        quantized_origin_latent_representations[0]),
+                    sound2sound_origin_microphone_latent_representation_image: gr.update(),
+                    sound2sound_origin_microphone_quantized_latent_representation_image: gr.update(),
+                    inpaintWithText_state: inpaintWithText_dict}
+    def sound2sound_sample(sound2sound_origin_spectrogram_upload, sound2sound_origin_spectrogram_microphone,
+                           text2sound_prompts, text2sound_negative_prompts, sound2sound_batchsize,
+                           sound2sound_guidance_scale, sound2sound_sampler,
+                           sound2sound_sample_steps, sound2sound_origin_source,
+                           sound2sound_noising_strength, sound2sound_seed, sound2sound_inpaint_area, inpaintWithText_dict
+                           ):
+        # input preprocessing
+        sound2sound_seed = safe_int(sound2sound_seed, 12345678)
+        sound2sound_batchsize = int(sound2sound_batchsize)
+        noising_strength = sound2sound_noising_strength
+        sound2sound_sample_steps = int(sound2sound_sample_steps)
+        CFG = int(sound2sound_guidance_scale)
+        text2sound_embedding = \
+        CLAP.get_text_features(**CLAP_tokenizer([text2sound_prompts], padding=True, return_tensors="pt"))[0].to(device)
+        if sound2sound_origin_source == "upload":
+            origin_latent_representations = torch.tensor(
+                inpaintWithText_dict["origin_upload_latent_representations"]).repeat(sound2sound_batchsize, 1, 1, 1).to(
+                device)
+        elif sound2sound_origin_source == "microphone":
+            origin_latent_representations = torch.tensor(
+                inpaintWithText_dict["origin_microphone_latent_representations"]).repeat(sound2sound_batchsize, 1, 1, 1).to(
+                device)
+        else:
+            print("Input source not in ['upload', 'microphone']!")
+            raise NotImplementedError()
+        high_resolution_latent_representations = torch.zeros((sound2sound_batchsize, channels, 256, 64)).to(device)
+        high_resolution_latent_representations[:, :, :128, :] = origin_latent_representations
+        latent_mask = np.ones((256, 64))
+        latent_mask[192:, :] = 0.0
+        print(f"latent_mask mean: {np.mean(latent_mask)}")
+        if sound2sound_inpaint_area == "inpaint masked":
+            latent_mask = 1 - latent_mask
+        latent_mask = torch.from_numpy(latent_mask).unsqueeze(0).unsqueeze(1).repeat(sound2sound_batchsize, channels, 1,
+                                                                                     1).float().to(device)
+        latent_mask = torch.flip(latent_mask, [2])
+        mySampler = DiffSynthSampler(timesteps, height=height*2, channels=channels, noise_strategy=noise_strategy)
+        unconditional_condition = \
+        CLAP.get_text_features(**CLAP_tokenizer([text2sound_negative_prompts], padding=True, return_tensors="pt"))[0]
+        mySampler.activate_classifier_free_guidance(CFG, unconditional_condition.to(device))
+        normalized_sample_steps = int(sound2sound_sample_steps / noising_strength)
+        mySampler.respace(list(np.linspace(0, timesteps - 1, normalized_sample_steps, dtype=np.int32)))
+        # Todo: remove hard-coding
+        width = high_resolution_latent_representations.shape[-1]
+        condition = text2sound_embedding.repeat(sound2sound_batchsize, 1)
+        new_sound_latent_representations, initial_noise = \
+            mySampler.inpaint_sample(model=uNet, shape=(sound2sound_batchsize, channels, height*2, width),
+                                     seed=sound2sound_seed,
+                                     noising_strength=noising_strength,
+                                     guide_img=high_resolution_latent_representations, mask=latent_mask, return_tensor=True,
+                                     condition=condition, sampler=sound2sound_sampler)
+        new_sound_latent_representations = new_sound_latent_representations[-1]
+        # Quantize new sound latent representations
+        quantized_new_sound_latent_representations, loss, (_, _, _) = VAE_quantizer(new_sound_latent_representations)
+        new_sound_flipped_log_spectrums, new_sound_flipped_phases, new_sound_signals, _, _, _ = encodeBatch2GradioOutput_STFT(VAE_decoder,
+                                                                                                            quantized_new_sound_latent_representations,
+                                                                                                            resolution=(
+                                                                                                                1024,
+                                                                                                                width * VAE_scale),
+                                                                                                            original_STFT_batch=None
+                                                                                                     )
+        new_sound_latent_representation_gradio_images = []
+        new_sound_quantized_latent_representation_gradio_images = []
+        new_sound_spectrogram_gradio_images = []
+        new_sound_phase_gradio_images = []
+        new_sound_rec_signals_gradio = []
+        for i in range(sound2sound_batchsize):
+            new_sound_latent_representation_gradio_images.append(
+                latent_representation_to_Gradio_image(new_sound_latent_representations[i]))
+            new_sound_quantized_latent_representation_gradio_images.append(
+                latent_representation_to_Gradio_image(quantized_new_sound_latent_representations[i]))
+            new_sound_spectrogram_gradio_images.append(new_sound_flipped_log_spectrums[i])
+            new_sound_phase_gradio_images.append(new_sound_flipped_phases[i])
+            new_sound_rec_signals_gradio.append((sample_rate, new_sound_signals[i]))
+        inpaintWithText_dict[
+            "new_sound_latent_representation_gradio_images"] = new_sound_latent_representation_gradio_images
+        inpaintWithText_dict[
+            "new_sound_quantized_latent_representation_gradio_images"] = new_sound_quantized_latent_representation_gradio_images
+        inpaintWithText_dict["new_sound_spectrogram_gradio_images"] = new_sound_spectrogram_gradio_images
+        inpaintWithText_dict["new_sound_phase_gradio_images"] = new_sound_phase_gradio_images
+        inpaintWithText_dict["new_sound_rec_signals_gradio"] = new_sound_rec_signals_gradio
+        return {sound2sound_new_sound_latent_representation_image: latent_representation_to_Gradio_image(
+            new_sound_latent_representations[0]),
+            sound2sound_new_sound_quantized_latent_representation_image: latent_representation_to_Gradio_image(
+                quantized_new_sound_latent_representations[0]),
+            sound2sound_new_sound_spectrogram_image: new_sound_flipped_log_spectrums[0],
+            sound2sound_new_sound_phase_image: new_sound_flipped_phases[0],
+            sound2sound_new_sound_audio: (sample_rate, new_sound_signals[0]),
+            sound2sound_sample_index_slider: gr.update(minimum=0, maximum=sound2sound_batchsize - 1, value=0,
+                                                       step=1.0,
+                                                       visible=True,
+                                                       label="Sample index",
+                                                       info="Swipe to view other samples"),
+            sound2sound_seed_textbox: sound2sound_seed,
+            inpaintWithText_state: inpaintWithText_dict}
+    def show_sound2sound_sample(sound2sound_sample_index, inpaintWithText_dict):
+        sample_index = int(sound2sound_sample_index)
+        return {sound2sound_new_sound_latent_representation_image:
+                    inpaintWithText_dict["new_sound_latent_representation_gradio_images"][sample_index],
+                sound2sound_new_sound_quantized_latent_representation_image:
+                    inpaintWithText_dict["new_sound_quantized_latent_representation_gradio_images"][sample_index],
+                sound2sound_new_sound_spectrogram_image: inpaintWithText_dict["new_sound_spectrogram_gradio_images"][
+                    sample_index],
+                sound2sound_new_sound_phase_image: inpaintWithText_dict["new_sound_phase_gradio_images"][
+                    sample_index],
+                sound2sound_new_sound_audio: inpaintWithText_dict["new_sound_rec_signals_gradio"][sample_index]}
+    def sound2sound_switch_origin_source(sound2sound_origin_source):
+        if sound2sound_origin_source == "upload":
+            return {sound2sound_origin_upload_audio: gr.update(visible=True),
+                    sound2sound_origin_microphone_audio: gr.update(visible=False),
+                    sound2sound_origin_spectrogram_upload_image: gr.update(visible=True),
+                    sound2sound_origin_phase_upload_image: gr.update(visible=True),
+                    sound2sound_origin_spectrogram_microphone_image: gr.update(visible=False),
+                    sound2sound_origin_phase_microphone_image: gr.update(visible=False),
+                    sound2sound_origin_upload_latent_representation_image: gr.update(visible=True),
+                    sound2sound_origin_upload_quantized_latent_representation_image: gr.update(visible=True),
+                    sound2sound_origin_microphone_latent_representation_image: gr.update(visible=False),
+                    sound2sound_origin_microphone_quantized_latent_representation_image: gr.update(visible=False)}
+        elif sound2sound_origin_source == "microphone":
+            return {sound2sound_origin_upload_audio: gr.update(visible=False),
+                    sound2sound_origin_microphone_audio: gr.update(visible=True),
+                    sound2sound_origin_spectrogram_upload_image: gr.update(visible=False),
+                    sound2sound_origin_phase_upload_image: gr.update(visible=False),
+                    sound2sound_origin_spectrogram_microphone_image: gr.update(visible=True),
+                    sound2sound_origin_phase_microphone_image: gr.update(visible=True),
+                    sound2sound_origin_upload_latent_representation_image: gr.update(visible=False),
+                    sound2sound_origin_upload_quantized_latent_representation_image: gr.update(visible=False),
+                    sound2sound_origin_microphone_latent_representation_image: gr.update(visible=True),
+                    sound2sound_origin_microphone_quantized_latent_representation_image: gr.update(visible=True)}
+        else:
+            print("Input source not in ['upload', 'microphone']!")
+    with gr.Tab("Super Resolution"):
+        gr.Markdown("Select the area to inpaint and use the prompt to guide the synthesis of a new sound!")
+        with gr.Row(variant="panel"):
+            with gr.Column(scale=3):
+                text2sound_prompts_textbox = gr.Textbox(label="Positive prompt", lines=2, value="organ")
+                text2sound_negative_prompts_textbox = gr.Textbox(label="Negative prompt", lines=2, value="")
+            with gr.Column(scale=1):
+                sound2sound_sample_button = gr.Button(variant="primary", value="Generate", scale=1)
+                sound2sound_sample_index_slider = gr.Slider(minimum=0, maximum=3, value=0, step=1.0, visible=False,
+                                                            label="Sample index",
+                                                            info="Swipe to view other samples")
+        with gr.Row(variant="panel"):
+            with gr.Column(scale=1):
+                with gr.Tab("Origin sound"):
+                    sound2sound_duration_slider = gradioWebUI.get_duration_slider()
+                    sound2sound_origin_source_radio = gr.Radio(choices=["upload", "microphone"], value="upload",
+                                                               label="Input source")
+                    sound2sound_origin_upload_audio = gr.Audio(type="numpy", label="Upload", source="upload",
+                                                               interactive=True, visible=True)
+                    sound2sound_origin_microphone_audio = gr.Audio(type="numpy", label="Record", source="microphone",
+                                                                   interactive=True, visible=False)
+                    with gr.Row(variant="panel"):
+                        sound2sound_origin_spectrogram_upload_image = gr.Image(label="Original upload spectrogram",
+                                                                               type="numpy", height=600,
+                                                                            visible=True, tool="sketch")
+                        sound2sound_origin_phase_upload_image = gr.Image(label="Original upload phase",
+                                                                               type="numpy", height=600,
+                                                                               visible=True)
+                        sound2sound_origin_spectrogram_microphone_image = gr.Image(label="Original microphone spectrogram",
+                                                                                   type="numpy", height=600,
+                                                                                   visible=False, tool="sketch")
+                        sound2sound_origin_phase_microphone_image = gr.Image(label="Original microphone phase",
+                                                                                   type="numpy", height=600,
+                                                                                   visible=False)
+                    sound2sound_inpaint_area_radio = gr.Radio(choices=["inpaint masked", "inpaint not masked"],
+                                                              value="inpaint masked")
+                with gr.Tab("Sound2sound settings"):
+                    sound2sound_sample_steps_slider = gradioWebUI.get_sample_steps_slider()
+                    sound2sound_sampler_radio = gradioWebUI.get_sampler_radio()
+                    sound2sound_batchsize_slider = gradioWebUI.get_batchsize_slider()
+                    sound2sound_noising_strength_slider = gradioWebUI.get_noising_strength_slider(default_noising_strength=1.0)
+                    sound2sound_guidance_scale_slider = gradioWebUI.get_guidance_scale_slider()
+                    sound2sound_seed_textbox = gradioWebUI.get_seed_textbox()
+            with gr.Column(scale=1):
+                sound2sound_new_sound_audio = gr.Audio(type="numpy", label="Play new sound", interactive=False)
+                with gr.Row(variant="panel"):
+                    sound2sound_new_sound_spectrogram_image = gr.Image(label="New sound spectrogram", type="numpy",
+                                                                       height=1200, scale=1)
+                    sound2sound_new_sound_phase_image = gr.Image(label="New sound phase", type="numpy",
+                                                                       height=1200, scale=1)
+        with gr.Row(variant="panel"):
+            sound2sound_origin_upload_latent_representation_image = gr.Image(label="Original latent representation",
+                                                                             type="numpy", height=1200,
+                                                                             visible=True)
+            sound2sound_origin_upload_quantized_latent_representation_image = gr.Image(
+                label="Original quantized latent representation", type="numpy", height=1200, visible=True)
+            sound2sound_origin_microphone_latent_representation_image = gr.Image(label="Original latent representation",
+                                                                                 type="numpy", height=1200,
+                                                                                 visible=False)
+            sound2sound_origin_microphone_quantized_latent_representation_image = gr.Image(
+                label="Original quantized latent representation", type="numpy", height=1200, visible=False)
+            sound2sound_new_sound_latent_representation_image = gr.Image(label="New latent representation",
+                                                                         type="numpy", height=1200)
+            sound2sound_new_sound_quantized_latent_representation_image = gr.Image(
+                label="New sound quantized latent representation", type="numpy", height=1200)
+    sound2sound_origin_upload_audio.change(receive_uopoad_origin_audio,
+                                           inputs=[sound2sound_duration_slider, sound2sound_origin_source_radio, sound2sound_origin_upload_audio,
+                                                   sound2sound_origin_microphone_audio, inpaintWithText_state],
+                                           outputs=[sound2sound_origin_spectrogram_upload_image,
+                                                    sound2sound_origin_phase_upload_image,
+                                                    sound2sound_origin_spectrogram_microphone_image,
+                                                    sound2sound_origin_phase_microphone_image,
+                                                    sound2sound_origin_upload_latent_representation_image,
+                                                    sound2sound_origin_upload_quantized_latent_representation_image,
+                                                    sound2sound_origin_microphone_latent_representation_image,
+                                                    sound2sound_origin_microphone_quantized_latent_representation_image,
+                                                    inpaintWithText_state])
+    sound2sound_origin_microphone_audio.change(receive_uopoad_origin_audio,
+                                               inputs=[sound2sound_duration_slider, sound2sound_origin_source_radio, sound2sound_origin_upload_audio,
+                                                       sound2sound_origin_microphone_audio, inpaintWithText_state],
+                                               outputs=[sound2sound_origin_spectrogram_upload_image,
+                                                        sound2sound_origin_phase_upload_image,
+                                                        sound2sound_origin_spectrogram_microphone_image,
+                                                        sound2sound_origin_phase_microphone_image,
+                                                        sound2sound_origin_upload_latent_representation_image,
+                                                        sound2sound_origin_upload_quantized_latent_representation_image,
+                                                        sound2sound_origin_microphone_latent_representation_image,
+                                                        sound2sound_origin_microphone_quantized_latent_representation_image,
+                                                        inpaintWithText_state])
+    sound2sound_sample_button.click(sound2sound_sample,
+                                    inputs=[sound2sound_origin_spectrogram_upload_image,
+                                            sound2sound_origin_spectrogram_microphone_image,
+                                            text2sound_prompts_textbox,
+                                            text2sound_negative_prompts_textbox,
+                                            sound2sound_batchsize_slider,
+                                            sound2sound_guidance_scale_slider,
+                                            sound2sound_sampler_radio,
+                                            sound2sound_sample_steps_slider,
+                                            sound2sound_origin_source_radio,
+                                            sound2sound_noising_strength_slider,
+                                            sound2sound_seed_textbox,
+                                            sound2sound_inpaint_area_radio,
+                                            inpaintWithText_state],
+                                    outputs=[sound2sound_new_sound_latent_representation_image,
+                                             sound2sound_new_sound_quantized_latent_representation_image,
+                                             sound2sound_new_sound_spectrogram_image,
+                                             sound2sound_new_sound_phase_image,
+                                             sound2sound_new_sound_audio,
+                                             sound2sound_sample_index_slider,
+                                             sound2sound_seed_textbox,
+                                             inpaintWithText_state])
+    sound2sound_sample_index_slider.change(show_sound2sound_sample,
+                                           inputs=[sound2sound_sample_index_slider, inpaintWithText_state],
+                                           outputs=[sound2sound_new_sound_latent_representation_image,
+                                                    sound2sound_new_sound_quantized_latent_representation_image,
+                                                    sound2sound_new_sound_spectrogram_image,
+                                                    sound2sound_new_sound_phase_image,
+                                                    sound2sound_new_sound_audio])
+    sound2sound_origin_source_radio.change(sound2sound_switch_origin_source,
+                                           inputs=[sound2sound_origin_source_radio],
+                                           outputs=[sound2sound_origin_upload_audio,
+                                                    sound2sound_origin_microphone_audio,
+                                                    sound2sound_origin_spectrogram_upload_image,
+                                                    sound2sound_origin_phase_upload_image,
+                                                    sound2sound_origin_spectrogram_microphone_image,
+                                                    sound2sound_origin_phase_microphone_image,
+                                                    sound2sound_origin_upload_latent_representation_image,
+                                                    sound2sound_origin_upload_quantized_latent_representation_image,
+                                                    sound2sound_origin_microphone_latent_representation_image,
+                                                    sound2sound_origin_microphone_quantized_latent_representation_image])

webUI/natural_language_guided_4/text2sound.py ADDED Viewed

	@@ -0,0 +1,220 @@

+import gradio as gr
+import numpy as np
+from model.DiffSynthSampler import DiffSynthSampler
+from tools import safe_int
+from webUI.natural_language_guided_4.utils import latent_representation_to_Gradio_image, \
+    encodeBatch2GradioOutput_STFT, add_instrument, resize_image_to_aspect_ratio
+def get_text2sound_module(gradioWebUI, text2sound_state, virtual_instruments_state):
+    # Load configurations
+    uNet = gradioWebUI.uNet
+    freq_resolution, time_resolution = gradioWebUI.freq_resolution, gradioWebUI.time_resolution
+    VAE_scale = gradioWebUI.VAE_scale
+    height, width, channels = int(freq_resolution / VAE_scale), int(time_resolution / VAE_scale), gradioWebUI.channels
+    timesteps = gradioWebUI.timesteps
+    VAE_quantizer = gradioWebUI.VAE_quantizer
+    VAE_decoder = gradioWebUI.VAE_decoder
+    CLAP = gradioWebUI.CLAP
+    CLAP_tokenizer = gradioWebUI.CLAP_tokenizer
+    device = gradioWebUI.device
+    squared = gradioWebUI.squared
+    sample_rate = gradioWebUI.sample_rate
+    noise_strategy = gradioWebUI.noise_strategy
+    def diffusion_random_sample(text2sound_prompts, text2sound_negative_prompts, text2sound_batchsize,
+                                text2sound_duration,
+                                text2sound_guidance_scale, text2sound_sampler,
+                                text2sound_sample_steps, text2sound_seed,
+                                text2sound_dict):
+        text2sound_sample_steps = int(text2sound_sample_steps)
+        text2sound_seed = safe_int(text2sound_seed, 12345678)
+        width = int(time_resolution * ((text2sound_duration + 1) / 4) / VAE_scale)
+        text2sound_batchsize = int(text2sound_batchsize)
+        text2sound_embedding = \
+            CLAP.get_text_features(**CLAP_tokenizer([text2sound_prompts], padding=True, return_tensors="pt"))[0].to(
+                device)
+        CFG = int(text2sound_guidance_scale)
+        mySampler = DiffSynthSampler(timesteps, height=height, channels=channels, noise_strategy=noise_strategy)
+        negative_condition = \
+            CLAP.get_text_features(**CLAP_tokenizer([text2sound_negative_prompts], padding=True, return_tensors="pt"))[
+                0]
+        mySampler.activate_classifier_free_guidance(CFG, negative_condition.to(device))
+        mySampler.respace(list(np.linspace(0, timesteps - 1, text2sound_sample_steps, dtype=np.int32)))
+        condition = text2sound_embedding.repeat(text2sound_batchsize, 1)
+        latent_representations, initial_noise = \
+            mySampler.sample(model=uNet, shape=(text2sound_batchsize, channels, height, width), seed=text2sound_seed,
+                             return_tensor=True, condition=condition, sampler=text2sound_sampler)
+        latent_representations = latent_representations[-1]
+        latent_representation_gradio_images = []
+        quantized_latent_representation_gradio_images = []
+        new_sound_spectrogram_gradio_images = []
+        new_sound_phase_gradio_images = []
+        new_sound_rec_signals_gradio = []
+        quantized_latent_representations, loss, (_, _, _) = VAE_quantizer(latent_representations)
+        # Todo: remove hard-coding
+        flipped_log_spectrums, flipped_phases, rec_signals, _, _, _ = encodeBatch2GradioOutput_STFT(VAE_decoder,
+                                                                                                    quantized_latent_representations,
+                                                                                                    resolution=(
+                                                                                                        512,
+                                                                                                        width * VAE_scale),
+                                                                                                    original_STFT_batch=None
+                                                                                                    )
+        for i in range(text2sound_batchsize):
+            latent_representation_gradio_images.append(latent_representation_to_Gradio_image(latent_representations[i]))
+            quantized_latent_representation_gradio_images.append(
+                latent_representation_to_Gradio_image(quantized_latent_representations[i]))
+            new_sound_spectrogram_gradio_images.append(flipped_log_spectrums[i])
+            new_sound_phase_gradio_images.append(flipped_phases[i])
+            new_sound_rec_signals_gradio.append((sample_rate, rec_signals[i]))
+        text2sound_dict["latent_representation_gradio_images"] = latent_representation_gradio_images
+        text2sound_dict["quantized_latent_representation_gradio_images"] = quantized_latent_representation_gradio_images
+        text2sound_dict["new_sound_spectrogram_gradio_images"] = new_sound_spectrogram_gradio_images
+        text2sound_dict["new_sound_phase_gradio_images"] = new_sound_phase_gradio_images
+        text2sound_dict["new_sound_rec_signals_gradio"] = new_sound_rec_signals_gradio
+        # save instrument
+        text2sound_dict["latent_representations"] = latent_representations.to("cpu").detach().numpy()
+        text2sound_dict["quantized_latent_representations"] = quantized_latent_representations.to(
+            "cpu").detach().numpy()
+        text2sound_dict["condition"] = condition.to("cpu").detach().numpy()
+        text2sound_dict["negative_condition"] = negative_condition.to("cpu").detach().numpy()
+        text2sound_dict["guidance_scale"] = CFG
+        text2sound_dict["sampler"] = text2sound_sampler
+        return {text2sound_latent_representation_image: text2sound_dict["latent_representation_gradio_images"][0],
+                text2sound_quantized_latent_representation_image:
+                    text2sound_dict["quantized_latent_representation_gradio_images"][0],
+                text2sound_sampled_spectrogram_image: resize_image_to_aspect_ratio(
+                    text2sound_dict["new_sound_spectrogram_gradio_images"][0],
+                    1.55,
+                    1),
+                text2sound_sampled_phase_image: resize_image_to_aspect_ratio(
+                    text2sound_dict["new_sound_phase_gradio_images"][0],
+                    1.55,
+                    1),
+                text2sound_sampled_audio: text2sound_dict["new_sound_rec_signals_gradio"][0],
+                text2sound_seed_textbox: text2sound_seed,
+                text2sound_state: text2sound_dict,
+                text2sound_sample_index_slider: gr.update(minimum=0, maximum=text2sound_batchsize - 1, value=0, step=1,
+                                                          visible=True,
+                                                          label="Sample index.",
+                                                          info="Swipe to view other samples")}
+    def show_random_sample(sample_index, text2sound_dict):
+        sample_index = int(sample_index)
+        text2sound_dict["sample_index"] = sample_index
+        print(text2sound_dict["new_sound_rec_signals_gradio"][sample_index])
+        return {text2sound_latent_representation_image: text2sound_dict["latent_representation_gradio_images"][
+            sample_index],
+                text2sound_quantized_latent_representation_image:
+                    text2sound_dict["quantized_latent_representation_gradio_images"][sample_index],
+                text2sound_sampled_spectrogram_image: resize_image_to_aspect_ratio(
+                    text2sound_dict["new_sound_spectrogram_gradio_images"][sample_index], 1.55, 1),
+                text2sound_sampled_phase_image: resize_image_to_aspect_ratio(text2sound_dict["new_sound_phase_gradio_images"][
+                    sample_index], 1.55, 1),
+                text2sound_sampled_audio: text2sound_dict["new_sound_rec_signals_gradio"][sample_index]}
+    def save_virtual_instrument(sample_index, virtual_instrument_name, text2sound_dict, virtual_instruments_dict):
+        virtual_instruments_dict = add_instrument(text2sound_dict, virtual_instruments_dict, virtual_instrument_name,
+                                                  sample_index)
+        return {virtual_instruments_state: virtual_instruments_dict,
+                text2sound_instrument_name_textbox: gr.Textbox(label="Instrument name", lines=1,
+                                                               placeholder=f"Saved as {virtual_instrument_name}!")}
+    with gr.Tab("Text2sound"):
+        gr.Markdown("Use neural networks to select random sounds using your favorite instrument!")
+        with gr.Row(variant="panel"):
+            with gr.Column(scale=3):
+                text2sound_prompts_textbox = gr.Textbox(label="Positive prompt", lines=2, value="organ")
+                text2sound_negative_prompts_textbox = gr.Textbox(label="Negative prompt", lines=2, value="")
+            with gr.Column(scale=1):
+                text2sound_sampling_button = gr.Button(variant="primary",
+                                                       value="Generate a batch of samples and show "
+                                                             "the first one",
+                                                       scale=1)
+                text2sound_sample_index_slider = gr.Slider(minimum=0, maximum=3, value=0, step=1.0, visible=False,
+                                                           label="Sample index",
+                                                           info="Swipe to view other samples")
+        with gr.Row(variant="panel"):
+            with gr.Column(variant="panel", scale=1):
+                text2sound_sample_steps_slider = gradioWebUI.get_sample_steps_slider()
+                text2sound_sampler_radio = gradioWebUI.get_sampler_radio()
+                text2sound_batchsize_slider = gradioWebUI.get_batchsize_slider()
+                text2sound_duration_slider = gradioWebUI.get_duration_slider()
+                text2sound_guidance_scale_slider = gradioWebUI.get_guidance_scale_slider()
+                text2sound_seed_textbox = gradioWebUI.get_seed_textbox()
+            with gr.Column(variant="panel", scale=1):
+                with gr.Row(variant="panel", ):
+                    text2sound_sampled_spectrogram_image = gr.Image(label="Sampled spectrogram", type="numpy", )
+                    text2sound_sampled_phase_image = gr.Image(label="Sampled phase", type="numpy")
+                text2sound_sampled_audio = gr.Audio(type="numpy", label="Play",
+                                                    scale=1)
+                with gr.Row(variant="panel", ):
+                    text2sound_instrument_name_textbox = gr.Textbox(label="Instrument name", lines=2,
+                                                                    placeholder="Name of your instrument",
+                                                                    scale=1)
+                    text2sound_save_instrument_button = gr.Button(variant="primary",
+                                                                  value="Save instrument",
+                                                                  scale=1)
+        with gr.Row(variant="panel"):
+            text2sound_latent_representation_image = gr.Image(label="Sampled latent representation", type="numpy",
+                                                              height=200, width=100, visible=False)
+            text2sound_quantized_latent_representation_image = gr.Image(label="Quantized latent representation",
+                                                                        type="numpy", height=200, width=100,
+                                                                        visible=False)
+    text2sound_sampling_button.click(diffusion_random_sample,
+                                     inputs=[text2sound_prompts_textbox,
+                                             text2sound_negative_prompts_textbox,
+                                             text2sound_batchsize_slider,
+                                             text2sound_duration_slider,
+                                             text2sound_guidance_scale_slider, text2sound_sampler_radio,
+                                             text2sound_sample_steps_slider,
+                                             text2sound_seed_textbox,
+                                             text2sound_state],
+                                     outputs=[text2sound_latent_representation_image,
+                                              text2sound_quantized_latent_representation_image,
+                                              text2sound_sampled_spectrogram_image,
+                                              text2sound_sampled_phase_image,
+                                              text2sound_sampled_audio,
+                                              text2sound_seed_textbox,
+                                              text2sound_state,
+                                              text2sound_sample_index_slider])
+    text2sound_save_instrument_button.click(save_virtual_instrument,
+                                            inputs=[text2sound_sample_index_slider,
+                                                    text2sound_instrument_name_textbox,
+                                                    text2sound_state,
+                                                    virtual_instruments_state],
+                                            outputs=[virtual_instruments_state,
+                                                     text2sound_instrument_name_textbox])
+    text2sound_sample_index_slider.change(show_random_sample,
+                                          inputs=[text2sound_sample_index_slider, text2sound_state],
+                                          outputs=[text2sound_latent_representation_image,
+                                                   text2sound_quantized_latent_representation_image,
+                                                   text2sound_sampled_spectrogram_image,
+                                                   text2sound_sampled_phase_image,
+                                                   text2sound_sampled_audio])

webUI/natural_language_guided_4/track_maker.py ADDED Viewed

	@@ -0,0 +1,248 @@

+import librosa
+import numpy as np
+import torch
+from model.DiffSynthSampler import DiffSynthSampler
+from webUI.natural_language_guided_4.utils import encodeBatch2GradioOutput_STFT
+import mido
+import torchaudio.transforms as transforms
+from tqdm import tqdm
+# def pitch_shift_audio(waveform, sample_rate, n_steps, device='cpu', n_fft=1024, hop_length=None):
+#     # 如果输入是 numpy 数组，则转换为 torch.Tensor
+#     if isinstance(waveform, np.ndarray):
+#         waveform = torch.from_numpy(waveform)
+#
+#     # 设置 hop_length 为 n_fft 的一半（合理的默认值），以减少 STFT 操作的内存开销
+#     if hop_length is None:
+#         hop_length = n_fft // 4
+#
+#     # 将 waveform 移动到指定设备上
+#     waveform = waveform.to(device, dtype=torch.float32)
+#
+#     # 创建 pitch_shift 变换，并移动到指定设备上
+#     pitch_shift = transforms.PitchShift(
+#         sample_rate=sample_rate,
+#         n_steps=n_steps,
+#         n_fft=n_fft,
+#         hop_length=hop_length
+#     ).to(device)
+#
+#     # 执行变换，并将结果从设备移动到 CPU 上，最后转换为 numpy 数组
+#     shifted_waveform = pitch_shift(waveform).detach().cpu().numpy()
+#
+#     return shifted_waveform
+def pitch_shift_librosa(waveform, sample_rate, total_steps, step_size=4, n_fft=4096, hop_length=None):
+    # librosa 需要输入的是 numpy 数组
+    if isinstance(waveform, torch.Tensor):
+        waveform = waveform.numpy()
+    # 如果 hop_length 未提供，则使用 n_fft 的四分之一作为默认值
+    if hop_length is None:
+        hop_length = n_fft // 4
+    # 逐步进行 pitch shift，每次提升 step_size 个半音
+    current_waveform = waveform
+    num_steps = int(np.ceil(total_steps / step_size))
+    for i in range(num_steps):
+        step = min(step_size, total_steps - i * step_size)  # 确保最后一步不会超过 total_steps
+        current_waveform = librosa.effects.pitch_shift(
+            current_waveform, sr=sample_rate, n_steps=step,
+            n_fft=n_fft, hop_length=hop_length
+        )
+    return current_waveform
+class NoteEvent:
+    def __init__(self, note, velocity, start_time, duration):
+        self.note = note
+        self.velocity = velocity
+        self.start_time = start_time  # In ticks
+        self.duration = duration      # In ticks
+    def __str__(self):
+        return f"Note {self.note}, velocity {self.velocity}, start_time {self.start_time}, duration {self.duration}"
+class Track:
+    def __init__(self, track, ticks_per_beat, max_notes=100):
+        self.tempo_events = self._parse_tempo_events(track)
+        self.events = self._parse_note_events(track)
+        self.ticks_per_beat = ticks_per_beat
+        self.max_notes = int(max_notes)
+    def _parse_tempo_events(self, track):
+        tempo_events = []
+        current_tempo = 500000  # Default MIDI tempo is 120 BPM which is 500000 microseconds per beat
+        for msg in track:
+            if msg.type == 'set_tempo':
+                tempo_events.append((msg.time, msg.tempo))
+            elif not msg.is_meta:
+                tempo_events.append((msg.time, current_tempo))
+        return tempo_events
+    def _parse_note_events(self, track):
+        events = []
+        start_time = 0
+        for msg in track:
+            if not msg.is_meta:
+                start_time += msg.time
+                if msg.type == 'note_on' and msg.velocity > 0:
+                    note_on_time = start_time
+                elif msg.type == 'note_on' and msg.velocity == 0:
+                    duration = start_time - note_on_time
+                    events.append(NoteEvent(msg.note, msg.velocity, note_on_time, duration))
+        return events
+    def synthesize_track(self, diffSynthSampler, sample_rate=16000):
+        track_audio = np.zeros(int(self._get_total_time() * sample_rate), dtype=np.float32)
+        current_tempo = 500000  # Start with default MIDI tempo 120 BPM
+        duration_note_mapping = {}
+        for event in tqdm(self.events[:self.max_notes]):
+            current_tempo = self._get_tempo_at(event.start_time)
+            seconds_per_tick = mido.tick2second(1, self.ticks_per_beat, current_tempo)
+            start_time_sec = event.start_time * seconds_per_tick
+            # Todo: set a minimum duration
+            duration_sec = event.duration * seconds_per_tick
+            duration_sec = max(duration_sec, 0.75)
+            start_sample = int(start_time_sec * sample_rate)
+            if not (str(duration_sec) in duration_note_mapping):
+                note_sample = diffSynthSampler(event.velocity, duration_sec)
+                duration_note_mapping[str(duration_sec)] = note_sample / np.max(np.abs(note_sample))
+            # note_audio = pyrb.pitch_shift(duration_note_mapping[str(duration_sec)], sample_rate, event.note - 52)
+            # note_audio = pitch_shift_audio(duration_note_mapping[str(duration_sec)], sample_rate, event.note - 52)
+            note_audio = pitch_shift_librosa(duration_note_mapping[str(duration_sec)], sample_rate, event.note - 52)
+            end_sample = start_sample + len(note_audio)
+            track_audio[start_sample:end_sample] += note_audio
+        return track_audio
+    def _get_tempo_at(self, time_tick):
+        current_tempo = 500000  # Start with default MIDI tempo 120 BPM
+        elapsed_ticks = 0
+        for tempo_change in self.tempo_events:
+            if elapsed_ticks + tempo_change[0] > time_tick:
+                return current_tempo
+            elapsed_ticks += tempo_change[0]
+            current_tempo = tempo_change[1]
+        return current_tempo
+    def _get_total_time(self):
+        total_time = 0
+        current_tempo = 500000  # Start with default MIDI tempo 120 BPM
+        for event in self.events:
+            current_tempo = self._get_tempo_at(event.start_time)
+            seconds_per_tick = mido.tick2second(1, self.ticks_per_beat, current_tempo)
+            total_time += event.duration * seconds_per_tick
+        return total_time + 10
+class DiffSynth:
+    def __init__(self, instruments_configs, noise_prediction_model, VAE_quantizer, VAE_decoder, text_encoder, CLAP_tokenizer, device,
+                               model_sample_rate=16000, timesteps=1000, channels=4, freq_resolution=512, time_resolution=256, VAE_scale=4, squared=False):
+        self.noise_prediction_model = noise_prediction_model
+        self.VAE_quantizer = VAE_quantizer
+        self.VAE_decoder = VAE_decoder
+        self.device = device
+        self.model_sample_rate = model_sample_rate
+        self.timesteps = timesteps
+        self.channels = channels
+        self.freq_resolution = freq_resolution
+        self.time_resolution = time_resolution
+        self.height = int(freq_resolution/VAE_scale)
+        self.VAE_scale = VAE_scale
+        self.squared = squared
+        self.text_encoder = text_encoder
+        self.CLAP_tokenizer = CLAP_tokenizer
+        # instruments_configs 是字典 string -> (condition, negative_condition, guidance_scale, sample_steps, seed, initial_noise, sampler)
+        self.instruments_configs = instruments_configs
+        self.diffSynthSamplers = {}
+        self._update_instruments()
+    def _update_instruments(self):
+        def diffSynthSamplerWrapper(instruments_config):
+            def diffSynthSampler(velocity, duration_sec, sample_rate=16000):
+                condition = self.text_encoder.get_text_features(**self.CLAP_tokenizer([""], padding=True, return_tensors="pt")).to(self.device)
+                sample_steps = instruments_config['sample_steps']
+                sampler = instruments_config['sampler']
+                noising_strength = instruments_config['noising_strength']
+                latent_representation = instruments_config['latent_representation']
+                attack = instruments_config['attack']
+                before_release = instruments_config['before_release']
+                assert sample_rate == self.model_sample_rate, "sample_rate != model_sample_rate"
+                width = int(self.time_resolution * ((duration_sec + 1) / 4) / self.VAE_scale)
+                mySampler = DiffSynthSampler(self.timesteps, height=128, channels=4, noise_strategy="repeat", mute=True)
+                mySampler.respace(list(np.linspace(0, self.timesteps - 1, sample_steps, dtype=np.int32)))
+                # mask = 1, freeze
+                latent_mask = torch.zeros((1, 1, self.height, width), dtype=torch.float32).to(self.device)
+                latent_mask[:, :, :, :int(self.time_resolution * (attack / 4) / self.VAE_scale)] = 1.0
+                latent_mask[:, :, :, -int(self.time_resolution * ((before_release+1) / 4) / self.VAE_scale):] = 1.0
+                latent_representations, _ = \
+                    mySampler.inpaint_sample(model=self.noise_prediction_model, shape=(1, self.channels, self.height, width),
+                                            noising_strength=noising_strength, condition=condition,
+                                            guide_img=latent_representation, mask=latent_mask, return_tensor=True,
+                                            sampler=sampler,
+                                            use_dynamic_mask=True, end_noise_level_ratio=0.0,
+                                            mask_flexivity=1.0)
+                latent_representations = latent_representations[-1]
+                quantized_latent_representations, _, (_, _, _) = self.VAE_quantizer(latent_representations)
+                # Todo: remove hard-coding
+                flipped_log_spectrums, flipped_phases, rec_signals, _, _, _ = encodeBatch2GradioOutput_STFT(self.VAE_decoder,
+                                                                  quantized_latent_representations,
+                                                                  resolution=(
+                                                                      512,
+                                                                      width * self.VAE_scale),
+                                                                  original_STFT_batch=None,
+                                                            )
+                return rec_signals[0]
+            return diffSynthSampler
+        for key in self.instruments_configs.keys():
+            self.diffSynthSamplers[key] = diffSynthSamplerWrapper(self.instruments_configs[key])
+    def get_music(self, mid, instrument_names, sample_rate=16000, max_notes=100):
+        tracks = [Track(t, mid.ticks_per_beat, max_notes) for t in mid.tracks]
+        assert len(tracks) <= len(instrument_names), f"len(tracks) = {len(tracks)} > {len(instrument_names)} = len(instrument_names)"
+        track_audios = [track.synthesize_track(self.diffSynthSamplers[instrument_names[i]], sample_rate=sample_rate) for i, track in enumerate(tracks)]
+        # 将所有音轨填充至最长音轨的长度，以便它们可以被叠加
+        max_length = max(len(audio) for audio in track_audios)
+        full_audio = np.zeros(max_length, dtype=np.float32)  # 初始化全音频数组为零
+        for audio in track_audios:
+            # 音轨可能不够长，需要填充零
+            padded_audio = np.pad(audio, (0, max_length - len(audio)), 'constant')
+            full_audio += padded_audio  # 叠加音轨
+        return full_audio

webUI/natural_language_guided_4/utils.py ADDED Viewed

	@@ -0,0 +1,228 @@

+import librosa
+import numpy as np
+import torch
+from PIL import Image
+from tools import np_power_to_db, decode_stft, depad_STFT
+def spectrogram_to_Gradio_image(spc):
+    ### input: spc [np.ndarray]
+    frequency_resolution, time_resolution = spc.shape[-2], spc.shape[-1]
+    spc = np.reshape(spc, (frequency_resolution, time_resolution))
+    # Todo:
+    magnitude_spectrum = np.abs(spc)
+    log_spectrum = np_power_to_db(magnitude_spectrum)
+    flipped_log_spectrum = np.flipud(log_spectrum)
+    colorful_spc = np.ones((frequency_resolution, time_resolution, 3)) * -80.0
+    colorful_spc[:, :, 0] = flipped_log_spectrum
+    colorful_spc[:, :, 1] = flipped_log_spectrum
+    colorful_spc[:, :, 2] = np.ones((frequency_resolution, time_resolution)) * -60.0
+    # Rescale to 0-255 and convert to uint8
+    rescaled = (colorful_spc + 80.0) / 80.0
+    rescaled = (255.0 * rescaled).astype(np.uint8)
+    return rescaled
+def phase_to_Gradio_image(phase):
+    ### input: spc [np.ndarray]
+    frequency_resolution, time_resolution = phase.shape[-2], phase.shape[-1]
+    phase = np.reshape(phase, (frequency_resolution, time_resolution))
+    # Todo:
+    flipped_phase = np.flipud(phase)
+    flipped_phase = (flipped_phase + 1.0) / 2.0
+    colorful_spc = np.zeros((frequency_resolution, time_resolution, 3))
+    colorful_spc[:, :, 0] = flipped_phase
+    colorful_spc[:, :, 1] = flipped_phase
+    colorful_spc[:, :, 2] = 0.2
+    # Rescale to 0-255 and convert to uint8
+    rescaled = (255.0 * colorful_spc).astype(np.uint8)
+    return rescaled
+def latent_representation_to_Gradio_image(latent_representation):
+    # input: latent_representation [torch.tensor]
+    if not isinstance(latent_representation, np.ndarray):
+        latent_representation = latent_representation.to("cpu").detach().numpy()
+    image = latent_representation
+    def normalize_image(img):
+        min_val = img.min()
+        max_val = img.max()
+        normalized_img = ((img - min_val) / (max_val - min_val) * 255)
+        return normalized_img
+    image[0, :, :] = normalize_image(image[0, :, :])
+    image[1, :, :] = normalize_image(image[1, :, :])
+    image[2, :, :] = normalize_image(image[2, :, :])
+    image[3, :, :] = normalize_image(image[3, :, :])
+    image_transposed = np.transpose(image, (1, 2, 0))
+    enlarged_image = np.repeat(image_transposed, 8, axis=0)
+    enlarged_image = np.repeat(enlarged_image, 8, axis=1)
+    return np.flipud(enlarged_image).astype(np.uint8)
+def InputBatch2Encode_STFT(encoder, STFT_batch, resolution=(512, 256), quantizer=None, squared=True):
+    """Transform batch of numpy spectrogram's into signals and encodings."""
+    # Todo: remove resolution hard-coding
+    frequency_resolution, time_resolution = resolution
+    device = next(encoder.parameters()).device
+    if not (quantizer is None):
+        latent_representation_batch = encoder(STFT_batch.to(device))
+        quantized_latent_representation_batch, loss, (_, _, _) = quantizer(latent_representation_batch)
+    else:
+        mu, logvar, latent_representation_batch = encoder(STFT_batch.to(device))
+        quantized_latent_representation_batch = None
+    STFT_batch = STFT_batch.to("cpu").detach().numpy()
+    origin_flipped_log_spectrums, origin_flipped_phases, origin_signals = [], [], []
+    for STFT in STFT_batch:
+        padded_D_rec = decode_stft(STFT)
+        D_rec = depad_STFT(padded_D_rec)
+        spc = np.abs(D_rec)
+        phase = np.angle(D_rec)
+        flipped_log_spectrum = spectrogram_to_Gradio_image(spc)
+        flipped_phase = phase_to_Gradio_image(phase)
+        # get_audio
+        rec_signal = librosa.istft(D_rec, hop_length=256, win_length=1024)
+        origin_flipped_log_spectrums.append(flipped_log_spectrum)
+        origin_flipped_phases.append(flipped_phase)
+        origin_signals.append(rec_signal)
+    return origin_flipped_log_spectrums, origin_flipped_phases, origin_signals, \
+        latent_representation_batch, quantized_latent_representation_batch
+def encodeBatch2GradioOutput_STFT(decoder, latent_vector_batch, resolution=(512, 256), original_STFT_batch=None):
+    """Show a spectrogram."""
+    # Todo: remove resolution hard-coding
+    frequency_resolution, time_resolution = resolution
+    if isinstance(latent_vector_batch, np.ndarray):
+        latent_vector_batch = torch.from_numpy(latent_vector_batch).to(next(decoder.parameters()).device)
+    reconstruction_batch = decoder(latent_vector_batch).to("cpu").detach().numpy()
+    flipped_log_spectrums, flipped_phases, rec_signals = [], [], []
+    flipped_log_spectrums_with_original_amp, flipped_phases_with_original_amp, rec_signals_with_original_amp = [], [], []
+    for index, STFT in enumerate(reconstruction_batch):
+        padded_D_rec = decode_stft(STFT)
+        D_rec = depad_STFT(padded_D_rec)
+        spc = np.abs(D_rec)
+        phase = np.angle(D_rec)
+        flipped_log_spectrum = spectrogram_to_Gradio_image(spc)
+        flipped_phase = phase_to_Gradio_image(phase)
+        # get_audio
+        rec_signal = librosa.istft(D_rec, hop_length=256, win_length=1024)
+        flipped_log_spectrums.append(flipped_log_spectrum)
+        flipped_phases.append(flipped_phase)
+        rec_signals.append(rec_signal)
+        ##########################################
+        if original_STFT_batch is not None:
+            STFT[0, :, :] = original_STFT_batch[index, 0, :, :]
+            padded_D_rec = decode_stft(STFT)
+            D_rec = depad_STFT(padded_D_rec)
+            spc = np.abs(D_rec)
+            phase = np.angle(D_rec)
+            flipped_log_spectrum = spectrogram_to_Gradio_image(spc)
+            flipped_phase = phase_to_Gradio_image(phase)
+            # get_audio
+            rec_signal = librosa.istft(D_rec, hop_length=256, win_length=1024)
+            flipped_log_spectrums_with_original_amp.append(flipped_log_spectrum)
+            flipped_phases_with_original_amp.append(flipped_phase)
+            rec_signals_with_original_amp.append(rec_signal)
+    return flipped_log_spectrums, flipped_phases, rec_signals, \
+        flipped_log_spectrums_with_original_amp, flipped_phases_with_original_amp, rec_signals_with_original_amp
+def add_instrument(source_dict, virtual_instruments_dict, virtual_instrument_name, sample_index):
+    virtual_instruments = virtual_instruments_dict["virtual_instruments"]
+    virtual_instrument = {
+                          "latent_representation": source_dict["latent_representations"][sample_index],
+                          "quantized_latent_representation": source_dict["quantized_latent_representations"][sample_index],
+                          "sampler": source_dict["sampler"],
+                          "signal": source_dict["new_sound_rec_signals_gradio"][sample_index],
+                          "spectrogram_gradio_image": source_dict["new_sound_spectrogram_gradio_images"][
+                              sample_index],
+                          "phase_gradio_image": source_dict["new_sound_phase_gradio_images"][
+                              sample_index]}
+    virtual_instruments[virtual_instrument_name] = virtual_instrument
+    virtual_instruments_dict["virtual_instruments"] = virtual_instruments
+    return virtual_instruments_dict
+def resize_image_to_aspect_ratio(image_data, aspect_ratio_width, aspect_ratio_height):
+    """
+    根据给定的宽高比例拉伸图像，并保持输入输出数据为 NumPy 数组。
+    参数:
+    image_data (numpy array): 输入图像数据 (height, width, 3)
+    aspect_ratio_width (int): 目标宽度比例
+    aspect_ratio_height (int): 目标高度比例
+    返回:
+    numpy array: 调整大小后的图像数据
+    """
+    # 获取图像的当前宽度和高度
+    original_height, original_width, channels = image_data.shape
+    # 计算当前的宽高比
+    current_aspect_ratio = original_width / original_height
+    # 计算目标的宽高比
+    target_aspect_ratio = aspect_ratio_width / aspect_ratio_height
+    # 判断是拉伸宽度还是高度
+    if current_aspect_ratio > target_aspect_ratio:
+        # 当前图像宽高比大于目标宽高比，说明宽度相对较大，需要拉伸高度
+        new_width = original_width
+        new_height = int(new_width / target_aspect_ratio)
+    else:
+        # 当前图像宽高比小于或等于目标宽高比，拉伸宽度
+        new_height = original_height
+        new_width = int(new_height * target_aspect_ratio)
+    # 将 numpy 数组转换为 PIL 图像对象
+    image = Image.fromarray(image_data.astype('uint8'))
+    # 使用 PIL 的 resize 函数进行缩放，使用 LANCZOS 替代 ANTIALIAS
+    resized_image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
+    # 将 PIL 图像转换回 numpy 数组
+    resized_image_data = np.array(resized_image)
+    return resized_image_data
+def average_np_arrays(arr_list):
+    if not arr_list:
+        raise ValueError("Input list cannot be empty")
+    stacked_arrays = np.stack(arr_list, axis=0)
+    avg_array = np.mean(stacked_arrays, axis=0)
+    return avg_array

webUI/presets/instruments/ax.wav ADDED Viewed