WeixuanYuan commited on
Commit
bd6e54b
1 Parent(s): bdd2a77

Upload 70 files

Browse files
Files changed (48) hide show
  1. app.py +22 -11
  2. webUI/natural_language_guided/__pycache__/README.cpython-310.pyc +0 -0
  3. webUI/natural_language_guided/__pycache__/build_instrument.cpython-310.pyc +0 -0
  4. webUI/natural_language_guided/__pycache__/gradio_webUI.cpython-310.pyc +0 -0
  5. webUI/natural_language_guided/__pycache__/inpaint_with_text.cpython-310.pyc +0 -0
  6. webUI/natural_language_guided/__pycache__/note2music.cpython-310.pyc +0 -0
  7. webUI/natural_language_guided/__pycache__/sound2sound_with_text.cpython-310.pyc +0 -0
  8. webUI/natural_language_guided/__pycache__/text2sound.cpython-310.pyc +0 -0
  9. webUI/natural_language_guided/__pycache__/track_maker.cpython-310.pyc +0 -0
  10. webUI/natural_language_guided/__pycache__/utils.cpython-310.pyc +0 -0
  11. webUI/natural_language_guided/build_instrument.py +2 -1
  12. webUI/natural_language_guided/note2music.py +174 -0
  13. webUI/natural_language_guided/text2sound.py +2 -2
  14. webUI/natural_language_guided/track_maker.py +246 -246
  15. webUI/natural_language_guided_4/GAN.py +164 -0
  16. webUI/natural_language_guided_4/README.py +53 -0
  17. webUI/natural_language_guided_4/__pycache__/gradio_webUI.cpython-310.pyc +0 -0
  18. webUI/natural_language_guided_4/__pycache__/inpaint_with_text.cpython-310.pyc +0 -0
  19. webUI/natural_language_guided_4/__pycache__/instruments.cpython-310.pyc +0 -0
  20. webUI/natural_language_guided_4/__pycache__/load_presets.cpython-310.pyc +0 -0
  21. webUI/natural_language_guided_4/__pycache__/note2music.cpython-310.pyc +0 -0
  22. webUI/natural_language_guided_4/__pycache__/sound2sound_with_text.cpython-310.pyc +0 -0
  23. webUI/natural_language_guided_4/__pycache__/text2sound.cpython-310.pyc +0 -0
  24. webUI/natural_language_guided_4/__pycache__/track_maker.cpython-310.pyc +0 -0
  25. webUI/natural_language_guided_4/__pycache__/utils.cpython-310.pyc +0 -0
  26. webUI/natural_language_guided_4/build_instrument.py +305 -0
  27. webUI/natural_language_guided_4/gradio_webUI.py +68 -0
  28. webUI/natural_language_guided_4/inpaint_with_text.py +371 -0
  29. webUI/natural_language_guided_4/instruments.py +60 -0
  30. webUI/natural_language_guided_4/load_presets.py +81 -0
  31. webUI/natural_language_guided_4/note2music.py +200 -0
  32. webUI/natural_language_guided_4/rec.py +190 -0
  33. webUI/natural_language_guided_4/sound2sound_with_text.py +325 -0
  34. webUI/natural_language_guided_4/super_resolution_with_text.py +387 -0
  35. webUI/natural_language_guided_4/text2sound.py +220 -0
  36. webUI/natural_language_guided_4/track_maker.py +248 -0
  37. webUI/natural_language_guided_4/utils.py +228 -0
  38. webUI/presets/instruments/ax.wav +0 -0
  39. webUI/presets/instruments/electronic_sound.wav +0 -0
  40. webUI/presets/instruments/keyboard.wav +0 -0
  41. webUI/presets/instruments/organ.wav +0 -0
  42. webUI/presets/instruments/string.wav +0 -0
  43. webUI/presets/instruments/synth_lead.wav +0 -0
  44. webUI/presets/midis/Air_on_the_G_String.mid +0 -0
  45. webUI/presets/midis/Arhbo.mid +0 -0
  46. webUI/presets/midis/Canon_in_D.mid +0 -0
  47. webUI/presets/midis/Ode_to_Joy_Easy_variation.mid +0 -0
  48. webUI/presets/midis/Rrharil.mid +0 -0
app.py CHANGED
@@ -15,12 +15,17 @@ from model.multimodal_model import get_multi_modal_model
15
 
16
 
17
  import gradio as gr
18
- from webUI.natural_language_guided.gradio_webUI import GradioWebUI
19
- from webUI.natural_language_guided.text2sound import get_text2sound_module
20
- from webUI.natural_language_guided.sound2sound_with_text import get_sound2sound_with_text_module
21
- from webUI.natural_language_guided.inpaint_with_text import get_inpaint_with_text_module
22
- from webUI.natural_language_guided.build_instrument import get_build_instrument_module
23
- from webUI.natural_language_guided.README import get_readme_module
 
 
 
 
 
24
 
25
 
26
 
@@ -62,28 +67,34 @@ else:
62
 
63
 
64
 
65
-
66
  gradioWebUI = GradioWebUI(device, VAE, uNet, text_encoder, CLAP_tokenizer, freq_resolution=512, time_resolution=256, channels=4, timesteps=1000, squared=False,
67
  VAE_scale=4, flexible_duration=True, noise_strategy="repeat", GAN_generator=None)
68
 
 
 
 
 
69
  with gr.Blocks(theme=gr.themes.Soft(), mode="dark") as demo:
70
- # with gr.Blocks(theme='WeixuanYuan/Soft_dark', mode="dark") as demo:
71
- gr.Markdown('Thank you for using DiffuSynth v0.2! \n <span style="color:red">The [Arrangement] feature is still being improved!</span>', unsafe_allow_html=True)
72
 
73
  reconstruction_state = gr.State(value={})
74
  text2sound_state = gr.State(value={})
75
  sound2sound_state = gr.State(value={})
76
  inpaint_state = gr.State(value={})
77
  super_resolution_state = gr.State(value={})
78
- virtual_instruments_state = gr.State(value={"virtual_instruments": {}})
 
79
 
80
  get_text2sound_module(gradioWebUI, text2sound_state, virtual_instruments_state)
81
  get_sound2sound_with_text_module(gradioWebUI, sound2sound_state, virtual_instruments_state)
82
  get_inpaint_with_text_module(gradioWebUI, inpaint_state, virtual_instruments_state)
83
- get_build_instrument_module(gradioWebUI, virtual_instruments_state)
 
84
  # get_readme_module()
 
85
 
86
  demo.launch(debug=True, share=True)
 
87
 
88
 
89
 
 
15
 
16
 
17
  import gradio as gr
18
+
19
+ from tools import read_wav_to_numpy
20
+ from webUI.natural_language_guided_4.gradio_webUI import GradioWebUI
21
+ from webUI.natural_language_guided_4.instruments import get_instruments_module
22
+ from webUI.natural_language_guided_4.load_presets import load_presets
23
+ from webUI.natural_language_guided_4.text2sound import get_text2sound_module
24
+ from webUI.natural_language_guided_4.sound2sound_with_text import get_sound2sound_with_text_module
25
+ from webUI.natural_language_guided_4.inpaint_with_text import get_inpaint_with_text_module
26
+ # from webUI.natural_language_guided_4.build_instrument import get_build_instrument_module
27
+ from webUI.natural_language_guided_4.note2music import get_arrangement_module
28
+ # from webUI.natural_language_guided_4.README import get_readme_module
29
 
30
 
31
 
 
67
 
68
 
69
 
 
70
  gradioWebUI = GradioWebUI(device, VAE, uNet, text_encoder, CLAP_tokenizer, freq_resolution=512, time_resolution=256, channels=4, timesteps=1000, squared=False,
71
  VAE_scale=4, flexible_duration=True, noise_strategy="repeat", GAN_generator=None)
72
 
73
+ virtual_instruments, midis = load_presets(gradioWebUI)
74
+
75
+
76
+
77
  with gr.Blocks(theme=gr.themes.Soft(), mode="dark") as demo:
78
+ gr.Markdown("Thank you for using DiffuSynth v0.2!")
 
79
 
80
  reconstruction_state = gr.State(value={})
81
  text2sound_state = gr.State(value={})
82
  sound2sound_state = gr.State(value={})
83
  inpaint_state = gr.State(value={})
84
  super_resolution_state = gr.State(value={})
85
+ virtual_instruments_state = gr.State(value={"virtual_instruments": virtual_instruments})
86
+ midi_files_state = gr.State(value={"midis": midis})
87
 
88
  get_text2sound_module(gradioWebUI, text2sound_state, virtual_instruments_state)
89
  get_sound2sound_with_text_module(gradioWebUI, sound2sound_state, virtual_instruments_state)
90
  get_inpaint_with_text_module(gradioWebUI, inpaint_state, virtual_instruments_state)
91
+ # get_build_instrument_module(gradioWebUI, virtual_instruments_state)
92
+ get_arrangement_module(gradioWebUI, virtual_instruments_state, midi_files_state)
93
  # get_readme_module()
94
+ # get_instruments_module(gradioWebUI, virtual_instruments_state)
95
 
96
  demo.launch(debug=True, share=True)
97
+ # demo.launch(debug=True, share=False)
98
 
99
 
100
 
webUI/natural_language_guided/__pycache__/README.cpython-310.pyc CHANGED
Binary files a/webUI/natural_language_guided/__pycache__/README.cpython-310.pyc and b/webUI/natural_language_guided/__pycache__/README.cpython-310.pyc differ
 
webUI/natural_language_guided/__pycache__/build_instrument.cpython-310.pyc CHANGED
Binary files a/webUI/natural_language_guided/__pycache__/build_instrument.cpython-310.pyc and b/webUI/natural_language_guided/__pycache__/build_instrument.cpython-310.pyc differ
 
webUI/natural_language_guided/__pycache__/gradio_webUI.cpython-310.pyc CHANGED
Binary files a/webUI/natural_language_guided/__pycache__/gradio_webUI.cpython-310.pyc and b/webUI/natural_language_guided/__pycache__/gradio_webUI.cpython-310.pyc differ
 
webUI/natural_language_guided/__pycache__/inpaint_with_text.cpython-310.pyc CHANGED
Binary files a/webUI/natural_language_guided/__pycache__/inpaint_with_text.cpython-310.pyc and b/webUI/natural_language_guided/__pycache__/inpaint_with_text.cpython-310.pyc differ
 
webUI/natural_language_guided/__pycache__/note2music.cpython-310.pyc ADDED
Binary file (6.59 kB). View file
 
webUI/natural_language_guided/__pycache__/sound2sound_with_text.cpython-310.pyc CHANGED
Binary files a/webUI/natural_language_guided/__pycache__/sound2sound_with_text.cpython-310.pyc and b/webUI/natural_language_guided/__pycache__/sound2sound_with_text.cpython-310.pyc differ
 
webUI/natural_language_guided/__pycache__/text2sound.cpython-310.pyc CHANGED
Binary files a/webUI/natural_language_guided/__pycache__/text2sound.cpython-310.pyc and b/webUI/natural_language_guided/__pycache__/text2sound.cpython-310.pyc differ
 
webUI/natural_language_guided/__pycache__/track_maker.cpython-310.pyc CHANGED
Binary files a/webUI/natural_language_guided/__pycache__/track_maker.cpython-310.pyc and b/webUI/natural_language_guided/__pycache__/track_maker.cpython-310.pyc differ
 
webUI/natural_language_guided/__pycache__/utils.cpython-310.pyc CHANGED
Binary files a/webUI/natural_language_guided/__pycache__/utils.cpython-310.pyc and b/webUI/natural_language_guided/__pycache__/utils.cpython-310.pyc differ
 
webUI/natural_language_guided/build_instrument.py CHANGED
@@ -4,13 +4,14 @@ import torch
4
  import gradio as gr
5
  import mido
6
  from io import BytesIO
 
 
7
 
8
  from model.DiffSynthSampler import DiffSynthSampler
9
  from tools import adsr_envelope, adjust_audio_length
10
  from webUI.natural_language_guided.track_maker import DiffSynth
11
  from webUI.natural_language_guided.utils import encodeBatch2GradioOutput_STFT, phase_to_Gradio_image, \
12
  spectrogram_to_Gradio_image
13
- import torchaudio.transforms as transforms
14
 
15
 
16
  def time_stretch_audio(waveform, sample_rate, stretch_factor):
 
4
  import gradio as gr
5
  import mido
6
  from io import BytesIO
7
+ # import pyrubberband as pyrb
8
+ import torchaudio.transforms as transforms
9
 
10
  from model.DiffSynthSampler import DiffSynthSampler
11
  from tools import adsr_envelope, adjust_audio_length
12
  from webUI.natural_language_guided.track_maker import DiffSynth
13
  from webUI.natural_language_guided.utils import encodeBatch2GradioOutput_STFT, phase_to_Gradio_image, \
14
  spectrogram_to_Gradio_image
 
15
 
16
 
17
  def time_stretch_audio(waveform, sample_rate, stretch_factor):
webUI/natural_language_guided/note2music.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ import numpy as np
3
+ import torch
4
+ import gradio as gr
5
+ import mido
6
+ from io import BytesIO
7
+ # import pyrubberband as pyrb
8
+ import torchaudio.transforms as transforms
9
+
10
+ from model.DiffSynthSampler import DiffSynthSampler
11
+ from tools import adsr_envelope, adjust_audio_length
12
+ from webUI.natural_language_guided.track_maker import DiffSynth, Track
13
+ from webUI.natural_language_guided.utils import encodeBatch2GradioOutput_STFT, phase_to_Gradio_image, \
14
+ spectrogram_to_Gradio_image
15
+
16
+
17
+ def get_arrangement_module(gradioWebUI, virtual_instruments_state, midi_files_state):
18
+ # Load configurations
19
+ uNet = gradioWebUI.uNet
20
+ freq_resolution, time_resolution = gradioWebUI.freq_resolution, gradioWebUI.time_resolution
21
+ VAE_scale = gradioWebUI.VAE_scale
22
+ height, width, channels = int(freq_resolution / VAE_scale), int(time_resolution / VAE_scale), gradioWebUI.channels
23
+
24
+ timesteps = gradioWebUI.timesteps
25
+ VAE_quantizer = gradioWebUI.VAE_quantizer
26
+ VAE_decoder = gradioWebUI.VAE_decoder
27
+ CLAP = gradioWebUI.CLAP
28
+ CLAP_tokenizer = gradioWebUI.CLAP_tokenizer
29
+ device = gradioWebUI.device
30
+ squared = gradioWebUI.squared
31
+ sample_rate = gradioWebUI.sample_rate
32
+ noise_strategy = gradioWebUI.noise_strategy
33
+
34
+ def read_midi(midi, midi_files_dict):
35
+ print(midi)
36
+ midi_name = midi_file.name
37
+ mid = mido.MidiFile(file=BytesIO(midi))
38
+ tracks = [Track(t, mid.ticks_per_beat) for t in mid.tracks]
39
+
40
+ midi_info_text = f"Name: {midi_name}"
41
+ for track in tracks:
42
+ midi_info_text += f"\n {len(track.events)}"
43
+
44
+
45
+ return {midi_info_textbox: gr.Textbox(label="Midi info", lines=10,
46
+ placeholder=midi_info_text),
47
+ midi_files_state: midi_files_dict}
48
+
49
+ def refresh_instruments(virtual_instruments_dict):
50
+ virtual_instruments_names = list(virtual_instruments_dict["virtual_instruments"].keys())
51
+ print(f"virtual_instruments_names: {virtual_instruments_names}")
52
+
53
+ return {select_instrument_dropdown: gr.Dropdown.update(choices=["New Option 1", "New Option 2", "New Option 3"])}
54
+
55
+ def select_sound(virtual_instrument_name, virtual_instruments_dict):
56
+ virtual_instruments = virtual_instruments_dict["virtual_instruments"]
57
+ virtual_instrument = virtual_instruments[virtual_instrument_name]
58
+
59
+ return {source_sound_spectrogram_image: virtual_instrument["spectrogram_gradio_image"],
60
+ source_sound_phase_image: virtual_instrument["phase_gradio_image"],
61
+ source_sound_audio: virtual_instrument["signal"]}
62
+
63
+ def make_track(inpaint_steps, midi, noising_strength, attack, before_release, instrument_names,
64
+ virtual_instruments_dict):
65
+
66
+ if noising_strength < 1:
67
+ print(f"Warning: making track with noising_strength = {noising_strength} < 1")
68
+ virtual_instruments = virtual_instruments_dict["virtual_instruments"]
69
+ sample_steps = int(inpaint_steps)
70
+
71
+ instrument_names = instrument_names.split("@")
72
+ instruments_configs = {}
73
+ for virtual_instrument_name in instrument_names:
74
+ virtual_instrument = virtual_instruments[virtual_instrument_name]
75
+
76
+ latent_representation = torch.tensor(virtual_instrument["latent_representation"], dtype=torch.float32).to(
77
+ device)
78
+ sampler = virtual_instrument["sampler"]
79
+
80
+ batchsize = 1
81
+
82
+ latent_representation = latent_representation.repeat(batchsize, 1, 1, 1)
83
+
84
+ mid = mido.MidiFile(file=BytesIO(midi))
85
+ instruments_configs[virtual_instrument_name] = {
86
+ 'sample_steps': sample_steps,
87
+ 'sampler': sampler,
88
+ 'noising_strength': noising_strength,
89
+ 'latent_representation': latent_representation,
90
+ 'attack': attack,
91
+ 'before_release': before_release}
92
+
93
+ diffSynth = DiffSynth(instruments_configs, uNet, VAE_quantizer, VAE_decoder, CLAP, CLAP_tokenizer, device)
94
+
95
+ full_audio = diffSynth.get_music(mid, instrument_names)
96
+
97
+ return {track_audio: (sample_rate, full_audio)}
98
+
99
+ with gr.Tab("Arrangement"):
100
+ gr.Markdown("Make music with generated sounds!")
101
+ with gr.Row(variant="panel"):
102
+ with gr.Column(scale=3):
103
+ preset_button_1 = gr.Button(variant="primary", value="Ode_to_Joy", scale=1)
104
+ preset_button_2 = gr.Button(variant="primary", value="Ode_to_Joy", scale=1)
105
+ preset_button_3 = gr.Button(variant="primary", value="Ode_to_Joy", scale=1)
106
+ midi_file = gr.File(label="Upload midi file", type="binary", scale=2)
107
+ with gr.Column(scale=3):
108
+ midi_info_textbox = gr.Textbox(label="Midi info", lines=10, placeholder="Please select/upload a midi on the left.")
109
+ instrument_names_textbox = gr.Textbox(label="Instrument names", lines=2,
110
+ placeholder="Names of your instrument used to play the midi", scale=1)
111
+ with gr.Column(scale=3):
112
+ refresh_instrument_button = gr.Button(variant="primary", value="Refresh instruments", scale=1)
113
+ # instrument_name_textbox = gr.Textbox(label="Instrument name", lines=1,
114
+ # placeholder="Name of your instrument", scale=1)
115
+ select_instrument_dropdown = gr.Dropdown(choices=["Option 1", "Option 2", "Option 3"], label="Choose an option")
116
+ source_sound_audio = gr.Audio(type="numpy", label="Play new sound", interactive=False)
117
+ with gr.Column(scale=3):
118
+ make_track_button = gr.Button(variant="primary", value="Make track", scale=1)
119
+ track_audio = gr.Audio(type="numpy", label="Play new sound", interactive=False)
120
+ with gr.Row(variant="panel"):
121
+ with gr.Tab("Origin sound"):
122
+ inpaint_steps_slider = gr.Slider(minimum=5.0, maximum=999.0, value=20.0, step=1.0,
123
+ label="inpaint_steps")
124
+ noising_strength_slider = gradioWebUI.get_noising_strength_slider(default_noising_strength=1.)
125
+ end_noise_level_ratio_slider = gr.Slider(minimum=0.0, maximum=1., value=0.0, step=0.01,
126
+ label="end_noise_level_ratio")
127
+ attack_slider = gr.Slider(minimum=0.0, maximum=1.5, value=0.5, step=0.01, label="attack in sec")
128
+ before_release_slider = gr.Slider(minimum=0.0, maximum=1.5, value=0.5, step=0.01,
129
+ label="before_release in sec")
130
+ release_slider = gr.Slider(minimum=0.0, maximum=1.0, value=0.3, step=0.01, label="release in sec")
131
+ mask_flexivity_slider = gr.Slider(minimum=0.01, maximum=1.00, value=1., step=0.01,
132
+ label="mask_flexivity")
133
+ with gr.Tab("Length adjustment config"):
134
+ use_dynamic_mask_checkbox = gr.Checkbox(label="Use dynamic mask", value=True)
135
+ test_duration_envelope_button = gr.Button(variant="primary", value="Apply envelope", scale=1)
136
+ test_duration_stretch_button = gr.Button(variant="primary", value="Apply stretch", scale=1)
137
+ test_duration_inpaint_button = gr.Button(variant="primary", value="Inpaint different duration", scale=1)
138
+ duration_slider = gradioWebUI.get_duration_slider()
139
+ with gr.Tab("Pitch shift config"):
140
+ pitch_shift_radio = gr.Radio(choices=["librosa", "torchaudio", "rubberband"],
141
+ value="librosa")
142
+
143
+ with gr.Row(variant="panel"):
144
+ with gr.Column(scale=2):
145
+ with gr.Row(variant="panel"):
146
+ source_sound_spectrogram_image = gr.Image(label="New sound spectrogram", type="numpy",
147
+ height=600, scale=1)
148
+ source_sound_phase_image = gr.Image(label="New sound phase", type="numpy",
149
+ height=600, scale=1)
150
+
151
+
152
+
153
+ # instrument_name_textbox.change(select_sound,
154
+ # inputs=[instrument_name_textbox, virtual_instruments_state],
155
+ # outputs=[source_sound_audio])
156
+
157
+ refresh_instrument_button.click(refresh_instruments,
158
+ inputs=[virtual_instruments_state],
159
+ outputs=[select_instrument_dropdown])
160
+
161
+ make_track_button.click(make_track,
162
+ inputs=[inpaint_steps_slider, midi_file,
163
+ noising_strength_slider,
164
+ attack_slider,
165
+ before_release_slider,
166
+ instrument_names_textbox,
167
+ virtual_instruments_state],
168
+ outputs=[track_audio])
169
+
170
+ midi_file.change(read_midi,
171
+ inputs=[midi_file,
172
+ midi_files_state],
173
+ outputs=[midi_info_textbox,
174
+ midi_files_state])
webUI/natural_language_guided/text2sound.py CHANGED
@@ -46,8 +46,8 @@ def get_text2sound_module(gradioWebUI, text2sound_state, virtual_instruments_sta
46
 
47
  mySampler = DiffSynthSampler(timesteps, height=height, channels=channels, noise_strategy=noise_strategy)
48
  negative_condition = \
49
- CLAP.get_text_features(**CLAP_tokenizer([text2sound_negative_prompts], padding=True, return_tensors="pt"))[
50
- 0]
51
  mySampler.activate_classifier_free_guidance(CFG, negative_condition.to(device))
52
 
53
  mySampler.respace(list(np.linspace(0, timesteps - 1, text2sound_sample_steps, dtype=np.int32)))
 
46
 
47
  mySampler = DiffSynthSampler(timesteps, height=height, channels=channels, noise_strategy=noise_strategy)
48
  negative_condition = \
49
+ CLAP.get_text_features(**CLAP_tokenizer([text2sound_negative_prompts], padding=True, return_tensors="pt"))[0]
50
+
51
  mySampler.activate_classifier_free_guidance(CFG, negative_condition.to(device))
52
 
53
  mySampler.respace(list(np.linspace(0, timesteps - 1, text2sound_sample_steps, dtype=np.int32)))
webUI/natural_language_guided/track_maker.py CHANGED
@@ -1,247 +1,247 @@
1
- import librosa
2
- import numpy as np
3
- import torch
4
-
5
- from model.DiffSynthSampler import DiffSynthSampler
6
- from webUI.natural_language_guided.utils import encodeBatch2GradioOutput_STFT
7
- import mido
8
- import torchaudio.transforms as transforms
9
- from tqdm import tqdm
10
-
11
-
12
- # def pitch_shift_audio(waveform, sample_rate, n_steps, device='cpu', n_fft=1024, hop_length=None):
13
- # # 如果输入是 numpy 数组,则转换为 torch.Tensor
14
- # if isinstance(waveform, np.ndarray):
15
- # waveform = torch.from_numpy(waveform)
16
- #
17
- # # 设置 hop_length 为 n_fft 的一半(合理的默认值),以减少 STFT 操作的内存开销
18
- # if hop_length is None:
19
- # hop_length = n_fft // 4
20
- #
21
- # # 将 waveform 移动到指定设备上
22
- # waveform = waveform.to(device, dtype=torch.float32)
23
- #
24
- # # 创建 pitch_shift 变换,并移动到指定设备上
25
- # pitch_shift = transforms.PitchShift(
26
- # sample_rate=sample_rate,
27
- # n_steps=n_steps,
28
- # n_fft=n_fft,
29
- # hop_length=hop_length
30
- # ).to(device)
31
- #
32
- # # 执行变换,并将结果从设备移动到 CPU 上,最后转换为 numpy 数组
33
- # shifted_waveform = pitch_shift(waveform).detach().cpu().numpy()
34
- #
35
- # return shifted_waveform
36
-
37
-
38
- def pitch_shift_librosa(waveform, sample_rate, total_steps, step_size=4, n_fft=4096, hop_length=None):
39
- # librosa 需要输入的是 numpy 数组
40
- if isinstance(waveform, torch.Tensor):
41
- waveform = waveform.numpy()
42
-
43
- # 如果 hop_length 未提供,则使用 n_fft 的四分之一作为默认值
44
- if hop_length is None:
45
- hop_length = n_fft // 4
46
-
47
- # 逐步进行 pitch shift,每次提升 step_size 个半音
48
- current_waveform = waveform
49
- num_steps = int(np.ceil(total_steps / step_size))
50
-
51
- for i in range(num_steps):
52
- step = min(step_size, total_steps - i * step_size) # 确保最后一步不会超过 total_steps
53
- current_waveform = librosa.effects.pitch_shift(
54
- current_waveform, sr=sample_rate, n_steps=step,
55
- n_fft=n_fft, hop_length=hop_length
56
- )
57
-
58
- return current_waveform
59
-
60
-
61
-
62
-
63
- class NoteEvent:
64
- def __init__(self, note, velocity, start_time, duration):
65
- self.note = note
66
- self.velocity = velocity
67
- self.start_time = start_time # In ticks
68
- self.duration = duration # In ticks
69
-
70
- def __str__(self):
71
- return f"Note {self.note}, velocity {self.velocity}, start_time {self.start_time}, duration {self.duration}"
72
-
73
-
74
- class Track:
75
- def __init__(self, track, ticks_per_beat):
76
- self.tempo_events = self._parse_tempo_events(track)
77
- self.events = self._parse_note_events(track)
78
- self.ticks_per_beat = ticks_per_beat
79
-
80
- def _parse_tempo_events(self, track):
81
- tempo_events = []
82
- current_tempo = 500000 # Default MIDI tempo is 120 BPM which is 500000 microseconds per beat
83
- for msg in track:
84
- if msg.type == 'set_tempo':
85
- tempo_events.append((msg.time, msg.tempo))
86
- elif not msg.is_meta:
87
- tempo_events.append((msg.time, current_tempo))
88
- return tempo_events
89
-
90
- def _parse_note_events(self, track):
91
- events = []
92
- start_time = 0
93
- for msg in track:
94
- if not msg.is_meta:
95
- start_time += msg.time
96
- if msg.type == 'note_on' and msg.velocity > 0:
97
- note_on_time = start_time
98
- elif msg.type == 'note_on' and msg.velocity == 0:
99
- duration = start_time - note_on_time
100
- events.append(NoteEvent(msg.note, msg.velocity, note_on_time, duration))
101
- return events
102
-
103
- def synthesize_track(self, diffSynthSampler, sample_rate=16000):
104
- track_audio = np.zeros(int(self._get_total_time() * sample_rate), dtype=np.float32)
105
- current_tempo = 500000 # Start with default MIDI tempo 120 BPM
106
- duration_note_mapping = {}
107
-
108
- for event in tqdm(self.events[:25]):
109
- current_tempo = self._get_tempo_at(event.start_time)
110
- seconds_per_tick = mido.tick2second(1, self.ticks_per_beat, current_tempo)
111
- start_time_sec = event.start_time * seconds_per_tick
112
- # Todo: set a minimum duration
113
- duration_sec = event.duration * seconds_per_tick
114
- duration_sec = max(duration_sec, 0.75)
115
- start_sample = int(start_time_sec * sample_rate)
116
- if not (str(duration_sec) in duration_note_mapping):
117
- note_sample = diffSynthSampler(event.velocity, duration_sec)
118
- duration_note_mapping[str(duration_sec)] = note_sample / np.max(np.abs(note_sample))
119
-
120
- # note_audio = pyrb.pitch_shift(duration_note_mapping[str(duration_sec)], sample_rate, event.note - 52)
121
- # note_audio = pitch_shift_audio(duration_note_mapping[str(duration_sec)], sample_rate, event.note - 52)
122
- note_audio = pitch_shift_librosa(duration_note_mapping[str(duration_sec)], sample_rate, event.note - 52)
123
- end_sample = start_sample + len(note_audio)
124
- track_audio[start_sample:end_sample] += note_audio
125
-
126
- return track_audio
127
-
128
- def _get_tempo_at(self, time_tick):
129
- current_tempo = 500000 # Start with default MIDI tempo 120 BPM
130
- elapsed_ticks = 0
131
-
132
- for tempo_change in self.tempo_events:
133
- if elapsed_ticks + tempo_change[0] > time_tick:
134
- return current_tempo
135
- elapsed_ticks += tempo_change[0]
136
- current_tempo = tempo_change[1]
137
-
138
- return current_tempo
139
-
140
- def _get_total_time(self):
141
- total_time = 0
142
- current_tempo = 500000 # Start with default MIDI tempo 120 BPM
143
-
144
- for event in self.events:
145
- current_tempo = self._get_tempo_at(event.start_time)
146
- seconds_per_tick = mido.tick2second(1, self.ticks_per_beat, current_tempo)
147
- total_time += event.duration * seconds_per_tick
148
-
149
- return total_time
150
-
151
-
152
- class DiffSynth:
153
- def __init__(self, instruments_configs, noise_prediction_model, VAE_quantizer, VAE_decoder, text_encoder, CLAP_tokenizer, device,
154
- model_sample_rate=16000, timesteps=1000, channels=4, freq_resolution=512, time_resolution=256, VAE_scale=4, squared=False):
155
-
156
- self.noise_prediction_model = noise_prediction_model
157
- self.VAE_quantizer = VAE_quantizer
158
- self.VAE_decoder = VAE_decoder
159
- self.device = device
160
- self.model_sample_rate = model_sample_rate
161
- self.timesteps = timesteps
162
- self.channels = channels
163
- self.freq_resolution = freq_resolution
164
- self.time_resolution = time_resolution
165
- self.height = int(freq_resolution/VAE_scale)
166
- self.VAE_scale = VAE_scale
167
- self.squared = squared
168
- self.text_encoder = text_encoder
169
- self.CLAP_tokenizer = CLAP_tokenizer
170
-
171
- # instruments_configs 是字典 string -> (condition, negative_condition, guidance_scale, sample_steps, seed, initial_noise, sampler)
172
- self.instruments_configs = instruments_configs
173
- self.diffSynthSamplers = {}
174
- self._update_instruments()
175
-
176
-
177
- def _update_instruments(self):
178
-
179
- def diffSynthSamplerWrapper(instruments_config):
180
-
181
- def diffSynthSampler(velocity, duration_sec, sample_rate=16000):
182
-
183
- condition = self.text_encoder.get_text_features(**self.CLAP_tokenizer([""], padding=True, return_tensors="pt")).to(self.device)
184
- sample_steps = instruments_config['sample_steps']
185
- sampler = instruments_config['sampler']
186
- noising_strength = instruments_config['noising_strength']
187
- latent_representation = instruments_config['latent_representation']
188
- attack = instruments_config['attack']
189
- before_release = instruments_config['before_release']
190
-
191
- assert sample_rate == self.model_sample_rate, "sample_rate != model_sample_rate"
192
-
193
- width = int(self.time_resolution * ((duration_sec + 1) / 4) / self.VAE_scale)
194
-
195
- mySampler = DiffSynthSampler(self.timesteps, height=128, channels=4, noise_strategy="repeat", mute=True)
196
- mySampler.respace(list(np.linspace(0, self.timesteps - 1, sample_steps, dtype=np.int32)))
197
-
198
- # mask = 1, freeze
199
- latent_mask = torch.zeros((1, 1, self.height, width), dtype=torch.float32).to(self.device)
200
- latent_mask[:, :, :, :int(self.time_resolution * (attack / 4) / self.VAE_scale)] = 1.0
201
- latent_mask[:, :, :, -int(self.time_resolution * ((before_release+1) / 4) / self.VAE_scale):] = 1.0
202
-
203
- latent_representations, _ = \
204
- mySampler.inpaint_sample(model=self.noise_prediction_model, shape=(1, self.channels, self.height, width),
205
- noising_strength=noising_strength, condition=condition,
206
- guide_img=latent_representation, mask=latent_mask, return_tensor=True,
207
- sampler=sampler,
208
- use_dynamic_mask=True, end_noise_level_ratio=0.0,
209
- mask_flexivity=1.0)
210
-
211
-
212
- latent_representations = latent_representations[-1]
213
-
214
- quantized_latent_representations, _, (_, _, _) = self.VAE_quantizer(latent_representations)
215
- # Todo: remove hard-coding
216
-
217
- flipped_log_spectrums, flipped_phases, rec_signals, _, _, _ = encodeBatch2GradioOutput_STFT(self.VAE_decoder,
218
- quantized_latent_representations,
219
- resolution=(
220
- 512,
221
- width * self.VAE_scale),
222
- original_STFT_batch=None,
223
- )
224
-
225
-
226
- return rec_signals[0]
227
-
228
- return diffSynthSampler
229
-
230
- for key in self.instruments_configs.keys():
231
- self.diffSynthSamplers[key] = diffSynthSamplerWrapper(self.instruments_configs[key])
232
-
233
- def get_music(self, mid, instrument_names, sample_rate=16000):
234
- tracks = [Track(t, mid.ticks_per_beat) for t in mid.tracks]
235
- assert len(tracks) == len(instrument_names), f"len(tracks) = {len(tracks)} != {len(instrument_names)} = len(instrument_names)"
236
-
237
- track_audios = [track.synthesize_track(self.diffSynthSamplers[instrument_names[i]], sample_rate=sample_rate) for i, track in enumerate(tracks)]
238
-
239
- # 将所有音轨填充至最长音轨的长度,以便它们可以被叠加
240
- max_length = max(len(audio) for audio in track_audios)
241
- full_audio = np.zeros(max_length, dtype=np.float32) # 初始化全音频数组为零
242
- for audio in track_audios:
243
- # 音轨可能不够长,需要填充零
244
- padded_audio = np.pad(audio, (0, max_length - len(audio)), 'constant')
245
- full_audio += padded_audio # 叠加音轨
246
-
247
  return full_audio
 
1
+ import librosa
2
+ import numpy as np
3
+ import torch
4
+
5
+ from model.DiffSynthSampler import DiffSynthSampler
6
+ from webUI.natural_language_guided.utils import encodeBatch2GradioOutput_STFT
7
+ import mido
8
+ import torchaudio.transforms as transforms
9
+ from tqdm import tqdm
10
+
11
+
12
+ # def pitch_shift_audio(waveform, sample_rate, n_steps, device='cpu', n_fft=1024, hop_length=None):
13
+ # # 如果输入是 numpy 数组,则转换为 torch.Tensor
14
+ # if isinstance(waveform, np.ndarray):
15
+ # waveform = torch.from_numpy(waveform)
16
+ #
17
+ # # 设置 hop_length 为 n_fft 的一半(合理的默认值),以减少 STFT 操作的内存开销
18
+ # if hop_length is None:
19
+ # hop_length = n_fft // 4
20
+ #
21
+ # # 将 waveform 移动到指定设备上
22
+ # waveform = waveform.to(device, dtype=torch.float32)
23
+ #
24
+ # # 创建 pitch_shift 变换,并移动到指定设备上
25
+ # pitch_shift = transforms.PitchShift(
26
+ # sample_rate=sample_rate,
27
+ # n_steps=n_steps,
28
+ # n_fft=n_fft,
29
+ # hop_length=hop_length
30
+ # ).to(device)
31
+ #
32
+ # # 执行变换,并将结果从设备移动到 CPU 上,最后转换为 numpy 数组
33
+ # shifted_waveform = pitch_shift(waveform).detach().cpu().numpy()
34
+ #
35
+ # return shifted_waveform
36
+
37
+
38
+ def pitch_shift_librosa(waveform, sample_rate, total_steps, step_size=4, n_fft=4096, hop_length=None):
39
+ # librosa 需要输入的是 numpy 数组
40
+ if isinstance(waveform, torch.Tensor):
41
+ waveform = waveform.numpy()
42
+
43
+ # 如果 hop_length 未提供,则使用 n_fft 的四分之一作为默认值
44
+ if hop_length is None:
45
+ hop_length = n_fft // 4
46
+
47
+ # 逐步进行 pitch shift,每次提升 step_size 个半音
48
+ current_waveform = waveform
49
+ num_steps = int(np.ceil(total_steps / step_size))
50
+
51
+ for i in range(num_steps):
52
+ step = min(step_size, total_steps - i * step_size) # 确保最后一步不会超过 total_steps
53
+ current_waveform = librosa.effects.pitch_shift(
54
+ current_waveform, sr=sample_rate, n_steps=step,
55
+ n_fft=n_fft, hop_length=hop_length
56
+ )
57
+
58
+ return current_waveform
59
+
60
+
61
+
62
+
63
+ class NoteEvent:
64
+ def __init__(self, note, velocity, start_time, duration):
65
+ self.note = note
66
+ self.velocity = velocity
67
+ self.start_time = start_time # In ticks
68
+ self.duration = duration # In ticks
69
+
70
+ def __str__(self):
71
+ return f"Note {self.note}, velocity {self.velocity}, start_time {self.start_time}, duration {self.duration}"
72
+
73
+
74
+ class Track:
75
+ def __init__(self, track, ticks_per_beat):
76
+ self.tempo_events = self._parse_tempo_events(track)
77
+ self.events = self._parse_note_events(track)
78
+ self.ticks_per_beat = ticks_per_beat
79
+
80
+ def _parse_tempo_events(self, track):
81
+ tempo_events = []
82
+ current_tempo = 500000 # Default MIDI tempo is 120 BPM which is 500000 microseconds per beat
83
+ for msg in track:
84
+ if msg.type == 'set_tempo':
85
+ tempo_events.append((msg.time, msg.tempo))
86
+ elif not msg.is_meta:
87
+ tempo_events.append((msg.time, current_tempo))
88
+ return tempo_events
89
+
90
+ def _parse_note_events(self, track):
91
+ events = []
92
+ start_time = 0
93
+ for msg in track:
94
+ if not msg.is_meta:
95
+ start_time += msg.time
96
+ if msg.type == 'note_on' and msg.velocity > 0:
97
+ note_on_time = start_time
98
+ elif msg.type == 'note_on' and msg.velocity == 0:
99
+ duration = start_time - note_on_time
100
+ events.append(NoteEvent(msg.note, msg.velocity, note_on_time, duration))
101
+ return events
102
+
103
+ def synthesize_track(self, diffSynthSampler, sample_rate=16000):
104
+ track_audio = np.zeros(int(self._get_total_time() * sample_rate), dtype=np.float32)
105
+ current_tempo = 500000 # Start with default MIDI tempo 120 BPM
106
+ duration_note_mapping = {}
107
+
108
+ for event in tqdm(self.events[:25]):
109
+ current_tempo = self._get_tempo_at(event.start_time)
110
+ seconds_per_tick = mido.tick2second(1, self.ticks_per_beat, current_tempo)
111
+ start_time_sec = event.start_time * seconds_per_tick
112
+ # Todo: set a minimum duration
113
+ duration_sec = event.duration * seconds_per_tick
114
+ duration_sec = max(duration_sec, 0.75)
115
+ start_sample = int(start_time_sec * sample_rate)
116
+ if not (str(duration_sec) in duration_note_mapping):
117
+ note_sample = diffSynthSampler(event.velocity, duration_sec)
118
+ duration_note_mapping[str(duration_sec)] = note_sample / np.max(np.abs(note_sample))
119
+
120
+ # note_audio = pyrb.pitch_shift(duration_note_mapping[str(duration_sec)], sample_rate, event.note - 52)
121
+ # note_audio = pitch_shift_audio(duration_note_mapping[str(duration_sec)], sample_rate, event.note - 52)
122
+ note_audio = pitch_shift_librosa(duration_note_mapping[str(duration_sec)], sample_rate, event.note - 52)
123
+ end_sample = start_sample + len(note_audio)
124
+ track_audio[start_sample:end_sample] += note_audio
125
+
126
+ return track_audio
127
+
128
+ def _get_tempo_at(self, time_tick):
129
+ current_tempo = 500000 # Start with default MIDI tempo 120 BPM
130
+ elapsed_ticks = 0
131
+
132
+ for tempo_change in self.tempo_events:
133
+ if elapsed_ticks + tempo_change[0] > time_tick:
134
+ return current_tempo
135
+ elapsed_ticks += tempo_change[0]
136
+ current_tempo = tempo_change[1]
137
+
138
+ return current_tempo
139
+
140
+ def _get_total_time(self):
141
+ total_time = 0
142
+ current_tempo = 500000 # Start with default MIDI tempo 120 BPM
143
+
144
+ for event in self.events:
145
+ current_tempo = self._get_tempo_at(event.start_time)
146
+ seconds_per_tick = mido.tick2second(1, self.ticks_per_beat, current_tempo)
147
+ total_time += event.duration * seconds_per_tick
148
+
149
+ return total_time
150
+
151
+
152
+ class DiffSynth:
153
+ def __init__(self, instruments_configs, noise_prediction_model, VAE_quantizer, VAE_decoder, text_encoder, CLAP_tokenizer, device,
154
+ model_sample_rate=16000, timesteps=1000, channels=4, freq_resolution=512, time_resolution=256, VAE_scale=4, squared=False):
155
+
156
+ self.noise_prediction_model = noise_prediction_model
157
+ self.VAE_quantizer = VAE_quantizer
158
+ self.VAE_decoder = VAE_decoder
159
+ self.device = device
160
+ self.model_sample_rate = model_sample_rate
161
+ self.timesteps = timesteps
162
+ self.channels = channels
163
+ self.freq_resolution = freq_resolution
164
+ self.time_resolution = time_resolution
165
+ self.height = int(freq_resolution/VAE_scale)
166
+ self.VAE_scale = VAE_scale
167
+ self.squared = squared
168
+ self.text_encoder = text_encoder
169
+ self.CLAP_tokenizer = CLAP_tokenizer
170
+
171
+ # instruments_configs 是字典 string -> (condition, negative_condition, guidance_scale, sample_steps, seed, initial_noise, sampler)
172
+ self.instruments_configs = instruments_configs
173
+ self.diffSynthSamplers = {}
174
+ self._update_instruments()
175
+
176
+
177
+ def _update_instruments(self):
178
+
179
+ def diffSynthSamplerWrapper(instruments_config):
180
+
181
+ def diffSynthSampler(velocity, duration_sec, sample_rate=16000):
182
+
183
+ condition = self.text_encoder.get_text_features(**self.CLAP_tokenizer([""], padding=True, return_tensors="pt")).to(self.device)
184
+ sample_steps = instruments_config['sample_steps']
185
+ sampler = instruments_config['sampler']
186
+ noising_strength = instruments_config['noising_strength']
187
+ latent_representation = instruments_config['latent_representation']
188
+ attack = instruments_config['attack']
189
+ before_release = instruments_config['before_release']
190
+
191
+ assert sample_rate == self.model_sample_rate, "sample_rate != model_sample_rate"
192
+
193
+ width = int(self.time_resolution * ((duration_sec + 1) / 4) / self.VAE_scale)
194
+
195
+ mySampler = DiffSynthSampler(self.timesteps, height=128, channels=4, noise_strategy="repeat", mute=True)
196
+ mySampler.respace(list(np.linspace(0, self.timesteps - 1, sample_steps, dtype=np.int32)))
197
+
198
+ # mask = 1, freeze
199
+ latent_mask = torch.zeros((1, 1, self.height, width), dtype=torch.float32).to(self.device)
200
+ latent_mask[:, :, :, :int(self.time_resolution * (attack / 4) / self.VAE_scale)] = 1.0
201
+ latent_mask[:, :, :, -int(self.time_resolution * ((before_release+1) / 4) / self.VAE_scale):] = 1.0
202
+
203
+ latent_representations, _ = \
204
+ mySampler.inpaint_sample(model=self.noise_prediction_model, shape=(1, self.channels, self.height, width),
205
+ noising_strength=noising_strength, condition=condition,
206
+ guide_img=latent_representation, mask=latent_mask, return_tensor=True,
207
+ sampler=sampler,
208
+ use_dynamic_mask=True, end_noise_level_ratio=0.0,
209
+ mask_flexivity=1.0)
210
+
211
+
212
+ latent_representations = latent_representations[-1]
213
+
214
+ quantized_latent_representations, _, (_, _, _) = self.VAE_quantizer(latent_representations)
215
+ # Todo: remove hard-coding
216
+
217
+ flipped_log_spectrums, flipped_phases, rec_signals, _, _, _ = encodeBatch2GradioOutput_STFT(self.VAE_decoder,
218
+ quantized_latent_representations,
219
+ resolution=(
220
+ 512,
221
+ width * self.VAE_scale),
222
+ original_STFT_batch=None,
223
+ )
224
+
225
+
226
+ return rec_signals[0]
227
+
228
+ return diffSynthSampler
229
+
230
+ for key in self.instruments_configs.keys():
231
+ self.diffSynthSamplers[key] = diffSynthSamplerWrapper(self.instruments_configs[key])
232
+
233
+ def get_music(self, mid, instrument_names, sample_rate=16000):
234
+ tracks = [Track(t, mid.ticks_per_beat) for t in mid.tracks]
235
+ assert len(tracks) <= len(instrument_names), f"len(tracks) = {len(tracks)} > {len(instrument_names)} = len(instrument_names)"
236
+
237
+ track_audios = [track.synthesize_track(self.diffSynthSamplers[instrument_names[i]], sample_rate=sample_rate) for i, track in enumerate(tracks)]
238
+
239
+ # 将所有音轨填充至最长音轨的长度,以便它们可以被叠加
240
+ max_length = max(len(audio) for audio in track_audios)
241
+ full_audio = np.zeros(max_length, dtype=np.float32) # 初始化全音频数组为零
242
+ for audio in track_audios:
243
+ # 音轨可能不够长,需要填充零
244
+ padded_audio = np.pad(audio, (0, max_length - len(audio)), 'constant')
245
+ full_audio += padded_audio # 叠加音轨
246
+
247
  return full_audio
webUI/natural_language_guided_4/GAN.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+ import torch
4
+
5
+ from tools import safe_int
6
+ from webUI.natural_language_guided_STFT.utils import encodeBatch2GradioOutput, latent_representation_to_Gradio_image, \
7
+ add_instrument
8
+
9
+
10
+ def get_testGAN(gradioWebUI, text2sound_state, virtual_instruments_state):
11
+ # Load configurations
12
+ gan_generator = gradioWebUI.GAN_generator
13
+ freq_resolution, time_resolution = gradioWebUI.freq_resolution, gradioWebUI.time_resolution
14
+ VAE_scale = gradioWebUI.VAE_scale
15
+ height, width, channels = int(freq_resolution / VAE_scale), int(time_resolution / VAE_scale), gradioWebUI.channels
16
+
17
+ timesteps = gradioWebUI.timesteps
18
+ VAE_quantizer = gradioWebUI.VAE_quantizer
19
+ VAE_decoder = gradioWebUI.VAE_decoder
20
+ CLAP = gradioWebUI.CLAP
21
+ CLAP_tokenizer = gradioWebUI.CLAP_tokenizer
22
+ device = gradioWebUI.device
23
+ squared = gradioWebUI.squared
24
+ sample_rate = gradioWebUI.sample_rate
25
+ noise_strategy = gradioWebUI.noise_strategy
26
+
27
+ def gan_random_sample(text2sound_prompts, text2sound_negative_prompts, text2sound_batchsize,
28
+ text2sound_duration,
29
+ text2sound_guidance_scale, text2sound_sampler,
30
+ text2sound_sample_steps, text2sound_seed,
31
+ text2sound_dict):
32
+ text2sound_seed = safe_int(text2sound_seed, 12345678)
33
+
34
+ width = int(time_resolution * ((text2sound_duration + 1) / 4) / VAE_scale)
35
+
36
+ text2sound_batchsize = int(text2sound_batchsize)
37
+
38
+ text2sound_embedding = \
39
+ CLAP.get_text_features(**CLAP_tokenizer([text2sound_prompts], padding=True, return_tensors="pt"))[0].to(
40
+ device)
41
+
42
+ CFG = int(text2sound_guidance_scale)
43
+
44
+ condition = text2sound_embedding.repeat(text2sound_batchsize, 1)
45
+
46
+ noise = torch.randn(text2sound_batchsize, channels, height, width).to(device)
47
+ latent_representations = gan_generator(noise, condition)
48
+
49
+ print(latent_representations[0, 0, :3, :3])
50
+
51
+ latent_representation_gradio_images = []
52
+ quantized_latent_representation_gradio_images = []
53
+ new_sound_spectrogram_gradio_images = []
54
+ new_sound_rec_signals_gradio = []
55
+
56
+ quantized_latent_representations, loss, (_, _, _) = VAE_quantizer(latent_representations)
57
+ # Todo: remove hard-coding
58
+ flipped_log_spectrums, rec_signals = encodeBatch2GradioOutput(VAE_decoder, quantized_latent_representations,
59
+ resolution=(512, width * VAE_scale),
60
+ centralized=False,
61
+ squared=squared)
62
+
63
+ for i in range(text2sound_batchsize):
64
+ latent_representation_gradio_images.append(latent_representation_to_Gradio_image(latent_representations[i]))
65
+ quantized_latent_representation_gradio_images.append(
66
+ latent_representation_to_Gradio_image(quantized_latent_representations[i]))
67
+ new_sound_spectrogram_gradio_images.append(flipped_log_spectrums[i])
68
+ new_sound_rec_signals_gradio.append((sample_rate, rec_signals[i]))
69
+
70
+ text2sound_dict["latent_representations"] = latent_representations.to("cpu").detach().numpy()
71
+ text2sound_dict["quantized_latent_representations"] = quantized_latent_representations.to("cpu").detach().numpy()
72
+ text2sound_dict["latent_representation_gradio_images"] = latent_representation_gradio_images
73
+ text2sound_dict["quantized_latent_representation_gradio_images"] = quantized_latent_representation_gradio_images
74
+ text2sound_dict["new_sound_spectrogram_gradio_images"] = new_sound_spectrogram_gradio_images
75
+ text2sound_dict["new_sound_rec_signals_gradio"] = new_sound_rec_signals_gradio
76
+
77
+ text2sound_dict["condition"] = condition.to("cpu").detach().numpy()
78
+ # text2sound_dict["negative_condition"] = negative_condition.to("cpu").detach().numpy()
79
+ text2sound_dict["guidance_scale"] = CFG
80
+ text2sound_dict["sampler"] = text2sound_sampler
81
+
82
+ return {text2sound_latent_representation_image: text2sound_dict["latent_representation_gradio_images"][0],
83
+ text2sound_quantized_latent_representation_image:
84
+ text2sound_dict["quantized_latent_representation_gradio_images"][0],
85
+ text2sound_sampled_spectrogram_image: text2sound_dict["new_sound_spectrogram_gradio_images"][0],
86
+ text2sound_sampled_audio: text2sound_dict["new_sound_rec_signals_gradio"][0],
87
+ text2sound_seed_textbox: text2sound_seed,
88
+ text2sound_state: text2sound_dict,
89
+ text2sound_sample_index_slider: gr.update(minimum=0, maximum=text2sound_batchsize - 1, value=0, step=1,
90
+ visible=True,
91
+ label="Sample index.",
92
+ info="Swipe to view other samples")}
93
+
94
+ def show_random_sample(sample_index, text2sound_dict):
95
+ sample_index = int(sample_index)
96
+ text2sound_dict["sample_index"] = sample_index
97
+ return {text2sound_latent_representation_image: text2sound_dict["latent_representation_gradio_images"][
98
+ sample_index],
99
+ text2sound_quantized_latent_representation_image:
100
+ text2sound_dict["quantized_latent_representation_gradio_images"][sample_index],
101
+ text2sound_sampled_spectrogram_image: text2sound_dict["new_sound_spectrogram_gradio_images"][
102
+ sample_index],
103
+ text2sound_sampled_audio: text2sound_dict["new_sound_rec_signals_gradio"][sample_index]}
104
+
105
+
106
+ with gr.Tab("Text2sound_GAN"):
107
+ gr.Markdown("Use neural networks to select random sounds using your favorite instrument!")
108
+ with gr.Row(variant="panel"):
109
+ with gr.Column(scale=3):
110
+ text2sound_prompts_textbox = gr.Textbox(label="Positive prompt", lines=2, value="organ")
111
+ text2sound_negative_prompts_textbox = gr.Textbox(label="Negative prompt", lines=2, value="")
112
+
113
+ with gr.Column(scale=1):
114
+ text2sound_sampling_button = gr.Button(variant="primary",
115
+ value="Generate a batch of samples and show "
116
+ "the first one",
117
+ scale=1)
118
+ text2sound_sample_index_slider = gr.Slider(minimum=0, maximum=3, value=0, step=1.0, visible=False,
119
+ label="Sample index",
120
+ info="Swipe to view other samples")
121
+ with gr.Row(variant="panel"):
122
+ with gr.Column(scale=1, variant="panel"):
123
+ text2sound_sample_steps_slider = gradioWebUI.get_sample_steps_slider()
124
+ text2sound_sampler_radio = gradioWebUI.get_sampler_radio()
125
+ text2sound_batchsize_slider = gradioWebUI.get_batchsize_slider()
126
+ text2sound_duration_slider = gradioWebUI.get_duration_slider()
127
+ text2sound_guidance_scale_slider = gradioWebUI.get_guidance_scale_slider()
128
+ text2sound_seed_textbox = gradioWebUI.get_seed_textbox()
129
+
130
+ with gr.Column(scale=1):
131
+ text2sound_sampled_spectrogram_image = gr.Image(label="Sampled spectrogram", type="numpy", height=420)
132
+ text2sound_sampled_audio = gr.Audio(type="numpy", label="Play")
133
+
134
+
135
+ with gr.Row(variant="panel"):
136
+ text2sound_latent_representation_image = gr.Image(label="Sampled latent representation", type="numpy",
137
+ height=200, width=100)
138
+ text2sound_quantized_latent_representation_image = gr.Image(label="Quantized latent representation",
139
+ type="numpy", height=200, width=100)
140
+
141
+ text2sound_sampling_button.click(gan_random_sample,
142
+ inputs=[text2sound_prompts_textbox,
143
+ text2sound_negative_prompts_textbox,
144
+ text2sound_batchsize_slider,
145
+ text2sound_duration_slider,
146
+ text2sound_guidance_scale_slider, text2sound_sampler_radio,
147
+ text2sound_sample_steps_slider,
148
+ text2sound_seed_textbox,
149
+ text2sound_state],
150
+ outputs=[text2sound_latent_representation_image,
151
+ text2sound_quantized_latent_representation_image,
152
+ text2sound_sampled_spectrogram_image,
153
+ text2sound_sampled_audio,
154
+ text2sound_seed_textbox,
155
+ text2sound_state,
156
+ text2sound_sample_index_slider])
157
+
158
+
159
+ text2sound_sample_index_slider.change(show_random_sample,
160
+ inputs=[text2sound_sample_index_slider, text2sound_state],
161
+ outputs=[text2sound_latent_representation_image,
162
+ text2sound_quantized_latent_representation_image,
163
+ text2sound_sampled_spectrogram_image,
164
+ text2sound_sampled_audio])
webUI/natural_language_guided_4/README.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ readme_content = """## Stable Diffusion for Sound Generation
4
+
5
+ This project applies stable diffusion[1] to sound generation. Inspired by the work of AUTOMATIC1111, 2022[2], we have implemented a preliminary version of text2sound, sound2sound, inpaint, as well as an additional interpolation feature, all accessible through a web UI.
6
+
7
+ ### Neural Network Training Data:
8
+ The neural network is trained using the filtered NSynth dataset[3], which is a large-scale and high-quality collection of annotated musical notes, comprising 305,979 musical notes. However, for this project, only samples with a pitch set to E3 were used, resulting in an actual training sample size of 4,096, making it a low-resource project.
9
+
10
+ The training took place on an NVIDIA Tesla T4 GPU and spanned approximately 10 hours.
11
+
12
+ ### Natural Language Guidance:
13
+ Natural language guidance is derived from the multi-label annotations of the NSynth dataset. The labels included in the training are:
14
+
15
+ - **Instrument Families**: bass, brass, flute, guitar, keyboard, mallet, organ, reed, string, synth lead, vocal.
16
+
17
+ - **Instrument Sources**: acoustic, electronic, synthetic.
18
+
19
+ - **Note Qualities**: bright, dark, distortion, fast decay, long release, multiphonic, nonlinear env, percussive, reverb, tempo-synced.
20
+
21
+ ### Usage Hints:
22
+
23
+ 1. **Prompt Format**: It's recommended to use the format “label1, label2, label3“, e.g., ”organ, dark, long release“.
24
+
25
+ 2. **Unique Sounds**: If you keep generating the same sound, try setting a different seed!
26
+
27
+ 3. **Sample Indexing**: Drag the "Sample index slider" to view other samples within the generated batch.
28
+
29
+ 4. **Running on CPU**: Be cautious with the settings for 'batchsize' and 'sample_steps' when running on CPU to avoid timeouts. Recommended settings are batchsize ≤ 4 and sample_steps = 15.
30
+
31
+ 5. **Editing Sounds**: Generated audio can be downloaded and then re-uploaded for further editing at the sound2sound/inpaint sections.
32
+
33
+ 6. **Guidance Scale**: A higher 'guidance_scale' intensifies the influence of natural language conditioning on the generation[4]. It's recommended to set it between 3 and 10.
34
+
35
+ 7. **Noising Strength**: A smaller 'noising_strength' value makes the generated sound closer to the input sound.
36
+
37
+ References:
38
+
39
+ [1] Rombach, R., Blattmann, A., Lorenz, D., Esser, P., & Ommer, B. (2022). High-resolution image synthesis with latent diffusion models. In Proceedings of the IEEE/CVF conference on computer vision and pattern recognition (pp. 10684-10695).
40
+
41
+ [2] AUTOMATIC1111. (2022). Stable Diffusion Web UI [Computer software]. Retrieved from https://github.com/AUTOMATIC1111/stable-diffusion-webui
42
+
43
+ [3] Engel, J., Resnick, C., Roberts, A., Dieleman, S., Eck, D., Simonyan, K., & Norouzi, M. (2017). Neural Audio Synthesis of Musical Notes with WaveNet Autoencoders.
44
+
45
+ [4] Ho, J., & Salimans, T. (2022). Classifier-free diffusion guidance. arXiv preprint arXiv:2207.12598.
46
+ """
47
+
48
+ def get_readme_module():
49
+
50
+ with gr.Tab("README"):
51
+ # gr.Markdown("Use interpolation to generate a gradient sound sequence.")
52
+ with gr.Column(scale=3):
53
+ readme_textbox = gr.Textbox(label="readme", lines=40, value=readme_content, interactive=False)
webUI/natural_language_guided_4/__pycache__/gradio_webUI.cpython-310.pyc ADDED
Binary file (3.55 kB). View file
 
webUI/natural_language_guided_4/__pycache__/inpaint_with_text.cpython-310.pyc ADDED
Binary file (10.9 kB). View file
 
webUI/natural_language_guided_4/__pycache__/instruments.cpython-310.pyc ADDED
Binary file (2.62 kB). View file
 
webUI/natural_language_guided_4/__pycache__/load_presets.cpython-310.pyc ADDED
Binary file (2.74 kB). View file
 
webUI/natural_language_guided_4/__pycache__/note2music.cpython-310.pyc ADDED
Binary file (7.62 kB). View file
 
webUI/natural_language_guided_4/__pycache__/sound2sound_with_text.cpython-310.pyc ADDED
Binary file (9.32 kB). View file
 
webUI/natural_language_guided_4/__pycache__/text2sound.cpython-310.pyc ADDED
Binary file (6.5 kB). View file
 
webUI/natural_language_guided_4/__pycache__/track_maker.cpython-310.pyc ADDED
Binary file (7.56 kB). View file
 
webUI/natural_language_guided_4/__pycache__/utils.cpython-310.pyc ADDED
Binary file (5.89 kB). View file
 
webUI/natural_language_guided_4/build_instrument.py ADDED
@@ -0,0 +1,305 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ import numpy as np
3
+ import torch
4
+ import gradio as gr
5
+ import mido
6
+ from io import BytesIO
7
+ # import pyrubberband as pyrb
8
+ import torchaudio.transforms as transforms
9
+
10
+ from model.DiffSynthSampler import DiffSynthSampler
11
+ from tools import adsr_envelope, adjust_audio_length
12
+ from webUI.natural_language_guided.track_maker import DiffSynth
13
+ from webUI.natural_language_guided.utils import encodeBatch2GradioOutput_STFT, phase_to_Gradio_image, \
14
+ spectrogram_to_Gradio_image
15
+
16
+
17
+ def time_stretch_audio(waveform, sample_rate, stretch_factor):
18
+ # 如果输入是 numpy 数组,则转换为 torch.Tensor
19
+ if isinstance(waveform, np.ndarray):
20
+ waveform = torch.from_numpy(waveform)
21
+
22
+ # 确保 waveform 的类型为 torch.float32
23
+ waveform = waveform.to(torch.float32)
24
+
25
+ # 设置 STFT 参数
26
+ n_fft = 2048 # STFT 窗口大小
27
+ hop_length = n_fft // 4 # STFT 的 hop length 设置为 n_fft 的四分之一
28
+
29
+ # 计算短时傅里叶变换 (STFT)
30
+ stft = torch.stft(waveform, n_fft=n_fft, hop_length=hop_length, return_complex=True)
31
+
32
+ # 创建 TimeStretch 变换
33
+ time_stretch = transforms.TimeStretch(hop_length=hop_length, n_freq=1025, fixed_rate=False)
34
+
35
+ print(stft.shape)
36
+ # 应用时间伸缩
37
+ stretched_stft = time_stretch(stft, stretch_factor)
38
+
39
+ # 将 STFT 转换回时域波形
40
+ stretched_waveform = torch.istft(stretched_stft, n_fft=n_fft, hop_length=hop_length)
41
+
42
+ # 返回处理后的 waveform,转换为 numpy 数组
43
+ return stretched_waveform.detach().numpy()
44
+
45
+
46
+ def get_build_instrument_module(gradioWebUI, virtual_instruments_state):
47
+ # Load configurations
48
+ uNet = gradioWebUI.uNet
49
+ freq_resolution, time_resolution = gradioWebUI.freq_resolution, gradioWebUI.time_resolution
50
+ VAE_scale = gradioWebUI.VAE_scale
51
+ height, width, channels = int(freq_resolution / VAE_scale), int(time_resolution / VAE_scale), gradioWebUI.channels
52
+
53
+ timesteps = gradioWebUI.timesteps
54
+ VAE_quantizer = gradioWebUI.VAE_quantizer
55
+ VAE_decoder = gradioWebUI.VAE_decoder
56
+ CLAP = gradioWebUI.CLAP
57
+ CLAP_tokenizer = gradioWebUI.CLAP_tokenizer
58
+ device = gradioWebUI.device
59
+ squared = gradioWebUI.squared
60
+ sample_rate = gradioWebUI.sample_rate
61
+ noise_strategy = gradioWebUI.noise_strategy
62
+
63
+ def select_sound(virtual_instrument_name, virtual_instruments_dict):
64
+ virtual_instruments = virtual_instruments_dict["virtual_instruments"]
65
+ virtual_instrument = virtual_instruments[virtual_instrument_name]
66
+
67
+ return {source_sound_spectrogram_image: virtual_instrument["spectrogram_gradio_image"],
68
+ source_sound_phase_image: virtual_instrument["phase_gradio_image"],
69
+ source_sound_audio: virtual_instrument["signal"]}
70
+
71
+ def make_track(inpaint_steps, midi, noising_strength, attack, before_release, instrument_names, virtual_instruments_dict):
72
+
73
+ if noising_strength < 1:
74
+ print(f"Warning: making track with noising_strength = {noising_strength} < 1")
75
+ virtual_instruments = virtual_instruments_dict["virtual_instruments"]
76
+ sample_steps = int(inpaint_steps)
77
+
78
+ instrument_names = instrument_names.split("@")
79
+ instruments_configs = {}
80
+ for virtual_instrument_name in instrument_names:
81
+ virtual_instrument = virtual_instruments[virtual_instrument_name]
82
+
83
+ latent_representation = torch.tensor(virtual_instrument["latent_representation"], dtype=torch.float32).to(device)
84
+ sampler = virtual_instrument["sampler"]
85
+
86
+ batchsize = 1
87
+
88
+ latent_representation = latent_representation.repeat(batchsize, 1, 1, 1)
89
+
90
+ mid = mido.MidiFile(file=BytesIO(midi))
91
+ instruments_configs[virtual_instrument_name] = {
92
+ 'sample_steps': sample_steps,
93
+ 'sampler': sampler,
94
+ 'noising_strength': noising_strength,
95
+ 'latent_representation': latent_representation,
96
+ 'attack': attack,
97
+ 'before_release': before_release}
98
+
99
+ diffSynth = DiffSynth(instruments_configs, uNet, VAE_quantizer, VAE_decoder, CLAP, CLAP_tokenizer, device)
100
+
101
+ full_audio = diffSynth.get_music(mid, instrument_names)
102
+
103
+ return {track_audio: (sample_rate, full_audio)}
104
+
105
+ def test_duration_inpaint(virtual_instrument_name, inpaint_steps, duration, noising_strength, end_noise_level_ratio, attack, before_release, mask_flexivity, virtual_instruments_dict, use_dynamic_mask):
106
+ width = int(time_resolution * ((duration + 1) / 4) / VAE_scale)
107
+
108
+ virtual_instruments = virtual_instruments_dict["virtual_instruments"]
109
+ virtual_instrument = virtual_instruments[virtual_instrument_name]
110
+
111
+ latent_representation = torch.tensor(virtual_instrument["latent_representation"], dtype=torch.float32).to(device)
112
+ sample_steps = int(inpaint_steps)
113
+ sampler = virtual_instrument["sampler"]
114
+ batchsize = 1
115
+
116
+ mySampler = DiffSynthSampler(timesteps, height=height, channels=channels, noise_strategy=noise_strategy)
117
+ mySampler.respace(list(np.linspace(0, timesteps - 1, sample_steps, dtype=np.int32)))
118
+
119
+ latent_representation = latent_representation.repeat(batchsize, 1, 1, 1)
120
+
121
+ # mask = 1, freeze
122
+ latent_mask = torch.zeros((batchsize, 1, height, width), dtype=torch.float32).to(device)
123
+
124
+ latent_mask[:, :, :, :int(time_resolution * (attack / 4) / VAE_scale)] = 1.0
125
+ latent_mask[:, :, :, -int(time_resolution * ((before_release+1) / 4) / VAE_scale):] = 1.0
126
+
127
+
128
+ text2sound_embedding = \
129
+ CLAP.get_text_features(**CLAP_tokenizer([""], padding=True, return_tensors="pt"))[0].to(
130
+ device)
131
+ condition = text2sound_embedding.repeat(1, 1)
132
+
133
+
134
+ latent_representations, initial_noise = \
135
+ mySampler.inpaint_sample(model=uNet, shape=(batchsize, channels, height, width),
136
+ noising_strength=noising_strength,
137
+ guide_img=latent_representation, mask=latent_mask, return_tensor=True,
138
+ condition=condition, sampler=sampler,
139
+ use_dynamic_mask=use_dynamic_mask,
140
+ end_noise_level_ratio=end_noise_level_ratio,
141
+ mask_flexivity=mask_flexivity)
142
+
143
+ latent_representations = latent_representations[-1]
144
+
145
+ quantized_latent_representations, loss, (_, _, _) = VAE_quantizer(latent_representations)
146
+ # Todo: remove hard-coding
147
+ flipped_log_spectrums, flipped_phases, rec_signals, _, _, _ = encodeBatch2GradioOutput_STFT(VAE_decoder,
148
+ quantized_latent_representations,
149
+ resolution=(
150
+ 512,
151
+ width * VAE_scale),
152
+ original_STFT_batch=None
153
+ )
154
+
155
+
156
+ return {test_duration_spectrogram_image: flipped_log_spectrums[0],
157
+ test_duration_phase_image: flipped_phases[0],
158
+ test_duration_audio: (sample_rate, rec_signals[0])}
159
+
160
+ def test_duration_envelope(virtual_instrument_name, duration, noising_strength, attack, before_release, release, virtual_instruments_dict):
161
+
162
+ virtual_instruments = virtual_instruments_dict["virtual_instruments"]
163
+ virtual_instrument = virtual_instruments[virtual_instrument_name]
164
+ sample_rate, signal = virtual_instrument["signal"]
165
+
166
+ applied_signal = adsr_envelope(signal=signal, sample_rate=sample_rate, duration=duration,
167
+ attack_time=0.0, decay_time=0.0, sustain_level=1.0, release_time=release)
168
+
169
+ D = librosa.stft(applied_signal, n_fft=1024, hop_length=256, win_length=1024)[1:, :]
170
+ spc = np.abs(D)
171
+ phase = np.angle(D)
172
+
173
+ flipped_log_spectrum = spectrogram_to_Gradio_image(spc)
174
+ flipped_phase = phase_to_Gradio_image(phase)
175
+
176
+ return {test_duration_spectrogram_image: flipped_log_spectrum,
177
+ test_duration_phase_image: flipped_phase,
178
+ test_duration_audio: (sample_rate, applied_signal)}
179
+
180
+ def test_duration_stretch(virtual_instrument_name, duration, noising_strength, attack, before_release, release, virtual_instruments_dict):
181
+
182
+ virtual_instruments = virtual_instruments_dict["virtual_instruments"]
183
+ virtual_instrument = virtual_instruments[virtual_instrument_name]
184
+ sample_rate, signal = virtual_instrument["signal"]
185
+
186
+ s = 3 / duration
187
+ # applied_signal = pyrb.time_stretch(signal, sample_rate, s)
188
+ applied_signal = time_stretch_audio(signal, sample_rate, s)
189
+ applied_signal = adjust_audio_length(applied_signal, int((duration+1) * sample_rate), sample_rate, sample_rate)
190
+
191
+ D = librosa.stft(applied_signal, n_fft=1024, hop_length=256, win_length=1024)[1:, :]
192
+ spc = np.abs(D)
193
+ phase = np.angle(D)
194
+
195
+ flipped_log_spectrum = spectrogram_to_Gradio_image(spc)
196
+ flipped_phase = phase_to_Gradio_image(phase)
197
+
198
+ return {test_duration_spectrogram_image: flipped_log_spectrum,
199
+ test_duration_phase_image: flipped_phase,
200
+ test_duration_audio: (sample_rate, applied_signal)}
201
+
202
+
203
+ with gr.Tab("TestInTrack"):
204
+ gr.Markdown("Make music with generated sounds!")
205
+ with gr.Row(variant="panel"):
206
+ with gr.Column(scale=3):
207
+ instrument_name_textbox = gr.Textbox(label="Instrument name", lines=1,
208
+ placeholder="Name of your instrument", scale=1)
209
+ select_instrument_button = gr.Button(variant="primary", value="Select", scale=1)
210
+ with gr.Column(scale=3):
211
+ inpaint_steps_slider = gr.Slider(minimum=5.0, maximum=999.0, value=20.0, step=1.0, label="inpaint_steps")
212
+ noising_strength_slider = gradioWebUI.get_noising_strength_slider(default_noising_strength=1.)
213
+ end_noise_level_ratio_slider = gr.Slider(minimum=0.0, maximum=1., value=0.0, step=0.01, label="end_noise_level_ratio")
214
+ attack_slider = gr.Slider(minimum=0.0, maximum=1.5, value=0.5, step=0.01, label="attack in sec")
215
+ before_release_slider = gr.Slider(minimum=0.0, maximum=1.5, value=0.5, step=0.01, label="before_release in sec")
216
+ release_slider = gr.Slider(minimum=0.0, maximum=1.0, value=0.3, step=0.01, label="release in sec")
217
+ mask_flexivity_slider = gr.Slider(minimum=0.01, maximum=1.00, value=1., step=0.01, label="mask_flexivity")
218
+ with gr.Column(scale=3):
219
+ use_dynamic_mask_checkbox = gr.Checkbox(label="Use dynamic mask", value=True)
220
+ test_duration_envelope_button = gr.Button(variant="primary", value="Apply envelope", scale=1)
221
+ test_duration_stretch_button = gr.Button(variant="primary", value="Apply stretch", scale=1)
222
+ test_duration_inpaint_button = gr.Button(variant="primary", value="Inpaint different duration", scale=1)
223
+ duration_slider = gradioWebUI.get_duration_slider()
224
+
225
+ with gr.Row(variant="panel"):
226
+ with gr.Column(scale=2):
227
+ with gr.Row(variant="panel"):
228
+ source_sound_spectrogram_image = gr.Image(label="New sound spectrogram", type="numpy",
229
+ height=600, scale=1)
230
+ source_sound_phase_image = gr.Image(label="New sound phase", type="numpy",
231
+ height=600, scale=1)
232
+ source_sound_audio = gr.Audio(type="numpy", label="Play new sound", interactive=False)
233
+
234
+ with gr.Column(scale=3):
235
+ with gr.Row(variant="panel"):
236
+ test_duration_spectrogram_image = gr.Image(label="New sound spectrogram", type="numpy",
237
+ height=600, scale=1)
238
+ test_duration_phase_image = gr.Image(label="New sound phase", type="numpy",
239
+ height=600, scale=1)
240
+ test_duration_audio = gr.Audio(type="numpy", label="Play new sound", interactive=False)
241
+
242
+ with gr.Row(variant="panel"):
243
+ with gr.Column(scale=1):
244
+ # track_spectrogram_image = gr.Image(label="New sound spectrogram", type="numpy",
245
+ # height=420, scale=1)
246
+ midi_file = gr.File(label="Upload midi file", type="binary")
247
+ instrument_names_textbox = gr.Textbox(label="Instrument names", lines=2,
248
+ placeholder="Names of your instrument used to play the midi", scale=1)
249
+ track_audio = gr.Audio(type="numpy", label="Play new sound", interactive=False)
250
+ make_track_button = gr.Button(variant="primary", value="Make track", scale=1)
251
+
252
+ select_instrument_button.click(select_sound,
253
+ inputs=[instrument_name_textbox, virtual_instruments_state],
254
+ outputs=[source_sound_spectrogram_image,
255
+ source_sound_phase_image,
256
+ source_sound_audio])
257
+
258
+ test_duration_envelope_button.click(test_duration_envelope,
259
+ inputs=[instrument_name_textbox, duration_slider,
260
+ noising_strength_slider,
261
+ attack_slider,
262
+ before_release_slider,
263
+ release_slider,
264
+ virtual_instruments_state,
265
+ ],
266
+ outputs=[test_duration_spectrogram_image,
267
+ test_duration_phase_image,
268
+ test_duration_audio])
269
+
270
+ test_duration_stretch_button.click(test_duration_stretch,
271
+ inputs=[instrument_name_textbox, duration_slider,
272
+ noising_strength_slider,
273
+ attack_slider,
274
+ before_release_slider,
275
+ release_slider,
276
+ virtual_instruments_state,
277
+ ],
278
+ outputs=[test_duration_spectrogram_image,
279
+ test_duration_phase_image,
280
+ test_duration_audio])
281
+
282
+ test_duration_inpaint_button.click(test_duration_inpaint,
283
+ inputs=[instrument_name_textbox,
284
+ inpaint_steps_slider,
285
+ duration_slider,
286
+ noising_strength_slider,
287
+ end_noise_level_ratio_slider,
288
+ attack_slider,
289
+ before_release_slider,
290
+ mask_flexivity_slider,
291
+ virtual_instruments_state,
292
+ use_dynamic_mask_checkbox],
293
+ outputs=[test_duration_spectrogram_image,
294
+ test_duration_phase_image,
295
+ test_duration_audio])
296
+
297
+ make_track_button.click(make_track,
298
+ inputs=[inpaint_steps_slider, midi_file,
299
+ noising_strength_slider,
300
+ attack_slider,
301
+ before_release_slider,
302
+ instrument_names_textbox,
303
+ virtual_instruments_state],
304
+ outputs=[track_audio])
305
+
webUI/natural_language_guided_4/gradio_webUI.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+
4
+ class GradioWebUI():
5
+
6
+ def __init__(self, device, VAE, uNet, CLAP, CLAP_tokenizer,
7
+ freq_resolution=512, time_resolution=256, channels=4, timesteps=1000,
8
+ sample_rate=16000, squared=False, VAE_scale=4,
9
+ flexible_duration=False, noise_strategy="repeat",
10
+ GAN_generator = None):
11
+ self.device = device
12
+ self.VAE_encoder, self.VAE_quantizer, self.VAE_decoder = VAE._encoder, VAE._vq_vae, VAE._decoder
13
+ self.uNet = uNet
14
+ self.CLAP, self.CLAP_tokenizer = CLAP, CLAP_tokenizer
15
+ self.freq_resolution, self.time_resolution = freq_resolution, time_resolution
16
+ self.channels = channels
17
+ self.GAN_generator = GAN_generator
18
+
19
+ self.timesteps = timesteps
20
+ self.sample_rate = sample_rate
21
+ self.squared = squared
22
+ self.VAE_scale = VAE_scale
23
+ self.flexible_duration = flexible_duration
24
+ self.noise_strategy = noise_strategy
25
+
26
+ self.text2sound_state = gr.State(value={})
27
+ self.interpolation_state = gr.State(value={})
28
+ self.sound2sound_state = gr.State(value={})
29
+ self.inpaint_state = gr.State(value={})
30
+
31
+ def get_sample_steps_slider(self):
32
+ default_steps = 10 if (self.device == "cpu") else 20
33
+ return gr.Slider(minimum=10, maximum=100, value=default_steps, step=1,
34
+ label="Sample steps",
35
+ info="Sampling steps. The more sampling steps, the better the "
36
+ "theoretical result, but the time it consumes.")
37
+
38
+ def get_sampler_radio(self):
39
+ # return gr.Radio(choices=["ddpm", "ddim", "dpmsolver++", "dpmsolver"], value="ddim", label="Sampler")
40
+ return gr.Radio(choices=["ddpm", "ddim"], value="ddim", label="Sampler")
41
+
42
+ def get_batchsize_slider(self, cpu_batchsize=1):
43
+ return gr.Slider(minimum=1., maximum=16, value=cpu_batchsize if (self.device == "cpu") else 8, step=1, label="Batchsize")
44
+
45
+ def get_time_resolution_slider(self):
46
+ return gr.Slider(minimum=16., maximum=int(1024/self.VAE_scale), value=int(256/self.VAE_scale), step=1, label="Time resolution", interactive=True)
47
+
48
+ def get_duration_slider(self):
49
+ if self.flexible_duration:
50
+ return gr.Slider(minimum=0.25, maximum=8., value=3., step=0.01, label="duration in sec")
51
+ else:
52
+ return gr.Slider(minimum=1., maximum=8., value=3., step=1., label="duration in sec")
53
+
54
+ def get_guidance_scale_slider(self):
55
+ return gr.Slider(minimum=0., maximum=20., value=6., step=1.,
56
+ label="Guidance scale",
57
+ info="The larger this value, the more the generated sound is "
58
+ "influenced by the condition. Setting it to 0 is equivalent to "
59
+ "the negative case.")
60
+
61
+ def get_noising_strength_slider(self, default_noising_strength=0.7):
62
+ return gr.Slider(minimum=0.0, maximum=1.00, value=default_noising_strength, step=0.01,
63
+ label="noising strength",
64
+ info="The smaller this value, the more the generated sound is "
65
+ "closed to the origin.")
66
+
67
+ def get_seed_textbox(self):
68
+ return gr.Textbox(label="Seed", lines=1, placeholder="seed", value=0)
webUI/natural_language_guided_4/inpaint_with_text.py ADDED
@@ -0,0 +1,371 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ import numpy as np
3
+ import torch
4
+ import gradio as gr
5
+ from scipy.ndimage import zoom
6
+
7
+ from model.DiffSynthSampler import DiffSynthSampler
8
+ from tools import adjust_audio_length, safe_int, pad_STFT, encode_stft
9
+ from webUI.natural_language_guided_4.utils import latent_representation_to_Gradio_image, InputBatch2Encode_STFT, \
10
+ encodeBatch2GradioOutput_STFT, add_instrument, average_np_arrays
11
+
12
+
13
+ def get_triangle_mask(height, width):
14
+ mask = np.zeros((height, width))
15
+ slope = 8 / 3
16
+ for i in range(height):
17
+ for j in range(width):
18
+ if i < slope * j:
19
+ mask[i, j] = 1
20
+ return mask
21
+
22
+
23
+ def get_inpaint_with_text_module(gradioWebUI, inpaintWithText_state, virtual_instruments_state):
24
+ # Load configurations
25
+ uNet = gradioWebUI.uNet
26
+ freq_resolution, time_resolution = gradioWebUI.freq_resolution, gradioWebUI.time_resolution
27
+ VAE_scale = gradioWebUI.VAE_scale
28
+ height, width, channels = int(freq_resolution / VAE_scale), int(time_resolution / VAE_scale), gradioWebUI.channels
29
+ timesteps = gradioWebUI.timesteps
30
+ VAE_encoder = gradioWebUI.VAE_encoder
31
+ VAE_quantizer = gradioWebUI.VAE_quantizer
32
+ VAE_decoder = gradioWebUI.VAE_decoder
33
+ CLAP = gradioWebUI.CLAP
34
+ CLAP_tokenizer = gradioWebUI.CLAP_tokenizer
35
+ device = gradioWebUI.device
36
+ squared = gradioWebUI.squared
37
+ sample_rate = gradioWebUI.sample_rate
38
+ noise_strategy = gradioWebUI.noise_strategy
39
+
40
+ def receive_upload_origin_audio(sound2sound_duration, sound2sound_origin, inpaintWithText_dict):
41
+
42
+ origin_sr, origin_audio = sound2sound_origin
43
+
44
+ origin_audio = origin_audio / np.max(np.abs(origin_audio))
45
+
46
+ width = int(time_resolution * ((sound2sound_duration + 1) / 4) / VAE_scale)
47
+ audio_length = 256 * (VAE_scale * width - 1)
48
+ origin_audio = adjust_audio_length(origin_audio, audio_length, origin_sr, sample_rate)
49
+
50
+ D = librosa.stft(origin_audio, n_fft=1024, hop_length=256, win_length=1024)
51
+ padded_D = pad_STFT(D)
52
+ encoded_D = encode_stft(padded_D)
53
+
54
+ # Todo: justify batchsize to 1
55
+ origin_spectrogram_batch_tensor = torch.from_numpy(
56
+ np.repeat(encoded_D[np.newaxis, :, :, :], 1, axis=0)).float().to(device)
57
+
58
+ # Todo: remove hard-coding
59
+ origin_flipped_log_spectrums, origin_flipped_phases, origin_signals, origin_latent_representations, quantized_origin_latent_representations = InputBatch2Encode_STFT(
60
+ VAE_encoder, origin_spectrogram_batch_tensor, resolution=(512, width * VAE_scale), quantizer=VAE_quantizer,
61
+ squared=squared)
62
+
63
+ inpaintWithText_dict["origin_upload_latent_representations"] = origin_latent_representations.tolist()
64
+ inpaintWithText_dict[
65
+ "sound2sound_origin_upload_latent_representation_image"] = latent_representation_to_Gradio_image(
66
+ origin_latent_representations[0]).tolist()
67
+ inpaintWithText_dict[
68
+ "sound2sound_origin_upload_quantized_latent_representation_image"] = latent_representation_to_Gradio_image(
69
+ quantized_origin_latent_representations[0]).tolist()
70
+ return {sound2sound_origin_spectrogram_image: origin_flipped_log_spectrums[0],
71
+ sound2sound_origin_phase_image: origin_flipped_phases[0],
72
+ sound2sound_origin_upload_latent_representation_image: latent_representation_to_Gradio_image(
73
+ origin_latent_representations[0]),
74
+ sound2sound_origin_upload_quantized_latent_representation_image: latent_representation_to_Gradio_image(
75
+ quantized_origin_latent_representations[0]),
76
+ sound2sound_origin_microphone_latent_representation_image: gr.update(),
77
+ sound2sound_origin_microphone_quantized_latent_representation_image: gr.update(),
78
+ inpaintWithText_state: inpaintWithText_dict}
79
+
80
+ def sound2sound_sample(sound2sound_origin_spectrogram,
81
+ text2sound_prompts, text2sound_negative_prompts, sound2sound_batchsize,
82
+ sound2sound_guidance_scale, sound2sound_sampler,
83
+ sound2sound_sample_steps,
84
+ sound2sound_noising_strength, sound2sound_seed, sound2sound_inpaint_area,
85
+ mask_time_begin, mask_time_end, mask_frequency_begin, mask_frequency_end,
86
+ inpaintWithText_dict
87
+ ):
88
+
89
+ # input preprocessing
90
+ sound2sound_seed = safe_int(sound2sound_seed, 12345678)
91
+ sound2sound_batchsize = int(sound2sound_batchsize)
92
+ noising_strength = sound2sound_noising_strength
93
+ sound2sound_sample_steps = int(sound2sound_sample_steps)
94
+ CFG = int(sound2sound_guidance_scale)
95
+
96
+ text2sound_embedding = \
97
+ CLAP.get_text_features(**CLAP_tokenizer([text2sound_prompts], padding=True, return_tensors="pt"))[0].to(
98
+ device)
99
+
100
+ averaged_transparency = average_np_arrays(sound2sound_origin_spectrogram["layers"])
101
+ # print(f"averaged_transparency: {averaged_transparency}")
102
+ averaged_transparency = averaged_transparency[:, :, -1]
103
+ # print(f"averaged_transparency: {averaged_transparency}")
104
+ # print(f"np.shape(averaged_transparency): {np.shape(averaged_transparency)}")
105
+ # print(f"np.mean(averaged_transparency): {np.mean(averaged_transparency)}")
106
+ origin_latent_representations = torch.tensor(
107
+ inpaintWithText_dict["origin_upload_latent_representations"]).repeat(sound2sound_batchsize, 1, 1, 1).to(
108
+ device)
109
+
110
+ merged_mask = np.where(averaged_transparency > 0, 1, 0)
111
+ latent_mask = zoom(merged_mask, (1 / VAE_scale, 1 / VAE_scale))
112
+ latent_mask = np.clip(latent_mask, 0, 1)
113
+ # print(f"latent_mask.avg = {np.mean(latent_mask)}")
114
+ latent_mask[int(mask_frequency_begin):int(mask_frequency_end),
115
+ int(mask_time_begin * time_resolution / (VAE_scale * 4)):int(
116
+ mask_time_end * time_resolution / (VAE_scale * 4))] = 1
117
+
118
+
119
+ if sound2sound_inpaint_area == "masked":
120
+ latent_mask = 1 - latent_mask
121
+ latent_mask = torch.from_numpy(latent_mask).unsqueeze(0).unsqueeze(1).repeat(sound2sound_batchsize, channels, 1,
122
+ 1).float().to(device)
123
+ latent_mask = torch.flip(latent_mask, [2])
124
+
125
+ mySampler = DiffSynthSampler(timesteps, height=height, channels=channels, noise_strategy=noise_strategy)
126
+ unconditional_condition = \
127
+ CLAP.get_text_features(**CLAP_tokenizer([text2sound_negative_prompts], padding=True, return_tensors="pt"))[
128
+ 0]
129
+ mySampler.activate_classifier_free_guidance(CFG, unconditional_condition.to(device))
130
+
131
+ normalized_sample_steps = int(sound2sound_sample_steps / noising_strength)
132
+
133
+ mySampler.respace(list(np.linspace(0, timesteps - 1, normalized_sample_steps, dtype=np.int32)))
134
+
135
+ # Todo: remove hard-coding
136
+ width = origin_latent_representations.shape[-1]
137
+ condition = text2sound_embedding.repeat(sound2sound_batchsize, 1)
138
+
139
+ new_sound_latent_representations, initial_noise = \
140
+ mySampler.inpaint_sample(model=uNet, shape=(sound2sound_batchsize, channels, height, width),
141
+ seed=sound2sound_seed,
142
+ noising_strength=noising_strength,
143
+ guide_img=origin_latent_representations, mask=latent_mask, return_tensor=True,
144
+ condition=condition, sampler=sound2sound_sampler)
145
+
146
+ new_sound_latent_representations = new_sound_latent_representations[-1]
147
+
148
+ # Quantize new sound latent representations
149
+ quantized_new_sound_latent_representations, loss, (_, _, _) = VAE_quantizer(new_sound_latent_representations)
150
+ new_sound_flipped_log_spectrums, new_sound_flipped_phases, new_sound_signals, _, _, _ = encodeBatch2GradioOutput_STFT(
151
+ VAE_decoder,
152
+ quantized_new_sound_latent_representations,
153
+ resolution=(
154
+ 512,
155
+ width * VAE_scale),
156
+ original_STFT_batch=None
157
+ )
158
+
159
+ new_sound_latent_representation_gradio_images = []
160
+ new_sound_quantized_latent_representation_gradio_images = []
161
+ new_sound_spectrogram_gradio_images = []
162
+ new_sound_phase_gradio_images = []
163
+ new_sound_rec_signals_gradio = []
164
+ for i in range(sound2sound_batchsize):
165
+ new_sound_latent_representation_gradio_images.append(
166
+ latent_representation_to_Gradio_image(new_sound_latent_representations[i]))
167
+ new_sound_quantized_latent_representation_gradio_images.append(
168
+ latent_representation_to_Gradio_image(quantized_new_sound_latent_representations[i]))
169
+ new_sound_spectrogram_gradio_images.append(new_sound_flipped_log_spectrums[i])
170
+ new_sound_phase_gradio_images.append(new_sound_flipped_phases[i])
171
+ new_sound_rec_signals_gradio.append((sample_rate, new_sound_signals[i]))
172
+
173
+ inpaintWithText_dict[
174
+ "new_sound_latent_representation_gradio_images"] = new_sound_latent_representation_gradio_images
175
+ inpaintWithText_dict[
176
+ "new_sound_quantized_latent_representation_gradio_images"] = new_sound_quantized_latent_representation_gradio_images
177
+ inpaintWithText_dict["new_sound_spectrogram_gradio_images"] = new_sound_spectrogram_gradio_images
178
+ inpaintWithText_dict["new_sound_phase_gradio_images"] = new_sound_phase_gradio_images
179
+ inpaintWithText_dict["new_sound_rec_signals_gradio"] = new_sound_rec_signals_gradio
180
+
181
+ inpaintWithText_dict["latent_representations"] = new_sound_latent_representations.to("cpu").detach().numpy()
182
+ inpaintWithText_dict["quantized_latent_representations"] = quantized_new_sound_latent_representations.to(
183
+ "cpu").detach().numpy()
184
+ inpaintWithText_dict["sampler"] = sound2sound_sampler
185
+
186
+ return {sound2sound_new_sound_latent_representation_image: latent_representation_to_Gradio_image(
187
+ new_sound_latent_representations[0]),
188
+ sound2sound_new_sound_quantized_latent_representation_image: latent_representation_to_Gradio_image(
189
+ quantized_new_sound_latent_representations[0]),
190
+ sound2sound_new_sound_spectrogram_image: new_sound_flipped_log_spectrums[0],
191
+ sound2sound_new_sound_phase_image: new_sound_flipped_phases[0],
192
+ sound2sound_new_sound_audio: (sample_rate, new_sound_signals[0]),
193
+ sound2sound_sample_index_slider: gr.update(minimum=0, maximum=sound2sound_batchsize - 1, value=0,
194
+ step=1.0,
195
+ visible=True,
196
+ label="Sample index",
197
+ info="Swipe to view other samples"),
198
+ sound2sound_seed_textbox: sound2sound_seed,
199
+ inpaintWithText_state: inpaintWithText_dict}
200
+
201
+ def show_sound2sound_sample(sound2sound_sample_index, inpaintWithText_dict):
202
+ sample_index = int(sound2sound_sample_index)
203
+ return {sound2sound_new_sound_latent_representation_image:
204
+ inpaintWithText_dict["new_sound_latent_representation_gradio_images"][sample_index],
205
+ sound2sound_new_sound_quantized_latent_representation_image:
206
+ inpaintWithText_dict["new_sound_quantized_latent_representation_gradio_images"][sample_index],
207
+ sound2sound_new_sound_spectrogram_image: inpaintWithText_dict["new_sound_spectrogram_gradio_images"][
208
+ sample_index],
209
+ sound2sound_new_sound_phase_image: inpaintWithText_dict["new_sound_phase_gradio_images"][
210
+ sample_index],
211
+ sound2sound_new_sound_audio: inpaintWithText_dict["new_sound_rec_signals_gradio"][sample_index]}
212
+
213
+ def save_virtual_instrument(sample_index, virtual_instrument_name, sound2sound_dict, virtual_instruments_dict):
214
+
215
+ virtual_instruments_dict = add_instrument(sound2sound_dict, virtual_instruments_dict, virtual_instrument_name,
216
+ sample_index)
217
+ return {virtual_instruments_state: virtual_instruments_dict,
218
+ sound2sound_instrument_name_textbox: gr.Textbox(label="Instrument name", lines=1,
219
+ placeholder=f"Saved as {virtual_instrument_name}!")}
220
+
221
+ with gr.Tab("Inpaint"):
222
+ gr.Markdown("Upload a musical note and select the area by drawing on \"Input spectrogram\" for inpainting!")
223
+ with gr.Row(variant="panel"):
224
+ with gr.Column(scale=3):
225
+ text2sound_prompts_textbox = gr.Textbox(label="Positive prompt", lines=2, value="organ")
226
+ text2sound_negative_prompts_textbox = gr.Textbox(label="Negative prompt", lines=2, value="")
227
+
228
+ with gr.Column(scale=1):
229
+ sound2sound_sample_button = gr.Button(variant="primary", value="Generate", scale=1)
230
+
231
+ sound2sound_sample_index_slider = gr.Slider(minimum=0, maximum=3, value=0, step=1.0, visible=False,
232
+ label="Sample index",
233
+ info="Swipe to view other samples")
234
+
235
+ with gr.Row(variant="panel"):
236
+ with gr.Column(scale=1):
237
+ sound2sound_duration_slider = gradioWebUI.get_duration_slider()
238
+ sound2sound_origin_audio = gr.Audio(
239
+ sources=["microphone", "upload"], label="Upload/Record source sound",
240
+ waveform_options=gr.WaveformOptions(
241
+ waveform_color="#01C6FF",
242
+ waveform_progress_color="#0066B4",
243
+ skip_length=1,
244
+ show_controls=False,
245
+ ),
246
+ )
247
+
248
+ with gr.Row(variant="panel"):
249
+ with gr.Tab("Sound2sound settings"):
250
+ sound2sound_sample_steps_slider = gradioWebUI.get_sample_steps_slider()
251
+ sound2sound_sampler_radio = gradioWebUI.get_sampler_radio()
252
+ sound2sound_batchsize_slider = gradioWebUI.get_batchsize_slider()
253
+ sound2sound_noising_strength_slider = gradioWebUI.get_noising_strength_slider()
254
+ sound2sound_guidance_scale_slider = gradioWebUI.get_guidance_scale_slider()
255
+ sound2sound_seed_textbox = gradioWebUI.get_seed_textbox()
256
+
257
+ with gr.Tab("Mask prototypes"):
258
+ with gr.Tab("Mask along time axis"):
259
+ mask_time_begin_slider = gr.Slider(minimum=0.0, maximum=4.00, value=0.0, step=0.01,
260
+ label="Begin time")
261
+ mask_time_end_slider = gr.Slider(minimum=0.0, maximum=4.00, value=0.0, step=0.01,
262
+ label="End time")
263
+ with gr.Tab("Mask along frequency axis"):
264
+ mask_frequency_begin_slider = gr.Slider(minimum=0, maximum=127, value=0, step=1,
265
+ label="Begin freq pixel")
266
+ mask_frequency_end_slider = gr.Slider(minimum=0, maximum=127, value=0, step=1,
267
+ label="End freq pixel")
268
+
269
+ with gr.Column(scale=1):
270
+ with gr.Row(variant="panel"):
271
+ sound2sound_origin_spectrogram_image = gr.ImageEditor(label="Input spectrogram (draw here!)",
272
+ type="numpy",
273
+ visible=True, height=600, scale=1)
274
+
275
+ sound2sound_new_sound_spectrogram_image = gr.Image(label="New sound spectrogram", type="numpy",
276
+ height=600, scale=1)
277
+
278
+ with gr.Row(variant="panel"):
279
+ sound2sound_inpaint_area_radio = gr.Radio(label="Inpainting area", choices=["masked", "unmasked"],
280
+ value="masked", scale=1)
281
+
282
+ sound2sound_new_sound_audio = gr.Audio(type="numpy", label="Play new sound", interactive=False,
283
+ waveform_options=gr.WaveformOptions(
284
+ waveform_color="#FFB6C1",
285
+ waveform_progress_color="#FF0000",
286
+ skip_length=1,
287
+ show_controls=False,
288
+ ), scale=1 )
289
+
290
+ with gr.Row(variant="panel"):
291
+ sound2sound_instrument_name_textbox = gr.Textbox(label="Instrument name", lines=1,
292
+ placeholder="Name of your instrument")
293
+ sound2sound_save_instrument_button = gr.Button(variant="primary",
294
+ value="Save instrument",
295
+ scale=1)
296
+
297
+ with gr.Row(variant="panel"):
298
+ sound2sound_origin_upload_latent_representation_image = gr.Image(label="Original latent representation",
299
+ type="numpy", height=800,
300
+ visible=False)
301
+ sound2sound_origin_upload_quantized_latent_representation_image = gr.Image(
302
+ label="Original quantized latent representation", type="numpy", height=800, visible=False)
303
+
304
+ sound2sound_origin_microphone_latent_representation_image = gr.Image(label="Original latent representation",
305
+ type="numpy", height=800,
306
+ visible=False)
307
+ sound2sound_origin_microphone_quantized_latent_representation_image = gr.Image(
308
+ label="Original quantized latent representation", type="numpy", height=800, visible=False)
309
+
310
+ sound2sound_new_sound_latent_representation_image = gr.Image(label="New latent representation",
311
+ type="numpy", height=800, visible=False)
312
+ sound2sound_new_sound_quantized_latent_representation_image = gr.Image(
313
+ label="New sound quantized latent representation", type="numpy", height=800, visible=False)
314
+
315
+ sound2sound_origin_phase_image = gr.Image(label="Original upload phase",
316
+ type="numpy", visible=False)
317
+
318
+ sound2sound_new_sound_phase_image = gr.Image(label="New sound phase", type="numpy",
319
+ height=600, scale=1, visible=False)
320
+
321
+ sound2sound_origin_audio.change(receive_upload_origin_audio,
322
+ inputs=[sound2sound_duration_slider, sound2sound_origin_audio,
323
+ inpaintWithText_state],
324
+ outputs=[sound2sound_origin_spectrogram_image,
325
+ sound2sound_origin_phase_image,
326
+ sound2sound_origin_upload_latent_representation_image,
327
+ sound2sound_origin_upload_quantized_latent_representation_image,
328
+ sound2sound_origin_microphone_latent_representation_image,
329
+ sound2sound_origin_microphone_quantized_latent_representation_image,
330
+ inpaintWithText_state])
331
+
332
+ sound2sound_sample_button.click(sound2sound_sample,
333
+ inputs=[sound2sound_origin_spectrogram_image,
334
+ text2sound_prompts_textbox,
335
+ text2sound_negative_prompts_textbox,
336
+ sound2sound_batchsize_slider,
337
+ sound2sound_guidance_scale_slider,
338
+ sound2sound_sampler_radio,
339
+ sound2sound_sample_steps_slider,
340
+ sound2sound_noising_strength_slider,
341
+ sound2sound_seed_textbox,
342
+ sound2sound_inpaint_area_radio,
343
+ mask_time_begin_slider,
344
+ mask_time_end_slider,
345
+ mask_frequency_begin_slider,
346
+ mask_frequency_end_slider,
347
+ inpaintWithText_state],
348
+ outputs=[sound2sound_new_sound_latent_representation_image,
349
+ sound2sound_new_sound_quantized_latent_representation_image,
350
+ sound2sound_new_sound_spectrogram_image,
351
+ sound2sound_new_sound_phase_image,
352
+ sound2sound_new_sound_audio,
353
+ sound2sound_sample_index_slider,
354
+ sound2sound_seed_textbox,
355
+ inpaintWithText_state])
356
+
357
+ sound2sound_sample_index_slider.change(show_sound2sound_sample,
358
+ inputs=[sound2sound_sample_index_slider, inpaintWithText_state],
359
+ outputs=[sound2sound_new_sound_latent_representation_image,
360
+ sound2sound_new_sound_quantized_latent_representation_image,
361
+ sound2sound_new_sound_spectrogram_image,
362
+ sound2sound_new_sound_phase_image,
363
+ sound2sound_new_sound_audio])
364
+
365
+ sound2sound_save_instrument_button.click(save_virtual_instrument,
366
+ inputs=[sound2sound_sample_index_slider,
367
+ sound2sound_instrument_name_textbox,
368
+ inpaintWithText_state,
369
+ virtual_instruments_state],
370
+ outputs=[virtual_instruments_state,
371
+ sound2sound_instrument_name_textbox])
webUI/natural_language_guided_4/instruments.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+
4
+ from model.DiffSynthSampler import DiffSynthSampler
5
+ from tools import safe_int, read_wav_to_numpy
6
+ from webUI.natural_language_guided.utils import latent_representation_to_Gradio_image, \
7
+ encodeBatch2GradioOutput_STFT, add_instrument
8
+ from webUI.natural_language_guided_4.utils import resize_image_to_aspect_ratio
9
+
10
+
11
+ def get_instruments_module(gradioWebUI, virtual_instruments_state):
12
+
13
+ with gr.Tab("intruments"):
14
+ gr.Markdown("Use neural networks to select random sounds using your favorite instrument!")
15
+ with gr.Row(variant="panel"):
16
+ with gr.Column(scale=1):
17
+ input_text = gr.Textbox(label="input")
18
+
19
+ @gr.render(inputs=input_text)
20
+ def show_split(text):
21
+ textboxes = []
22
+
23
+ if len(text) == 0:
24
+ gr.Markdown("## No Input Provided")
25
+ else:
26
+ for letter in text:
27
+ textboxes.append(gr.Textbox(letter, interactive=True))
28
+
29
+ def merge(*splitted_texts):
30
+ out = ""
31
+ for t in splitted_texts:
32
+ out += t
33
+ return out
34
+
35
+ submit_botton.click(merge, inputs=textboxes, outputs=merged_textbox)
36
+
37
+ submit_botton = gr.Button("submit")
38
+
39
+ merged_textbox = gr.Textbox(placeholder="placeholder", interactive=False)
40
+
41
+ with gr.Column(scale=1):
42
+
43
+ @gr.render(inputs=virtual_instruments_state)
44
+ def check_instruments(virtual_instruments_dict):
45
+ virtual_instruments = virtual_instruments_dict["virtual_instruments"]
46
+ instrument_names = list(virtual_instruments.keys())
47
+
48
+ instrument_dropdown = gr.Dropdown(
49
+ instrument_names, label="instrument", info="info placeholder"
50
+ )
51
+
52
+ def select_instrument(instrument):
53
+ print(f"instrument: {instrument}")
54
+ sr, signal = virtual_instruments[instrument]["signal"]
55
+ return {selected_instrument_audio: (sr, signal)}
56
+
57
+ instrument_dropdown.select(select_instrument, inputs=instrument_dropdown,
58
+ outputs=selected_instrument_audio)
59
+
60
+ selected_instrument_audio = gr.Audio(type="numpy", label="Play", scale=1, interactive=False)
webUI/natural_language_guided_4/load_presets.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import librosa
4
+ import mido
5
+ import numpy as np
6
+ import torch
7
+
8
+ from tools import read_wav_to_numpy, pad_STFT, encode_stft
9
+ from webUI.natural_language_guided_4.gradio_webUI import GradioWebUI
10
+ from webUI.natural_language_guided_4.utils import InputBatch2Encode_STFT
11
+
12
+
13
+ def load_presets(gradioWebUI: GradioWebUI):
14
+ # Load configurations
15
+ uNet = gradioWebUI.uNet
16
+ freq_resolution, time_resolution = gradioWebUI.freq_resolution, gradioWebUI.time_resolution
17
+ VAE_scale = gradioWebUI.VAE_scale
18
+ height, width, channels = int(freq_resolution / VAE_scale), int(time_resolution / VAE_scale), gradioWebUI.channels
19
+
20
+ timesteps = gradioWebUI.timesteps
21
+ VAE_quantizer = gradioWebUI.VAE_quantizer
22
+ VAE_encoder = gradioWebUI.VAE_encoder
23
+ VAE_decoder = gradioWebUI.VAE_decoder
24
+ CLAP = gradioWebUI.CLAP
25
+ CLAP_tokenizer = gradioWebUI.CLAP_tokenizer
26
+ device = gradioWebUI.device
27
+ squared = gradioWebUI.squared
28
+ sample_rate = gradioWebUI.sample_rate
29
+ noise_strategy = gradioWebUI.noise_strategy
30
+
31
+ def add_preset_instruments(virtual_instruments, instrument_name):
32
+
33
+ instruments_path = os.path.join("webUI", "presets", "instruments", f"{instrument_name}.wav")
34
+ sample_rate, origin_audio = read_wav_to_numpy(instruments_path)
35
+
36
+ D = librosa.stft(origin_audio, n_fft=1024, hop_length=256, win_length=1024)
37
+ padded_D = pad_STFT(D)
38
+ encoded_D = encode_stft(padded_D)
39
+
40
+ # Todo: justify batchsize to 1
41
+ origin_spectrogram_batch_tensor = torch.from_numpy(
42
+ np.repeat(encoded_D[np.newaxis, :, :, :], 1, axis=0)).float().to(device)
43
+
44
+ # Todo: remove hard-coding
45
+ origin_flipped_log_spectrums, origin_flipped_phases, origin_signals, origin_latent_representations, quantized_origin_latent_representations = InputBatch2Encode_STFT(
46
+ VAE_encoder, origin_spectrogram_batch_tensor, resolution=(512, width * VAE_scale), quantizer=VAE_quantizer,
47
+ squared=squared)
48
+
49
+
50
+ virtual_instrument = {"latent_representation": origin_latent_representations[0].to("cpu").detach().numpy(),
51
+ "quantized_latent_representation": quantized_origin_latent_representations[0].to(
52
+ "cpu").detach().numpy(),
53
+ "sampler": "ddim",
54
+ "signal": (sample_rate, origin_audio),
55
+ "spectrogram_gradio_image": origin_flipped_log_spectrums[0],
56
+ "phase_gradio_image": origin_flipped_phases[0]}
57
+ virtual_instruments[f"preset_{instrument_name}"] = virtual_instrument
58
+ return virtual_instruments
59
+
60
+ virtual_instruments = {}
61
+ preset_instrument_names = ["ax", "electronic_sound", "organ", "synth_lead", "keyboard", "string"]
62
+ for preset_instrument_name in preset_instrument_names:
63
+ virtual_instruments = add_preset_instruments(virtual_instruments, preset_instrument_name)
64
+
65
+
66
+
67
+ def load_midi_files():
68
+
69
+ midis_dict = {}
70
+ midi_file_names = ["Ode_to_Joy_Easy_variation", "Air_on_the_G_String", "Canon_in_D"]
71
+
72
+ for midi_file_name in midi_file_names:
73
+ midi_path = os.path.join("webUI", "presets", "midis", f"{midi_file_name}.mid")
74
+ mid = mido.MidiFile(midi_path)
75
+ midis_dict[midi_file_name] = mid
76
+
77
+ return midis_dict
78
+
79
+ midis = load_midi_files()
80
+
81
+ return virtual_instruments, midis
webUI/natural_language_guided_4/note2music.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import gradio as gr
3
+ import mido
4
+ from io import BytesIO
5
+ # import pyrubberband as pyrb
6
+
7
+ from webUI.natural_language_guided_4.track_maker import DiffSynth, Track
8
+
9
+
10
+ def get_arrangement_module(gradioWebUI, virtual_instruments_state, midi_files_state):
11
+ # Load configurations
12
+ uNet = gradioWebUI.uNet
13
+ freq_resolution, time_resolution = gradioWebUI.freq_resolution, gradioWebUI.time_resolution
14
+ VAE_scale = gradioWebUI.VAE_scale
15
+ height, width, channels = int(freq_resolution / VAE_scale), int(time_resolution / VAE_scale), gradioWebUI.channels
16
+
17
+ timesteps = gradioWebUI.timesteps
18
+ VAE_quantizer = gradioWebUI.VAE_quantizer
19
+ VAE_decoder = gradioWebUI.VAE_decoder
20
+ CLAP = gradioWebUI.CLAP
21
+ CLAP_tokenizer = gradioWebUI.CLAP_tokenizer
22
+ device = gradioWebUI.device
23
+ squared = gradioWebUI.squared
24
+ sample_rate = gradioWebUI.sample_rate
25
+ noise_strategy = gradioWebUI.noise_strategy
26
+
27
+ def read_midi(midi, midi_dict):
28
+ mid = mido.MidiFile(file=BytesIO(midi))
29
+ tracks = [Track(t, mid.ticks_per_beat) for t in mid.tracks]
30
+
31
+ midi_info_text = f"Uploaded midi:"
32
+ for i, track in enumerate(tracks):
33
+ midi_info_text += f"\n{len(track.events)} events loaded from Track {i}."
34
+
35
+ midis = midi_dict["midis"]
36
+ midis["uploaded_midi"] = mid
37
+ midi_dict["midis"] = midis
38
+
39
+ return {midi_info_textbox: gr.Textbox(label="Midi info", lines=10,
40
+ placeholder=midi_info_text),
41
+ current_midi_state: "uploaded_midi",
42
+ midi_files_state: midi_dict}
43
+
44
+ def make_track(inpaint_steps, current_midi_name, midi_dict, max_notes, noising_strength, attack, before_release, current_instruments,
45
+ virtual_instruments_dict):
46
+
47
+ if noising_strength < 1:
48
+ print(f"Warning: making track with noising_strength = {noising_strength} < 1")
49
+ virtual_instruments = virtual_instruments_dict["virtual_instruments"]
50
+ sample_steps = int(inpaint_steps)
51
+
52
+ print(f"current_instruments: {current_instruments}")
53
+ instrument_names = current_instruments
54
+ instruments_configs = {}
55
+
56
+ for virtual_instrument_name in instrument_names:
57
+ virtual_instrument = virtual_instruments[virtual_instrument_name]
58
+
59
+ latent_representation = torch.tensor(virtual_instrument["latent_representation"], dtype=torch.float32).to(
60
+ device)
61
+ sampler = virtual_instrument["sampler"]
62
+
63
+ batchsize = 1
64
+
65
+ latent_representation = latent_representation.repeat(batchsize, 1, 1, 1)
66
+
67
+ instruments_configs[virtual_instrument_name] = {
68
+ 'sample_steps': sample_steps,
69
+ 'sampler': sampler,
70
+ 'noising_strength': noising_strength,
71
+ 'latent_representation': latent_representation,
72
+ 'attack': attack,
73
+ 'before_release': before_release}
74
+
75
+ diffSynth = DiffSynth(instruments_configs, uNet, VAE_quantizer, VAE_decoder, CLAP, CLAP_tokenizer, device)
76
+
77
+ midis = midi_dict["midis"]
78
+ mid = midis[current_midi_name]
79
+ full_audio = diffSynth.get_music(mid, instrument_names, max_notes=max_notes)
80
+
81
+ return {track_audio: (sample_rate, full_audio)}
82
+
83
+ with gr.Tab("Arrangement"):
84
+ default_instrument = "preset_string"
85
+ current_instruments_state = gr.State(value=[default_instrument for _ in range(100)])
86
+ current_midi_state = gr.State(value="Ode_to_Joy_Easy_variation")
87
+
88
+ gr.Markdown("Make music with generated sounds!")
89
+ with gr.Row(variant="panel"):
90
+ with gr.Column(scale=3):
91
+
92
+ @gr.render(inputs=midi_files_state)
93
+ def check_midis(midi_dict):
94
+ midis = midi_dict["midis"]
95
+ midi_names = list(midis.keys())
96
+
97
+ instrument_dropdown = gr.Dropdown(
98
+ midi_names, label="Select from preset midi files", value="Ode_to_Joy_Easy_variation"
99
+ )
100
+
101
+ def select_midi(midi_name):
102
+ # print(f"midi_name: {midi_name}")
103
+ mid = midis[midi_name]
104
+ tracks = [Track(t, mid.ticks_per_beat) for t in mid.tracks]
105
+ midi_info_text = f"Name: {midi_name}"
106
+ for i, track in enumerate(tracks):
107
+ midi_info_text += f"\n{len(track.events)} events loaded from Track {i}."
108
+
109
+ return {midi_info_textbox: gr.Textbox(label="Midi info", lines=10,
110
+ placeholder=midi_info_text),
111
+ current_midi_state: midi_name}
112
+
113
+ instrument_dropdown.select(select_midi, inputs=instrument_dropdown,
114
+ outputs=[midi_info_textbox, current_midi_state])
115
+
116
+ midi_file = gr.File(label="Upload a midi file", type="binary", scale=1)
117
+ midi_info_textbox = gr.Textbox(label="Midi info", lines=10,
118
+ placeholder="Please select/upload a midi on the left.", scale=3,
119
+ visible=False)
120
+
121
+ with gr.Column(scale=3, ):
122
+
123
+ @gr.render(inputs=[current_midi_state, midi_files_state, virtual_instruments_state])
124
+ def render_select_instruments(current_midi_name, midi_dict, virtual_instruments_dict):
125
+
126
+ virtual_instruments = virtual_instruments_dict["virtual_instruments"]
127
+ instrument_names = list(virtual_instruments.keys())
128
+
129
+ midis = midi_dict["midis"]
130
+ mid = midis[current_midi_name]
131
+ tracks = [Track(t, mid.ticks_per_beat) for t in mid.tracks]
132
+
133
+ dropdowns = []
134
+ for i, track in enumerate(tracks):
135
+ dropdowns.append(gr.Dropdown(
136
+ instrument_names, value=default_instrument, label=f"Track {i}: {len(track.events)} notes",
137
+ info=f"Select an instrument to play this track!"
138
+ ))
139
+
140
+ def select_instruments(*instruments):
141
+ return instruments
142
+
143
+ for d in dropdowns:
144
+ d.select(select_instruments, inputs=dropdowns,
145
+ outputs=current_instruments_state)
146
+
147
+
148
+ with gr.Column(scale=3):
149
+ max_notes_slider = gr.Slider(minimum=10.0, maximum=999.0, value=100.0, step=1.0,
150
+ label="Maximum number of synthesized notes in each track",
151
+ info="Lower this value to prevent Gradio timeouts")
152
+ make_track_button = gr.Button(variant="primary", value="Make track", scale=1)
153
+ track_audio = gr.Audio(type="numpy", label="Play music", interactive=False)
154
+
155
+ with gr.Row(variant="panel", visible=False):
156
+ with gr.Tab("Origin sound"):
157
+ inpaint_steps_slider = gr.Slider(minimum=5.0, maximum=999.0, value=20.0, step=1.0,
158
+ label="inpaint_steps")
159
+ noising_strength_slider = gradioWebUI.get_noising_strength_slider(default_noising_strength=1.)
160
+ end_noise_level_ratio_slider = gr.Slider(minimum=0.0, maximum=1., value=0.0, step=0.01,
161
+ label="end_noise_level_ratio")
162
+ attack_slider = gr.Slider(minimum=0.0, maximum=1.5, value=0.5, step=0.01, label="attack in sec")
163
+ before_release_slider = gr.Slider(minimum=0.0, maximum=1.5, value=0.5, step=0.01,
164
+ label="before_release in sec")
165
+ release_slider = gr.Slider(minimum=0.0, maximum=1.0, value=0.3, step=0.01, label="release in sec")
166
+ mask_flexivity_slider = gr.Slider(minimum=0.01, maximum=1.00, value=1., step=0.01,
167
+ label="mask_flexivity")
168
+ with gr.Tab("Length adjustment config"):
169
+ use_dynamic_mask_checkbox = gr.Checkbox(label="Use dynamic mask", value=True)
170
+ test_duration_envelope_button = gr.Button(variant="primary", value="Apply envelope", scale=1)
171
+ test_duration_stretch_button = gr.Button(variant="primary", value="Apply stretch", scale=1)
172
+ test_duration_inpaint_button = gr.Button(variant="primary", value="Inpaint different duration", scale=1)
173
+ duration_slider = gradioWebUI.get_duration_slider()
174
+ with gr.Tab("Pitch shift config"):
175
+ pitch_shift_radio = gr.Radio(choices=["librosa", "torchaudio", "rubberband"],
176
+ value="librosa")
177
+
178
+ with gr.Row(variant="panel", visible=False):
179
+ with gr.Column(scale=2):
180
+ with gr.Row(variant="panel"):
181
+ source_sound_spectrogram_image = gr.Image(label="New sound spectrogram", type="numpy",
182
+ height=600, scale=1)
183
+ source_sound_phase_image = gr.Image(label="New sound phase", type="numpy",
184
+ height=600, scale=1)
185
+
186
+ make_track_button.click(make_track,
187
+ inputs=[inpaint_steps_slider, current_midi_state, midi_files_state,
188
+ max_notes_slider, noising_strength_slider,
189
+ attack_slider,
190
+ before_release_slider,
191
+ current_instruments_state,
192
+ virtual_instruments_state],
193
+ outputs=[track_audio])
194
+
195
+ midi_file.change(read_midi,
196
+ inputs=[midi_file,
197
+ midi_files_state],
198
+ outputs=[midi_info_textbox,
199
+ current_midi_state,
200
+ midi_files_state])
webUI/natural_language_guided_4/rec.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ from data_generation.nsynth import get_nsynth_dataloader
4
+ from webUI.natural_language_guided_STFT.utils import encodeBatch2GradioOutput_STFT, InputBatch2Encode_STFT, \
5
+ latent_representation_to_Gradio_image
6
+
7
+
8
+ def get_recSTFT_module(gradioWebUI, reconstruction_state):
9
+ # Load configurations
10
+ uNet = gradioWebUI.uNet
11
+ freq_resolution, time_resolution = gradioWebUI.freq_resolution, gradioWebUI.time_resolution
12
+ VAE_scale = gradioWebUI.VAE_scale
13
+ height, width, channels = int(freq_resolution / VAE_scale), int(time_resolution / VAE_scale), gradioWebUI.channels
14
+
15
+ timesteps = gradioWebUI.timesteps
16
+ VAE_quantizer = gradioWebUI.VAE_quantizer
17
+ VAE_encoder = gradioWebUI.VAE_encoder
18
+ VAE_decoder = gradioWebUI.VAE_decoder
19
+ CLAP = gradioWebUI.CLAP
20
+ CLAP_tokenizer = gradioWebUI.CLAP_tokenizer
21
+ device = gradioWebUI.device
22
+ squared = gradioWebUI.squared
23
+ sample_rate = gradioWebUI.sample_rate
24
+ noise_strategy = gradioWebUI.noise_strategy
25
+
26
+ def generate_reconstruction_samples(sample_source, batchsize_slider, encodeCache,
27
+ reconstruction_samples):
28
+
29
+ vae_batchsize = int(batchsize_slider)
30
+
31
+ if sample_source == "text2sound_trainSTFT":
32
+ training_dataset_path = f'data/NSynth/nsynth-STFT-train-52.hdf5' # Make sure to use your actual path
33
+ iterator = get_nsynth_dataloader(training_dataset_path, batch_size=vae_batchsize, shuffle=True,
34
+ get_latent_representation=False, with_meta_data=False,
35
+ task="STFT")
36
+ elif sample_source == "text2sound_validSTFT":
37
+ training_dataset_path = f'data/NSynth/nsynth-STFT-valid-52.hdf5' # Make sure to use your actual path
38
+ iterator = get_nsynth_dataloader(training_dataset_path, batch_size=vae_batchsize, shuffle=True,
39
+ get_latent_representation=False, with_meta_data=False,
40
+ task="STFT")
41
+ elif sample_source == "text2sound_testSTFT":
42
+ training_dataset_path = f'data/NSynth/nsynth-STFT-test-52.hdf5' # Make sure to use your actual path
43
+ iterator = get_nsynth_dataloader(training_dataset_path, batch_size=vae_batchsize, shuffle=True,
44
+ get_latent_representation=False, with_meta_data=False,
45
+ task="STFT")
46
+ else:
47
+ raise NotImplementedError()
48
+
49
+ spectrogram_batch = next(iter(iterator))
50
+
51
+ origin_flipped_log_spectrums, origin_flipped_phases, origin_signals, latent_representations, quantized_latent_representations = InputBatch2Encode_STFT(
52
+ VAE_encoder, spectrogram_batch, resolution=(512, width * VAE_scale), quantizer=VAE_quantizer, squared=squared)
53
+
54
+ latent_representation_gradio_images, quantized_latent_representation_gradio_images = [], []
55
+ for i in range(vae_batchsize):
56
+ latent_representation_gradio_images.append(latent_representation_to_Gradio_image(latent_representations[i]))
57
+ quantized_latent_representation_gradio_images.append(
58
+ latent_representation_to_Gradio_image(quantized_latent_representations[i]))
59
+
60
+ if quantized_latent_representations is None:
61
+ quantized_latent_representations = latent_representations
62
+ reconstruction_flipped_log_spectrums, reconstruction_flipped_phases, reconstruction_signals, reconstruction_flipped_log_spectrums_WOA, reconstruction_flipped_phases_WOA, reconstruction_signals_WOA = encodeBatch2GradioOutput_STFT(VAE_decoder,
63
+ quantized_latent_representations,
64
+ resolution=(
65
+ 512,
66
+ width * VAE_scale),
67
+ original_STFT_batch=spectrogram_batch
68
+ )
69
+
70
+ reconstruction_samples["origin_flipped_log_spectrums"] = origin_flipped_log_spectrums
71
+ reconstruction_samples["origin_flipped_phases"] = origin_flipped_phases
72
+ reconstruction_samples["origin_signals"] = origin_signals
73
+ reconstruction_samples["latent_representation_gradio_images"] = latent_representation_gradio_images
74
+ reconstruction_samples[
75
+ "quantized_latent_representation_gradio_images"] = quantized_latent_representation_gradio_images
76
+ reconstruction_samples[
77
+ "reconstruction_flipped_log_spectrums"] = reconstruction_flipped_log_spectrums
78
+ reconstruction_samples[
79
+ "reconstruction_flipped_phases"] = reconstruction_flipped_phases
80
+ reconstruction_samples["reconstruction_signals"] = reconstruction_signals
81
+ reconstruction_samples[
82
+ "reconstruction_flipped_log_spectrums_WOA"] = reconstruction_flipped_log_spectrums_WOA
83
+ reconstruction_samples[
84
+ "reconstruction_flipped_phases_WOA"] = reconstruction_flipped_phases_WOA
85
+ reconstruction_samples["reconstruction_signals_WOA"] = reconstruction_signals_WOA
86
+ reconstruction_samples["sampleRate"] = sample_rate
87
+
88
+ latent_representation_gradio_image = reconstruction_samples["latent_representation_gradio_images"][0]
89
+ quantized_latent_representation_gradio_image = \
90
+ reconstruction_samples["quantized_latent_representation_gradio_images"][0]
91
+ origin_flipped_log_spectrum = reconstruction_samples["origin_flipped_log_spectrums"][0]
92
+ origin_flipped_phase = reconstruction_samples["origin_flipped_phases"][0]
93
+ origin_signal = reconstruction_samples["origin_signals"][0]
94
+ reconstruction_flipped_log_spectrum = reconstruction_samples["reconstruction_flipped_log_spectrums"][0]
95
+ reconstruction_flipped_phase = reconstruction_samples["reconstruction_flipped_phases"][0]
96
+ reconstruction_signal = reconstruction_samples["reconstruction_signals"][0]
97
+ reconstruction_flipped_log_spectrum_WOA = reconstruction_samples["reconstruction_flipped_log_spectrums_WOA"][0]
98
+ reconstruction_flipped_phase_WOA = reconstruction_samples["reconstruction_flipped_phases_WOA"][0]
99
+ reconstruction_signal_WOA = reconstruction_samples["reconstruction_signals_WOA"][0]
100
+
101
+ return {origin_amplitude_image_output: origin_flipped_log_spectrum,
102
+ origin_phase_image_output: origin_flipped_phase,
103
+ origin_audio_output: (sample_rate, origin_signal),
104
+ latent_representation_image_output: latent_representation_gradio_image,
105
+ quantized_latent_representation_image_output: quantized_latent_representation_gradio_image,
106
+ reconstruction_amplitude_image_output: reconstruction_flipped_log_spectrum,
107
+ reconstruction_phase_image_output: reconstruction_flipped_phase,
108
+ reconstruction_audio_output: (sample_rate, reconstruction_signal),
109
+ reconstruction_amplitude_image_output_WOA: reconstruction_flipped_log_spectrum_WOA,
110
+ reconstruction_phase_image_output_WOA: reconstruction_flipped_phase_WOA,
111
+ reconstruction_audio_output_WOA: (sample_rate, reconstruction_signal_WOA),
112
+ sample_index_slider: gr.update(minimum=0, maximum=vae_batchsize - 1, value=0, step=1.0,
113
+ label="Sample index.",
114
+ info="Slide to view other samples", scale=1, visible=True),
115
+ reconstruction_state: encodeCache,
116
+ reconstruction_samples_state: reconstruction_samples}
117
+
118
+ def show_reconstruction_sample(sample_index, encodeCache_state, reconstruction_samples_state):
119
+ sample_index = int(sample_index)
120
+ sampleRate = reconstruction_samples_state["sampleRate"]
121
+ latent_representation_gradio_image = reconstruction_samples_state["latent_representation_gradio_images"][
122
+ sample_index]
123
+ quantized_latent_representation_gradio_image = \
124
+ reconstruction_samples_state["quantized_latent_representation_gradio_images"][sample_index]
125
+ origin_flipped_log_spectrum = reconstruction_samples_state["origin_flipped_log_spectrums"][sample_index]
126
+ origin_flipped_phase = reconstruction_samples_state["origin_flipped_phases"][sample_index]
127
+ origin_signal = reconstruction_samples_state["origin_signals"][sample_index]
128
+ reconstruction_flipped_log_spectrum = reconstruction_samples_state["reconstruction_flipped_log_spectrums"][
129
+ sample_index]
130
+ reconstruction_flipped_phase = reconstruction_samples_state["reconstruction_flipped_phases"][
131
+ sample_index]
132
+ reconstruction_signal = reconstruction_samples_state["reconstruction_signals"][sample_index]
133
+ reconstruction_flipped_log_spectrum_WOA = reconstruction_samples_state["reconstruction_flipped_log_spectrums_WOA"][
134
+ sample_index]
135
+ reconstruction_flipped_phase_WOA = reconstruction_samples_state["reconstruction_flipped_phases_WOA"][
136
+ sample_index]
137
+ reconstruction_signal_WOA = reconstruction_samples_state["reconstruction_signals_WOA"][sample_index]
138
+ return origin_flipped_log_spectrum, origin_flipped_phase, (sampleRate, origin_signal), \
139
+ latent_representation_gradio_image, quantized_latent_representation_gradio_image, \
140
+ reconstruction_flipped_log_spectrum, reconstruction_flipped_phase, (sampleRate, reconstruction_signal), \
141
+ reconstruction_flipped_log_spectrum_WOA, reconstruction_flipped_phase_WOA, (sampleRate, reconstruction_signal_WOA), \
142
+ encodeCache_state, reconstruction_samples_state
143
+
144
+ with gr.Tab("Reconstruction"):
145
+ reconstruction_samples_state = gr.State(value={})
146
+ gr.Markdown("Test reconstruction.")
147
+ with gr.Row(variant="panel"):
148
+ with gr.Column():
149
+ sample_source_radio = gr.Radio(
150
+ choices=["synthetic", "external", "text2sound_trainSTFT", "text2sound_testSTFT", "text2sound_validSTFT"],
151
+ value="text2sound_trainf", info="Info placeholder", scale=2)
152
+ batchsize_slider = gr.Slider(minimum=1., maximum=16., value=4., step=1.,
153
+ label="batchsize")
154
+ with gr.Column():
155
+ generate_button = gr.Button(variant="primary", value="Generate reconstruction samples", scale=1)
156
+ sample_index_slider = gr.Slider(minimum=0, maximum=3, value=0, step=1.0, label="Sample index.",
157
+ info="Slide to view other samples", scale=1, visible=False)
158
+ with gr.Row(variant="panel"):
159
+ with gr.Column():
160
+ origin_amplitude_image_output = gr.Image(label="Spectrogram", type="numpy", height=300, width=100, scale=1)
161
+ origin_phase_image_output = gr.Image(label="Phase", type="numpy", height=300, width=100, scale=1)
162
+ origin_audio_output = gr.Audio(type="numpy", label="Play the example!")
163
+ with gr.Column():
164
+ reconstruction_amplitude_image_output = gr.Image(label="Spectrogram", type="numpy", height=300, width=100, scale=1)
165
+ reconstruction_phase_image_output = gr.Image(label="Phase", type="numpy", height=300, width=100, scale=1)
166
+ reconstruction_audio_output = gr.Audio(type="numpy", label="Play the example!")
167
+ with gr.Column():
168
+ reconstruction_amplitude_image_output_WOA = gr.Image(label="Spectrogram", type="numpy", height=300, width=100, scale=1)
169
+ reconstruction_phase_image_output_WOA = gr.Image(label="Phase", type="numpy", height=300, width=100, scale=1)
170
+ reconstruction_audio_output_WOA = gr.Audio(type="numpy", label="Play the example!")
171
+ with gr.Row(variant="panel", equal_height=True):
172
+ latent_representation_image_output = gr.Image(label="latent_representation", type="numpy", height=300, width=100)
173
+ quantized_latent_representation_image_output = gr.Image(label="quantized", type="numpy", height=300, width=100)
174
+
175
+ generate_button.click(generate_reconstruction_samples,
176
+ inputs=[sample_source_radio, batchsize_slider, reconstruction_state,
177
+ reconstruction_samples_state],
178
+ outputs=[origin_amplitude_image_output, origin_phase_image_output, origin_audio_output,
179
+ latent_representation_image_output, quantized_latent_representation_image_output,
180
+ reconstruction_amplitude_image_output, reconstruction_phase_image_output, reconstruction_audio_output,
181
+ reconstruction_amplitude_image_output_WOA, reconstruction_phase_image_output_WOA, reconstruction_audio_output_WOA,
182
+ sample_index_slider, reconstruction_state, reconstruction_samples_state])
183
+
184
+ sample_index_slider.change(show_reconstruction_sample,
185
+ inputs=[sample_index_slider, reconstruction_state, reconstruction_samples_state],
186
+ outputs=[origin_amplitude_image_output, origin_phase_image_output, origin_audio_output,
187
+ latent_representation_image_output, quantized_latent_representation_image_output,
188
+ reconstruction_amplitude_image_output, reconstruction_phase_image_output, reconstruction_audio_output,
189
+ reconstruction_amplitude_image_output_WOA, reconstruction_phase_image_output_WOA, reconstruction_audio_output_WOA,
190
+ reconstruction_state, reconstruction_samples_state])
webUI/natural_language_guided_4/sound2sound_with_text.py ADDED
@@ -0,0 +1,325 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import librosa
3
+ import numpy as np
4
+ import torch
5
+
6
+ from model.DiffSynthSampler import DiffSynthSampler
7
+ from tools import pad_STFT, encode_stft
8
+ from tools import safe_int, adjust_audio_length
9
+ from webUI.natural_language_guided_4.utils import InputBatch2Encode_STFT, encodeBatch2GradioOutput_STFT, \
10
+ latent_representation_to_Gradio_image, resize_image_to_aspect_ratio, add_instrument
11
+
12
+
13
+ def get_sound2sound_with_text_module(gradioWebUI, sound2sound_with_text_state, virtual_instruments_state):
14
+ # Load configurations
15
+ uNet = gradioWebUI.uNet
16
+ freq_resolution, time_resolution = gradioWebUI.freq_resolution, gradioWebUI.time_resolution
17
+ VAE_scale = gradioWebUI.VAE_scale
18
+ height, width, channels = int(freq_resolution / VAE_scale), int(time_resolution / VAE_scale), gradioWebUI.channels
19
+ timesteps = gradioWebUI.timesteps
20
+ VAE_encoder = gradioWebUI.VAE_encoder
21
+ VAE_quantizer = gradioWebUI.VAE_quantizer
22
+ VAE_decoder = gradioWebUI.VAE_decoder
23
+ CLAP = gradioWebUI.CLAP
24
+ CLAP_tokenizer = gradioWebUI.CLAP_tokenizer
25
+ device = gradioWebUI.device
26
+ squared = gradioWebUI.squared
27
+ sample_rate = gradioWebUI.sample_rate
28
+ noise_strategy = gradioWebUI.noise_strategy
29
+
30
+ def receive_upload_origin_audio(sound2sound_duration, sound2sound_origin,
31
+ sound2sound_with_text_dict, virtual_instruments_dict):
32
+ origin_sr, origin_audio = sound2sound_origin
33
+ origin_audio = origin_audio / np.max(np.abs(origin_audio))
34
+
35
+ width = int(time_resolution * ((sound2sound_duration + 1) / 4) / VAE_scale)
36
+ audio_length = 256 * (VAE_scale * width - 1)
37
+ origin_audio = adjust_audio_length(origin_audio, audio_length, origin_sr, sample_rate)
38
+
39
+ D = librosa.stft(origin_audio, n_fft=1024, hop_length=256, win_length=1024)
40
+ padded_D = pad_STFT(D)
41
+ encoded_D = encode_stft(padded_D)
42
+
43
+ # Todo: justify batchsize to 1
44
+ origin_spectrogram_batch_tensor = torch.from_numpy(
45
+ np.repeat(encoded_D[np.newaxis, :, :, :], 1, axis=0)).float().to(device)
46
+
47
+ # Todo: remove hard-coding
48
+ origin_flipped_log_spectrums, origin_flipped_phases, origin_signals, origin_latent_representations, quantized_origin_latent_representations = InputBatch2Encode_STFT(
49
+ VAE_encoder, origin_spectrogram_batch_tensor, resolution=(512, width * VAE_scale), quantizer=VAE_quantizer,
50
+ squared=squared)
51
+
52
+ sound2sound_with_text_dict["origin_latent_representations"] = origin_latent_representations.tolist()
53
+ sound2sound_with_text_dict[
54
+ "sound2sound_origin_latent_representation_image"] = latent_representation_to_Gradio_image(
55
+ origin_latent_representations[0]).tolist()
56
+ sound2sound_with_text_dict[
57
+ "sound2sound_origin_quantized_latent_representation_image"] = latent_representation_to_Gradio_image(
58
+ quantized_origin_latent_representations[0]).tolist()
59
+
60
+
61
+ return {sound2sound_origin_spectrogram_image: resize_image_to_aspect_ratio(origin_flipped_log_spectrums[0],
62
+ 1.55,
63
+ 1),
64
+ sound2sound_origin_phase_image: resize_image_to_aspect_ratio(origin_flipped_phases[0],
65
+ 1.55,
66
+ 1),
67
+ sound2sound_origin_latent_representation_image: latent_representation_to_Gradio_image(
68
+ origin_latent_representations[0]),
69
+ sound2sound_origin_quantized_latent_representation_image: latent_representation_to_Gradio_image(
70
+ quantized_origin_latent_representations[0]),
71
+ sound2sound_with_text_state: sound2sound_with_text_dict,
72
+ virtual_instruments_state: virtual_instruments_dict}
73
+
74
+ def sound2sound_sample(sound2sound_prompts, sound2sound_negative_prompts, sound2sound_batchsize,
75
+ sound2sound_guidance_scale, sound2sound_sampler,
76
+ sound2sound_sample_steps,
77
+ sound2sound_noising_strength, sound2sound_seed, sound2sound_dict, virtual_instruments_dict):
78
+ # input processing
79
+ sound2sound_seed = safe_int(sound2sound_seed, 12345678)
80
+ sound2sound_batchsize = int(sound2sound_batchsize)
81
+ noising_strength = sound2sound_noising_strength
82
+ sound2sound_sample_steps = int(sound2sound_sample_steps)
83
+ CFG = int(sound2sound_guidance_scale)
84
+
85
+ origin_latent_representations = torch.tensor(
86
+ sound2sound_dict["origin_latent_representations"]).repeat(sound2sound_batchsize, 1, 1, 1).to(
87
+ device)
88
+
89
+ # sound2sound
90
+ text2sound_embedding = \
91
+ CLAP.get_text_features(**CLAP_tokenizer([sound2sound_prompts], padding=True, return_tensors="pt"))[0].to(
92
+ device)
93
+
94
+ mySampler = DiffSynthSampler(timesteps, height=height, channels=channels, noise_strategy=noise_strategy)
95
+ negative_condition = \
96
+ CLAP.get_text_features(**CLAP_tokenizer([sound2sound_negative_prompts], padding=True, return_tensors="pt"))[
97
+ 0]
98
+ mySampler.activate_classifier_free_guidance(CFG, negative_condition.to(device))
99
+
100
+ normalized_sample_steps = int(sound2sound_sample_steps / noising_strength)
101
+ mySampler.respace(list(np.linspace(0, timesteps - 1, normalized_sample_steps, dtype=np.int32)))
102
+
103
+ condition = text2sound_embedding.repeat(sound2sound_batchsize, 1)
104
+
105
+ # Todo: remove-hard coding
106
+ width = origin_latent_representations.shape[-1]
107
+ new_sound_latent_representations, initial_noise = \
108
+ mySampler.img_guided_sample(model=uNet, shape=(sound2sound_batchsize, channels, height, width),
109
+ seed=sound2sound_seed,
110
+ noising_strength=noising_strength,
111
+ guide_img=origin_latent_representations, return_tensor=True,
112
+ condition=condition,
113
+ sampler=sound2sound_sampler)
114
+
115
+ new_sound_latent_representations = new_sound_latent_representations[-1]
116
+
117
+ # Quantize new sound latent representations
118
+ quantized_new_sound_latent_representations, loss, (_, _, _) = VAE_quantizer(new_sound_latent_representations)
119
+
120
+ new_sound_flipped_log_spectrums, new_sound_flipped_phases, new_sound_signals, _, _, _ = encodeBatch2GradioOutput_STFT(
121
+ VAE_decoder,
122
+ quantized_new_sound_latent_representations,
123
+ resolution=(
124
+ 512,
125
+ width * VAE_scale),
126
+ original_STFT_batch=None
127
+ )
128
+
129
+ new_sound_latent_representation_gradio_images = []
130
+ new_sound_quantized_latent_representation_gradio_images = []
131
+ new_sound_spectrogram_gradio_images = []
132
+ new_sound_phase_gradio_images = []
133
+ new_sound_rec_signals_gradio = []
134
+ for i in range(sound2sound_batchsize):
135
+ new_sound_latent_representation_gradio_images.append(
136
+ latent_representation_to_Gradio_image(new_sound_latent_representations[i]))
137
+ new_sound_quantized_latent_representation_gradio_images.append(
138
+ latent_representation_to_Gradio_image(quantized_new_sound_latent_representations[i]))
139
+ new_sound_spectrogram_gradio_images.append(new_sound_flipped_log_spectrums[i])
140
+ new_sound_phase_gradio_images.append(new_sound_flipped_phases[i])
141
+ new_sound_rec_signals_gradio.append((sample_rate, new_sound_signals[i]))
142
+ sound2sound_dict[
143
+ "new_sound_latent_representation_gradio_images"] = new_sound_latent_representation_gradio_images
144
+ sound2sound_dict[
145
+ "new_sound_quantized_latent_representation_gradio_images"] = new_sound_quantized_latent_representation_gradio_images
146
+ sound2sound_dict["new_sound_spectrogram_gradio_images"] = new_sound_spectrogram_gradio_images
147
+ sound2sound_dict["new_sound_phase_gradio_images"] = new_sound_phase_gradio_images
148
+ sound2sound_dict["new_sound_rec_signals_gradio"] = new_sound_rec_signals_gradio
149
+
150
+ # save instrument
151
+ sound2sound_dict["latent_representations"] = new_sound_latent_representations.to("cpu").detach().numpy()
152
+ sound2sound_dict["quantized_latent_representations"] = quantized_new_sound_latent_representations.to(
153
+ "cpu").detach().numpy()
154
+ sound2sound_dict["condition"] = condition.to("cpu").detach().numpy()
155
+ sound2sound_dict["negative_condition"] = negative_condition.to("cpu").detach().numpy()
156
+ sound2sound_dict["guidance_scale"] = CFG
157
+ sound2sound_dict["sampler"] = sound2sound_sampler
158
+
159
+ return {sound2sound_new_sound_latent_representation_image: latent_representation_to_Gradio_image(
160
+ new_sound_latent_representations[0]),
161
+ sound2sound_new_sound_quantized_latent_representation_image: latent_representation_to_Gradio_image(
162
+ quantized_new_sound_latent_representations[0]),
163
+ sound2sound_new_sound_spectrogram_image: resize_image_to_aspect_ratio(new_sound_flipped_log_spectrums[0],
164
+ 1.55,
165
+ 1),
166
+ sound2sound_new_sound_phase_image: resize_image_to_aspect_ratio(new_sound_flipped_phases[0],
167
+ 1.55,
168
+ 1),
169
+ sound2sound_new_sound_audio: (sample_rate, new_sound_signals[0]),
170
+ sound2sound_sample_index_slider: gr.update(minimum=0, maximum=sound2sound_batchsize - 1, value=0,
171
+ step=1.0,
172
+ visible=True,
173
+ label="Sample index",
174
+ info="Swipe to view other samples"),
175
+ sound2sound_seed_textbox: sound2sound_seed,
176
+ sound2sound_with_text_state: sound2sound_dict,
177
+ virtual_instruments_state: virtual_instruments_dict}
178
+
179
+ def show_sound2sound_sample(sound2sound_sample_index, sound2sound_with_text_dict):
180
+ sample_index = int(sound2sound_sample_index)
181
+ return {sound2sound_new_sound_latent_representation_image:
182
+ sound2sound_with_text_dict["new_sound_latent_representation_gradio_images"][sample_index],
183
+ sound2sound_new_sound_quantized_latent_representation_image:
184
+ sound2sound_with_text_dict["new_sound_quantized_latent_representation_gradio_images"][sample_index],
185
+ sound2sound_new_sound_spectrogram_image: resize_image_to_aspect_ratio(
186
+ sound2sound_with_text_dict["new_sound_spectrogram_gradio_images"][
187
+ sample_index], 1.55, 1),
188
+ sound2sound_new_sound_phase_image: resize_image_to_aspect_ratio(
189
+ sound2sound_with_text_dict["new_sound_phase_gradio_images"][
190
+ sample_index], 1.55, 1),
191
+ sound2sound_new_sound_audio: sound2sound_with_text_dict["new_sound_rec_signals_gradio"][sample_index]}
192
+
193
+ def save_virtual_instrument(sample_index, virtual_instrument_name, sound2sound_dict, virtual_instruments_dict):
194
+ virtual_instruments_dict = add_instrument(sound2sound_dict, virtual_instruments_dict, virtual_instrument_name,
195
+ sample_index)
196
+
197
+ return {virtual_instruments_state: virtual_instruments_dict,
198
+ text2sound_instrument_name_textbox: gr.Textbox(label="Instrument name", lines=1,
199
+ placeholder=f"Saved as {virtual_instrument_name}!")}
200
+
201
+ with gr.Tab("Sound2Sound"):
202
+ gr.Markdown("Generate new sound based on a given sound!")
203
+ with gr.Row(variant="panel"):
204
+ with gr.Column(scale=3):
205
+ sound2sound_prompts_textbox = gr.Textbox(label="Positive prompt", lines=2, value="organ")
206
+ text2sound_negative_prompts_textbox = gr.Textbox(label="Negative prompt", lines=2, value="")
207
+
208
+ with gr.Column(scale=1):
209
+ sound2sound_sample_button = gr.Button(variant="primary", value="Generate", scale=1)
210
+
211
+ sound2sound_sample_index_slider = gr.Slider(minimum=0, maximum=3, value=0, step=1.0, visible=False,
212
+ label="Sample index",
213
+ info="Swipe to view other samples")
214
+
215
+ with gr.Row(variant="panel"):
216
+ with gr.Column(scale=1):
217
+ with gr.Tab("Origin sound"):
218
+ sound2sound_duration_slider = gradioWebUI.get_duration_slider()
219
+
220
+ sound2sound_origin_audio = gr.Audio(
221
+ sources=["microphone", "upload"], label="Upload/Record source sound",
222
+ waveform_options=gr.WaveformOptions(
223
+ waveform_color="#01C6FF",
224
+ waveform_progress_color="#0066B4",
225
+ skip_length=1,
226
+ show_controls=False,
227
+ ),
228
+ )
229
+
230
+ with gr.Row(variant="panel"):
231
+ sound2sound_origin_spectrogram_image = gr.Image(label="Original upload spectrogram",
232
+ type="numpy",visible=True)
233
+ sound2sound_origin_phase_image = gr.Image(label="Original upload phase",
234
+ type="numpy", visible=True)
235
+
236
+ with gr.Tab("Sound2sound settings"):
237
+ sound2sound_sample_steps_slider = gradioWebUI.get_sample_steps_slider()
238
+ sound2sound_sampler_radio = gradioWebUI.get_sampler_radio()
239
+ sound2sound_batchsize_slider = gradioWebUI.get_batchsize_slider()
240
+ sound2sound_noising_strength_slider = gradioWebUI.get_noising_strength_slider()
241
+ sound2sound_guidance_scale_slider = gradioWebUI.get_guidance_scale_slider()
242
+ sound2sound_seed_textbox = gradioWebUI.get_seed_textbox()
243
+
244
+ with gr.Column(scale=1):
245
+ sound2sound_new_sound_audio = gr.Audio(type="numpy", label="Play new sound", interactive=False,
246
+ waveform_options=gr.WaveformOptions(
247
+ waveform_color="#FFB6C1",
248
+ waveform_progress_color="#FF0000",
249
+ skip_length=1,
250
+ show_controls=False,
251
+ ), )
252
+ with gr.Row(variant="panel"):
253
+ sound2sound_new_sound_spectrogram_image = gr.Image(label="New sound spectrogram", type="numpy",
254
+ scale=1)
255
+ sound2sound_new_sound_phase_image = gr.Image(label="New sound phase", type="numpy",
256
+ scale=1)
257
+
258
+ with gr.Row(variant="panel",):
259
+ text2sound_instrument_name_textbox = gr.Textbox(label="Instrument name", lines=2,
260
+ placeholder="Name of your instrument",
261
+ scale=1)
262
+ text2sound_save_instrument_button = gr.Button(variant="primary",
263
+ value="Save instrument",
264
+ scale=1)
265
+
266
+ with gr.Row(variant="panel"):
267
+ sound2sound_origin_latent_representation_image = gr.Image(label="Original latent representation",
268
+ type="numpy", height=800,
269
+ visible=False)
270
+ sound2sound_origin_quantized_latent_representation_image = gr.Image(
271
+ label="Original quantized latent representation", type="numpy", height=800, visible=False)
272
+
273
+ sound2sound_new_sound_latent_representation_image = gr.Image(label="New latent representation",
274
+ type="numpy", height=800, visible=False)
275
+ sound2sound_new_sound_quantized_latent_representation_image = gr.Image(
276
+ label="New sound quantized latent representation", type="numpy", height=800, visible=False)
277
+
278
+ sound2sound_origin_audio.change(receive_upload_origin_audio,
279
+ inputs=[sound2sound_duration_slider,
280
+ sound2sound_origin_audio,
281
+ sound2sound_with_text_state,
282
+ virtual_instruments_state],
283
+ outputs=[sound2sound_origin_spectrogram_image,
284
+ sound2sound_origin_phase_image,
285
+ sound2sound_origin_latent_representation_image,
286
+ sound2sound_origin_quantized_latent_representation_image,
287
+ sound2sound_with_text_state,
288
+ virtual_instruments_state])
289
+
290
+ sound2sound_sample_button.click(sound2sound_sample,
291
+ inputs=[sound2sound_prompts_textbox,
292
+ text2sound_negative_prompts_textbox,
293
+ sound2sound_batchsize_slider,
294
+ sound2sound_guidance_scale_slider,
295
+ sound2sound_sampler_radio,
296
+ sound2sound_sample_steps_slider,
297
+ sound2sound_noising_strength_slider,
298
+ sound2sound_seed_textbox,
299
+ sound2sound_with_text_state,
300
+ virtual_instruments_state],
301
+ outputs=[sound2sound_new_sound_latent_representation_image,
302
+ sound2sound_new_sound_quantized_latent_representation_image,
303
+ sound2sound_new_sound_spectrogram_image,
304
+ sound2sound_new_sound_phase_image,
305
+ sound2sound_new_sound_audio,
306
+ sound2sound_sample_index_slider,
307
+ sound2sound_seed_textbox,
308
+ sound2sound_with_text_state,
309
+ virtual_instruments_state])
310
+
311
+ text2sound_save_instrument_button.click(save_virtual_instrument,
312
+ inputs=[sound2sound_sample_index_slider,
313
+ text2sound_instrument_name_textbox,
314
+ sound2sound_with_text_state,
315
+ virtual_instruments_state],
316
+ outputs=[virtual_instruments_state,
317
+ text2sound_instrument_name_textbox])
318
+
319
+ sound2sound_sample_index_slider.change(show_sound2sound_sample,
320
+ inputs=[sound2sound_sample_index_slider, sound2sound_with_text_state],
321
+ outputs=[sound2sound_new_sound_latent_representation_image,
322
+ sound2sound_new_sound_quantized_latent_representation_image,
323
+ sound2sound_new_sound_spectrogram_image,
324
+ sound2sound_new_sound_phase_image,
325
+ sound2sound_new_sound_audio])
webUI/natural_language_guided_4/super_resolution_with_text.py ADDED
@@ -0,0 +1,387 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ import numpy as np
3
+ import torch
4
+ import gradio as gr
5
+ from scipy.ndimage import zoom
6
+
7
+ from model.DiffSynthSampler import DiffSynthSampler
8
+ from tools import adjust_audio_length, rescale, safe_int, pad_STFT, encode_stft
9
+ from webUI.natural_language_guided_STFT.utils import latent_representation_to_Gradio_image
10
+ from webUI.natural_language_guided_STFT.utils import InputBatch2Encode_STFT, encodeBatch2GradioOutput_STFT
11
+
12
+
13
+ def get_super_resolution_with_text_module(gradioWebUI, inpaintWithText_state):
14
+ # Load configurations
15
+ uNet = gradioWebUI.uNet
16
+ freq_resolution, time_resolution = gradioWebUI.freq_resolution, gradioWebUI.time_resolution
17
+ VAE_scale = gradioWebUI.VAE_scale
18
+ height, width, channels = int(freq_resolution/VAE_scale), int(time_resolution/VAE_scale), gradioWebUI.channels
19
+ timesteps = gradioWebUI.timesteps
20
+ VAE_encoder = gradioWebUI.VAE_encoder
21
+ VAE_quantizer = gradioWebUI.VAE_quantizer
22
+ VAE_decoder = gradioWebUI.VAE_decoder
23
+ CLAP = gradioWebUI.CLAP
24
+ CLAP_tokenizer = gradioWebUI.CLAP_tokenizer
25
+ device = gradioWebUI.device
26
+ squared = gradioWebUI.squared
27
+ sample_rate = gradioWebUI.sample_rate
28
+ noise_strategy = gradioWebUI.noise_strategy
29
+
30
+ def receive_uopoad_origin_audio(sound2sound_duration, sound2sound_origin_source, sound2sound_origin_upload, sound2sound_origin_microphone,
31
+ inpaintWithText_dict):
32
+
33
+ if sound2sound_origin_source == "upload":
34
+ origin_sr, origin_audio = sound2sound_origin_upload
35
+ else:
36
+ origin_sr, origin_audio = sound2sound_origin_microphone
37
+
38
+ origin_audio = origin_audio / np.max(np.abs(origin_audio))
39
+
40
+ width = int(time_resolution*((sound2sound_duration+1)/4) / VAE_scale)
41
+ audio_length = 256 * (VAE_scale * width - 1)
42
+ origin_audio = adjust_audio_length(origin_audio, audio_length, origin_sr, sample_rate)
43
+
44
+ D = librosa.stft(origin_audio, n_fft=1024, hop_length=256, win_length=1024)
45
+ padded_D = pad_STFT(D)
46
+ encoded_D = encode_stft(padded_D)
47
+
48
+ # Todo: justify batchsize to 1
49
+ origin_spectrogram_batch_tensor = torch.from_numpy(
50
+ np.repeat(encoded_D[np.newaxis, :, :, :], 1, axis=0)).float().to(device)
51
+
52
+ # Todo: remove hard-coding
53
+ origin_flipped_log_spectrums, origin_flipped_phases, origin_signals, origin_latent_representations, quantized_origin_latent_representations = InputBatch2Encode_STFT(
54
+ VAE_encoder, origin_spectrogram_batch_tensor, resolution=(512, width * VAE_scale), quantizer=VAE_quantizer, squared=squared)
55
+
56
+ if sound2sound_origin_source == "upload":
57
+ inpaintWithText_dict["origin_upload_latent_representations"] = origin_latent_representations.tolist()
58
+ inpaintWithText_dict[
59
+ "sound2sound_origin_upload_latent_representation_image"] = latent_representation_to_Gradio_image(
60
+ origin_latent_representations[0]).tolist()
61
+ inpaintWithText_dict[
62
+ "sound2sound_origin_upload_quantized_latent_representation_image"] = latent_representation_to_Gradio_image(
63
+ quantized_origin_latent_representations[0]).tolist()
64
+ return {sound2sound_origin_spectrogram_upload_image: origin_flipped_log_spectrums[0],
65
+ sound2sound_origin_phase_upload_image: origin_flipped_phases[0],
66
+ sound2sound_origin_spectrogram_microphone_image: gr.update(),
67
+ sound2sound_origin_phase_microphone_image: gr.update(),
68
+ sound2sound_origin_upload_latent_representation_image: latent_representation_to_Gradio_image(
69
+ origin_latent_representations[0]),
70
+ sound2sound_origin_upload_quantized_latent_representation_image: latent_representation_to_Gradio_image(
71
+ quantized_origin_latent_representations[0]),
72
+ sound2sound_origin_microphone_latent_representation_image: gr.update(),
73
+ sound2sound_origin_microphone_quantized_latent_representation_image: gr.update(),
74
+ inpaintWithText_state: inpaintWithText_dict}
75
+ else:
76
+ inpaintWithText_dict["origin_microphone_latent_representations"] = origin_latent_representations.tolist()
77
+ inpaintWithText_dict[
78
+ "sound2sound_origin_microphone_latent_representation_image"] = latent_representation_to_Gradio_image(
79
+ origin_latent_representations[0]).tolist()
80
+ inpaintWithText_dict[
81
+ "sound2sound_origin_microphone_quantized_latent_representation_image"] = latent_representation_to_Gradio_image(
82
+ quantized_origin_latent_representations[0]).tolist()
83
+ return {sound2sound_origin_spectrogram_upload_image: origin_flipped_log_spectrums[0],
84
+ sound2sound_origin_phase_upload_image: origin_flipped_phases[0],
85
+ sound2sound_origin_spectrogram_microphone_image: gr.update(),
86
+ sound2sound_origin_phase_microphone_image: gr.update(),
87
+ sound2sound_origin_upload_latent_representation_image: latent_representation_to_Gradio_image(
88
+ origin_latent_representations[0]),
89
+ sound2sound_origin_upload_quantized_latent_representation_image: latent_representation_to_Gradio_image(
90
+ quantized_origin_latent_representations[0]),
91
+ sound2sound_origin_microphone_latent_representation_image: gr.update(),
92
+ sound2sound_origin_microphone_quantized_latent_representation_image: gr.update(),
93
+ inpaintWithText_state: inpaintWithText_dict}
94
+
95
+ def sound2sound_sample(sound2sound_origin_spectrogram_upload, sound2sound_origin_spectrogram_microphone,
96
+ text2sound_prompts, text2sound_negative_prompts, sound2sound_batchsize,
97
+ sound2sound_guidance_scale, sound2sound_sampler,
98
+ sound2sound_sample_steps, sound2sound_origin_source,
99
+ sound2sound_noising_strength, sound2sound_seed, sound2sound_inpaint_area, inpaintWithText_dict
100
+ ):
101
+
102
+ # input preprocessing
103
+ sound2sound_seed = safe_int(sound2sound_seed, 12345678)
104
+ sound2sound_batchsize = int(sound2sound_batchsize)
105
+ noising_strength = sound2sound_noising_strength
106
+ sound2sound_sample_steps = int(sound2sound_sample_steps)
107
+ CFG = int(sound2sound_guidance_scale)
108
+
109
+ text2sound_embedding = \
110
+ CLAP.get_text_features(**CLAP_tokenizer([text2sound_prompts], padding=True, return_tensors="pt"))[0].to(device)
111
+
112
+ if sound2sound_origin_source == "upload":
113
+ origin_latent_representations = torch.tensor(
114
+ inpaintWithText_dict["origin_upload_latent_representations"]).repeat(sound2sound_batchsize, 1, 1, 1).to(
115
+ device)
116
+ elif sound2sound_origin_source == "microphone":
117
+ origin_latent_representations = torch.tensor(
118
+ inpaintWithText_dict["origin_microphone_latent_representations"]).repeat(sound2sound_batchsize, 1, 1, 1).to(
119
+ device)
120
+ else:
121
+ print("Input source not in ['upload', 'microphone']!")
122
+ raise NotImplementedError()
123
+
124
+ high_resolution_latent_representations = torch.zeros((sound2sound_batchsize, channels, 256, 64)).to(device)
125
+ high_resolution_latent_representations[:, :, :128, :] = origin_latent_representations
126
+ latent_mask = np.ones((256, 64))
127
+ latent_mask[192:, :] = 0.0
128
+ print(f"latent_mask mean: {np.mean(latent_mask)}")
129
+
130
+ if sound2sound_inpaint_area == "inpaint masked":
131
+ latent_mask = 1 - latent_mask
132
+ latent_mask = torch.from_numpy(latent_mask).unsqueeze(0).unsqueeze(1).repeat(sound2sound_batchsize, channels, 1,
133
+ 1).float().to(device)
134
+ latent_mask = torch.flip(latent_mask, [2])
135
+
136
+ mySampler = DiffSynthSampler(timesteps, height=height*2, channels=channels, noise_strategy=noise_strategy)
137
+ unconditional_condition = \
138
+ CLAP.get_text_features(**CLAP_tokenizer([text2sound_negative_prompts], padding=True, return_tensors="pt"))[0]
139
+ mySampler.activate_classifier_free_guidance(CFG, unconditional_condition.to(device))
140
+
141
+ normalized_sample_steps = int(sound2sound_sample_steps / noising_strength)
142
+
143
+ mySampler.respace(list(np.linspace(0, timesteps - 1, normalized_sample_steps, dtype=np.int32)))
144
+
145
+ # Todo: remove hard-coding
146
+ width = high_resolution_latent_representations.shape[-1]
147
+ condition = text2sound_embedding.repeat(sound2sound_batchsize, 1)
148
+
149
+ new_sound_latent_representations, initial_noise = \
150
+ mySampler.inpaint_sample(model=uNet, shape=(sound2sound_batchsize, channels, height*2, width),
151
+ seed=sound2sound_seed,
152
+ noising_strength=noising_strength,
153
+ guide_img=high_resolution_latent_representations, mask=latent_mask, return_tensor=True,
154
+ condition=condition, sampler=sound2sound_sampler)
155
+
156
+ new_sound_latent_representations = new_sound_latent_representations[-1]
157
+
158
+ # Quantize new sound latent representations
159
+ quantized_new_sound_latent_representations, loss, (_, _, _) = VAE_quantizer(new_sound_latent_representations)
160
+ new_sound_flipped_log_spectrums, new_sound_flipped_phases, new_sound_signals, _, _, _ = encodeBatch2GradioOutput_STFT(VAE_decoder,
161
+ quantized_new_sound_latent_representations,
162
+ resolution=(
163
+ 1024,
164
+ width * VAE_scale),
165
+ original_STFT_batch=None
166
+ )
167
+
168
+ new_sound_latent_representation_gradio_images = []
169
+ new_sound_quantized_latent_representation_gradio_images = []
170
+ new_sound_spectrogram_gradio_images = []
171
+ new_sound_phase_gradio_images = []
172
+ new_sound_rec_signals_gradio = []
173
+ for i in range(sound2sound_batchsize):
174
+ new_sound_latent_representation_gradio_images.append(
175
+ latent_representation_to_Gradio_image(new_sound_latent_representations[i]))
176
+ new_sound_quantized_latent_representation_gradio_images.append(
177
+ latent_representation_to_Gradio_image(quantized_new_sound_latent_representations[i]))
178
+ new_sound_spectrogram_gradio_images.append(new_sound_flipped_log_spectrums[i])
179
+ new_sound_phase_gradio_images.append(new_sound_flipped_phases[i])
180
+ new_sound_rec_signals_gradio.append((sample_rate, new_sound_signals[i]))
181
+
182
+ inpaintWithText_dict[
183
+ "new_sound_latent_representation_gradio_images"] = new_sound_latent_representation_gradio_images
184
+ inpaintWithText_dict[
185
+ "new_sound_quantized_latent_representation_gradio_images"] = new_sound_quantized_latent_representation_gradio_images
186
+ inpaintWithText_dict["new_sound_spectrogram_gradio_images"] = new_sound_spectrogram_gradio_images
187
+ inpaintWithText_dict["new_sound_phase_gradio_images"] = new_sound_phase_gradio_images
188
+ inpaintWithText_dict["new_sound_rec_signals_gradio"] = new_sound_rec_signals_gradio
189
+
190
+ return {sound2sound_new_sound_latent_representation_image: latent_representation_to_Gradio_image(
191
+ new_sound_latent_representations[0]),
192
+ sound2sound_new_sound_quantized_latent_representation_image: latent_representation_to_Gradio_image(
193
+ quantized_new_sound_latent_representations[0]),
194
+ sound2sound_new_sound_spectrogram_image: new_sound_flipped_log_spectrums[0],
195
+ sound2sound_new_sound_phase_image: new_sound_flipped_phases[0],
196
+ sound2sound_new_sound_audio: (sample_rate, new_sound_signals[0]),
197
+ sound2sound_sample_index_slider: gr.update(minimum=0, maximum=sound2sound_batchsize - 1, value=0,
198
+ step=1.0,
199
+ visible=True,
200
+ label="Sample index",
201
+ info="Swipe to view other samples"),
202
+ sound2sound_seed_textbox: sound2sound_seed,
203
+ inpaintWithText_state: inpaintWithText_dict}
204
+
205
+ def show_sound2sound_sample(sound2sound_sample_index, inpaintWithText_dict):
206
+ sample_index = int(sound2sound_sample_index)
207
+ return {sound2sound_new_sound_latent_representation_image:
208
+ inpaintWithText_dict["new_sound_latent_representation_gradio_images"][sample_index],
209
+ sound2sound_new_sound_quantized_latent_representation_image:
210
+ inpaintWithText_dict["new_sound_quantized_latent_representation_gradio_images"][sample_index],
211
+ sound2sound_new_sound_spectrogram_image: inpaintWithText_dict["new_sound_spectrogram_gradio_images"][
212
+ sample_index],
213
+ sound2sound_new_sound_phase_image: inpaintWithText_dict["new_sound_phase_gradio_images"][
214
+ sample_index],
215
+ sound2sound_new_sound_audio: inpaintWithText_dict["new_sound_rec_signals_gradio"][sample_index]}
216
+
217
+ def sound2sound_switch_origin_source(sound2sound_origin_source):
218
+
219
+ if sound2sound_origin_source == "upload":
220
+ return {sound2sound_origin_upload_audio: gr.update(visible=True),
221
+ sound2sound_origin_microphone_audio: gr.update(visible=False),
222
+ sound2sound_origin_spectrogram_upload_image: gr.update(visible=True),
223
+ sound2sound_origin_phase_upload_image: gr.update(visible=True),
224
+ sound2sound_origin_spectrogram_microphone_image: gr.update(visible=False),
225
+ sound2sound_origin_phase_microphone_image: gr.update(visible=False),
226
+ sound2sound_origin_upload_latent_representation_image: gr.update(visible=True),
227
+ sound2sound_origin_upload_quantized_latent_representation_image: gr.update(visible=True),
228
+ sound2sound_origin_microphone_latent_representation_image: gr.update(visible=False),
229
+ sound2sound_origin_microphone_quantized_latent_representation_image: gr.update(visible=False)}
230
+ elif sound2sound_origin_source == "microphone":
231
+ return {sound2sound_origin_upload_audio: gr.update(visible=False),
232
+ sound2sound_origin_microphone_audio: gr.update(visible=True),
233
+ sound2sound_origin_spectrogram_upload_image: gr.update(visible=False),
234
+ sound2sound_origin_phase_upload_image: gr.update(visible=False),
235
+ sound2sound_origin_spectrogram_microphone_image: gr.update(visible=True),
236
+ sound2sound_origin_phase_microphone_image: gr.update(visible=True),
237
+ sound2sound_origin_upload_latent_representation_image: gr.update(visible=False),
238
+ sound2sound_origin_upload_quantized_latent_representation_image: gr.update(visible=False),
239
+ sound2sound_origin_microphone_latent_representation_image: gr.update(visible=True),
240
+ sound2sound_origin_microphone_quantized_latent_representation_image: gr.update(visible=True)}
241
+ else:
242
+ print("Input source not in ['upload', 'microphone']!")
243
+
244
+ with gr.Tab("Super Resolution"):
245
+ gr.Markdown("Select the area to inpaint and use the prompt to guide the synthesis of a new sound!")
246
+ with gr.Row(variant="panel"):
247
+ with gr.Column(scale=3):
248
+ text2sound_prompts_textbox = gr.Textbox(label="Positive prompt", lines=2, value="organ")
249
+ text2sound_negative_prompts_textbox = gr.Textbox(label="Negative prompt", lines=2, value="")
250
+
251
+ with gr.Column(scale=1):
252
+ sound2sound_sample_button = gr.Button(variant="primary", value="Generate", scale=1)
253
+
254
+ sound2sound_sample_index_slider = gr.Slider(minimum=0, maximum=3, value=0, step=1.0, visible=False,
255
+ label="Sample index",
256
+ info="Swipe to view other samples")
257
+
258
+ with gr.Row(variant="panel"):
259
+ with gr.Column(scale=1):
260
+ with gr.Tab("Origin sound"):
261
+ sound2sound_duration_slider = gradioWebUI.get_duration_slider()
262
+ sound2sound_origin_source_radio = gr.Radio(choices=["upload", "microphone"], value="upload",
263
+ label="Input source")
264
+
265
+ sound2sound_origin_upload_audio = gr.Audio(type="numpy", label="Upload", source="upload",
266
+ interactive=True, visible=True)
267
+ sound2sound_origin_microphone_audio = gr.Audio(type="numpy", label="Record", source="microphone",
268
+ interactive=True, visible=False)
269
+ with gr.Row(variant="panel"):
270
+ sound2sound_origin_spectrogram_upload_image = gr.Image(label="Original upload spectrogram",
271
+ type="numpy", height=600,
272
+ visible=True, tool="sketch")
273
+ sound2sound_origin_phase_upload_image = gr.Image(label="Original upload phase",
274
+ type="numpy", height=600,
275
+ visible=True)
276
+ sound2sound_origin_spectrogram_microphone_image = gr.Image(label="Original microphone spectrogram",
277
+ type="numpy", height=600,
278
+ visible=False, tool="sketch")
279
+ sound2sound_origin_phase_microphone_image = gr.Image(label="Original microphone phase",
280
+ type="numpy", height=600,
281
+ visible=False)
282
+ sound2sound_inpaint_area_radio = gr.Radio(choices=["inpaint masked", "inpaint not masked"],
283
+ value="inpaint masked")
284
+
285
+ with gr.Tab("Sound2sound settings"):
286
+ sound2sound_sample_steps_slider = gradioWebUI.get_sample_steps_slider()
287
+ sound2sound_sampler_radio = gradioWebUI.get_sampler_radio()
288
+ sound2sound_batchsize_slider = gradioWebUI.get_batchsize_slider()
289
+ sound2sound_noising_strength_slider = gradioWebUI.get_noising_strength_slider(default_noising_strength=1.0)
290
+ sound2sound_guidance_scale_slider = gradioWebUI.get_guidance_scale_slider()
291
+ sound2sound_seed_textbox = gradioWebUI.get_seed_textbox()
292
+
293
+
294
+ with gr.Column(scale=1):
295
+ sound2sound_new_sound_audio = gr.Audio(type="numpy", label="Play new sound", interactive=False)
296
+ with gr.Row(variant="panel"):
297
+ sound2sound_new_sound_spectrogram_image = gr.Image(label="New sound spectrogram", type="numpy",
298
+ height=1200, scale=1)
299
+ sound2sound_new_sound_phase_image = gr.Image(label="New sound phase", type="numpy",
300
+ height=1200, scale=1)
301
+
302
+ with gr.Row(variant="panel"):
303
+ sound2sound_origin_upload_latent_representation_image = gr.Image(label="Original latent representation",
304
+ type="numpy", height=1200,
305
+ visible=True)
306
+ sound2sound_origin_upload_quantized_latent_representation_image = gr.Image(
307
+ label="Original quantized latent representation", type="numpy", height=1200, visible=True)
308
+
309
+ sound2sound_origin_microphone_latent_representation_image = gr.Image(label="Original latent representation",
310
+ type="numpy", height=1200,
311
+ visible=False)
312
+ sound2sound_origin_microphone_quantized_latent_representation_image = gr.Image(
313
+ label="Original quantized latent representation", type="numpy", height=1200, visible=False)
314
+
315
+ sound2sound_new_sound_latent_representation_image = gr.Image(label="New latent representation",
316
+ type="numpy", height=1200)
317
+ sound2sound_new_sound_quantized_latent_representation_image = gr.Image(
318
+ label="New sound quantized latent representation", type="numpy", height=1200)
319
+
320
+ sound2sound_origin_upload_audio.change(receive_uopoad_origin_audio,
321
+ inputs=[sound2sound_duration_slider, sound2sound_origin_source_radio, sound2sound_origin_upload_audio,
322
+ sound2sound_origin_microphone_audio, inpaintWithText_state],
323
+ outputs=[sound2sound_origin_spectrogram_upload_image,
324
+ sound2sound_origin_phase_upload_image,
325
+ sound2sound_origin_spectrogram_microphone_image,
326
+ sound2sound_origin_phase_microphone_image,
327
+ sound2sound_origin_upload_latent_representation_image,
328
+ sound2sound_origin_upload_quantized_latent_representation_image,
329
+ sound2sound_origin_microphone_latent_representation_image,
330
+ sound2sound_origin_microphone_quantized_latent_representation_image,
331
+ inpaintWithText_state])
332
+ sound2sound_origin_microphone_audio.change(receive_uopoad_origin_audio,
333
+ inputs=[sound2sound_duration_slider, sound2sound_origin_source_radio, sound2sound_origin_upload_audio,
334
+ sound2sound_origin_microphone_audio, inpaintWithText_state],
335
+ outputs=[sound2sound_origin_spectrogram_upload_image,
336
+ sound2sound_origin_phase_upload_image,
337
+ sound2sound_origin_spectrogram_microphone_image,
338
+ sound2sound_origin_phase_microphone_image,
339
+ sound2sound_origin_upload_latent_representation_image,
340
+ sound2sound_origin_upload_quantized_latent_representation_image,
341
+ sound2sound_origin_microphone_latent_representation_image,
342
+ sound2sound_origin_microphone_quantized_latent_representation_image,
343
+ inpaintWithText_state])
344
+
345
+ sound2sound_sample_button.click(sound2sound_sample,
346
+ inputs=[sound2sound_origin_spectrogram_upload_image,
347
+ sound2sound_origin_spectrogram_microphone_image,
348
+ text2sound_prompts_textbox,
349
+ text2sound_negative_prompts_textbox,
350
+ sound2sound_batchsize_slider,
351
+ sound2sound_guidance_scale_slider,
352
+ sound2sound_sampler_radio,
353
+ sound2sound_sample_steps_slider,
354
+ sound2sound_origin_source_radio,
355
+ sound2sound_noising_strength_slider,
356
+ sound2sound_seed_textbox,
357
+ sound2sound_inpaint_area_radio,
358
+ inpaintWithText_state],
359
+ outputs=[sound2sound_new_sound_latent_representation_image,
360
+ sound2sound_new_sound_quantized_latent_representation_image,
361
+ sound2sound_new_sound_spectrogram_image,
362
+ sound2sound_new_sound_phase_image,
363
+ sound2sound_new_sound_audio,
364
+ sound2sound_sample_index_slider,
365
+ sound2sound_seed_textbox,
366
+ inpaintWithText_state])
367
+
368
+ sound2sound_sample_index_slider.change(show_sound2sound_sample,
369
+ inputs=[sound2sound_sample_index_slider, inpaintWithText_state],
370
+ outputs=[sound2sound_new_sound_latent_representation_image,
371
+ sound2sound_new_sound_quantized_latent_representation_image,
372
+ sound2sound_new_sound_spectrogram_image,
373
+ sound2sound_new_sound_phase_image,
374
+ sound2sound_new_sound_audio])
375
+
376
+ sound2sound_origin_source_radio.change(sound2sound_switch_origin_source,
377
+ inputs=[sound2sound_origin_source_radio],
378
+ outputs=[sound2sound_origin_upload_audio,
379
+ sound2sound_origin_microphone_audio,
380
+ sound2sound_origin_spectrogram_upload_image,
381
+ sound2sound_origin_phase_upload_image,
382
+ sound2sound_origin_spectrogram_microphone_image,
383
+ sound2sound_origin_phase_microphone_image,
384
+ sound2sound_origin_upload_latent_representation_image,
385
+ sound2sound_origin_upload_quantized_latent_representation_image,
386
+ sound2sound_origin_microphone_latent_representation_image,
387
+ sound2sound_origin_microphone_quantized_latent_representation_image])
webUI/natural_language_guided_4/text2sound.py ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+
4
+ from model.DiffSynthSampler import DiffSynthSampler
5
+ from tools import safe_int
6
+ from webUI.natural_language_guided_4.utils import latent_representation_to_Gradio_image, \
7
+ encodeBatch2GradioOutput_STFT, add_instrument, resize_image_to_aspect_ratio
8
+
9
+
10
+ def get_text2sound_module(gradioWebUI, text2sound_state, virtual_instruments_state):
11
+ # Load configurations
12
+ uNet = gradioWebUI.uNet
13
+ freq_resolution, time_resolution = gradioWebUI.freq_resolution, gradioWebUI.time_resolution
14
+ VAE_scale = gradioWebUI.VAE_scale
15
+ height, width, channels = int(freq_resolution / VAE_scale), int(time_resolution / VAE_scale), gradioWebUI.channels
16
+
17
+ timesteps = gradioWebUI.timesteps
18
+ VAE_quantizer = gradioWebUI.VAE_quantizer
19
+ VAE_decoder = gradioWebUI.VAE_decoder
20
+ CLAP = gradioWebUI.CLAP
21
+ CLAP_tokenizer = gradioWebUI.CLAP_tokenizer
22
+ device = gradioWebUI.device
23
+ squared = gradioWebUI.squared
24
+ sample_rate = gradioWebUI.sample_rate
25
+ noise_strategy = gradioWebUI.noise_strategy
26
+
27
+ def diffusion_random_sample(text2sound_prompts, text2sound_negative_prompts, text2sound_batchsize,
28
+ text2sound_duration,
29
+ text2sound_guidance_scale, text2sound_sampler,
30
+ text2sound_sample_steps, text2sound_seed,
31
+ text2sound_dict):
32
+ text2sound_sample_steps = int(text2sound_sample_steps)
33
+ text2sound_seed = safe_int(text2sound_seed, 12345678)
34
+
35
+ width = int(time_resolution * ((text2sound_duration + 1) / 4) / VAE_scale)
36
+
37
+ text2sound_batchsize = int(text2sound_batchsize)
38
+
39
+ text2sound_embedding = \
40
+ CLAP.get_text_features(**CLAP_tokenizer([text2sound_prompts], padding=True, return_tensors="pt"))[0].to(
41
+ device)
42
+
43
+ CFG = int(text2sound_guidance_scale)
44
+
45
+ mySampler = DiffSynthSampler(timesteps, height=height, channels=channels, noise_strategy=noise_strategy)
46
+ negative_condition = \
47
+ CLAP.get_text_features(**CLAP_tokenizer([text2sound_negative_prompts], padding=True, return_tensors="pt"))[
48
+ 0]
49
+
50
+ mySampler.activate_classifier_free_guidance(CFG, negative_condition.to(device))
51
+
52
+ mySampler.respace(list(np.linspace(0, timesteps - 1, text2sound_sample_steps, dtype=np.int32)))
53
+
54
+ condition = text2sound_embedding.repeat(text2sound_batchsize, 1)
55
+
56
+ latent_representations, initial_noise = \
57
+ mySampler.sample(model=uNet, shape=(text2sound_batchsize, channels, height, width), seed=text2sound_seed,
58
+ return_tensor=True, condition=condition, sampler=text2sound_sampler)
59
+
60
+ latent_representations = latent_representations[-1]
61
+
62
+ latent_representation_gradio_images = []
63
+ quantized_latent_representation_gradio_images = []
64
+ new_sound_spectrogram_gradio_images = []
65
+ new_sound_phase_gradio_images = []
66
+ new_sound_rec_signals_gradio = []
67
+
68
+ quantized_latent_representations, loss, (_, _, _) = VAE_quantizer(latent_representations)
69
+ # Todo: remove hard-coding
70
+ flipped_log_spectrums, flipped_phases, rec_signals, _, _, _ = encodeBatch2GradioOutput_STFT(VAE_decoder,
71
+ quantized_latent_representations,
72
+ resolution=(
73
+ 512,
74
+ width * VAE_scale),
75
+ original_STFT_batch=None
76
+ )
77
+
78
+ for i in range(text2sound_batchsize):
79
+ latent_representation_gradio_images.append(latent_representation_to_Gradio_image(latent_representations[i]))
80
+ quantized_latent_representation_gradio_images.append(
81
+ latent_representation_to_Gradio_image(quantized_latent_representations[i]))
82
+ new_sound_spectrogram_gradio_images.append(flipped_log_spectrums[i])
83
+ new_sound_phase_gradio_images.append(flipped_phases[i])
84
+ new_sound_rec_signals_gradio.append((sample_rate, rec_signals[i]))
85
+
86
+ text2sound_dict["latent_representation_gradio_images"] = latent_representation_gradio_images
87
+ text2sound_dict["quantized_latent_representation_gradio_images"] = quantized_latent_representation_gradio_images
88
+ text2sound_dict["new_sound_spectrogram_gradio_images"] = new_sound_spectrogram_gradio_images
89
+ text2sound_dict["new_sound_phase_gradio_images"] = new_sound_phase_gradio_images
90
+ text2sound_dict["new_sound_rec_signals_gradio"] = new_sound_rec_signals_gradio
91
+
92
+ # save instrument
93
+ text2sound_dict["latent_representations"] = latent_representations.to("cpu").detach().numpy()
94
+ text2sound_dict["quantized_latent_representations"] = quantized_latent_representations.to(
95
+ "cpu").detach().numpy()
96
+ text2sound_dict["condition"] = condition.to("cpu").detach().numpy()
97
+ text2sound_dict["negative_condition"] = negative_condition.to("cpu").detach().numpy()
98
+ text2sound_dict["guidance_scale"] = CFG
99
+ text2sound_dict["sampler"] = text2sound_sampler
100
+
101
+ return {text2sound_latent_representation_image: text2sound_dict["latent_representation_gradio_images"][0],
102
+ text2sound_quantized_latent_representation_image:
103
+ text2sound_dict["quantized_latent_representation_gradio_images"][0],
104
+ text2sound_sampled_spectrogram_image: resize_image_to_aspect_ratio(
105
+ text2sound_dict["new_sound_spectrogram_gradio_images"][0],
106
+ 1.55,
107
+ 1),
108
+ text2sound_sampled_phase_image: resize_image_to_aspect_ratio(
109
+ text2sound_dict["new_sound_phase_gradio_images"][0],
110
+ 1.55,
111
+ 1),
112
+ text2sound_sampled_audio: text2sound_dict["new_sound_rec_signals_gradio"][0],
113
+ text2sound_seed_textbox: text2sound_seed,
114
+ text2sound_state: text2sound_dict,
115
+ text2sound_sample_index_slider: gr.update(minimum=0, maximum=text2sound_batchsize - 1, value=0, step=1,
116
+ visible=True,
117
+ label="Sample index.",
118
+ info="Swipe to view other samples")}
119
+
120
+ def show_random_sample(sample_index, text2sound_dict):
121
+ sample_index = int(sample_index)
122
+ text2sound_dict["sample_index"] = sample_index
123
+ print(text2sound_dict["new_sound_rec_signals_gradio"][sample_index])
124
+ return {text2sound_latent_representation_image: text2sound_dict["latent_representation_gradio_images"][
125
+ sample_index],
126
+ text2sound_quantized_latent_representation_image:
127
+ text2sound_dict["quantized_latent_representation_gradio_images"][sample_index],
128
+ text2sound_sampled_spectrogram_image: resize_image_to_aspect_ratio(
129
+ text2sound_dict["new_sound_spectrogram_gradio_images"][sample_index], 1.55, 1),
130
+ text2sound_sampled_phase_image: resize_image_to_aspect_ratio(text2sound_dict["new_sound_phase_gradio_images"][
131
+ sample_index], 1.55, 1),
132
+ text2sound_sampled_audio: text2sound_dict["new_sound_rec_signals_gradio"][sample_index]}
133
+
134
+ def save_virtual_instrument(sample_index, virtual_instrument_name, text2sound_dict, virtual_instruments_dict):
135
+ virtual_instruments_dict = add_instrument(text2sound_dict, virtual_instruments_dict, virtual_instrument_name,
136
+ sample_index)
137
+
138
+ return {virtual_instruments_state: virtual_instruments_dict,
139
+ text2sound_instrument_name_textbox: gr.Textbox(label="Instrument name", lines=1,
140
+ placeholder=f"Saved as {virtual_instrument_name}!")}
141
+
142
+ with gr.Tab("Text2sound"):
143
+ gr.Markdown("Use neural networks to select random sounds using your favorite instrument!")
144
+ with gr.Row(variant="panel"):
145
+ with gr.Column(scale=3):
146
+ text2sound_prompts_textbox = gr.Textbox(label="Positive prompt", lines=2, value="organ")
147
+ text2sound_negative_prompts_textbox = gr.Textbox(label="Negative prompt", lines=2, value="")
148
+
149
+ with gr.Column(scale=1):
150
+ text2sound_sampling_button = gr.Button(variant="primary",
151
+ value="Generate a batch of samples and show "
152
+ "the first one",
153
+ scale=1)
154
+ text2sound_sample_index_slider = gr.Slider(minimum=0, maximum=3, value=0, step=1.0, visible=False,
155
+ label="Sample index",
156
+ info="Swipe to view other samples")
157
+ with gr.Row(variant="panel"):
158
+ with gr.Column(variant="panel", scale=1):
159
+ text2sound_sample_steps_slider = gradioWebUI.get_sample_steps_slider()
160
+ text2sound_sampler_radio = gradioWebUI.get_sampler_radio()
161
+ text2sound_batchsize_slider = gradioWebUI.get_batchsize_slider()
162
+ text2sound_duration_slider = gradioWebUI.get_duration_slider()
163
+ text2sound_guidance_scale_slider = gradioWebUI.get_guidance_scale_slider()
164
+ text2sound_seed_textbox = gradioWebUI.get_seed_textbox()
165
+
166
+ with gr.Column(variant="panel", scale=1):
167
+ with gr.Row(variant="panel", ):
168
+ text2sound_sampled_spectrogram_image = gr.Image(label="Sampled spectrogram", type="numpy", )
169
+ text2sound_sampled_phase_image = gr.Image(label="Sampled phase", type="numpy")
170
+ text2sound_sampled_audio = gr.Audio(type="numpy", label="Play",
171
+ scale=1)
172
+
173
+ with gr.Row(variant="panel", ):
174
+ text2sound_instrument_name_textbox = gr.Textbox(label="Instrument name", lines=2,
175
+ placeholder="Name of your instrument",
176
+ scale=1)
177
+ text2sound_save_instrument_button = gr.Button(variant="primary",
178
+ value="Save instrument",
179
+ scale=1)
180
+
181
+ with gr.Row(variant="panel"):
182
+ text2sound_latent_representation_image = gr.Image(label="Sampled latent representation", type="numpy",
183
+ height=200, width=100, visible=False)
184
+ text2sound_quantized_latent_representation_image = gr.Image(label="Quantized latent representation",
185
+ type="numpy", height=200, width=100,
186
+ visible=False)
187
+
188
+ text2sound_sampling_button.click(diffusion_random_sample,
189
+ inputs=[text2sound_prompts_textbox,
190
+ text2sound_negative_prompts_textbox,
191
+ text2sound_batchsize_slider,
192
+ text2sound_duration_slider,
193
+ text2sound_guidance_scale_slider, text2sound_sampler_radio,
194
+ text2sound_sample_steps_slider,
195
+ text2sound_seed_textbox,
196
+ text2sound_state],
197
+ outputs=[text2sound_latent_representation_image,
198
+ text2sound_quantized_latent_representation_image,
199
+ text2sound_sampled_spectrogram_image,
200
+ text2sound_sampled_phase_image,
201
+ text2sound_sampled_audio,
202
+ text2sound_seed_textbox,
203
+ text2sound_state,
204
+ text2sound_sample_index_slider])
205
+
206
+ text2sound_save_instrument_button.click(save_virtual_instrument,
207
+ inputs=[text2sound_sample_index_slider,
208
+ text2sound_instrument_name_textbox,
209
+ text2sound_state,
210
+ virtual_instruments_state],
211
+ outputs=[virtual_instruments_state,
212
+ text2sound_instrument_name_textbox])
213
+
214
+ text2sound_sample_index_slider.change(show_random_sample,
215
+ inputs=[text2sound_sample_index_slider, text2sound_state],
216
+ outputs=[text2sound_latent_representation_image,
217
+ text2sound_quantized_latent_representation_image,
218
+ text2sound_sampled_spectrogram_image,
219
+ text2sound_sampled_phase_image,
220
+ text2sound_sampled_audio])
webUI/natural_language_guided_4/track_maker.py ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ import numpy as np
3
+ import torch
4
+
5
+ from model.DiffSynthSampler import DiffSynthSampler
6
+ from webUI.natural_language_guided_4.utils import encodeBatch2GradioOutput_STFT
7
+ import mido
8
+ import torchaudio.transforms as transforms
9
+ from tqdm import tqdm
10
+
11
+
12
+ # def pitch_shift_audio(waveform, sample_rate, n_steps, device='cpu', n_fft=1024, hop_length=None):
13
+ # # 如果输入是 numpy 数组,则转换为 torch.Tensor
14
+ # if isinstance(waveform, np.ndarray):
15
+ # waveform = torch.from_numpy(waveform)
16
+ #
17
+ # # 设置 hop_length 为 n_fft 的一半(合理的默认值),以减少 STFT 操作的内存开销
18
+ # if hop_length is None:
19
+ # hop_length = n_fft // 4
20
+ #
21
+ # # 将 waveform 移动到指定设备上
22
+ # waveform = waveform.to(device, dtype=torch.float32)
23
+ #
24
+ # # 创建 pitch_shift 变换,并移动到指定设备上
25
+ # pitch_shift = transforms.PitchShift(
26
+ # sample_rate=sample_rate,
27
+ # n_steps=n_steps,
28
+ # n_fft=n_fft,
29
+ # hop_length=hop_length
30
+ # ).to(device)
31
+ #
32
+ # # 执行变换,并将结果从设备移动到 CPU 上,最后转换为 numpy 数组
33
+ # shifted_waveform = pitch_shift(waveform).detach().cpu().numpy()
34
+ #
35
+ # return shifted_waveform
36
+
37
+
38
+ def pitch_shift_librosa(waveform, sample_rate, total_steps, step_size=4, n_fft=4096, hop_length=None):
39
+ # librosa 需要输入的是 numpy 数组
40
+ if isinstance(waveform, torch.Tensor):
41
+ waveform = waveform.numpy()
42
+
43
+ # 如果 hop_length 未提供,则使用 n_fft 的四分之一作为默认值
44
+ if hop_length is None:
45
+ hop_length = n_fft // 4
46
+
47
+ # 逐步进行 pitch shift,每次提升 step_size 个半音
48
+ current_waveform = waveform
49
+ num_steps = int(np.ceil(total_steps / step_size))
50
+
51
+ for i in range(num_steps):
52
+ step = min(step_size, total_steps - i * step_size) # 确保最后一步不会超过 total_steps
53
+ current_waveform = librosa.effects.pitch_shift(
54
+ current_waveform, sr=sample_rate, n_steps=step,
55
+ n_fft=n_fft, hop_length=hop_length
56
+ )
57
+
58
+ return current_waveform
59
+
60
+
61
+
62
+
63
+ class NoteEvent:
64
+ def __init__(self, note, velocity, start_time, duration):
65
+ self.note = note
66
+ self.velocity = velocity
67
+ self.start_time = start_time # In ticks
68
+ self.duration = duration # In ticks
69
+
70
+ def __str__(self):
71
+ return f"Note {self.note}, velocity {self.velocity}, start_time {self.start_time}, duration {self.duration}"
72
+
73
+
74
+ class Track:
75
+ def __init__(self, track, ticks_per_beat, max_notes=100):
76
+ self.tempo_events = self._parse_tempo_events(track)
77
+ self.events = self._parse_note_events(track)
78
+ self.ticks_per_beat = ticks_per_beat
79
+ self.max_notes = int(max_notes)
80
+
81
+ def _parse_tempo_events(self, track):
82
+ tempo_events = []
83
+ current_tempo = 500000 # Default MIDI tempo is 120 BPM which is 500000 microseconds per beat
84
+ for msg in track:
85
+ if msg.type == 'set_tempo':
86
+ tempo_events.append((msg.time, msg.tempo))
87
+ elif not msg.is_meta:
88
+ tempo_events.append((msg.time, current_tempo))
89
+ return tempo_events
90
+
91
+ def _parse_note_events(self, track):
92
+ events = []
93
+ start_time = 0
94
+ for msg in track:
95
+ if not msg.is_meta:
96
+ start_time += msg.time
97
+ if msg.type == 'note_on' and msg.velocity > 0:
98
+ note_on_time = start_time
99
+ elif msg.type == 'note_on' and msg.velocity == 0:
100
+ duration = start_time - note_on_time
101
+ events.append(NoteEvent(msg.note, msg.velocity, note_on_time, duration))
102
+ return events
103
+
104
+ def synthesize_track(self, diffSynthSampler, sample_rate=16000):
105
+ track_audio = np.zeros(int(self._get_total_time() * sample_rate), dtype=np.float32)
106
+ current_tempo = 500000 # Start with default MIDI tempo 120 BPM
107
+ duration_note_mapping = {}
108
+
109
+ for event in tqdm(self.events[:self.max_notes]):
110
+ current_tempo = self._get_tempo_at(event.start_time)
111
+ seconds_per_tick = mido.tick2second(1, self.ticks_per_beat, current_tempo)
112
+ start_time_sec = event.start_time * seconds_per_tick
113
+ # Todo: set a minimum duration
114
+ duration_sec = event.duration * seconds_per_tick
115
+ duration_sec = max(duration_sec, 0.75)
116
+ start_sample = int(start_time_sec * sample_rate)
117
+ if not (str(duration_sec) in duration_note_mapping):
118
+ note_sample = diffSynthSampler(event.velocity, duration_sec)
119
+ duration_note_mapping[str(duration_sec)] = note_sample / np.max(np.abs(note_sample))
120
+
121
+ # note_audio = pyrb.pitch_shift(duration_note_mapping[str(duration_sec)], sample_rate, event.note - 52)
122
+ # note_audio = pitch_shift_audio(duration_note_mapping[str(duration_sec)], sample_rate, event.note - 52)
123
+ note_audio = pitch_shift_librosa(duration_note_mapping[str(duration_sec)], sample_rate, event.note - 52)
124
+ end_sample = start_sample + len(note_audio)
125
+ track_audio[start_sample:end_sample] += note_audio
126
+
127
+ return track_audio
128
+
129
+ def _get_tempo_at(self, time_tick):
130
+ current_tempo = 500000 # Start with default MIDI tempo 120 BPM
131
+ elapsed_ticks = 0
132
+
133
+ for tempo_change in self.tempo_events:
134
+ if elapsed_ticks + tempo_change[0] > time_tick:
135
+ return current_tempo
136
+ elapsed_ticks += tempo_change[0]
137
+ current_tempo = tempo_change[1]
138
+
139
+ return current_tempo
140
+
141
+ def _get_total_time(self):
142
+ total_time = 0
143
+ current_tempo = 500000 # Start with default MIDI tempo 120 BPM
144
+
145
+ for event in self.events:
146
+ current_tempo = self._get_tempo_at(event.start_time)
147
+ seconds_per_tick = mido.tick2second(1, self.ticks_per_beat, current_tempo)
148
+ total_time += event.duration * seconds_per_tick
149
+
150
+ return total_time + 10
151
+
152
+
153
+ class DiffSynth:
154
+ def __init__(self, instruments_configs, noise_prediction_model, VAE_quantizer, VAE_decoder, text_encoder, CLAP_tokenizer, device,
155
+ model_sample_rate=16000, timesteps=1000, channels=4, freq_resolution=512, time_resolution=256, VAE_scale=4, squared=False):
156
+
157
+ self.noise_prediction_model = noise_prediction_model
158
+ self.VAE_quantizer = VAE_quantizer
159
+ self.VAE_decoder = VAE_decoder
160
+ self.device = device
161
+ self.model_sample_rate = model_sample_rate
162
+ self.timesteps = timesteps
163
+ self.channels = channels
164
+ self.freq_resolution = freq_resolution
165
+ self.time_resolution = time_resolution
166
+ self.height = int(freq_resolution/VAE_scale)
167
+ self.VAE_scale = VAE_scale
168
+ self.squared = squared
169
+ self.text_encoder = text_encoder
170
+ self.CLAP_tokenizer = CLAP_tokenizer
171
+
172
+ # instruments_configs 是字典 string -> (condition, negative_condition, guidance_scale, sample_steps, seed, initial_noise, sampler)
173
+ self.instruments_configs = instruments_configs
174
+ self.diffSynthSamplers = {}
175
+ self._update_instruments()
176
+
177
+
178
+ def _update_instruments(self):
179
+
180
+ def diffSynthSamplerWrapper(instruments_config):
181
+
182
+ def diffSynthSampler(velocity, duration_sec, sample_rate=16000):
183
+
184
+ condition = self.text_encoder.get_text_features(**self.CLAP_tokenizer([""], padding=True, return_tensors="pt")).to(self.device)
185
+ sample_steps = instruments_config['sample_steps']
186
+ sampler = instruments_config['sampler']
187
+ noising_strength = instruments_config['noising_strength']
188
+ latent_representation = instruments_config['latent_representation']
189
+ attack = instruments_config['attack']
190
+ before_release = instruments_config['before_release']
191
+
192
+ assert sample_rate == self.model_sample_rate, "sample_rate != model_sample_rate"
193
+
194
+ width = int(self.time_resolution * ((duration_sec + 1) / 4) / self.VAE_scale)
195
+
196
+ mySampler = DiffSynthSampler(self.timesteps, height=128, channels=4, noise_strategy="repeat", mute=True)
197
+ mySampler.respace(list(np.linspace(0, self.timesteps - 1, sample_steps, dtype=np.int32)))
198
+
199
+ # mask = 1, freeze
200
+ latent_mask = torch.zeros((1, 1, self.height, width), dtype=torch.float32).to(self.device)
201
+ latent_mask[:, :, :, :int(self.time_resolution * (attack / 4) / self.VAE_scale)] = 1.0
202
+ latent_mask[:, :, :, -int(self.time_resolution * ((before_release+1) / 4) / self.VAE_scale):] = 1.0
203
+
204
+ latent_representations, _ = \
205
+ mySampler.inpaint_sample(model=self.noise_prediction_model, shape=(1, self.channels, self.height, width),
206
+ noising_strength=noising_strength, condition=condition,
207
+ guide_img=latent_representation, mask=latent_mask, return_tensor=True,
208
+ sampler=sampler,
209
+ use_dynamic_mask=True, end_noise_level_ratio=0.0,
210
+ mask_flexivity=1.0)
211
+
212
+
213
+ latent_representations = latent_representations[-1]
214
+
215
+ quantized_latent_representations, _, (_, _, _) = self.VAE_quantizer(latent_representations)
216
+ # Todo: remove hard-coding
217
+
218
+ flipped_log_spectrums, flipped_phases, rec_signals, _, _, _ = encodeBatch2GradioOutput_STFT(self.VAE_decoder,
219
+ quantized_latent_representations,
220
+ resolution=(
221
+ 512,
222
+ width * self.VAE_scale),
223
+ original_STFT_batch=None,
224
+ )
225
+
226
+
227
+ return rec_signals[0]
228
+
229
+ return diffSynthSampler
230
+
231
+ for key in self.instruments_configs.keys():
232
+ self.diffSynthSamplers[key] = diffSynthSamplerWrapper(self.instruments_configs[key])
233
+
234
+ def get_music(self, mid, instrument_names, sample_rate=16000, max_notes=100):
235
+ tracks = [Track(t, mid.ticks_per_beat, max_notes) for t in mid.tracks]
236
+ assert len(tracks) <= len(instrument_names), f"len(tracks) = {len(tracks)} > {len(instrument_names)} = len(instrument_names)"
237
+
238
+ track_audios = [track.synthesize_track(self.diffSynthSamplers[instrument_names[i]], sample_rate=sample_rate) for i, track in enumerate(tracks)]
239
+
240
+ # 将所有音轨填充至最长音轨的长度,以便它们可以被叠加
241
+ max_length = max(len(audio) for audio in track_audios)
242
+ full_audio = np.zeros(max_length, dtype=np.float32) # 初始化全音频数组为零
243
+ for audio in track_audios:
244
+ # 音轨可能不够长,需要填充零
245
+ padded_audio = np.pad(audio, (0, max_length - len(audio)), 'constant')
246
+ full_audio += padded_audio # 叠加音轨
247
+
248
+ return full_audio
webUI/natural_language_guided_4/utils.py ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ import numpy as np
3
+ import torch
4
+ from PIL import Image
5
+ from tools import np_power_to_db, decode_stft, depad_STFT
6
+
7
+
8
+ def spectrogram_to_Gradio_image(spc):
9
+ ### input: spc [np.ndarray]
10
+ frequency_resolution, time_resolution = spc.shape[-2], spc.shape[-1]
11
+ spc = np.reshape(spc, (frequency_resolution, time_resolution))
12
+
13
+ # Todo:
14
+ magnitude_spectrum = np.abs(spc)
15
+ log_spectrum = np_power_to_db(magnitude_spectrum)
16
+ flipped_log_spectrum = np.flipud(log_spectrum)
17
+
18
+ colorful_spc = np.ones((frequency_resolution, time_resolution, 3)) * -80.0
19
+ colorful_spc[:, :, 0] = flipped_log_spectrum
20
+ colorful_spc[:, :, 1] = flipped_log_spectrum
21
+ colorful_spc[:, :, 2] = np.ones((frequency_resolution, time_resolution)) * -60.0
22
+ # Rescale to 0-255 and convert to uint8
23
+ rescaled = (colorful_spc + 80.0) / 80.0
24
+ rescaled = (255.0 * rescaled).astype(np.uint8)
25
+ return rescaled
26
+
27
+
28
+ def phase_to_Gradio_image(phase):
29
+ ### input: spc [np.ndarray]
30
+ frequency_resolution, time_resolution = phase.shape[-2], phase.shape[-1]
31
+ phase = np.reshape(phase, (frequency_resolution, time_resolution))
32
+
33
+ # Todo:
34
+ flipped_phase = np.flipud(phase)
35
+ flipped_phase = (flipped_phase + 1.0) / 2.0
36
+
37
+ colorful_spc = np.zeros((frequency_resolution, time_resolution, 3))
38
+ colorful_spc[:, :, 0] = flipped_phase
39
+ colorful_spc[:, :, 1] = flipped_phase
40
+ colorful_spc[:, :, 2] = 0.2
41
+ # Rescale to 0-255 and convert to uint8
42
+ rescaled = (255.0 * colorful_spc).astype(np.uint8)
43
+ return rescaled
44
+
45
+
46
+ def latent_representation_to_Gradio_image(latent_representation):
47
+ # input: latent_representation [torch.tensor]
48
+ if not isinstance(latent_representation, np.ndarray):
49
+ latent_representation = latent_representation.to("cpu").detach().numpy()
50
+ image = latent_representation
51
+
52
+ def normalize_image(img):
53
+ min_val = img.min()
54
+ max_val = img.max()
55
+ normalized_img = ((img - min_val) / (max_val - min_val) * 255)
56
+ return normalized_img
57
+
58
+ image[0, :, :] = normalize_image(image[0, :, :])
59
+ image[1, :, :] = normalize_image(image[1, :, :])
60
+ image[2, :, :] = normalize_image(image[2, :, :])
61
+ image[3, :, :] = normalize_image(image[3, :, :])
62
+ image_transposed = np.transpose(image, (1, 2, 0))
63
+ enlarged_image = np.repeat(image_transposed, 8, axis=0)
64
+ enlarged_image = np.repeat(enlarged_image, 8, axis=1)
65
+ return np.flipud(enlarged_image).astype(np.uint8)
66
+
67
+
68
+ def InputBatch2Encode_STFT(encoder, STFT_batch, resolution=(512, 256), quantizer=None, squared=True):
69
+ """Transform batch of numpy spectrogram's into signals and encodings."""
70
+ # Todo: remove resolution hard-coding
71
+ frequency_resolution, time_resolution = resolution
72
+
73
+ device = next(encoder.parameters()).device
74
+ if not (quantizer is None):
75
+ latent_representation_batch = encoder(STFT_batch.to(device))
76
+ quantized_latent_representation_batch, loss, (_, _, _) = quantizer(latent_representation_batch)
77
+ else:
78
+ mu, logvar, latent_representation_batch = encoder(STFT_batch.to(device))
79
+ quantized_latent_representation_batch = None
80
+
81
+ STFT_batch = STFT_batch.to("cpu").detach().numpy()
82
+
83
+ origin_flipped_log_spectrums, origin_flipped_phases, origin_signals = [], [], []
84
+ for STFT in STFT_batch:
85
+
86
+ padded_D_rec = decode_stft(STFT)
87
+ D_rec = depad_STFT(padded_D_rec)
88
+ spc = np.abs(D_rec)
89
+ phase = np.angle(D_rec)
90
+
91
+ flipped_log_spectrum = spectrogram_to_Gradio_image(spc)
92
+ flipped_phase = phase_to_Gradio_image(phase)
93
+
94
+ # get_audio
95
+ rec_signal = librosa.istft(D_rec, hop_length=256, win_length=1024)
96
+
97
+ origin_flipped_log_spectrums.append(flipped_log_spectrum)
98
+ origin_flipped_phases.append(flipped_phase)
99
+ origin_signals.append(rec_signal)
100
+
101
+ return origin_flipped_log_spectrums, origin_flipped_phases, origin_signals, \
102
+ latent_representation_batch, quantized_latent_representation_batch
103
+
104
+
105
+ def encodeBatch2GradioOutput_STFT(decoder, latent_vector_batch, resolution=(512, 256), original_STFT_batch=None):
106
+ """Show a spectrogram."""
107
+ # Todo: remove resolution hard-coding
108
+ frequency_resolution, time_resolution = resolution
109
+
110
+ if isinstance(latent_vector_batch, np.ndarray):
111
+ latent_vector_batch = torch.from_numpy(latent_vector_batch).to(next(decoder.parameters()).device)
112
+
113
+ reconstruction_batch = decoder(latent_vector_batch).to("cpu").detach().numpy()
114
+
115
+ flipped_log_spectrums, flipped_phases, rec_signals = [], [], []
116
+ flipped_log_spectrums_with_original_amp, flipped_phases_with_original_amp, rec_signals_with_original_amp = [], [], []
117
+
118
+ for index, STFT in enumerate(reconstruction_batch):
119
+ padded_D_rec = decode_stft(STFT)
120
+ D_rec = depad_STFT(padded_D_rec)
121
+ spc = np.abs(D_rec)
122
+ phase = np.angle(D_rec)
123
+
124
+ flipped_log_spectrum = spectrogram_to_Gradio_image(spc)
125
+ flipped_phase = phase_to_Gradio_image(phase)
126
+
127
+ # get_audio
128
+ rec_signal = librosa.istft(D_rec, hop_length=256, win_length=1024)
129
+
130
+ flipped_log_spectrums.append(flipped_log_spectrum)
131
+ flipped_phases.append(flipped_phase)
132
+ rec_signals.append(rec_signal)
133
+
134
+ ##########################################
135
+
136
+ if original_STFT_batch is not None:
137
+ STFT[0, :, :] = original_STFT_batch[index, 0, :, :]
138
+
139
+ padded_D_rec = decode_stft(STFT)
140
+ D_rec = depad_STFT(padded_D_rec)
141
+ spc = np.abs(D_rec)
142
+ phase = np.angle(D_rec)
143
+
144
+ flipped_log_spectrum = spectrogram_to_Gradio_image(spc)
145
+ flipped_phase = phase_to_Gradio_image(phase)
146
+
147
+ # get_audio
148
+ rec_signal = librosa.istft(D_rec, hop_length=256, win_length=1024)
149
+
150
+ flipped_log_spectrums_with_original_amp.append(flipped_log_spectrum)
151
+ flipped_phases_with_original_amp.append(flipped_phase)
152
+ rec_signals_with_original_amp.append(rec_signal)
153
+
154
+
155
+ return flipped_log_spectrums, flipped_phases, rec_signals, \
156
+ flipped_log_spectrums_with_original_amp, flipped_phases_with_original_amp, rec_signals_with_original_amp
157
+
158
+
159
+
160
+ def add_instrument(source_dict, virtual_instruments_dict, virtual_instrument_name, sample_index):
161
+
162
+ virtual_instruments = virtual_instruments_dict["virtual_instruments"]
163
+ virtual_instrument = {
164
+ "latent_representation": source_dict["latent_representations"][sample_index],
165
+ "quantized_latent_representation": source_dict["quantized_latent_representations"][sample_index],
166
+ "sampler": source_dict["sampler"],
167
+ "signal": source_dict["new_sound_rec_signals_gradio"][sample_index],
168
+ "spectrogram_gradio_image": source_dict["new_sound_spectrogram_gradio_images"][
169
+ sample_index],
170
+ "phase_gradio_image": source_dict["new_sound_phase_gradio_images"][
171
+ sample_index]}
172
+ virtual_instruments[virtual_instrument_name] = virtual_instrument
173
+ virtual_instruments_dict["virtual_instruments"] = virtual_instruments
174
+ return virtual_instruments_dict
175
+
176
+
177
+ def resize_image_to_aspect_ratio(image_data, aspect_ratio_width, aspect_ratio_height):
178
+ """
179
+ 根据给定的宽高比例拉伸图像,并保持输入输出数据为 NumPy 数组。
180
+
181
+ 参数:
182
+ image_data (numpy array): 输入图像数据 (height, width, 3)
183
+ aspect_ratio_width (int): 目标宽度比例
184
+ aspect_ratio_height (int): 目标高度比例
185
+
186
+ 返回:
187
+ numpy array: 调整大小后的图像数据
188
+ """
189
+ # 获取图像的当前宽度和高度
190
+ original_height, original_width, channels = image_data.shape
191
+
192
+ # 计算当前的宽高比
193
+ current_aspect_ratio = original_width / original_height
194
+
195
+ # 计算目标的宽高比
196
+ target_aspect_ratio = aspect_ratio_width / aspect_ratio_height
197
+
198
+ # 判断是拉伸宽度还是高度
199
+ if current_aspect_ratio > target_aspect_ratio:
200
+ # 当前图像宽高比大于目标宽高比,说明宽度相对较大,需要拉伸高度
201
+ new_width = original_width
202
+ new_height = int(new_width / target_aspect_ratio)
203
+ else:
204
+ # 当前图像宽高比小于或等于目标宽高比,拉伸宽度
205
+ new_height = original_height
206
+ new_width = int(new_height * target_aspect_ratio)
207
+
208
+ # 将 numpy 数组转换为 PIL 图像对象
209
+ image = Image.fromarray(image_data.astype('uint8'))
210
+
211
+ # 使用 PIL 的 resize 函数进行缩放,使用 LANCZOS 替代 ANTIALIAS
212
+ resized_image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
213
+
214
+ # 将 PIL 图像转换回 numpy 数组
215
+ resized_image_data = np.array(resized_image)
216
+
217
+ return resized_image_data
218
+
219
+
220
+ def average_np_arrays(arr_list):
221
+ if not arr_list:
222
+ raise ValueError("Input list cannot be empty")
223
+
224
+ stacked_arrays = np.stack(arr_list, axis=0)
225
+
226
+ avg_array = np.mean(stacked_arrays, axis=0)
227
+
228
+ return avg_array
webUI/presets/instruments/ax.wav ADDED
Binary file (131 kB). View file
 
webUI/presets/instruments/electronic_sound.wav ADDED
Binary file (131 kB). View file
 
webUI/presets/instruments/keyboard.wav ADDED
Binary file (128 kB). View file
 
webUI/presets/instruments/organ.wav ADDED
Binary file (131 kB). View file
 
webUI/presets/instruments/string.wav ADDED
Binary file (131 kB). View file
 
webUI/presets/instruments/synth_lead.wav ADDED
Binary file (131 kB). View file
 
webUI/presets/midis/Air_on_the_G_String.mid ADDED
Binary file (6 kB). View file
 
webUI/presets/midis/Arhbo.mid ADDED
Binary file (14.7 kB). View file
 
webUI/presets/midis/Canon_in_D.mid ADDED
Binary file (10.9 kB). View file
 
webUI/presets/midis/Ode_to_Joy_Easy_variation.mid ADDED
Binary file (920 Bytes). View file
 
webUI/presets/midis/Rrharil.mid ADDED
Binary file (16.2 kB). View file