Spaces:
Running
Running
WeixuanYuan
commited on
Commit
•
bd6e54b
1
Parent(s):
bdd2a77
Upload 70 files
Browse files- app.py +22 -11
- webUI/natural_language_guided/__pycache__/README.cpython-310.pyc +0 -0
- webUI/natural_language_guided/__pycache__/build_instrument.cpython-310.pyc +0 -0
- webUI/natural_language_guided/__pycache__/gradio_webUI.cpython-310.pyc +0 -0
- webUI/natural_language_guided/__pycache__/inpaint_with_text.cpython-310.pyc +0 -0
- webUI/natural_language_guided/__pycache__/note2music.cpython-310.pyc +0 -0
- webUI/natural_language_guided/__pycache__/sound2sound_with_text.cpython-310.pyc +0 -0
- webUI/natural_language_guided/__pycache__/text2sound.cpython-310.pyc +0 -0
- webUI/natural_language_guided/__pycache__/track_maker.cpython-310.pyc +0 -0
- webUI/natural_language_guided/__pycache__/utils.cpython-310.pyc +0 -0
- webUI/natural_language_guided/build_instrument.py +2 -1
- webUI/natural_language_guided/note2music.py +174 -0
- webUI/natural_language_guided/text2sound.py +2 -2
- webUI/natural_language_guided/track_maker.py +246 -246
- webUI/natural_language_guided_4/GAN.py +164 -0
- webUI/natural_language_guided_4/README.py +53 -0
- webUI/natural_language_guided_4/__pycache__/gradio_webUI.cpython-310.pyc +0 -0
- webUI/natural_language_guided_4/__pycache__/inpaint_with_text.cpython-310.pyc +0 -0
- webUI/natural_language_guided_4/__pycache__/instruments.cpython-310.pyc +0 -0
- webUI/natural_language_guided_4/__pycache__/load_presets.cpython-310.pyc +0 -0
- webUI/natural_language_guided_4/__pycache__/note2music.cpython-310.pyc +0 -0
- webUI/natural_language_guided_4/__pycache__/sound2sound_with_text.cpython-310.pyc +0 -0
- webUI/natural_language_guided_4/__pycache__/text2sound.cpython-310.pyc +0 -0
- webUI/natural_language_guided_4/__pycache__/track_maker.cpython-310.pyc +0 -0
- webUI/natural_language_guided_4/__pycache__/utils.cpython-310.pyc +0 -0
- webUI/natural_language_guided_4/build_instrument.py +305 -0
- webUI/natural_language_guided_4/gradio_webUI.py +68 -0
- webUI/natural_language_guided_4/inpaint_with_text.py +371 -0
- webUI/natural_language_guided_4/instruments.py +60 -0
- webUI/natural_language_guided_4/load_presets.py +81 -0
- webUI/natural_language_guided_4/note2music.py +200 -0
- webUI/natural_language_guided_4/rec.py +190 -0
- webUI/natural_language_guided_4/sound2sound_with_text.py +325 -0
- webUI/natural_language_guided_4/super_resolution_with_text.py +387 -0
- webUI/natural_language_guided_4/text2sound.py +220 -0
- webUI/natural_language_guided_4/track_maker.py +248 -0
- webUI/natural_language_guided_4/utils.py +228 -0
- webUI/presets/instruments/ax.wav +0 -0
- webUI/presets/instruments/electronic_sound.wav +0 -0
- webUI/presets/instruments/keyboard.wav +0 -0
- webUI/presets/instruments/organ.wav +0 -0
- webUI/presets/instruments/string.wav +0 -0
- webUI/presets/instruments/synth_lead.wav +0 -0
- webUI/presets/midis/Air_on_the_G_String.mid +0 -0
- webUI/presets/midis/Arhbo.mid +0 -0
- webUI/presets/midis/Canon_in_D.mid +0 -0
- webUI/presets/midis/Ode_to_Joy_Easy_variation.mid +0 -0
- webUI/presets/midis/Rrharil.mid +0 -0
app.py
CHANGED
@@ -15,12 +15,17 @@ from model.multimodal_model import get_multi_modal_model
|
|
15 |
|
16 |
|
17 |
import gradio as gr
|
18 |
-
|
19 |
-
from
|
20 |
-
from webUI.
|
21 |
-
from webUI.
|
22 |
-
from webUI.
|
23 |
-
from webUI.
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
|
26 |
|
@@ -62,28 +67,34 @@ else:
|
|
62 |
|
63 |
|
64 |
|
65 |
-
|
66 |
gradioWebUI = GradioWebUI(device, VAE, uNet, text_encoder, CLAP_tokenizer, freq_resolution=512, time_resolution=256, channels=4, timesteps=1000, squared=False,
|
67 |
VAE_scale=4, flexible_duration=True, noise_strategy="repeat", GAN_generator=None)
|
68 |
|
|
|
|
|
|
|
|
|
69 |
with gr.Blocks(theme=gr.themes.Soft(), mode="dark") as demo:
|
70 |
-
|
71 |
-
gr.Markdown('Thank you for using DiffuSynth v0.2! \n <span style="color:red">The [Arrangement] feature is still being improved!</span>', unsafe_allow_html=True)
|
72 |
|
73 |
reconstruction_state = gr.State(value={})
|
74 |
text2sound_state = gr.State(value={})
|
75 |
sound2sound_state = gr.State(value={})
|
76 |
inpaint_state = gr.State(value={})
|
77 |
super_resolution_state = gr.State(value={})
|
78 |
-
virtual_instruments_state = gr.State(value={"virtual_instruments":
|
|
|
79 |
|
80 |
get_text2sound_module(gradioWebUI, text2sound_state, virtual_instruments_state)
|
81 |
get_sound2sound_with_text_module(gradioWebUI, sound2sound_state, virtual_instruments_state)
|
82 |
get_inpaint_with_text_module(gradioWebUI, inpaint_state, virtual_instruments_state)
|
83 |
-
get_build_instrument_module(gradioWebUI, virtual_instruments_state)
|
|
|
84 |
# get_readme_module()
|
|
|
85 |
|
86 |
demo.launch(debug=True, share=True)
|
|
|
87 |
|
88 |
|
89 |
|
|
|
15 |
|
16 |
|
17 |
import gradio as gr
|
18 |
+
|
19 |
+
from tools import read_wav_to_numpy
|
20 |
+
from webUI.natural_language_guided_4.gradio_webUI import GradioWebUI
|
21 |
+
from webUI.natural_language_guided_4.instruments import get_instruments_module
|
22 |
+
from webUI.natural_language_guided_4.load_presets import load_presets
|
23 |
+
from webUI.natural_language_guided_4.text2sound import get_text2sound_module
|
24 |
+
from webUI.natural_language_guided_4.sound2sound_with_text import get_sound2sound_with_text_module
|
25 |
+
from webUI.natural_language_guided_4.inpaint_with_text import get_inpaint_with_text_module
|
26 |
+
# from webUI.natural_language_guided_4.build_instrument import get_build_instrument_module
|
27 |
+
from webUI.natural_language_guided_4.note2music import get_arrangement_module
|
28 |
+
# from webUI.natural_language_guided_4.README import get_readme_module
|
29 |
|
30 |
|
31 |
|
|
|
67 |
|
68 |
|
69 |
|
|
|
70 |
gradioWebUI = GradioWebUI(device, VAE, uNet, text_encoder, CLAP_tokenizer, freq_resolution=512, time_resolution=256, channels=4, timesteps=1000, squared=False,
|
71 |
VAE_scale=4, flexible_duration=True, noise_strategy="repeat", GAN_generator=None)
|
72 |
|
73 |
+
virtual_instruments, midis = load_presets(gradioWebUI)
|
74 |
+
|
75 |
+
|
76 |
+
|
77 |
with gr.Blocks(theme=gr.themes.Soft(), mode="dark") as demo:
|
78 |
+
gr.Markdown("Thank you for using DiffuSynth v0.2!")
|
|
|
79 |
|
80 |
reconstruction_state = gr.State(value={})
|
81 |
text2sound_state = gr.State(value={})
|
82 |
sound2sound_state = gr.State(value={})
|
83 |
inpaint_state = gr.State(value={})
|
84 |
super_resolution_state = gr.State(value={})
|
85 |
+
virtual_instruments_state = gr.State(value={"virtual_instruments": virtual_instruments})
|
86 |
+
midi_files_state = gr.State(value={"midis": midis})
|
87 |
|
88 |
get_text2sound_module(gradioWebUI, text2sound_state, virtual_instruments_state)
|
89 |
get_sound2sound_with_text_module(gradioWebUI, sound2sound_state, virtual_instruments_state)
|
90 |
get_inpaint_with_text_module(gradioWebUI, inpaint_state, virtual_instruments_state)
|
91 |
+
# get_build_instrument_module(gradioWebUI, virtual_instruments_state)
|
92 |
+
get_arrangement_module(gradioWebUI, virtual_instruments_state, midi_files_state)
|
93 |
# get_readme_module()
|
94 |
+
# get_instruments_module(gradioWebUI, virtual_instruments_state)
|
95 |
|
96 |
demo.launch(debug=True, share=True)
|
97 |
+
# demo.launch(debug=True, share=False)
|
98 |
|
99 |
|
100 |
|
webUI/natural_language_guided/__pycache__/README.cpython-310.pyc
CHANGED
Binary files a/webUI/natural_language_guided/__pycache__/README.cpython-310.pyc and b/webUI/natural_language_guided/__pycache__/README.cpython-310.pyc differ
|
|
webUI/natural_language_guided/__pycache__/build_instrument.cpython-310.pyc
CHANGED
Binary files a/webUI/natural_language_guided/__pycache__/build_instrument.cpython-310.pyc and b/webUI/natural_language_guided/__pycache__/build_instrument.cpython-310.pyc differ
|
|
webUI/natural_language_guided/__pycache__/gradio_webUI.cpython-310.pyc
CHANGED
Binary files a/webUI/natural_language_guided/__pycache__/gradio_webUI.cpython-310.pyc and b/webUI/natural_language_guided/__pycache__/gradio_webUI.cpython-310.pyc differ
|
|
webUI/natural_language_guided/__pycache__/inpaint_with_text.cpython-310.pyc
CHANGED
Binary files a/webUI/natural_language_guided/__pycache__/inpaint_with_text.cpython-310.pyc and b/webUI/natural_language_guided/__pycache__/inpaint_with_text.cpython-310.pyc differ
|
|
webUI/natural_language_guided/__pycache__/note2music.cpython-310.pyc
ADDED
Binary file (6.59 kB). View file
|
|
webUI/natural_language_guided/__pycache__/sound2sound_with_text.cpython-310.pyc
CHANGED
Binary files a/webUI/natural_language_guided/__pycache__/sound2sound_with_text.cpython-310.pyc and b/webUI/natural_language_guided/__pycache__/sound2sound_with_text.cpython-310.pyc differ
|
|
webUI/natural_language_guided/__pycache__/text2sound.cpython-310.pyc
CHANGED
Binary files a/webUI/natural_language_guided/__pycache__/text2sound.cpython-310.pyc and b/webUI/natural_language_guided/__pycache__/text2sound.cpython-310.pyc differ
|
|
webUI/natural_language_guided/__pycache__/track_maker.cpython-310.pyc
CHANGED
Binary files a/webUI/natural_language_guided/__pycache__/track_maker.cpython-310.pyc and b/webUI/natural_language_guided/__pycache__/track_maker.cpython-310.pyc differ
|
|
webUI/natural_language_guided/__pycache__/utils.cpython-310.pyc
CHANGED
Binary files a/webUI/natural_language_guided/__pycache__/utils.cpython-310.pyc and b/webUI/natural_language_guided/__pycache__/utils.cpython-310.pyc differ
|
|
webUI/natural_language_guided/build_instrument.py
CHANGED
@@ -4,13 +4,14 @@ import torch
|
|
4 |
import gradio as gr
|
5 |
import mido
|
6 |
from io import BytesIO
|
|
|
|
|
7 |
|
8 |
from model.DiffSynthSampler import DiffSynthSampler
|
9 |
from tools import adsr_envelope, adjust_audio_length
|
10 |
from webUI.natural_language_guided.track_maker import DiffSynth
|
11 |
from webUI.natural_language_guided.utils import encodeBatch2GradioOutput_STFT, phase_to_Gradio_image, \
|
12 |
spectrogram_to_Gradio_image
|
13 |
-
import torchaudio.transforms as transforms
|
14 |
|
15 |
|
16 |
def time_stretch_audio(waveform, sample_rate, stretch_factor):
|
|
|
4 |
import gradio as gr
|
5 |
import mido
|
6 |
from io import BytesIO
|
7 |
+
# import pyrubberband as pyrb
|
8 |
+
import torchaudio.transforms as transforms
|
9 |
|
10 |
from model.DiffSynthSampler import DiffSynthSampler
|
11 |
from tools import adsr_envelope, adjust_audio_length
|
12 |
from webUI.natural_language_guided.track_maker import DiffSynth
|
13 |
from webUI.natural_language_guided.utils import encodeBatch2GradioOutput_STFT, phase_to_Gradio_image, \
|
14 |
spectrogram_to_Gradio_image
|
|
|
15 |
|
16 |
|
17 |
def time_stretch_audio(waveform, sample_rate, stretch_factor):
|
webUI/natural_language_guided/note2music.py
ADDED
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import librosa
|
2 |
+
import numpy as np
|
3 |
+
import torch
|
4 |
+
import gradio as gr
|
5 |
+
import mido
|
6 |
+
from io import BytesIO
|
7 |
+
# import pyrubberband as pyrb
|
8 |
+
import torchaudio.transforms as transforms
|
9 |
+
|
10 |
+
from model.DiffSynthSampler import DiffSynthSampler
|
11 |
+
from tools import adsr_envelope, adjust_audio_length
|
12 |
+
from webUI.natural_language_guided.track_maker import DiffSynth, Track
|
13 |
+
from webUI.natural_language_guided.utils import encodeBatch2GradioOutput_STFT, phase_to_Gradio_image, \
|
14 |
+
spectrogram_to_Gradio_image
|
15 |
+
|
16 |
+
|
17 |
+
def get_arrangement_module(gradioWebUI, virtual_instruments_state, midi_files_state):
|
18 |
+
# Load configurations
|
19 |
+
uNet = gradioWebUI.uNet
|
20 |
+
freq_resolution, time_resolution = gradioWebUI.freq_resolution, gradioWebUI.time_resolution
|
21 |
+
VAE_scale = gradioWebUI.VAE_scale
|
22 |
+
height, width, channels = int(freq_resolution / VAE_scale), int(time_resolution / VAE_scale), gradioWebUI.channels
|
23 |
+
|
24 |
+
timesteps = gradioWebUI.timesteps
|
25 |
+
VAE_quantizer = gradioWebUI.VAE_quantizer
|
26 |
+
VAE_decoder = gradioWebUI.VAE_decoder
|
27 |
+
CLAP = gradioWebUI.CLAP
|
28 |
+
CLAP_tokenizer = gradioWebUI.CLAP_tokenizer
|
29 |
+
device = gradioWebUI.device
|
30 |
+
squared = gradioWebUI.squared
|
31 |
+
sample_rate = gradioWebUI.sample_rate
|
32 |
+
noise_strategy = gradioWebUI.noise_strategy
|
33 |
+
|
34 |
+
def read_midi(midi, midi_files_dict):
|
35 |
+
print(midi)
|
36 |
+
midi_name = midi_file.name
|
37 |
+
mid = mido.MidiFile(file=BytesIO(midi))
|
38 |
+
tracks = [Track(t, mid.ticks_per_beat) for t in mid.tracks]
|
39 |
+
|
40 |
+
midi_info_text = f"Name: {midi_name}"
|
41 |
+
for track in tracks:
|
42 |
+
midi_info_text += f"\n {len(track.events)}"
|
43 |
+
|
44 |
+
|
45 |
+
return {midi_info_textbox: gr.Textbox(label="Midi info", lines=10,
|
46 |
+
placeholder=midi_info_text),
|
47 |
+
midi_files_state: midi_files_dict}
|
48 |
+
|
49 |
+
def refresh_instruments(virtual_instruments_dict):
|
50 |
+
virtual_instruments_names = list(virtual_instruments_dict["virtual_instruments"].keys())
|
51 |
+
print(f"virtual_instruments_names: {virtual_instruments_names}")
|
52 |
+
|
53 |
+
return {select_instrument_dropdown: gr.Dropdown.update(choices=["New Option 1", "New Option 2", "New Option 3"])}
|
54 |
+
|
55 |
+
def select_sound(virtual_instrument_name, virtual_instruments_dict):
|
56 |
+
virtual_instruments = virtual_instruments_dict["virtual_instruments"]
|
57 |
+
virtual_instrument = virtual_instruments[virtual_instrument_name]
|
58 |
+
|
59 |
+
return {source_sound_spectrogram_image: virtual_instrument["spectrogram_gradio_image"],
|
60 |
+
source_sound_phase_image: virtual_instrument["phase_gradio_image"],
|
61 |
+
source_sound_audio: virtual_instrument["signal"]}
|
62 |
+
|
63 |
+
def make_track(inpaint_steps, midi, noising_strength, attack, before_release, instrument_names,
|
64 |
+
virtual_instruments_dict):
|
65 |
+
|
66 |
+
if noising_strength < 1:
|
67 |
+
print(f"Warning: making track with noising_strength = {noising_strength} < 1")
|
68 |
+
virtual_instruments = virtual_instruments_dict["virtual_instruments"]
|
69 |
+
sample_steps = int(inpaint_steps)
|
70 |
+
|
71 |
+
instrument_names = instrument_names.split("@")
|
72 |
+
instruments_configs = {}
|
73 |
+
for virtual_instrument_name in instrument_names:
|
74 |
+
virtual_instrument = virtual_instruments[virtual_instrument_name]
|
75 |
+
|
76 |
+
latent_representation = torch.tensor(virtual_instrument["latent_representation"], dtype=torch.float32).to(
|
77 |
+
device)
|
78 |
+
sampler = virtual_instrument["sampler"]
|
79 |
+
|
80 |
+
batchsize = 1
|
81 |
+
|
82 |
+
latent_representation = latent_representation.repeat(batchsize, 1, 1, 1)
|
83 |
+
|
84 |
+
mid = mido.MidiFile(file=BytesIO(midi))
|
85 |
+
instruments_configs[virtual_instrument_name] = {
|
86 |
+
'sample_steps': sample_steps,
|
87 |
+
'sampler': sampler,
|
88 |
+
'noising_strength': noising_strength,
|
89 |
+
'latent_representation': latent_representation,
|
90 |
+
'attack': attack,
|
91 |
+
'before_release': before_release}
|
92 |
+
|
93 |
+
diffSynth = DiffSynth(instruments_configs, uNet, VAE_quantizer, VAE_decoder, CLAP, CLAP_tokenizer, device)
|
94 |
+
|
95 |
+
full_audio = diffSynth.get_music(mid, instrument_names)
|
96 |
+
|
97 |
+
return {track_audio: (sample_rate, full_audio)}
|
98 |
+
|
99 |
+
with gr.Tab("Arrangement"):
|
100 |
+
gr.Markdown("Make music with generated sounds!")
|
101 |
+
with gr.Row(variant="panel"):
|
102 |
+
with gr.Column(scale=3):
|
103 |
+
preset_button_1 = gr.Button(variant="primary", value="Ode_to_Joy", scale=1)
|
104 |
+
preset_button_2 = gr.Button(variant="primary", value="Ode_to_Joy", scale=1)
|
105 |
+
preset_button_3 = gr.Button(variant="primary", value="Ode_to_Joy", scale=1)
|
106 |
+
midi_file = gr.File(label="Upload midi file", type="binary", scale=2)
|
107 |
+
with gr.Column(scale=3):
|
108 |
+
midi_info_textbox = gr.Textbox(label="Midi info", lines=10, placeholder="Please select/upload a midi on the left.")
|
109 |
+
instrument_names_textbox = gr.Textbox(label="Instrument names", lines=2,
|
110 |
+
placeholder="Names of your instrument used to play the midi", scale=1)
|
111 |
+
with gr.Column(scale=3):
|
112 |
+
refresh_instrument_button = gr.Button(variant="primary", value="Refresh instruments", scale=1)
|
113 |
+
# instrument_name_textbox = gr.Textbox(label="Instrument name", lines=1,
|
114 |
+
# placeholder="Name of your instrument", scale=1)
|
115 |
+
select_instrument_dropdown = gr.Dropdown(choices=["Option 1", "Option 2", "Option 3"], label="Choose an option")
|
116 |
+
source_sound_audio = gr.Audio(type="numpy", label="Play new sound", interactive=False)
|
117 |
+
with gr.Column(scale=3):
|
118 |
+
make_track_button = gr.Button(variant="primary", value="Make track", scale=1)
|
119 |
+
track_audio = gr.Audio(type="numpy", label="Play new sound", interactive=False)
|
120 |
+
with gr.Row(variant="panel"):
|
121 |
+
with gr.Tab("Origin sound"):
|
122 |
+
inpaint_steps_slider = gr.Slider(minimum=5.0, maximum=999.0, value=20.0, step=1.0,
|
123 |
+
label="inpaint_steps")
|
124 |
+
noising_strength_slider = gradioWebUI.get_noising_strength_slider(default_noising_strength=1.)
|
125 |
+
end_noise_level_ratio_slider = gr.Slider(minimum=0.0, maximum=1., value=0.0, step=0.01,
|
126 |
+
label="end_noise_level_ratio")
|
127 |
+
attack_slider = gr.Slider(minimum=0.0, maximum=1.5, value=0.5, step=0.01, label="attack in sec")
|
128 |
+
before_release_slider = gr.Slider(minimum=0.0, maximum=1.5, value=0.5, step=0.01,
|
129 |
+
label="before_release in sec")
|
130 |
+
release_slider = gr.Slider(minimum=0.0, maximum=1.0, value=0.3, step=0.01, label="release in sec")
|
131 |
+
mask_flexivity_slider = gr.Slider(minimum=0.01, maximum=1.00, value=1., step=0.01,
|
132 |
+
label="mask_flexivity")
|
133 |
+
with gr.Tab("Length adjustment config"):
|
134 |
+
use_dynamic_mask_checkbox = gr.Checkbox(label="Use dynamic mask", value=True)
|
135 |
+
test_duration_envelope_button = gr.Button(variant="primary", value="Apply envelope", scale=1)
|
136 |
+
test_duration_stretch_button = gr.Button(variant="primary", value="Apply stretch", scale=1)
|
137 |
+
test_duration_inpaint_button = gr.Button(variant="primary", value="Inpaint different duration", scale=1)
|
138 |
+
duration_slider = gradioWebUI.get_duration_slider()
|
139 |
+
with gr.Tab("Pitch shift config"):
|
140 |
+
pitch_shift_radio = gr.Radio(choices=["librosa", "torchaudio", "rubberband"],
|
141 |
+
value="librosa")
|
142 |
+
|
143 |
+
with gr.Row(variant="panel"):
|
144 |
+
with gr.Column(scale=2):
|
145 |
+
with gr.Row(variant="panel"):
|
146 |
+
source_sound_spectrogram_image = gr.Image(label="New sound spectrogram", type="numpy",
|
147 |
+
height=600, scale=1)
|
148 |
+
source_sound_phase_image = gr.Image(label="New sound phase", type="numpy",
|
149 |
+
height=600, scale=1)
|
150 |
+
|
151 |
+
|
152 |
+
|
153 |
+
# instrument_name_textbox.change(select_sound,
|
154 |
+
# inputs=[instrument_name_textbox, virtual_instruments_state],
|
155 |
+
# outputs=[source_sound_audio])
|
156 |
+
|
157 |
+
refresh_instrument_button.click(refresh_instruments,
|
158 |
+
inputs=[virtual_instruments_state],
|
159 |
+
outputs=[select_instrument_dropdown])
|
160 |
+
|
161 |
+
make_track_button.click(make_track,
|
162 |
+
inputs=[inpaint_steps_slider, midi_file,
|
163 |
+
noising_strength_slider,
|
164 |
+
attack_slider,
|
165 |
+
before_release_slider,
|
166 |
+
instrument_names_textbox,
|
167 |
+
virtual_instruments_state],
|
168 |
+
outputs=[track_audio])
|
169 |
+
|
170 |
+
midi_file.change(read_midi,
|
171 |
+
inputs=[midi_file,
|
172 |
+
midi_files_state],
|
173 |
+
outputs=[midi_info_textbox,
|
174 |
+
midi_files_state])
|
webUI/natural_language_guided/text2sound.py
CHANGED
@@ -46,8 +46,8 @@ def get_text2sound_module(gradioWebUI, text2sound_state, virtual_instruments_sta
|
|
46 |
|
47 |
mySampler = DiffSynthSampler(timesteps, height=height, channels=channels, noise_strategy=noise_strategy)
|
48 |
negative_condition = \
|
49 |
-
CLAP.get_text_features(**CLAP_tokenizer([text2sound_negative_prompts], padding=True, return_tensors="pt"))[
|
50 |
-
|
51 |
mySampler.activate_classifier_free_guidance(CFG, negative_condition.to(device))
|
52 |
|
53 |
mySampler.respace(list(np.linspace(0, timesteps - 1, text2sound_sample_steps, dtype=np.int32)))
|
|
|
46 |
|
47 |
mySampler = DiffSynthSampler(timesteps, height=height, channels=channels, noise_strategy=noise_strategy)
|
48 |
negative_condition = \
|
49 |
+
CLAP.get_text_features(**CLAP_tokenizer([text2sound_negative_prompts], padding=True, return_tensors="pt"))[0]
|
50 |
+
|
51 |
mySampler.activate_classifier_free_guidance(CFG, negative_condition.to(device))
|
52 |
|
53 |
mySampler.respace(list(np.linspace(0, timesteps - 1, text2sound_sample_steps, dtype=np.int32)))
|
webUI/natural_language_guided/track_maker.py
CHANGED
@@ -1,247 +1,247 @@
|
|
1 |
-
import librosa
|
2 |
-
import numpy as np
|
3 |
-
import torch
|
4 |
-
|
5 |
-
from model.DiffSynthSampler import DiffSynthSampler
|
6 |
-
from webUI.natural_language_guided.utils import encodeBatch2GradioOutput_STFT
|
7 |
-
import mido
|
8 |
-
import torchaudio.transforms as transforms
|
9 |
-
from tqdm import tqdm
|
10 |
-
|
11 |
-
|
12 |
-
# def pitch_shift_audio(waveform, sample_rate, n_steps, device='cpu', n_fft=1024, hop_length=None):
|
13 |
-
# # 如果输入是 numpy 数组,则转换为 torch.Tensor
|
14 |
-
# if isinstance(waveform, np.ndarray):
|
15 |
-
# waveform = torch.from_numpy(waveform)
|
16 |
-
#
|
17 |
-
# # 设置 hop_length 为 n_fft 的一半(合理的默认值),以减少 STFT 操作的内存开销
|
18 |
-
# if hop_length is None:
|
19 |
-
# hop_length = n_fft // 4
|
20 |
-
#
|
21 |
-
# # 将 waveform 移动到指定设备上
|
22 |
-
# waveform = waveform.to(device, dtype=torch.float32)
|
23 |
-
#
|
24 |
-
# # 创建 pitch_shift 变换,并移动到指定设备上
|
25 |
-
# pitch_shift = transforms.PitchShift(
|
26 |
-
# sample_rate=sample_rate,
|
27 |
-
# n_steps=n_steps,
|
28 |
-
# n_fft=n_fft,
|
29 |
-
# hop_length=hop_length
|
30 |
-
# ).to(device)
|
31 |
-
#
|
32 |
-
# # 执行变换,并将结果从设备移动到 CPU 上,最后转换为 numpy 数组
|
33 |
-
# shifted_waveform = pitch_shift(waveform).detach().cpu().numpy()
|
34 |
-
#
|
35 |
-
# return shifted_waveform
|
36 |
-
|
37 |
-
|
38 |
-
def pitch_shift_librosa(waveform, sample_rate, total_steps, step_size=4, n_fft=4096, hop_length=None):
|
39 |
-
# librosa 需要输入的是 numpy 数组
|
40 |
-
if isinstance(waveform, torch.Tensor):
|
41 |
-
waveform = waveform.numpy()
|
42 |
-
|
43 |
-
# 如果 hop_length 未提供,则使用 n_fft 的四分之一作为默认值
|
44 |
-
if hop_length is None:
|
45 |
-
hop_length = n_fft // 4
|
46 |
-
|
47 |
-
# 逐步进行 pitch shift,每次提升 step_size 个半音
|
48 |
-
current_waveform = waveform
|
49 |
-
num_steps = int(np.ceil(total_steps / step_size))
|
50 |
-
|
51 |
-
for i in range(num_steps):
|
52 |
-
step = min(step_size, total_steps - i * step_size) # 确保最后一步不会超过 total_steps
|
53 |
-
current_waveform = librosa.effects.pitch_shift(
|
54 |
-
current_waveform, sr=sample_rate, n_steps=step,
|
55 |
-
n_fft=n_fft, hop_length=hop_length
|
56 |
-
)
|
57 |
-
|
58 |
-
return current_waveform
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
class NoteEvent:
|
64 |
-
def __init__(self, note, velocity, start_time, duration):
|
65 |
-
self.note = note
|
66 |
-
self.velocity = velocity
|
67 |
-
self.start_time = start_time # In ticks
|
68 |
-
self.duration = duration # In ticks
|
69 |
-
|
70 |
-
def __str__(self):
|
71 |
-
return f"Note {self.note}, velocity {self.velocity}, start_time {self.start_time}, duration {self.duration}"
|
72 |
-
|
73 |
-
|
74 |
-
class Track:
|
75 |
-
def __init__(self, track, ticks_per_beat):
|
76 |
-
self.tempo_events = self._parse_tempo_events(track)
|
77 |
-
self.events = self._parse_note_events(track)
|
78 |
-
self.ticks_per_beat = ticks_per_beat
|
79 |
-
|
80 |
-
def _parse_tempo_events(self, track):
|
81 |
-
tempo_events = []
|
82 |
-
current_tempo = 500000 # Default MIDI tempo is 120 BPM which is 500000 microseconds per beat
|
83 |
-
for msg in track:
|
84 |
-
if msg.type == 'set_tempo':
|
85 |
-
tempo_events.append((msg.time, msg.tempo))
|
86 |
-
elif not msg.is_meta:
|
87 |
-
tempo_events.append((msg.time, current_tempo))
|
88 |
-
return tempo_events
|
89 |
-
|
90 |
-
def _parse_note_events(self, track):
|
91 |
-
events = []
|
92 |
-
start_time = 0
|
93 |
-
for msg in track:
|
94 |
-
if not msg.is_meta:
|
95 |
-
start_time += msg.time
|
96 |
-
if msg.type == 'note_on' and msg.velocity > 0:
|
97 |
-
note_on_time = start_time
|
98 |
-
elif msg.type == 'note_on' and msg.velocity == 0:
|
99 |
-
duration = start_time - note_on_time
|
100 |
-
events.append(NoteEvent(msg.note, msg.velocity, note_on_time, duration))
|
101 |
-
return events
|
102 |
-
|
103 |
-
def synthesize_track(self, diffSynthSampler, sample_rate=16000):
|
104 |
-
track_audio = np.zeros(int(self._get_total_time() * sample_rate), dtype=np.float32)
|
105 |
-
current_tempo = 500000 # Start with default MIDI tempo 120 BPM
|
106 |
-
duration_note_mapping = {}
|
107 |
-
|
108 |
-
for event in tqdm(self.events[:25]):
|
109 |
-
current_tempo = self._get_tempo_at(event.start_time)
|
110 |
-
seconds_per_tick = mido.tick2second(1, self.ticks_per_beat, current_tempo)
|
111 |
-
start_time_sec = event.start_time * seconds_per_tick
|
112 |
-
# Todo: set a minimum duration
|
113 |
-
duration_sec = event.duration * seconds_per_tick
|
114 |
-
duration_sec = max(duration_sec, 0.75)
|
115 |
-
start_sample = int(start_time_sec * sample_rate)
|
116 |
-
if not (str(duration_sec) in duration_note_mapping):
|
117 |
-
note_sample = diffSynthSampler(event.velocity, duration_sec)
|
118 |
-
duration_note_mapping[str(duration_sec)] = note_sample / np.max(np.abs(note_sample))
|
119 |
-
|
120 |
-
# note_audio = pyrb.pitch_shift(duration_note_mapping[str(duration_sec)], sample_rate, event.note - 52)
|
121 |
-
# note_audio = pitch_shift_audio(duration_note_mapping[str(duration_sec)], sample_rate, event.note - 52)
|
122 |
-
note_audio = pitch_shift_librosa(duration_note_mapping[str(duration_sec)], sample_rate, event.note - 52)
|
123 |
-
end_sample = start_sample + len(note_audio)
|
124 |
-
track_audio[start_sample:end_sample] += note_audio
|
125 |
-
|
126 |
-
return track_audio
|
127 |
-
|
128 |
-
def _get_tempo_at(self, time_tick):
|
129 |
-
current_tempo = 500000 # Start with default MIDI tempo 120 BPM
|
130 |
-
elapsed_ticks = 0
|
131 |
-
|
132 |
-
for tempo_change in self.tempo_events:
|
133 |
-
if elapsed_ticks + tempo_change[0] > time_tick:
|
134 |
-
return current_tempo
|
135 |
-
elapsed_ticks += tempo_change[0]
|
136 |
-
current_tempo = tempo_change[1]
|
137 |
-
|
138 |
-
return current_tempo
|
139 |
-
|
140 |
-
def _get_total_time(self):
|
141 |
-
total_time = 0
|
142 |
-
current_tempo = 500000 # Start with default MIDI tempo 120 BPM
|
143 |
-
|
144 |
-
for event in self.events:
|
145 |
-
current_tempo = self._get_tempo_at(event.start_time)
|
146 |
-
seconds_per_tick = mido.tick2second(1, self.ticks_per_beat, current_tempo)
|
147 |
-
total_time += event.duration * seconds_per_tick
|
148 |
-
|
149 |
-
return total_time
|
150 |
-
|
151 |
-
|
152 |
-
class DiffSynth:
|
153 |
-
def __init__(self, instruments_configs, noise_prediction_model, VAE_quantizer, VAE_decoder, text_encoder, CLAP_tokenizer, device,
|
154 |
-
model_sample_rate=16000, timesteps=1000, channels=4, freq_resolution=512, time_resolution=256, VAE_scale=4, squared=False):
|
155 |
-
|
156 |
-
self.noise_prediction_model = noise_prediction_model
|
157 |
-
self.VAE_quantizer = VAE_quantizer
|
158 |
-
self.VAE_decoder = VAE_decoder
|
159 |
-
self.device = device
|
160 |
-
self.model_sample_rate = model_sample_rate
|
161 |
-
self.timesteps = timesteps
|
162 |
-
self.channels = channels
|
163 |
-
self.freq_resolution = freq_resolution
|
164 |
-
self.time_resolution = time_resolution
|
165 |
-
self.height = int(freq_resolution/VAE_scale)
|
166 |
-
self.VAE_scale = VAE_scale
|
167 |
-
self.squared = squared
|
168 |
-
self.text_encoder = text_encoder
|
169 |
-
self.CLAP_tokenizer = CLAP_tokenizer
|
170 |
-
|
171 |
-
# instruments_configs 是字典 string -> (condition, negative_condition, guidance_scale, sample_steps, seed, initial_noise, sampler)
|
172 |
-
self.instruments_configs = instruments_configs
|
173 |
-
self.diffSynthSamplers = {}
|
174 |
-
self._update_instruments()
|
175 |
-
|
176 |
-
|
177 |
-
def _update_instruments(self):
|
178 |
-
|
179 |
-
def diffSynthSamplerWrapper(instruments_config):
|
180 |
-
|
181 |
-
def diffSynthSampler(velocity, duration_sec, sample_rate=16000):
|
182 |
-
|
183 |
-
condition = self.text_encoder.get_text_features(**self.CLAP_tokenizer([""], padding=True, return_tensors="pt")).to(self.device)
|
184 |
-
sample_steps = instruments_config['sample_steps']
|
185 |
-
sampler = instruments_config['sampler']
|
186 |
-
noising_strength = instruments_config['noising_strength']
|
187 |
-
latent_representation = instruments_config['latent_representation']
|
188 |
-
attack = instruments_config['attack']
|
189 |
-
before_release = instruments_config['before_release']
|
190 |
-
|
191 |
-
assert sample_rate == self.model_sample_rate, "sample_rate != model_sample_rate"
|
192 |
-
|
193 |
-
width = int(self.time_resolution * ((duration_sec + 1) / 4) / self.VAE_scale)
|
194 |
-
|
195 |
-
mySampler = DiffSynthSampler(self.timesteps, height=128, channels=4, noise_strategy="repeat", mute=True)
|
196 |
-
mySampler.respace(list(np.linspace(0, self.timesteps - 1, sample_steps, dtype=np.int32)))
|
197 |
-
|
198 |
-
# mask = 1, freeze
|
199 |
-
latent_mask = torch.zeros((1, 1, self.height, width), dtype=torch.float32).to(self.device)
|
200 |
-
latent_mask[:, :, :, :int(self.time_resolution * (attack / 4) / self.VAE_scale)] = 1.0
|
201 |
-
latent_mask[:, :, :, -int(self.time_resolution * ((before_release+1) / 4) / self.VAE_scale):] = 1.0
|
202 |
-
|
203 |
-
latent_representations, _ = \
|
204 |
-
mySampler.inpaint_sample(model=self.noise_prediction_model, shape=(1, self.channels, self.height, width),
|
205 |
-
noising_strength=noising_strength, condition=condition,
|
206 |
-
guide_img=latent_representation, mask=latent_mask, return_tensor=True,
|
207 |
-
sampler=sampler,
|
208 |
-
use_dynamic_mask=True, end_noise_level_ratio=0.0,
|
209 |
-
mask_flexivity=1.0)
|
210 |
-
|
211 |
-
|
212 |
-
latent_representations = latent_representations[-1]
|
213 |
-
|
214 |
-
quantized_latent_representations, _, (_, _, _) = self.VAE_quantizer(latent_representations)
|
215 |
-
# Todo: remove hard-coding
|
216 |
-
|
217 |
-
flipped_log_spectrums, flipped_phases, rec_signals, _, _, _ = encodeBatch2GradioOutput_STFT(self.VAE_decoder,
|
218 |
-
quantized_latent_representations,
|
219 |
-
resolution=(
|
220 |
-
512,
|
221 |
-
width * self.VAE_scale),
|
222 |
-
original_STFT_batch=None,
|
223 |
-
)
|
224 |
-
|
225 |
-
|
226 |
-
return rec_signals[0]
|
227 |
-
|
228 |
-
return diffSynthSampler
|
229 |
-
|
230 |
-
for key in self.instruments_configs.keys():
|
231 |
-
self.diffSynthSamplers[key] = diffSynthSamplerWrapper(self.instruments_configs[key])
|
232 |
-
|
233 |
-
def get_music(self, mid, instrument_names, sample_rate=16000):
|
234 |
-
tracks = [Track(t, mid.ticks_per_beat) for t in mid.tracks]
|
235 |
-
assert len(tracks)
|
236 |
-
|
237 |
-
track_audios = [track.synthesize_track(self.diffSynthSamplers[instrument_names[i]], sample_rate=sample_rate) for i, track in enumerate(tracks)]
|
238 |
-
|
239 |
-
# 将所有音轨填充至最长音轨的长度,以便它们可以被叠加
|
240 |
-
max_length = max(len(audio) for audio in track_audios)
|
241 |
-
full_audio = np.zeros(max_length, dtype=np.float32) # 初始化全音频数组为零
|
242 |
-
for audio in track_audios:
|
243 |
-
# 音轨可能不够长,需要填充零
|
244 |
-
padded_audio = np.pad(audio, (0, max_length - len(audio)), 'constant')
|
245 |
-
full_audio += padded_audio # 叠加音轨
|
246 |
-
|
247 |
return full_audio
|
|
|
1 |
+
import librosa
|
2 |
+
import numpy as np
|
3 |
+
import torch
|
4 |
+
|
5 |
+
from model.DiffSynthSampler import DiffSynthSampler
|
6 |
+
from webUI.natural_language_guided.utils import encodeBatch2GradioOutput_STFT
|
7 |
+
import mido
|
8 |
+
import torchaudio.transforms as transforms
|
9 |
+
from tqdm import tqdm
|
10 |
+
|
11 |
+
|
12 |
+
# def pitch_shift_audio(waveform, sample_rate, n_steps, device='cpu', n_fft=1024, hop_length=None):
|
13 |
+
# # 如果输入是 numpy 数组,则转换为 torch.Tensor
|
14 |
+
# if isinstance(waveform, np.ndarray):
|
15 |
+
# waveform = torch.from_numpy(waveform)
|
16 |
+
#
|
17 |
+
# # 设置 hop_length 为 n_fft 的一半(合理的默认值),以减少 STFT 操作的内存开销
|
18 |
+
# if hop_length is None:
|
19 |
+
# hop_length = n_fft // 4
|
20 |
+
#
|
21 |
+
# # 将 waveform 移动到指定设备上
|
22 |
+
# waveform = waveform.to(device, dtype=torch.float32)
|
23 |
+
#
|
24 |
+
# # 创建 pitch_shift 变换,并移动到指定设备上
|
25 |
+
# pitch_shift = transforms.PitchShift(
|
26 |
+
# sample_rate=sample_rate,
|
27 |
+
# n_steps=n_steps,
|
28 |
+
# n_fft=n_fft,
|
29 |
+
# hop_length=hop_length
|
30 |
+
# ).to(device)
|
31 |
+
#
|
32 |
+
# # 执行变换,并将结果从设备移动到 CPU 上,最后转换为 numpy 数组
|
33 |
+
# shifted_waveform = pitch_shift(waveform).detach().cpu().numpy()
|
34 |
+
#
|
35 |
+
# return shifted_waveform
|
36 |
+
|
37 |
+
|
38 |
+
def pitch_shift_librosa(waveform, sample_rate, total_steps, step_size=4, n_fft=4096, hop_length=None):
|
39 |
+
# librosa 需要输入的是 numpy 数组
|
40 |
+
if isinstance(waveform, torch.Tensor):
|
41 |
+
waveform = waveform.numpy()
|
42 |
+
|
43 |
+
# 如果 hop_length 未提供,则使用 n_fft 的四分之一作为默认值
|
44 |
+
if hop_length is None:
|
45 |
+
hop_length = n_fft // 4
|
46 |
+
|
47 |
+
# 逐步进行 pitch shift,每次提升 step_size 个半音
|
48 |
+
current_waveform = waveform
|
49 |
+
num_steps = int(np.ceil(total_steps / step_size))
|
50 |
+
|
51 |
+
for i in range(num_steps):
|
52 |
+
step = min(step_size, total_steps - i * step_size) # 确保最后一步不会超过 total_steps
|
53 |
+
current_waveform = librosa.effects.pitch_shift(
|
54 |
+
current_waveform, sr=sample_rate, n_steps=step,
|
55 |
+
n_fft=n_fft, hop_length=hop_length
|
56 |
+
)
|
57 |
+
|
58 |
+
return current_waveform
|
59 |
+
|
60 |
+
|
61 |
+
|
62 |
+
|
63 |
+
class NoteEvent:
|
64 |
+
def __init__(self, note, velocity, start_time, duration):
|
65 |
+
self.note = note
|
66 |
+
self.velocity = velocity
|
67 |
+
self.start_time = start_time # In ticks
|
68 |
+
self.duration = duration # In ticks
|
69 |
+
|
70 |
+
def __str__(self):
|
71 |
+
return f"Note {self.note}, velocity {self.velocity}, start_time {self.start_time}, duration {self.duration}"
|
72 |
+
|
73 |
+
|
74 |
+
class Track:
|
75 |
+
def __init__(self, track, ticks_per_beat):
|
76 |
+
self.tempo_events = self._parse_tempo_events(track)
|
77 |
+
self.events = self._parse_note_events(track)
|
78 |
+
self.ticks_per_beat = ticks_per_beat
|
79 |
+
|
80 |
+
def _parse_tempo_events(self, track):
|
81 |
+
tempo_events = []
|
82 |
+
current_tempo = 500000 # Default MIDI tempo is 120 BPM which is 500000 microseconds per beat
|
83 |
+
for msg in track:
|
84 |
+
if msg.type == 'set_tempo':
|
85 |
+
tempo_events.append((msg.time, msg.tempo))
|
86 |
+
elif not msg.is_meta:
|
87 |
+
tempo_events.append((msg.time, current_tempo))
|
88 |
+
return tempo_events
|
89 |
+
|
90 |
+
def _parse_note_events(self, track):
|
91 |
+
events = []
|
92 |
+
start_time = 0
|
93 |
+
for msg in track:
|
94 |
+
if not msg.is_meta:
|
95 |
+
start_time += msg.time
|
96 |
+
if msg.type == 'note_on' and msg.velocity > 0:
|
97 |
+
note_on_time = start_time
|
98 |
+
elif msg.type == 'note_on' and msg.velocity == 0:
|
99 |
+
duration = start_time - note_on_time
|
100 |
+
events.append(NoteEvent(msg.note, msg.velocity, note_on_time, duration))
|
101 |
+
return events
|
102 |
+
|
103 |
+
def synthesize_track(self, diffSynthSampler, sample_rate=16000):
|
104 |
+
track_audio = np.zeros(int(self._get_total_time() * sample_rate), dtype=np.float32)
|
105 |
+
current_tempo = 500000 # Start with default MIDI tempo 120 BPM
|
106 |
+
duration_note_mapping = {}
|
107 |
+
|
108 |
+
for event in tqdm(self.events[:25]):
|
109 |
+
current_tempo = self._get_tempo_at(event.start_time)
|
110 |
+
seconds_per_tick = mido.tick2second(1, self.ticks_per_beat, current_tempo)
|
111 |
+
start_time_sec = event.start_time * seconds_per_tick
|
112 |
+
# Todo: set a minimum duration
|
113 |
+
duration_sec = event.duration * seconds_per_tick
|
114 |
+
duration_sec = max(duration_sec, 0.75)
|
115 |
+
start_sample = int(start_time_sec * sample_rate)
|
116 |
+
if not (str(duration_sec) in duration_note_mapping):
|
117 |
+
note_sample = diffSynthSampler(event.velocity, duration_sec)
|
118 |
+
duration_note_mapping[str(duration_sec)] = note_sample / np.max(np.abs(note_sample))
|
119 |
+
|
120 |
+
# note_audio = pyrb.pitch_shift(duration_note_mapping[str(duration_sec)], sample_rate, event.note - 52)
|
121 |
+
# note_audio = pitch_shift_audio(duration_note_mapping[str(duration_sec)], sample_rate, event.note - 52)
|
122 |
+
note_audio = pitch_shift_librosa(duration_note_mapping[str(duration_sec)], sample_rate, event.note - 52)
|
123 |
+
end_sample = start_sample + len(note_audio)
|
124 |
+
track_audio[start_sample:end_sample] += note_audio
|
125 |
+
|
126 |
+
return track_audio
|
127 |
+
|
128 |
+
def _get_tempo_at(self, time_tick):
|
129 |
+
current_tempo = 500000 # Start with default MIDI tempo 120 BPM
|
130 |
+
elapsed_ticks = 0
|
131 |
+
|
132 |
+
for tempo_change in self.tempo_events:
|
133 |
+
if elapsed_ticks + tempo_change[0] > time_tick:
|
134 |
+
return current_tempo
|
135 |
+
elapsed_ticks += tempo_change[0]
|
136 |
+
current_tempo = tempo_change[1]
|
137 |
+
|
138 |
+
return current_tempo
|
139 |
+
|
140 |
+
def _get_total_time(self):
|
141 |
+
total_time = 0
|
142 |
+
current_tempo = 500000 # Start with default MIDI tempo 120 BPM
|
143 |
+
|
144 |
+
for event in self.events:
|
145 |
+
current_tempo = self._get_tempo_at(event.start_time)
|
146 |
+
seconds_per_tick = mido.tick2second(1, self.ticks_per_beat, current_tempo)
|
147 |
+
total_time += event.duration * seconds_per_tick
|
148 |
+
|
149 |
+
return total_time
|
150 |
+
|
151 |
+
|
152 |
+
class DiffSynth:
|
153 |
+
def __init__(self, instruments_configs, noise_prediction_model, VAE_quantizer, VAE_decoder, text_encoder, CLAP_tokenizer, device,
|
154 |
+
model_sample_rate=16000, timesteps=1000, channels=4, freq_resolution=512, time_resolution=256, VAE_scale=4, squared=False):
|
155 |
+
|
156 |
+
self.noise_prediction_model = noise_prediction_model
|
157 |
+
self.VAE_quantizer = VAE_quantizer
|
158 |
+
self.VAE_decoder = VAE_decoder
|
159 |
+
self.device = device
|
160 |
+
self.model_sample_rate = model_sample_rate
|
161 |
+
self.timesteps = timesteps
|
162 |
+
self.channels = channels
|
163 |
+
self.freq_resolution = freq_resolution
|
164 |
+
self.time_resolution = time_resolution
|
165 |
+
self.height = int(freq_resolution/VAE_scale)
|
166 |
+
self.VAE_scale = VAE_scale
|
167 |
+
self.squared = squared
|
168 |
+
self.text_encoder = text_encoder
|
169 |
+
self.CLAP_tokenizer = CLAP_tokenizer
|
170 |
+
|
171 |
+
# instruments_configs 是字典 string -> (condition, negative_condition, guidance_scale, sample_steps, seed, initial_noise, sampler)
|
172 |
+
self.instruments_configs = instruments_configs
|
173 |
+
self.diffSynthSamplers = {}
|
174 |
+
self._update_instruments()
|
175 |
+
|
176 |
+
|
177 |
+
def _update_instruments(self):
|
178 |
+
|
179 |
+
def diffSynthSamplerWrapper(instruments_config):
|
180 |
+
|
181 |
+
def diffSynthSampler(velocity, duration_sec, sample_rate=16000):
|
182 |
+
|
183 |
+
condition = self.text_encoder.get_text_features(**self.CLAP_tokenizer([""], padding=True, return_tensors="pt")).to(self.device)
|
184 |
+
sample_steps = instruments_config['sample_steps']
|
185 |
+
sampler = instruments_config['sampler']
|
186 |
+
noising_strength = instruments_config['noising_strength']
|
187 |
+
latent_representation = instruments_config['latent_representation']
|
188 |
+
attack = instruments_config['attack']
|
189 |
+
before_release = instruments_config['before_release']
|
190 |
+
|
191 |
+
assert sample_rate == self.model_sample_rate, "sample_rate != model_sample_rate"
|
192 |
+
|
193 |
+
width = int(self.time_resolution * ((duration_sec + 1) / 4) / self.VAE_scale)
|
194 |
+
|
195 |
+
mySampler = DiffSynthSampler(self.timesteps, height=128, channels=4, noise_strategy="repeat", mute=True)
|
196 |
+
mySampler.respace(list(np.linspace(0, self.timesteps - 1, sample_steps, dtype=np.int32)))
|
197 |
+
|
198 |
+
# mask = 1, freeze
|
199 |
+
latent_mask = torch.zeros((1, 1, self.height, width), dtype=torch.float32).to(self.device)
|
200 |
+
latent_mask[:, :, :, :int(self.time_resolution * (attack / 4) / self.VAE_scale)] = 1.0
|
201 |
+
latent_mask[:, :, :, -int(self.time_resolution * ((before_release+1) / 4) / self.VAE_scale):] = 1.0
|
202 |
+
|
203 |
+
latent_representations, _ = \
|
204 |
+
mySampler.inpaint_sample(model=self.noise_prediction_model, shape=(1, self.channels, self.height, width),
|
205 |
+
noising_strength=noising_strength, condition=condition,
|
206 |
+
guide_img=latent_representation, mask=latent_mask, return_tensor=True,
|
207 |
+
sampler=sampler,
|
208 |
+
use_dynamic_mask=True, end_noise_level_ratio=0.0,
|
209 |
+
mask_flexivity=1.0)
|
210 |
+
|
211 |
+
|
212 |
+
latent_representations = latent_representations[-1]
|
213 |
+
|
214 |
+
quantized_latent_representations, _, (_, _, _) = self.VAE_quantizer(latent_representations)
|
215 |
+
# Todo: remove hard-coding
|
216 |
+
|
217 |
+
flipped_log_spectrums, flipped_phases, rec_signals, _, _, _ = encodeBatch2GradioOutput_STFT(self.VAE_decoder,
|
218 |
+
quantized_latent_representations,
|
219 |
+
resolution=(
|
220 |
+
512,
|
221 |
+
width * self.VAE_scale),
|
222 |
+
original_STFT_batch=None,
|
223 |
+
)
|
224 |
+
|
225 |
+
|
226 |
+
return rec_signals[0]
|
227 |
+
|
228 |
+
return diffSynthSampler
|
229 |
+
|
230 |
+
for key in self.instruments_configs.keys():
|
231 |
+
self.diffSynthSamplers[key] = diffSynthSamplerWrapper(self.instruments_configs[key])
|
232 |
+
|
233 |
+
def get_music(self, mid, instrument_names, sample_rate=16000):
|
234 |
+
tracks = [Track(t, mid.ticks_per_beat) for t in mid.tracks]
|
235 |
+
assert len(tracks) <= len(instrument_names), f"len(tracks) = {len(tracks)} > {len(instrument_names)} = len(instrument_names)"
|
236 |
+
|
237 |
+
track_audios = [track.synthesize_track(self.diffSynthSamplers[instrument_names[i]], sample_rate=sample_rate) for i, track in enumerate(tracks)]
|
238 |
+
|
239 |
+
# 将所有音轨填充至最长音轨的长度,以便它们可以被叠加
|
240 |
+
max_length = max(len(audio) for audio in track_audios)
|
241 |
+
full_audio = np.zeros(max_length, dtype=np.float32) # 初始化全音频数组为零
|
242 |
+
for audio in track_audios:
|
243 |
+
# 音轨可能不够长,需要填充零
|
244 |
+
padded_audio = np.pad(audio, (0, max_length - len(audio)), 'constant')
|
245 |
+
full_audio += padded_audio # 叠加音轨
|
246 |
+
|
247 |
return full_audio
|
webUI/natural_language_guided_4/GAN.py
ADDED
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import numpy as np
|
3 |
+
import torch
|
4 |
+
|
5 |
+
from tools import safe_int
|
6 |
+
from webUI.natural_language_guided_STFT.utils import encodeBatch2GradioOutput, latent_representation_to_Gradio_image, \
|
7 |
+
add_instrument
|
8 |
+
|
9 |
+
|
10 |
+
def get_testGAN(gradioWebUI, text2sound_state, virtual_instruments_state):
|
11 |
+
# Load configurations
|
12 |
+
gan_generator = gradioWebUI.GAN_generator
|
13 |
+
freq_resolution, time_resolution = gradioWebUI.freq_resolution, gradioWebUI.time_resolution
|
14 |
+
VAE_scale = gradioWebUI.VAE_scale
|
15 |
+
height, width, channels = int(freq_resolution / VAE_scale), int(time_resolution / VAE_scale), gradioWebUI.channels
|
16 |
+
|
17 |
+
timesteps = gradioWebUI.timesteps
|
18 |
+
VAE_quantizer = gradioWebUI.VAE_quantizer
|
19 |
+
VAE_decoder = gradioWebUI.VAE_decoder
|
20 |
+
CLAP = gradioWebUI.CLAP
|
21 |
+
CLAP_tokenizer = gradioWebUI.CLAP_tokenizer
|
22 |
+
device = gradioWebUI.device
|
23 |
+
squared = gradioWebUI.squared
|
24 |
+
sample_rate = gradioWebUI.sample_rate
|
25 |
+
noise_strategy = gradioWebUI.noise_strategy
|
26 |
+
|
27 |
+
def gan_random_sample(text2sound_prompts, text2sound_negative_prompts, text2sound_batchsize,
|
28 |
+
text2sound_duration,
|
29 |
+
text2sound_guidance_scale, text2sound_sampler,
|
30 |
+
text2sound_sample_steps, text2sound_seed,
|
31 |
+
text2sound_dict):
|
32 |
+
text2sound_seed = safe_int(text2sound_seed, 12345678)
|
33 |
+
|
34 |
+
width = int(time_resolution * ((text2sound_duration + 1) / 4) / VAE_scale)
|
35 |
+
|
36 |
+
text2sound_batchsize = int(text2sound_batchsize)
|
37 |
+
|
38 |
+
text2sound_embedding = \
|
39 |
+
CLAP.get_text_features(**CLAP_tokenizer([text2sound_prompts], padding=True, return_tensors="pt"))[0].to(
|
40 |
+
device)
|
41 |
+
|
42 |
+
CFG = int(text2sound_guidance_scale)
|
43 |
+
|
44 |
+
condition = text2sound_embedding.repeat(text2sound_batchsize, 1)
|
45 |
+
|
46 |
+
noise = torch.randn(text2sound_batchsize, channels, height, width).to(device)
|
47 |
+
latent_representations = gan_generator(noise, condition)
|
48 |
+
|
49 |
+
print(latent_representations[0, 0, :3, :3])
|
50 |
+
|
51 |
+
latent_representation_gradio_images = []
|
52 |
+
quantized_latent_representation_gradio_images = []
|
53 |
+
new_sound_spectrogram_gradio_images = []
|
54 |
+
new_sound_rec_signals_gradio = []
|
55 |
+
|
56 |
+
quantized_latent_representations, loss, (_, _, _) = VAE_quantizer(latent_representations)
|
57 |
+
# Todo: remove hard-coding
|
58 |
+
flipped_log_spectrums, rec_signals = encodeBatch2GradioOutput(VAE_decoder, quantized_latent_representations,
|
59 |
+
resolution=(512, width * VAE_scale),
|
60 |
+
centralized=False,
|
61 |
+
squared=squared)
|
62 |
+
|
63 |
+
for i in range(text2sound_batchsize):
|
64 |
+
latent_representation_gradio_images.append(latent_representation_to_Gradio_image(latent_representations[i]))
|
65 |
+
quantized_latent_representation_gradio_images.append(
|
66 |
+
latent_representation_to_Gradio_image(quantized_latent_representations[i]))
|
67 |
+
new_sound_spectrogram_gradio_images.append(flipped_log_spectrums[i])
|
68 |
+
new_sound_rec_signals_gradio.append((sample_rate, rec_signals[i]))
|
69 |
+
|
70 |
+
text2sound_dict["latent_representations"] = latent_representations.to("cpu").detach().numpy()
|
71 |
+
text2sound_dict["quantized_latent_representations"] = quantized_latent_representations.to("cpu").detach().numpy()
|
72 |
+
text2sound_dict["latent_representation_gradio_images"] = latent_representation_gradio_images
|
73 |
+
text2sound_dict["quantized_latent_representation_gradio_images"] = quantized_latent_representation_gradio_images
|
74 |
+
text2sound_dict["new_sound_spectrogram_gradio_images"] = new_sound_spectrogram_gradio_images
|
75 |
+
text2sound_dict["new_sound_rec_signals_gradio"] = new_sound_rec_signals_gradio
|
76 |
+
|
77 |
+
text2sound_dict["condition"] = condition.to("cpu").detach().numpy()
|
78 |
+
# text2sound_dict["negative_condition"] = negative_condition.to("cpu").detach().numpy()
|
79 |
+
text2sound_dict["guidance_scale"] = CFG
|
80 |
+
text2sound_dict["sampler"] = text2sound_sampler
|
81 |
+
|
82 |
+
return {text2sound_latent_representation_image: text2sound_dict["latent_representation_gradio_images"][0],
|
83 |
+
text2sound_quantized_latent_representation_image:
|
84 |
+
text2sound_dict["quantized_latent_representation_gradio_images"][0],
|
85 |
+
text2sound_sampled_spectrogram_image: text2sound_dict["new_sound_spectrogram_gradio_images"][0],
|
86 |
+
text2sound_sampled_audio: text2sound_dict["new_sound_rec_signals_gradio"][0],
|
87 |
+
text2sound_seed_textbox: text2sound_seed,
|
88 |
+
text2sound_state: text2sound_dict,
|
89 |
+
text2sound_sample_index_slider: gr.update(minimum=0, maximum=text2sound_batchsize - 1, value=0, step=1,
|
90 |
+
visible=True,
|
91 |
+
label="Sample index.",
|
92 |
+
info="Swipe to view other samples")}
|
93 |
+
|
94 |
+
def show_random_sample(sample_index, text2sound_dict):
|
95 |
+
sample_index = int(sample_index)
|
96 |
+
text2sound_dict["sample_index"] = sample_index
|
97 |
+
return {text2sound_latent_representation_image: text2sound_dict["latent_representation_gradio_images"][
|
98 |
+
sample_index],
|
99 |
+
text2sound_quantized_latent_representation_image:
|
100 |
+
text2sound_dict["quantized_latent_representation_gradio_images"][sample_index],
|
101 |
+
text2sound_sampled_spectrogram_image: text2sound_dict["new_sound_spectrogram_gradio_images"][
|
102 |
+
sample_index],
|
103 |
+
text2sound_sampled_audio: text2sound_dict["new_sound_rec_signals_gradio"][sample_index]}
|
104 |
+
|
105 |
+
|
106 |
+
with gr.Tab("Text2sound_GAN"):
|
107 |
+
gr.Markdown("Use neural networks to select random sounds using your favorite instrument!")
|
108 |
+
with gr.Row(variant="panel"):
|
109 |
+
with gr.Column(scale=3):
|
110 |
+
text2sound_prompts_textbox = gr.Textbox(label="Positive prompt", lines=2, value="organ")
|
111 |
+
text2sound_negative_prompts_textbox = gr.Textbox(label="Negative prompt", lines=2, value="")
|
112 |
+
|
113 |
+
with gr.Column(scale=1):
|
114 |
+
text2sound_sampling_button = gr.Button(variant="primary",
|
115 |
+
value="Generate a batch of samples and show "
|
116 |
+
"the first one",
|
117 |
+
scale=1)
|
118 |
+
text2sound_sample_index_slider = gr.Slider(minimum=0, maximum=3, value=0, step=1.0, visible=False,
|
119 |
+
label="Sample index",
|
120 |
+
info="Swipe to view other samples")
|
121 |
+
with gr.Row(variant="panel"):
|
122 |
+
with gr.Column(scale=1, variant="panel"):
|
123 |
+
text2sound_sample_steps_slider = gradioWebUI.get_sample_steps_slider()
|
124 |
+
text2sound_sampler_radio = gradioWebUI.get_sampler_radio()
|
125 |
+
text2sound_batchsize_slider = gradioWebUI.get_batchsize_slider()
|
126 |
+
text2sound_duration_slider = gradioWebUI.get_duration_slider()
|
127 |
+
text2sound_guidance_scale_slider = gradioWebUI.get_guidance_scale_slider()
|
128 |
+
text2sound_seed_textbox = gradioWebUI.get_seed_textbox()
|
129 |
+
|
130 |
+
with gr.Column(scale=1):
|
131 |
+
text2sound_sampled_spectrogram_image = gr.Image(label="Sampled spectrogram", type="numpy", height=420)
|
132 |
+
text2sound_sampled_audio = gr.Audio(type="numpy", label="Play")
|
133 |
+
|
134 |
+
|
135 |
+
with gr.Row(variant="panel"):
|
136 |
+
text2sound_latent_representation_image = gr.Image(label="Sampled latent representation", type="numpy",
|
137 |
+
height=200, width=100)
|
138 |
+
text2sound_quantized_latent_representation_image = gr.Image(label="Quantized latent representation",
|
139 |
+
type="numpy", height=200, width=100)
|
140 |
+
|
141 |
+
text2sound_sampling_button.click(gan_random_sample,
|
142 |
+
inputs=[text2sound_prompts_textbox,
|
143 |
+
text2sound_negative_prompts_textbox,
|
144 |
+
text2sound_batchsize_slider,
|
145 |
+
text2sound_duration_slider,
|
146 |
+
text2sound_guidance_scale_slider, text2sound_sampler_radio,
|
147 |
+
text2sound_sample_steps_slider,
|
148 |
+
text2sound_seed_textbox,
|
149 |
+
text2sound_state],
|
150 |
+
outputs=[text2sound_latent_representation_image,
|
151 |
+
text2sound_quantized_latent_representation_image,
|
152 |
+
text2sound_sampled_spectrogram_image,
|
153 |
+
text2sound_sampled_audio,
|
154 |
+
text2sound_seed_textbox,
|
155 |
+
text2sound_state,
|
156 |
+
text2sound_sample_index_slider])
|
157 |
+
|
158 |
+
|
159 |
+
text2sound_sample_index_slider.change(show_random_sample,
|
160 |
+
inputs=[text2sound_sample_index_slider, text2sound_state],
|
161 |
+
outputs=[text2sound_latent_representation_image,
|
162 |
+
text2sound_quantized_latent_representation_image,
|
163 |
+
text2sound_sampled_spectrogram_image,
|
164 |
+
text2sound_sampled_audio])
|
webUI/natural_language_guided_4/README.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
|
3 |
+
readme_content = """## Stable Diffusion for Sound Generation
|
4 |
+
|
5 |
+
This project applies stable diffusion[1] to sound generation. Inspired by the work of AUTOMATIC1111, 2022[2], we have implemented a preliminary version of text2sound, sound2sound, inpaint, as well as an additional interpolation feature, all accessible through a web UI.
|
6 |
+
|
7 |
+
### Neural Network Training Data:
|
8 |
+
The neural network is trained using the filtered NSynth dataset[3], which is a large-scale and high-quality collection of annotated musical notes, comprising 305,979 musical notes. However, for this project, only samples with a pitch set to E3 were used, resulting in an actual training sample size of 4,096, making it a low-resource project.
|
9 |
+
|
10 |
+
The training took place on an NVIDIA Tesla T4 GPU and spanned approximately 10 hours.
|
11 |
+
|
12 |
+
### Natural Language Guidance:
|
13 |
+
Natural language guidance is derived from the multi-label annotations of the NSynth dataset. The labels included in the training are:
|
14 |
+
|
15 |
+
- **Instrument Families**: bass, brass, flute, guitar, keyboard, mallet, organ, reed, string, synth lead, vocal.
|
16 |
+
|
17 |
+
- **Instrument Sources**: acoustic, electronic, synthetic.
|
18 |
+
|
19 |
+
- **Note Qualities**: bright, dark, distortion, fast decay, long release, multiphonic, nonlinear env, percussive, reverb, tempo-synced.
|
20 |
+
|
21 |
+
### Usage Hints:
|
22 |
+
|
23 |
+
1. **Prompt Format**: It's recommended to use the format “label1, label2, label3“, e.g., ”organ, dark, long release“.
|
24 |
+
|
25 |
+
2. **Unique Sounds**: If you keep generating the same sound, try setting a different seed!
|
26 |
+
|
27 |
+
3. **Sample Indexing**: Drag the "Sample index slider" to view other samples within the generated batch.
|
28 |
+
|
29 |
+
4. **Running on CPU**: Be cautious with the settings for 'batchsize' and 'sample_steps' when running on CPU to avoid timeouts. Recommended settings are batchsize ≤ 4 and sample_steps = 15.
|
30 |
+
|
31 |
+
5. **Editing Sounds**: Generated audio can be downloaded and then re-uploaded for further editing at the sound2sound/inpaint sections.
|
32 |
+
|
33 |
+
6. **Guidance Scale**: A higher 'guidance_scale' intensifies the influence of natural language conditioning on the generation[4]. It's recommended to set it between 3 and 10.
|
34 |
+
|
35 |
+
7. **Noising Strength**: A smaller 'noising_strength' value makes the generated sound closer to the input sound.
|
36 |
+
|
37 |
+
References:
|
38 |
+
|
39 |
+
[1] Rombach, R., Blattmann, A., Lorenz, D., Esser, P., & Ommer, B. (2022). High-resolution image synthesis with latent diffusion models. In Proceedings of the IEEE/CVF conference on computer vision and pattern recognition (pp. 10684-10695).
|
40 |
+
|
41 |
+
[2] AUTOMATIC1111. (2022). Stable Diffusion Web UI [Computer software]. Retrieved from https://github.com/AUTOMATIC1111/stable-diffusion-webui
|
42 |
+
|
43 |
+
[3] Engel, J., Resnick, C., Roberts, A., Dieleman, S., Eck, D., Simonyan, K., & Norouzi, M. (2017). Neural Audio Synthesis of Musical Notes with WaveNet Autoencoders.
|
44 |
+
|
45 |
+
[4] Ho, J., & Salimans, T. (2022). Classifier-free diffusion guidance. arXiv preprint arXiv:2207.12598.
|
46 |
+
"""
|
47 |
+
|
48 |
+
def get_readme_module():
|
49 |
+
|
50 |
+
with gr.Tab("README"):
|
51 |
+
# gr.Markdown("Use interpolation to generate a gradient sound sequence.")
|
52 |
+
with gr.Column(scale=3):
|
53 |
+
readme_textbox = gr.Textbox(label="readme", lines=40, value=readme_content, interactive=False)
|
webUI/natural_language_guided_4/__pycache__/gradio_webUI.cpython-310.pyc
ADDED
Binary file (3.55 kB). View file
|
|
webUI/natural_language_guided_4/__pycache__/inpaint_with_text.cpython-310.pyc
ADDED
Binary file (10.9 kB). View file
|
|
webUI/natural_language_guided_4/__pycache__/instruments.cpython-310.pyc
ADDED
Binary file (2.62 kB). View file
|
|
webUI/natural_language_guided_4/__pycache__/load_presets.cpython-310.pyc
ADDED
Binary file (2.74 kB). View file
|
|
webUI/natural_language_guided_4/__pycache__/note2music.cpython-310.pyc
ADDED
Binary file (7.62 kB). View file
|
|
webUI/natural_language_guided_4/__pycache__/sound2sound_with_text.cpython-310.pyc
ADDED
Binary file (9.32 kB). View file
|
|
webUI/natural_language_guided_4/__pycache__/text2sound.cpython-310.pyc
ADDED
Binary file (6.5 kB). View file
|
|
webUI/natural_language_guided_4/__pycache__/track_maker.cpython-310.pyc
ADDED
Binary file (7.56 kB). View file
|
|
webUI/natural_language_guided_4/__pycache__/utils.cpython-310.pyc
ADDED
Binary file (5.89 kB). View file
|
|
webUI/natural_language_guided_4/build_instrument.py
ADDED
@@ -0,0 +1,305 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import librosa
|
2 |
+
import numpy as np
|
3 |
+
import torch
|
4 |
+
import gradio as gr
|
5 |
+
import mido
|
6 |
+
from io import BytesIO
|
7 |
+
# import pyrubberband as pyrb
|
8 |
+
import torchaudio.transforms as transforms
|
9 |
+
|
10 |
+
from model.DiffSynthSampler import DiffSynthSampler
|
11 |
+
from tools import adsr_envelope, adjust_audio_length
|
12 |
+
from webUI.natural_language_guided.track_maker import DiffSynth
|
13 |
+
from webUI.natural_language_guided.utils import encodeBatch2GradioOutput_STFT, phase_to_Gradio_image, \
|
14 |
+
spectrogram_to_Gradio_image
|
15 |
+
|
16 |
+
|
17 |
+
def time_stretch_audio(waveform, sample_rate, stretch_factor):
|
18 |
+
# 如果输入是 numpy 数组,则转换为 torch.Tensor
|
19 |
+
if isinstance(waveform, np.ndarray):
|
20 |
+
waveform = torch.from_numpy(waveform)
|
21 |
+
|
22 |
+
# 确保 waveform 的类型为 torch.float32
|
23 |
+
waveform = waveform.to(torch.float32)
|
24 |
+
|
25 |
+
# 设置 STFT 参数
|
26 |
+
n_fft = 2048 # STFT 窗口大小
|
27 |
+
hop_length = n_fft // 4 # STFT 的 hop length 设置为 n_fft 的四分之一
|
28 |
+
|
29 |
+
# 计算短时傅里叶变换 (STFT)
|
30 |
+
stft = torch.stft(waveform, n_fft=n_fft, hop_length=hop_length, return_complex=True)
|
31 |
+
|
32 |
+
# 创建 TimeStretch 变换
|
33 |
+
time_stretch = transforms.TimeStretch(hop_length=hop_length, n_freq=1025, fixed_rate=False)
|
34 |
+
|
35 |
+
print(stft.shape)
|
36 |
+
# 应用时间伸缩
|
37 |
+
stretched_stft = time_stretch(stft, stretch_factor)
|
38 |
+
|
39 |
+
# 将 STFT 转换回时域波形
|
40 |
+
stretched_waveform = torch.istft(stretched_stft, n_fft=n_fft, hop_length=hop_length)
|
41 |
+
|
42 |
+
# 返回处理后的 waveform,转换为 numpy 数组
|
43 |
+
return stretched_waveform.detach().numpy()
|
44 |
+
|
45 |
+
|
46 |
+
def get_build_instrument_module(gradioWebUI, virtual_instruments_state):
|
47 |
+
# Load configurations
|
48 |
+
uNet = gradioWebUI.uNet
|
49 |
+
freq_resolution, time_resolution = gradioWebUI.freq_resolution, gradioWebUI.time_resolution
|
50 |
+
VAE_scale = gradioWebUI.VAE_scale
|
51 |
+
height, width, channels = int(freq_resolution / VAE_scale), int(time_resolution / VAE_scale), gradioWebUI.channels
|
52 |
+
|
53 |
+
timesteps = gradioWebUI.timesteps
|
54 |
+
VAE_quantizer = gradioWebUI.VAE_quantizer
|
55 |
+
VAE_decoder = gradioWebUI.VAE_decoder
|
56 |
+
CLAP = gradioWebUI.CLAP
|
57 |
+
CLAP_tokenizer = gradioWebUI.CLAP_tokenizer
|
58 |
+
device = gradioWebUI.device
|
59 |
+
squared = gradioWebUI.squared
|
60 |
+
sample_rate = gradioWebUI.sample_rate
|
61 |
+
noise_strategy = gradioWebUI.noise_strategy
|
62 |
+
|
63 |
+
def select_sound(virtual_instrument_name, virtual_instruments_dict):
|
64 |
+
virtual_instruments = virtual_instruments_dict["virtual_instruments"]
|
65 |
+
virtual_instrument = virtual_instruments[virtual_instrument_name]
|
66 |
+
|
67 |
+
return {source_sound_spectrogram_image: virtual_instrument["spectrogram_gradio_image"],
|
68 |
+
source_sound_phase_image: virtual_instrument["phase_gradio_image"],
|
69 |
+
source_sound_audio: virtual_instrument["signal"]}
|
70 |
+
|
71 |
+
def make_track(inpaint_steps, midi, noising_strength, attack, before_release, instrument_names, virtual_instruments_dict):
|
72 |
+
|
73 |
+
if noising_strength < 1:
|
74 |
+
print(f"Warning: making track with noising_strength = {noising_strength} < 1")
|
75 |
+
virtual_instruments = virtual_instruments_dict["virtual_instruments"]
|
76 |
+
sample_steps = int(inpaint_steps)
|
77 |
+
|
78 |
+
instrument_names = instrument_names.split("@")
|
79 |
+
instruments_configs = {}
|
80 |
+
for virtual_instrument_name in instrument_names:
|
81 |
+
virtual_instrument = virtual_instruments[virtual_instrument_name]
|
82 |
+
|
83 |
+
latent_representation = torch.tensor(virtual_instrument["latent_representation"], dtype=torch.float32).to(device)
|
84 |
+
sampler = virtual_instrument["sampler"]
|
85 |
+
|
86 |
+
batchsize = 1
|
87 |
+
|
88 |
+
latent_representation = latent_representation.repeat(batchsize, 1, 1, 1)
|
89 |
+
|
90 |
+
mid = mido.MidiFile(file=BytesIO(midi))
|
91 |
+
instruments_configs[virtual_instrument_name] = {
|
92 |
+
'sample_steps': sample_steps,
|
93 |
+
'sampler': sampler,
|
94 |
+
'noising_strength': noising_strength,
|
95 |
+
'latent_representation': latent_representation,
|
96 |
+
'attack': attack,
|
97 |
+
'before_release': before_release}
|
98 |
+
|
99 |
+
diffSynth = DiffSynth(instruments_configs, uNet, VAE_quantizer, VAE_decoder, CLAP, CLAP_tokenizer, device)
|
100 |
+
|
101 |
+
full_audio = diffSynth.get_music(mid, instrument_names)
|
102 |
+
|
103 |
+
return {track_audio: (sample_rate, full_audio)}
|
104 |
+
|
105 |
+
def test_duration_inpaint(virtual_instrument_name, inpaint_steps, duration, noising_strength, end_noise_level_ratio, attack, before_release, mask_flexivity, virtual_instruments_dict, use_dynamic_mask):
|
106 |
+
width = int(time_resolution * ((duration + 1) / 4) / VAE_scale)
|
107 |
+
|
108 |
+
virtual_instruments = virtual_instruments_dict["virtual_instruments"]
|
109 |
+
virtual_instrument = virtual_instruments[virtual_instrument_name]
|
110 |
+
|
111 |
+
latent_representation = torch.tensor(virtual_instrument["latent_representation"], dtype=torch.float32).to(device)
|
112 |
+
sample_steps = int(inpaint_steps)
|
113 |
+
sampler = virtual_instrument["sampler"]
|
114 |
+
batchsize = 1
|
115 |
+
|
116 |
+
mySampler = DiffSynthSampler(timesteps, height=height, channels=channels, noise_strategy=noise_strategy)
|
117 |
+
mySampler.respace(list(np.linspace(0, timesteps - 1, sample_steps, dtype=np.int32)))
|
118 |
+
|
119 |
+
latent_representation = latent_representation.repeat(batchsize, 1, 1, 1)
|
120 |
+
|
121 |
+
# mask = 1, freeze
|
122 |
+
latent_mask = torch.zeros((batchsize, 1, height, width), dtype=torch.float32).to(device)
|
123 |
+
|
124 |
+
latent_mask[:, :, :, :int(time_resolution * (attack / 4) / VAE_scale)] = 1.0
|
125 |
+
latent_mask[:, :, :, -int(time_resolution * ((before_release+1) / 4) / VAE_scale):] = 1.0
|
126 |
+
|
127 |
+
|
128 |
+
text2sound_embedding = \
|
129 |
+
CLAP.get_text_features(**CLAP_tokenizer([""], padding=True, return_tensors="pt"))[0].to(
|
130 |
+
device)
|
131 |
+
condition = text2sound_embedding.repeat(1, 1)
|
132 |
+
|
133 |
+
|
134 |
+
latent_representations, initial_noise = \
|
135 |
+
mySampler.inpaint_sample(model=uNet, shape=(batchsize, channels, height, width),
|
136 |
+
noising_strength=noising_strength,
|
137 |
+
guide_img=latent_representation, mask=latent_mask, return_tensor=True,
|
138 |
+
condition=condition, sampler=sampler,
|
139 |
+
use_dynamic_mask=use_dynamic_mask,
|
140 |
+
end_noise_level_ratio=end_noise_level_ratio,
|
141 |
+
mask_flexivity=mask_flexivity)
|
142 |
+
|
143 |
+
latent_representations = latent_representations[-1]
|
144 |
+
|
145 |
+
quantized_latent_representations, loss, (_, _, _) = VAE_quantizer(latent_representations)
|
146 |
+
# Todo: remove hard-coding
|
147 |
+
flipped_log_spectrums, flipped_phases, rec_signals, _, _, _ = encodeBatch2GradioOutput_STFT(VAE_decoder,
|
148 |
+
quantized_latent_representations,
|
149 |
+
resolution=(
|
150 |
+
512,
|
151 |
+
width * VAE_scale),
|
152 |
+
original_STFT_batch=None
|
153 |
+
)
|
154 |
+
|
155 |
+
|
156 |
+
return {test_duration_spectrogram_image: flipped_log_spectrums[0],
|
157 |
+
test_duration_phase_image: flipped_phases[0],
|
158 |
+
test_duration_audio: (sample_rate, rec_signals[0])}
|
159 |
+
|
160 |
+
def test_duration_envelope(virtual_instrument_name, duration, noising_strength, attack, before_release, release, virtual_instruments_dict):
|
161 |
+
|
162 |
+
virtual_instruments = virtual_instruments_dict["virtual_instruments"]
|
163 |
+
virtual_instrument = virtual_instruments[virtual_instrument_name]
|
164 |
+
sample_rate, signal = virtual_instrument["signal"]
|
165 |
+
|
166 |
+
applied_signal = adsr_envelope(signal=signal, sample_rate=sample_rate, duration=duration,
|
167 |
+
attack_time=0.0, decay_time=0.0, sustain_level=1.0, release_time=release)
|
168 |
+
|
169 |
+
D = librosa.stft(applied_signal, n_fft=1024, hop_length=256, win_length=1024)[1:, :]
|
170 |
+
spc = np.abs(D)
|
171 |
+
phase = np.angle(D)
|
172 |
+
|
173 |
+
flipped_log_spectrum = spectrogram_to_Gradio_image(spc)
|
174 |
+
flipped_phase = phase_to_Gradio_image(phase)
|
175 |
+
|
176 |
+
return {test_duration_spectrogram_image: flipped_log_spectrum,
|
177 |
+
test_duration_phase_image: flipped_phase,
|
178 |
+
test_duration_audio: (sample_rate, applied_signal)}
|
179 |
+
|
180 |
+
def test_duration_stretch(virtual_instrument_name, duration, noising_strength, attack, before_release, release, virtual_instruments_dict):
|
181 |
+
|
182 |
+
virtual_instruments = virtual_instruments_dict["virtual_instruments"]
|
183 |
+
virtual_instrument = virtual_instruments[virtual_instrument_name]
|
184 |
+
sample_rate, signal = virtual_instrument["signal"]
|
185 |
+
|
186 |
+
s = 3 / duration
|
187 |
+
# applied_signal = pyrb.time_stretch(signal, sample_rate, s)
|
188 |
+
applied_signal = time_stretch_audio(signal, sample_rate, s)
|
189 |
+
applied_signal = adjust_audio_length(applied_signal, int((duration+1) * sample_rate), sample_rate, sample_rate)
|
190 |
+
|
191 |
+
D = librosa.stft(applied_signal, n_fft=1024, hop_length=256, win_length=1024)[1:, :]
|
192 |
+
spc = np.abs(D)
|
193 |
+
phase = np.angle(D)
|
194 |
+
|
195 |
+
flipped_log_spectrum = spectrogram_to_Gradio_image(spc)
|
196 |
+
flipped_phase = phase_to_Gradio_image(phase)
|
197 |
+
|
198 |
+
return {test_duration_spectrogram_image: flipped_log_spectrum,
|
199 |
+
test_duration_phase_image: flipped_phase,
|
200 |
+
test_duration_audio: (sample_rate, applied_signal)}
|
201 |
+
|
202 |
+
|
203 |
+
with gr.Tab("TestInTrack"):
|
204 |
+
gr.Markdown("Make music with generated sounds!")
|
205 |
+
with gr.Row(variant="panel"):
|
206 |
+
with gr.Column(scale=3):
|
207 |
+
instrument_name_textbox = gr.Textbox(label="Instrument name", lines=1,
|
208 |
+
placeholder="Name of your instrument", scale=1)
|
209 |
+
select_instrument_button = gr.Button(variant="primary", value="Select", scale=1)
|
210 |
+
with gr.Column(scale=3):
|
211 |
+
inpaint_steps_slider = gr.Slider(minimum=5.0, maximum=999.0, value=20.0, step=1.0, label="inpaint_steps")
|
212 |
+
noising_strength_slider = gradioWebUI.get_noising_strength_slider(default_noising_strength=1.)
|
213 |
+
end_noise_level_ratio_slider = gr.Slider(minimum=0.0, maximum=1., value=0.0, step=0.01, label="end_noise_level_ratio")
|
214 |
+
attack_slider = gr.Slider(minimum=0.0, maximum=1.5, value=0.5, step=0.01, label="attack in sec")
|
215 |
+
before_release_slider = gr.Slider(minimum=0.0, maximum=1.5, value=0.5, step=0.01, label="before_release in sec")
|
216 |
+
release_slider = gr.Slider(minimum=0.0, maximum=1.0, value=0.3, step=0.01, label="release in sec")
|
217 |
+
mask_flexivity_slider = gr.Slider(minimum=0.01, maximum=1.00, value=1., step=0.01, label="mask_flexivity")
|
218 |
+
with gr.Column(scale=3):
|
219 |
+
use_dynamic_mask_checkbox = gr.Checkbox(label="Use dynamic mask", value=True)
|
220 |
+
test_duration_envelope_button = gr.Button(variant="primary", value="Apply envelope", scale=1)
|
221 |
+
test_duration_stretch_button = gr.Button(variant="primary", value="Apply stretch", scale=1)
|
222 |
+
test_duration_inpaint_button = gr.Button(variant="primary", value="Inpaint different duration", scale=1)
|
223 |
+
duration_slider = gradioWebUI.get_duration_slider()
|
224 |
+
|
225 |
+
with gr.Row(variant="panel"):
|
226 |
+
with gr.Column(scale=2):
|
227 |
+
with gr.Row(variant="panel"):
|
228 |
+
source_sound_spectrogram_image = gr.Image(label="New sound spectrogram", type="numpy",
|
229 |
+
height=600, scale=1)
|
230 |
+
source_sound_phase_image = gr.Image(label="New sound phase", type="numpy",
|
231 |
+
height=600, scale=1)
|
232 |
+
source_sound_audio = gr.Audio(type="numpy", label="Play new sound", interactive=False)
|
233 |
+
|
234 |
+
with gr.Column(scale=3):
|
235 |
+
with gr.Row(variant="panel"):
|
236 |
+
test_duration_spectrogram_image = gr.Image(label="New sound spectrogram", type="numpy",
|
237 |
+
height=600, scale=1)
|
238 |
+
test_duration_phase_image = gr.Image(label="New sound phase", type="numpy",
|
239 |
+
height=600, scale=1)
|
240 |
+
test_duration_audio = gr.Audio(type="numpy", label="Play new sound", interactive=False)
|
241 |
+
|
242 |
+
with gr.Row(variant="panel"):
|
243 |
+
with gr.Column(scale=1):
|
244 |
+
# track_spectrogram_image = gr.Image(label="New sound spectrogram", type="numpy",
|
245 |
+
# height=420, scale=1)
|
246 |
+
midi_file = gr.File(label="Upload midi file", type="binary")
|
247 |
+
instrument_names_textbox = gr.Textbox(label="Instrument names", lines=2,
|
248 |
+
placeholder="Names of your instrument used to play the midi", scale=1)
|
249 |
+
track_audio = gr.Audio(type="numpy", label="Play new sound", interactive=False)
|
250 |
+
make_track_button = gr.Button(variant="primary", value="Make track", scale=1)
|
251 |
+
|
252 |
+
select_instrument_button.click(select_sound,
|
253 |
+
inputs=[instrument_name_textbox, virtual_instruments_state],
|
254 |
+
outputs=[source_sound_spectrogram_image,
|
255 |
+
source_sound_phase_image,
|
256 |
+
source_sound_audio])
|
257 |
+
|
258 |
+
test_duration_envelope_button.click(test_duration_envelope,
|
259 |
+
inputs=[instrument_name_textbox, duration_slider,
|
260 |
+
noising_strength_slider,
|
261 |
+
attack_slider,
|
262 |
+
before_release_slider,
|
263 |
+
release_slider,
|
264 |
+
virtual_instruments_state,
|
265 |
+
],
|
266 |
+
outputs=[test_duration_spectrogram_image,
|
267 |
+
test_duration_phase_image,
|
268 |
+
test_duration_audio])
|
269 |
+
|
270 |
+
test_duration_stretch_button.click(test_duration_stretch,
|
271 |
+
inputs=[instrument_name_textbox, duration_slider,
|
272 |
+
noising_strength_slider,
|
273 |
+
attack_slider,
|
274 |
+
before_release_slider,
|
275 |
+
release_slider,
|
276 |
+
virtual_instruments_state,
|
277 |
+
],
|
278 |
+
outputs=[test_duration_spectrogram_image,
|
279 |
+
test_duration_phase_image,
|
280 |
+
test_duration_audio])
|
281 |
+
|
282 |
+
test_duration_inpaint_button.click(test_duration_inpaint,
|
283 |
+
inputs=[instrument_name_textbox,
|
284 |
+
inpaint_steps_slider,
|
285 |
+
duration_slider,
|
286 |
+
noising_strength_slider,
|
287 |
+
end_noise_level_ratio_slider,
|
288 |
+
attack_slider,
|
289 |
+
before_release_slider,
|
290 |
+
mask_flexivity_slider,
|
291 |
+
virtual_instruments_state,
|
292 |
+
use_dynamic_mask_checkbox],
|
293 |
+
outputs=[test_duration_spectrogram_image,
|
294 |
+
test_duration_phase_image,
|
295 |
+
test_duration_audio])
|
296 |
+
|
297 |
+
make_track_button.click(make_track,
|
298 |
+
inputs=[inpaint_steps_slider, midi_file,
|
299 |
+
noising_strength_slider,
|
300 |
+
attack_slider,
|
301 |
+
before_release_slider,
|
302 |
+
instrument_names_textbox,
|
303 |
+
virtual_instruments_state],
|
304 |
+
outputs=[track_audio])
|
305 |
+
|
webUI/natural_language_guided_4/gradio_webUI.py
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
|
3 |
+
|
4 |
+
class GradioWebUI():
|
5 |
+
|
6 |
+
def __init__(self, device, VAE, uNet, CLAP, CLAP_tokenizer,
|
7 |
+
freq_resolution=512, time_resolution=256, channels=4, timesteps=1000,
|
8 |
+
sample_rate=16000, squared=False, VAE_scale=4,
|
9 |
+
flexible_duration=False, noise_strategy="repeat",
|
10 |
+
GAN_generator = None):
|
11 |
+
self.device = device
|
12 |
+
self.VAE_encoder, self.VAE_quantizer, self.VAE_decoder = VAE._encoder, VAE._vq_vae, VAE._decoder
|
13 |
+
self.uNet = uNet
|
14 |
+
self.CLAP, self.CLAP_tokenizer = CLAP, CLAP_tokenizer
|
15 |
+
self.freq_resolution, self.time_resolution = freq_resolution, time_resolution
|
16 |
+
self.channels = channels
|
17 |
+
self.GAN_generator = GAN_generator
|
18 |
+
|
19 |
+
self.timesteps = timesteps
|
20 |
+
self.sample_rate = sample_rate
|
21 |
+
self.squared = squared
|
22 |
+
self.VAE_scale = VAE_scale
|
23 |
+
self.flexible_duration = flexible_duration
|
24 |
+
self.noise_strategy = noise_strategy
|
25 |
+
|
26 |
+
self.text2sound_state = gr.State(value={})
|
27 |
+
self.interpolation_state = gr.State(value={})
|
28 |
+
self.sound2sound_state = gr.State(value={})
|
29 |
+
self.inpaint_state = gr.State(value={})
|
30 |
+
|
31 |
+
def get_sample_steps_slider(self):
|
32 |
+
default_steps = 10 if (self.device == "cpu") else 20
|
33 |
+
return gr.Slider(minimum=10, maximum=100, value=default_steps, step=1,
|
34 |
+
label="Sample steps",
|
35 |
+
info="Sampling steps. The more sampling steps, the better the "
|
36 |
+
"theoretical result, but the time it consumes.")
|
37 |
+
|
38 |
+
def get_sampler_radio(self):
|
39 |
+
# return gr.Radio(choices=["ddpm", "ddim", "dpmsolver++", "dpmsolver"], value="ddim", label="Sampler")
|
40 |
+
return gr.Radio(choices=["ddpm", "ddim"], value="ddim", label="Sampler")
|
41 |
+
|
42 |
+
def get_batchsize_slider(self, cpu_batchsize=1):
|
43 |
+
return gr.Slider(minimum=1., maximum=16, value=cpu_batchsize if (self.device == "cpu") else 8, step=1, label="Batchsize")
|
44 |
+
|
45 |
+
def get_time_resolution_slider(self):
|
46 |
+
return gr.Slider(minimum=16., maximum=int(1024/self.VAE_scale), value=int(256/self.VAE_scale), step=1, label="Time resolution", interactive=True)
|
47 |
+
|
48 |
+
def get_duration_slider(self):
|
49 |
+
if self.flexible_duration:
|
50 |
+
return gr.Slider(minimum=0.25, maximum=8., value=3., step=0.01, label="duration in sec")
|
51 |
+
else:
|
52 |
+
return gr.Slider(minimum=1., maximum=8., value=3., step=1., label="duration in sec")
|
53 |
+
|
54 |
+
def get_guidance_scale_slider(self):
|
55 |
+
return gr.Slider(minimum=0., maximum=20., value=6., step=1.,
|
56 |
+
label="Guidance scale",
|
57 |
+
info="The larger this value, the more the generated sound is "
|
58 |
+
"influenced by the condition. Setting it to 0 is equivalent to "
|
59 |
+
"the negative case.")
|
60 |
+
|
61 |
+
def get_noising_strength_slider(self, default_noising_strength=0.7):
|
62 |
+
return gr.Slider(minimum=0.0, maximum=1.00, value=default_noising_strength, step=0.01,
|
63 |
+
label="noising strength",
|
64 |
+
info="The smaller this value, the more the generated sound is "
|
65 |
+
"closed to the origin.")
|
66 |
+
|
67 |
+
def get_seed_textbox(self):
|
68 |
+
return gr.Textbox(label="Seed", lines=1, placeholder="seed", value=0)
|
webUI/natural_language_guided_4/inpaint_with_text.py
ADDED
@@ -0,0 +1,371 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import librosa
|
2 |
+
import numpy as np
|
3 |
+
import torch
|
4 |
+
import gradio as gr
|
5 |
+
from scipy.ndimage import zoom
|
6 |
+
|
7 |
+
from model.DiffSynthSampler import DiffSynthSampler
|
8 |
+
from tools import adjust_audio_length, safe_int, pad_STFT, encode_stft
|
9 |
+
from webUI.natural_language_guided_4.utils import latent_representation_to_Gradio_image, InputBatch2Encode_STFT, \
|
10 |
+
encodeBatch2GradioOutput_STFT, add_instrument, average_np_arrays
|
11 |
+
|
12 |
+
|
13 |
+
def get_triangle_mask(height, width):
|
14 |
+
mask = np.zeros((height, width))
|
15 |
+
slope = 8 / 3
|
16 |
+
for i in range(height):
|
17 |
+
for j in range(width):
|
18 |
+
if i < slope * j:
|
19 |
+
mask[i, j] = 1
|
20 |
+
return mask
|
21 |
+
|
22 |
+
|
23 |
+
def get_inpaint_with_text_module(gradioWebUI, inpaintWithText_state, virtual_instruments_state):
|
24 |
+
# Load configurations
|
25 |
+
uNet = gradioWebUI.uNet
|
26 |
+
freq_resolution, time_resolution = gradioWebUI.freq_resolution, gradioWebUI.time_resolution
|
27 |
+
VAE_scale = gradioWebUI.VAE_scale
|
28 |
+
height, width, channels = int(freq_resolution / VAE_scale), int(time_resolution / VAE_scale), gradioWebUI.channels
|
29 |
+
timesteps = gradioWebUI.timesteps
|
30 |
+
VAE_encoder = gradioWebUI.VAE_encoder
|
31 |
+
VAE_quantizer = gradioWebUI.VAE_quantizer
|
32 |
+
VAE_decoder = gradioWebUI.VAE_decoder
|
33 |
+
CLAP = gradioWebUI.CLAP
|
34 |
+
CLAP_tokenizer = gradioWebUI.CLAP_tokenizer
|
35 |
+
device = gradioWebUI.device
|
36 |
+
squared = gradioWebUI.squared
|
37 |
+
sample_rate = gradioWebUI.sample_rate
|
38 |
+
noise_strategy = gradioWebUI.noise_strategy
|
39 |
+
|
40 |
+
def receive_upload_origin_audio(sound2sound_duration, sound2sound_origin, inpaintWithText_dict):
|
41 |
+
|
42 |
+
origin_sr, origin_audio = sound2sound_origin
|
43 |
+
|
44 |
+
origin_audio = origin_audio / np.max(np.abs(origin_audio))
|
45 |
+
|
46 |
+
width = int(time_resolution * ((sound2sound_duration + 1) / 4) / VAE_scale)
|
47 |
+
audio_length = 256 * (VAE_scale * width - 1)
|
48 |
+
origin_audio = adjust_audio_length(origin_audio, audio_length, origin_sr, sample_rate)
|
49 |
+
|
50 |
+
D = librosa.stft(origin_audio, n_fft=1024, hop_length=256, win_length=1024)
|
51 |
+
padded_D = pad_STFT(D)
|
52 |
+
encoded_D = encode_stft(padded_D)
|
53 |
+
|
54 |
+
# Todo: justify batchsize to 1
|
55 |
+
origin_spectrogram_batch_tensor = torch.from_numpy(
|
56 |
+
np.repeat(encoded_D[np.newaxis, :, :, :], 1, axis=0)).float().to(device)
|
57 |
+
|
58 |
+
# Todo: remove hard-coding
|
59 |
+
origin_flipped_log_spectrums, origin_flipped_phases, origin_signals, origin_latent_representations, quantized_origin_latent_representations = InputBatch2Encode_STFT(
|
60 |
+
VAE_encoder, origin_spectrogram_batch_tensor, resolution=(512, width * VAE_scale), quantizer=VAE_quantizer,
|
61 |
+
squared=squared)
|
62 |
+
|
63 |
+
inpaintWithText_dict["origin_upload_latent_representations"] = origin_latent_representations.tolist()
|
64 |
+
inpaintWithText_dict[
|
65 |
+
"sound2sound_origin_upload_latent_representation_image"] = latent_representation_to_Gradio_image(
|
66 |
+
origin_latent_representations[0]).tolist()
|
67 |
+
inpaintWithText_dict[
|
68 |
+
"sound2sound_origin_upload_quantized_latent_representation_image"] = latent_representation_to_Gradio_image(
|
69 |
+
quantized_origin_latent_representations[0]).tolist()
|
70 |
+
return {sound2sound_origin_spectrogram_image: origin_flipped_log_spectrums[0],
|
71 |
+
sound2sound_origin_phase_image: origin_flipped_phases[0],
|
72 |
+
sound2sound_origin_upload_latent_representation_image: latent_representation_to_Gradio_image(
|
73 |
+
origin_latent_representations[0]),
|
74 |
+
sound2sound_origin_upload_quantized_latent_representation_image: latent_representation_to_Gradio_image(
|
75 |
+
quantized_origin_latent_representations[0]),
|
76 |
+
sound2sound_origin_microphone_latent_representation_image: gr.update(),
|
77 |
+
sound2sound_origin_microphone_quantized_latent_representation_image: gr.update(),
|
78 |
+
inpaintWithText_state: inpaintWithText_dict}
|
79 |
+
|
80 |
+
def sound2sound_sample(sound2sound_origin_spectrogram,
|
81 |
+
text2sound_prompts, text2sound_negative_prompts, sound2sound_batchsize,
|
82 |
+
sound2sound_guidance_scale, sound2sound_sampler,
|
83 |
+
sound2sound_sample_steps,
|
84 |
+
sound2sound_noising_strength, sound2sound_seed, sound2sound_inpaint_area,
|
85 |
+
mask_time_begin, mask_time_end, mask_frequency_begin, mask_frequency_end,
|
86 |
+
inpaintWithText_dict
|
87 |
+
):
|
88 |
+
|
89 |
+
# input preprocessing
|
90 |
+
sound2sound_seed = safe_int(sound2sound_seed, 12345678)
|
91 |
+
sound2sound_batchsize = int(sound2sound_batchsize)
|
92 |
+
noising_strength = sound2sound_noising_strength
|
93 |
+
sound2sound_sample_steps = int(sound2sound_sample_steps)
|
94 |
+
CFG = int(sound2sound_guidance_scale)
|
95 |
+
|
96 |
+
text2sound_embedding = \
|
97 |
+
CLAP.get_text_features(**CLAP_tokenizer([text2sound_prompts], padding=True, return_tensors="pt"))[0].to(
|
98 |
+
device)
|
99 |
+
|
100 |
+
averaged_transparency = average_np_arrays(sound2sound_origin_spectrogram["layers"])
|
101 |
+
# print(f"averaged_transparency: {averaged_transparency}")
|
102 |
+
averaged_transparency = averaged_transparency[:, :, -1]
|
103 |
+
# print(f"averaged_transparency: {averaged_transparency}")
|
104 |
+
# print(f"np.shape(averaged_transparency): {np.shape(averaged_transparency)}")
|
105 |
+
# print(f"np.mean(averaged_transparency): {np.mean(averaged_transparency)}")
|
106 |
+
origin_latent_representations = torch.tensor(
|
107 |
+
inpaintWithText_dict["origin_upload_latent_representations"]).repeat(sound2sound_batchsize, 1, 1, 1).to(
|
108 |
+
device)
|
109 |
+
|
110 |
+
merged_mask = np.where(averaged_transparency > 0, 1, 0)
|
111 |
+
latent_mask = zoom(merged_mask, (1 / VAE_scale, 1 / VAE_scale))
|
112 |
+
latent_mask = np.clip(latent_mask, 0, 1)
|
113 |
+
# print(f"latent_mask.avg = {np.mean(latent_mask)}")
|
114 |
+
latent_mask[int(mask_frequency_begin):int(mask_frequency_end),
|
115 |
+
int(mask_time_begin * time_resolution / (VAE_scale * 4)):int(
|
116 |
+
mask_time_end * time_resolution / (VAE_scale * 4))] = 1
|
117 |
+
|
118 |
+
|
119 |
+
if sound2sound_inpaint_area == "masked":
|
120 |
+
latent_mask = 1 - latent_mask
|
121 |
+
latent_mask = torch.from_numpy(latent_mask).unsqueeze(0).unsqueeze(1).repeat(sound2sound_batchsize, channels, 1,
|
122 |
+
1).float().to(device)
|
123 |
+
latent_mask = torch.flip(latent_mask, [2])
|
124 |
+
|
125 |
+
mySampler = DiffSynthSampler(timesteps, height=height, channels=channels, noise_strategy=noise_strategy)
|
126 |
+
unconditional_condition = \
|
127 |
+
CLAP.get_text_features(**CLAP_tokenizer([text2sound_negative_prompts], padding=True, return_tensors="pt"))[
|
128 |
+
0]
|
129 |
+
mySampler.activate_classifier_free_guidance(CFG, unconditional_condition.to(device))
|
130 |
+
|
131 |
+
normalized_sample_steps = int(sound2sound_sample_steps / noising_strength)
|
132 |
+
|
133 |
+
mySampler.respace(list(np.linspace(0, timesteps - 1, normalized_sample_steps, dtype=np.int32)))
|
134 |
+
|
135 |
+
# Todo: remove hard-coding
|
136 |
+
width = origin_latent_representations.shape[-1]
|
137 |
+
condition = text2sound_embedding.repeat(sound2sound_batchsize, 1)
|
138 |
+
|
139 |
+
new_sound_latent_representations, initial_noise = \
|
140 |
+
mySampler.inpaint_sample(model=uNet, shape=(sound2sound_batchsize, channels, height, width),
|
141 |
+
seed=sound2sound_seed,
|
142 |
+
noising_strength=noising_strength,
|
143 |
+
guide_img=origin_latent_representations, mask=latent_mask, return_tensor=True,
|
144 |
+
condition=condition, sampler=sound2sound_sampler)
|
145 |
+
|
146 |
+
new_sound_latent_representations = new_sound_latent_representations[-1]
|
147 |
+
|
148 |
+
# Quantize new sound latent representations
|
149 |
+
quantized_new_sound_latent_representations, loss, (_, _, _) = VAE_quantizer(new_sound_latent_representations)
|
150 |
+
new_sound_flipped_log_spectrums, new_sound_flipped_phases, new_sound_signals, _, _, _ = encodeBatch2GradioOutput_STFT(
|
151 |
+
VAE_decoder,
|
152 |
+
quantized_new_sound_latent_representations,
|
153 |
+
resolution=(
|
154 |
+
512,
|
155 |
+
width * VAE_scale),
|
156 |
+
original_STFT_batch=None
|
157 |
+
)
|
158 |
+
|
159 |
+
new_sound_latent_representation_gradio_images = []
|
160 |
+
new_sound_quantized_latent_representation_gradio_images = []
|
161 |
+
new_sound_spectrogram_gradio_images = []
|
162 |
+
new_sound_phase_gradio_images = []
|
163 |
+
new_sound_rec_signals_gradio = []
|
164 |
+
for i in range(sound2sound_batchsize):
|
165 |
+
new_sound_latent_representation_gradio_images.append(
|
166 |
+
latent_representation_to_Gradio_image(new_sound_latent_representations[i]))
|
167 |
+
new_sound_quantized_latent_representation_gradio_images.append(
|
168 |
+
latent_representation_to_Gradio_image(quantized_new_sound_latent_representations[i]))
|
169 |
+
new_sound_spectrogram_gradio_images.append(new_sound_flipped_log_spectrums[i])
|
170 |
+
new_sound_phase_gradio_images.append(new_sound_flipped_phases[i])
|
171 |
+
new_sound_rec_signals_gradio.append((sample_rate, new_sound_signals[i]))
|
172 |
+
|
173 |
+
inpaintWithText_dict[
|
174 |
+
"new_sound_latent_representation_gradio_images"] = new_sound_latent_representation_gradio_images
|
175 |
+
inpaintWithText_dict[
|
176 |
+
"new_sound_quantized_latent_representation_gradio_images"] = new_sound_quantized_latent_representation_gradio_images
|
177 |
+
inpaintWithText_dict["new_sound_spectrogram_gradio_images"] = new_sound_spectrogram_gradio_images
|
178 |
+
inpaintWithText_dict["new_sound_phase_gradio_images"] = new_sound_phase_gradio_images
|
179 |
+
inpaintWithText_dict["new_sound_rec_signals_gradio"] = new_sound_rec_signals_gradio
|
180 |
+
|
181 |
+
inpaintWithText_dict["latent_representations"] = new_sound_latent_representations.to("cpu").detach().numpy()
|
182 |
+
inpaintWithText_dict["quantized_latent_representations"] = quantized_new_sound_latent_representations.to(
|
183 |
+
"cpu").detach().numpy()
|
184 |
+
inpaintWithText_dict["sampler"] = sound2sound_sampler
|
185 |
+
|
186 |
+
return {sound2sound_new_sound_latent_representation_image: latent_representation_to_Gradio_image(
|
187 |
+
new_sound_latent_representations[0]),
|
188 |
+
sound2sound_new_sound_quantized_latent_representation_image: latent_representation_to_Gradio_image(
|
189 |
+
quantized_new_sound_latent_representations[0]),
|
190 |
+
sound2sound_new_sound_spectrogram_image: new_sound_flipped_log_spectrums[0],
|
191 |
+
sound2sound_new_sound_phase_image: new_sound_flipped_phases[0],
|
192 |
+
sound2sound_new_sound_audio: (sample_rate, new_sound_signals[0]),
|
193 |
+
sound2sound_sample_index_slider: gr.update(minimum=0, maximum=sound2sound_batchsize - 1, value=0,
|
194 |
+
step=1.0,
|
195 |
+
visible=True,
|
196 |
+
label="Sample index",
|
197 |
+
info="Swipe to view other samples"),
|
198 |
+
sound2sound_seed_textbox: sound2sound_seed,
|
199 |
+
inpaintWithText_state: inpaintWithText_dict}
|
200 |
+
|
201 |
+
def show_sound2sound_sample(sound2sound_sample_index, inpaintWithText_dict):
|
202 |
+
sample_index = int(sound2sound_sample_index)
|
203 |
+
return {sound2sound_new_sound_latent_representation_image:
|
204 |
+
inpaintWithText_dict["new_sound_latent_representation_gradio_images"][sample_index],
|
205 |
+
sound2sound_new_sound_quantized_latent_representation_image:
|
206 |
+
inpaintWithText_dict["new_sound_quantized_latent_representation_gradio_images"][sample_index],
|
207 |
+
sound2sound_new_sound_spectrogram_image: inpaintWithText_dict["new_sound_spectrogram_gradio_images"][
|
208 |
+
sample_index],
|
209 |
+
sound2sound_new_sound_phase_image: inpaintWithText_dict["new_sound_phase_gradio_images"][
|
210 |
+
sample_index],
|
211 |
+
sound2sound_new_sound_audio: inpaintWithText_dict["new_sound_rec_signals_gradio"][sample_index]}
|
212 |
+
|
213 |
+
def save_virtual_instrument(sample_index, virtual_instrument_name, sound2sound_dict, virtual_instruments_dict):
|
214 |
+
|
215 |
+
virtual_instruments_dict = add_instrument(sound2sound_dict, virtual_instruments_dict, virtual_instrument_name,
|
216 |
+
sample_index)
|
217 |
+
return {virtual_instruments_state: virtual_instruments_dict,
|
218 |
+
sound2sound_instrument_name_textbox: gr.Textbox(label="Instrument name", lines=1,
|
219 |
+
placeholder=f"Saved as {virtual_instrument_name}!")}
|
220 |
+
|
221 |
+
with gr.Tab("Inpaint"):
|
222 |
+
gr.Markdown("Upload a musical note and select the area by drawing on \"Input spectrogram\" for inpainting!")
|
223 |
+
with gr.Row(variant="panel"):
|
224 |
+
with gr.Column(scale=3):
|
225 |
+
text2sound_prompts_textbox = gr.Textbox(label="Positive prompt", lines=2, value="organ")
|
226 |
+
text2sound_negative_prompts_textbox = gr.Textbox(label="Negative prompt", lines=2, value="")
|
227 |
+
|
228 |
+
with gr.Column(scale=1):
|
229 |
+
sound2sound_sample_button = gr.Button(variant="primary", value="Generate", scale=1)
|
230 |
+
|
231 |
+
sound2sound_sample_index_slider = gr.Slider(minimum=0, maximum=3, value=0, step=1.0, visible=False,
|
232 |
+
label="Sample index",
|
233 |
+
info="Swipe to view other samples")
|
234 |
+
|
235 |
+
with gr.Row(variant="panel"):
|
236 |
+
with gr.Column(scale=1):
|
237 |
+
sound2sound_duration_slider = gradioWebUI.get_duration_slider()
|
238 |
+
sound2sound_origin_audio = gr.Audio(
|
239 |
+
sources=["microphone", "upload"], label="Upload/Record source sound",
|
240 |
+
waveform_options=gr.WaveformOptions(
|
241 |
+
waveform_color="#01C6FF",
|
242 |
+
waveform_progress_color="#0066B4",
|
243 |
+
skip_length=1,
|
244 |
+
show_controls=False,
|
245 |
+
),
|
246 |
+
)
|
247 |
+
|
248 |
+
with gr.Row(variant="panel"):
|
249 |
+
with gr.Tab("Sound2sound settings"):
|
250 |
+
sound2sound_sample_steps_slider = gradioWebUI.get_sample_steps_slider()
|
251 |
+
sound2sound_sampler_radio = gradioWebUI.get_sampler_radio()
|
252 |
+
sound2sound_batchsize_slider = gradioWebUI.get_batchsize_slider()
|
253 |
+
sound2sound_noising_strength_slider = gradioWebUI.get_noising_strength_slider()
|
254 |
+
sound2sound_guidance_scale_slider = gradioWebUI.get_guidance_scale_slider()
|
255 |
+
sound2sound_seed_textbox = gradioWebUI.get_seed_textbox()
|
256 |
+
|
257 |
+
with gr.Tab("Mask prototypes"):
|
258 |
+
with gr.Tab("Mask along time axis"):
|
259 |
+
mask_time_begin_slider = gr.Slider(minimum=0.0, maximum=4.00, value=0.0, step=0.01,
|
260 |
+
label="Begin time")
|
261 |
+
mask_time_end_slider = gr.Slider(minimum=0.0, maximum=4.00, value=0.0, step=0.01,
|
262 |
+
label="End time")
|
263 |
+
with gr.Tab("Mask along frequency axis"):
|
264 |
+
mask_frequency_begin_slider = gr.Slider(minimum=0, maximum=127, value=0, step=1,
|
265 |
+
label="Begin freq pixel")
|
266 |
+
mask_frequency_end_slider = gr.Slider(minimum=0, maximum=127, value=0, step=1,
|
267 |
+
label="End freq pixel")
|
268 |
+
|
269 |
+
with gr.Column(scale=1):
|
270 |
+
with gr.Row(variant="panel"):
|
271 |
+
sound2sound_origin_spectrogram_image = gr.ImageEditor(label="Input spectrogram (draw here!)",
|
272 |
+
type="numpy",
|
273 |
+
visible=True, height=600, scale=1)
|
274 |
+
|
275 |
+
sound2sound_new_sound_spectrogram_image = gr.Image(label="New sound spectrogram", type="numpy",
|
276 |
+
height=600, scale=1)
|
277 |
+
|
278 |
+
with gr.Row(variant="panel"):
|
279 |
+
sound2sound_inpaint_area_radio = gr.Radio(label="Inpainting area", choices=["masked", "unmasked"],
|
280 |
+
value="masked", scale=1)
|
281 |
+
|
282 |
+
sound2sound_new_sound_audio = gr.Audio(type="numpy", label="Play new sound", interactive=False,
|
283 |
+
waveform_options=gr.WaveformOptions(
|
284 |
+
waveform_color="#FFB6C1",
|
285 |
+
waveform_progress_color="#FF0000",
|
286 |
+
skip_length=1,
|
287 |
+
show_controls=False,
|
288 |
+
), scale=1 )
|
289 |
+
|
290 |
+
with gr.Row(variant="panel"):
|
291 |
+
sound2sound_instrument_name_textbox = gr.Textbox(label="Instrument name", lines=1,
|
292 |
+
placeholder="Name of your instrument")
|
293 |
+
sound2sound_save_instrument_button = gr.Button(variant="primary",
|
294 |
+
value="Save instrument",
|
295 |
+
scale=1)
|
296 |
+
|
297 |
+
with gr.Row(variant="panel"):
|
298 |
+
sound2sound_origin_upload_latent_representation_image = gr.Image(label="Original latent representation",
|
299 |
+
type="numpy", height=800,
|
300 |
+
visible=False)
|
301 |
+
sound2sound_origin_upload_quantized_latent_representation_image = gr.Image(
|
302 |
+
label="Original quantized latent representation", type="numpy", height=800, visible=False)
|
303 |
+
|
304 |
+
sound2sound_origin_microphone_latent_representation_image = gr.Image(label="Original latent representation",
|
305 |
+
type="numpy", height=800,
|
306 |
+
visible=False)
|
307 |
+
sound2sound_origin_microphone_quantized_latent_representation_image = gr.Image(
|
308 |
+
label="Original quantized latent representation", type="numpy", height=800, visible=False)
|
309 |
+
|
310 |
+
sound2sound_new_sound_latent_representation_image = gr.Image(label="New latent representation",
|
311 |
+
type="numpy", height=800, visible=False)
|
312 |
+
sound2sound_new_sound_quantized_latent_representation_image = gr.Image(
|
313 |
+
label="New sound quantized latent representation", type="numpy", height=800, visible=False)
|
314 |
+
|
315 |
+
sound2sound_origin_phase_image = gr.Image(label="Original upload phase",
|
316 |
+
type="numpy", visible=False)
|
317 |
+
|
318 |
+
sound2sound_new_sound_phase_image = gr.Image(label="New sound phase", type="numpy",
|
319 |
+
height=600, scale=1, visible=False)
|
320 |
+
|
321 |
+
sound2sound_origin_audio.change(receive_upload_origin_audio,
|
322 |
+
inputs=[sound2sound_duration_slider, sound2sound_origin_audio,
|
323 |
+
inpaintWithText_state],
|
324 |
+
outputs=[sound2sound_origin_spectrogram_image,
|
325 |
+
sound2sound_origin_phase_image,
|
326 |
+
sound2sound_origin_upload_latent_representation_image,
|
327 |
+
sound2sound_origin_upload_quantized_latent_representation_image,
|
328 |
+
sound2sound_origin_microphone_latent_representation_image,
|
329 |
+
sound2sound_origin_microphone_quantized_latent_representation_image,
|
330 |
+
inpaintWithText_state])
|
331 |
+
|
332 |
+
sound2sound_sample_button.click(sound2sound_sample,
|
333 |
+
inputs=[sound2sound_origin_spectrogram_image,
|
334 |
+
text2sound_prompts_textbox,
|
335 |
+
text2sound_negative_prompts_textbox,
|
336 |
+
sound2sound_batchsize_slider,
|
337 |
+
sound2sound_guidance_scale_slider,
|
338 |
+
sound2sound_sampler_radio,
|
339 |
+
sound2sound_sample_steps_slider,
|
340 |
+
sound2sound_noising_strength_slider,
|
341 |
+
sound2sound_seed_textbox,
|
342 |
+
sound2sound_inpaint_area_radio,
|
343 |
+
mask_time_begin_slider,
|
344 |
+
mask_time_end_slider,
|
345 |
+
mask_frequency_begin_slider,
|
346 |
+
mask_frequency_end_slider,
|
347 |
+
inpaintWithText_state],
|
348 |
+
outputs=[sound2sound_new_sound_latent_representation_image,
|
349 |
+
sound2sound_new_sound_quantized_latent_representation_image,
|
350 |
+
sound2sound_new_sound_spectrogram_image,
|
351 |
+
sound2sound_new_sound_phase_image,
|
352 |
+
sound2sound_new_sound_audio,
|
353 |
+
sound2sound_sample_index_slider,
|
354 |
+
sound2sound_seed_textbox,
|
355 |
+
inpaintWithText_state])
|
356 |
+
|
357 |
+
sound2sound_sample_index_slider.change(show_sound2sound_sample,
|
358 |
+
inputs=[sound2sound_sample_index_slider, inpaintWithText_state],
|
359 |
+
outputs=[sound2sound_new_sound_latent_representation_image,
|
360 |
+
sound2sound_new_sound_quantized_latent_representation_image,
|
361 |
+
sound2sound_new_sound_spectrogram_image,
|
362 |
+
sound2sound_new_sound_phase_image,
|
363 |
+
sound2sound_new_sound_audio])
|
364 |
+
|
365 |
+
sound2sound_save_instrument_button.click(save_virtual_instrument,
|
366 |
+
inputs=[sound2sound_sample_index_slider,
|
367 |
+
sound2sound_instrument_name_textbox,
|
368 |
+
inpaintWithText_state,
|
369 |
+
virtual_instruments_state],
|
370 |
+
outputs=[virtual_instruments_state,
|
371 |
+
sound2sound_instrument_name_textbox])
|
webUI/natural_language_guided_4/instruments.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
from model.DiffSynthSampler import DiffSynthSampler
|
5 |
+
from tools import safe_int, read_wav_to_numpy
|
6 |
+
from webUI.natural_language_guided.utils import latent_representation_to_Gradio_image, \
|
7 |
+
encodeBatch2GradioOutput_STFT, add_instrument
|
8 |
+
from webUI.natural_language_guided_4.utils import resize_image_to_aspect_ratio
|
9 |
+
|
10 |
+
|
11 |
+
def get_instruments_module(gradioWebUI, virtual_instruments_state):
|
12 |
+
|
13 |
+
with gr.Tab("intruments"):
|
14 |
+
gr.Markdown("Use neural networks to select random sounds using your favorite instrument!")
|
15 |
+
with gr.Row(variant="panel"):
|
16 |
+
with gr.Column(scale=1):
|
17 |
+
input_text = gr.Textbox(label="input")
|
18 |
+
|
19 |
+
@gr.render(inputs=input_text)
|
20 |
+
def show_split(text):
|
21 |
+
textboxes = []
|
22 |
+
|
23 |
+
if len(text) == 0:
|
24 |
+
gr.Markdown("## No Input Provided")
|
25 |
+
else:
|
26 |
+
for letter in text:
|
27 |
+
textboxes.append(gr.Textbox(letter, interactive=True))
|
28 |
+
|
29 |
+
def merge(*splitted_texts):
|
30 |
+
out = ""
|
31 |
+
for t in splitted_texts:
|
32 |
+
out += t
|
33 |
+
return out
|
34 |
+
|
35 |
+
submit_botton.click(merge, inputs=textboxes, outputs=merged_textbox)
|
36 |
+
|
37 |
+
submit_botton = gr.Button("submit")
|
38 |
+
|
39 |
+
merged_textbox = gr.Textbox(placeholder="placeholder", interactive=False)
|
40 |
+
|
41 |
+
with gr.Column(scale=1):
|
42 |
+
|
43 |
+
@gr.render(inputs=virtual_instruments_state)
|
44 |
+
def check_instruments(virtual_instruments_dict):
|
45 |
+
virtual_instruments = virtual_instruments_dict["virtual_instruments"]
|
46 |
+
instrument_names = list(virtual_instruments.keys())
|
47 |
+
|
48 |
+
instrument_dropdown = gr.Dropdown(
|
49 |
+
instrument_names, label="instrument", info="info placeholder"
|
50 |
+
)
|
51 |
+
|
52 |
+
def select_instrument(instrument):
|
53 |
+
print(f"instrument: {instrument}")
|
54 |
+
sr, signal = virtual_instruments[instrument]["signal"]
|
55 |
+
return {selected_instrument_audio: (sr, signal)}
|
56 |
+
|
57 |
+
instrument_dropdown.select(select_instrument, inputs=instrument_dropdown,
|
58 |
+
outputs=selected_instrument_audio)
|
59 |
+
|
60 |
+
selected_instrument_audio = gr.Audio(type="numpy", label="Play", scale=1, interactive=False)
|
webUI/natural_language_guided_4/load_presets.py
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
import librosa
|
4 |
+
import mido
|
5 |
+
import numpy as np
|
6 |
+
import torch
|
7 |
+
|
8 |
+
from tools import read_wav_to_numpy, pad_STFT, encode_stft
|
9 |
+
from webUI.natural_language_guided_4.gradio_webUI import GradioWebUI
|
10 |
+
from webUI.natural_language_guided_4.utils import InputBatch2Encode_STFT
|
11 |
+
|
12 |
+
|
13 |
+
def load_presets(gradioWebUI: GradioWebUI):
|
14 |
+
# Load configurations
|
15 |
+
uNet = gradioWebUI.uNet
|
16 |
+
freq_resolution, time_resolution = gradioWebUI.freq_resolution, gradioWebUI.time_resolution
|
17 |
+
VAE_scale = gradioWebUI.VAE_scale
|
18 |
+
height, width, channels = int(freq_resolution / VAE_scale), int(time_resolution / VAE_scale), gradioWebUI.channels
|
19 |
+
|
20 |
+
timesteps = gradioWebUI.timesteps
|
21 |
+
VAE_quantizer = gradioWebUI.VAE_quantizer
|
22 |
+
VAE_encoder = gradioWebUI.VAE_encoder
|
23 |
+
VAE_decoder = gradioWebUI.VAE_decoder
|
24 |
+
CLAP = gradioWebUI.CLAP
|
25 |
+
CLAP_tokenizer = gradioWebUI.CLAP_tokenizer
|
26 |
+
device = gradioWebUI.device
|
27 |
+
squared = gradioWebUI.squared
|
28 |
+
sample_rate = gradioWebUI.sample_rate
|
29 |
+
noise_strategy = gradioWebUI.noise_strategy
|
30 |
+
|
31 |
+
def add_preset_instruments(virtual_instruments, instrument_name):
|
32 |
+
|
33 |
+
instruments_path = os.path.join("webUI", "presets", "instruments", f"{instrument_name}.wav")
|
34 |
+
sample_rate, origin_audio = read_wav_to_numpy(instruments_path)
|
35 |
+
|
36 |
+
D = librosa.stft(origin_audio, n_fft=1024, hop_length=256, win_length=1024)
|
37 |
+
padded_D = pad_STFT(D)
|
38 |
+
encoded_D = encode_stft(padded_D)
|
39 |
+
|
40 |
+
# Todo: justify batchsize to 1
|
41 |
+
origin_spectrogram_batch_tensor = torch.from_numpy(
|
42 |
+
np.repeat(encoded_D[np.newaxis, :, :, :], 1, axis=0)).float().to(device)
|
43 |
+
|
44 |
+
# Todo: remove hard-coding
|
45 |
+
origin_flipped_log_spectrums, origin_flipped_phases, origin_signals, origin_latent_representations, quantized_origin_latent_representations = InputBatch2Encode_STFT(
|
46 |
+
VAE_encoder, origin_spectrogram_batch_tensor, resolution=(512, width * VAE_scale), quantizer=VAE_quantizer,
|
47 |
+
squared=squared)
|
48 |
+
|
49 |
+
|
50 |
+
virtual_instrument = {"latent_representation": origin_latent_representations[0].to("cpu").detach().numpy(),
|
51 |
+
"quantized_latent_representation": quantized_origin_latent_representations[0].to(
|
52 |
+
"cpu").detach().numpy(),
|
53 |
+
"sampler": "ddim",
|
54 |
+
"signal": (sample_rate, origin_audio),
|
55 |
+
"spectrogram_gradio_image": origin_flipped_log_spectrums[0],
|
56 |
+
"phase_gradio_image": origin_flipped_phases[0]}
|
57 |
+
virtual_instruments[f"preset_{instrument_name}"] = virtual_instrument
|
58 |
+
return virtual_instruments
|
59 |
+
|
60 |
+
virtual_instruments = {}
|
61 |
+
preset_instrument_names = ["ax", "electronic_sound", "organ", "synth_lead", "keyboard", "string"]
|
62 |
+
for preset_instrument_name in preset_instrument_names:
|
63 |
+
virtual_instruments = add_preset_instruments(virtual_instruments, preset_instrument_name)
|
64 |
+
|
65 |
+
|
66 |
+
|
67 |
+
def load_midi_files():
|
68 |
+
|
69 |
+
midis_dict = {}
|
70 |
+
midi_file_names = ["Ode_to_Joy_Easy_variation", "Air_on_the_G_String", "Canon_in_D"]
|
71 |
+
|
72 |
+
for midi_file_name in midi_file_names:
|
73 |
+
midi_path = os.path.join("webUI", "presets", "midis", f"{midi_file_name}.mid")
|
74 |
+
mid = mido.MidiFile(midi_path)
|
75 |
+
midis_dict[midi_file_name] = mid
|
76 |
+
|
77 |
+
return midis_dict
|
78 |
+
|
79 |
+
midis = load_midi_files()
|
80 |
+
|
81 |
+
return virtual_instruments, midis
|
webUI/natural_language_guided_4/note2music.py
ADDED
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import gradio as gr
|
3 |
+
import mido
|
4 |
+
from io import BytesIO
|
5 |
+
# import pyrubberband as pyrb
|
6 |
+
|
7 |
+
from webUI.natural_language_guided_4.track_maker import DiffSynth, Track
|
8 |
+
|
9 |
+
|
10 |
+
def get_arrangement_module(gradioWebUI, virtual_instruments_state, midi_files_state):
|
11 |
+
# Load configurations
|
12 |
+
uNet = gradioWebUI.uNet
|
13 |
+
freq_resolution, time_resolution = gradioWebUI.freq_resolution, gradioWebUI.time_resolution
|
14 |
+
VAE_scale = gradioWebUI.VAE_scale
|
15 |
+
height, width, channels = int(freq_resolution / VAE_scale), int(time_resolution / VAE_scale), gradioWebUI.channels
|
16 |
+
|
17 |
+
timesteps = gradioWebUI.timesteps
|
18 |
+
VAE_quantizer = gradioWebUI.VAE_quantizer
|
19 |
+
VAE_decoder = gradioWebUI.VAE_decoder
|
20 |
+
CLAP = gradioWebUI.CLAP
|
21 |
+
CLAP_tokenizer = gradioWebUI.CLAP_tokenizer
|
22 |
+
device = gradioWebUI.device
|
23 |
+
squared = gradioWebUI.squared
|
24 |
+
sample_rate = gradioWebUI.sample_rate
|
25 |
+
noise_strategy = gradioWebUI.noise_strategy
|
26 |
+
|
27 |
+
def read_midi(midi, midi_dict):
|
28 |
+
mid = mido.MidiFile(file=BytesIO(midi))
|
29 |
+
tracks = [Track(t, mid.ticks_per_beat) for t in mid.tracks]
|
30 |
+
|
31 |
+
midi_info_text = f"Uploaded midi:"
|
32 |
+
for i, track in enumerate(tracks):
|
33 |
+
midi_info_text += f"\n{len(track.events)} events loaded from Track {i}."
|
34 |
+
|
35 |
+
midis = midi_dict["midis"]
|
36 |
+
midis["uploaded_midi"] = mid
|
37 |
+
midi_dict["midis"] = midis
|
38 |
+
|
39 |
+
return {midi_info_textbox: gr.Textbox(label="Midi info", lines=10,
|
40 |
+
placeholder=midi_info_text),
|
41 |
+
current_midi_state: "uploaded_midi",
|
42 |
+
midi_files_state: midi_dict}
|
43 |
+
|
44 |
+
def make_track(inpaint_steps, current_midi_name, midi_dict, max_notes, noising_strength, attack, before_release, current_instruments,
|
45 |
+
virtual_instruments_dict):
|
46 |
+
|
47 |
+
if noising_strength < 1:
|
48 |
+
print(f"Warning: making track with noising_strength = {noising_strength} < 1")
|
49 |
+
virtual_instruments = virtual_instruments_dict["virtual_instruments"]
|
50 |
+
sample_steps = int(inpaint_steps)
|
51 |
+
|
52 |
+
print(f"current_instruments: {current_instruments}")
|
53 |
+
instrument_names = current_instruments
|
54 |
+
instruments_configs = {}
|
55 |
+
|
56 |
+
for virtual_instrument_name in instrument_names:
|
57 |
+
virtual_instrument = virtual_instruments[virtual_instrument_name]
|
58 |
+
|
59 |
+
latent_representation = torch.tensor(virtual_instrument["latent_representation"], dtype=torch.float32).to(
|
60 |
+
device)
|
61 |
+
sampler = virtual_instrument["sampler"]
|
62 |
+
|
63 |
+
batchsize = 1
|
64 |
+
|
65 |
+
latent_representation = latent_representation.repeat(batchsize, 1, 1, 1)
|
66 |
+
|
67 |
+
instruments_configs[virtual_instrument_name] = {
|
68 |
+
'sample_steps': sample_steps,
|
69 |
+
'sampler': sampler,
|
70 |
+
'noising_strength': noising_strength,
|
71 |
+
'latent_representation': latent_representation,
|
72 |
+
'attack': attack,
|
73 |
+
'before_release': before_release}
|
74 |
+
|
75 |
+
diffSynth = DiffSynth(instruments_configs, uNet, VAE_quantizer, VAE_decoder, CLAP, CLAP_tokenizer, device)
|
76 |
+
|
77 |
+
midis = midi_dict["midis"]
|
78 |
+
mid = midis[current_midi_name]
|
79 |
+
full_audio = diffSynth.get_music(mid, instrument_names, max_notes=max_notes)
|
80 |
+
|
81 |
+
return {track_audio: (sample_rate, full_audio)}
|
82 |
+
|
83 |
+
with gr.Tab("Arrangement"):
|
84 |
+
default_instrument = "preset_string"
|
85 |
+
current_instruments_state = gr.State(value=[default_instrument for _ in range(100)])
|
86 |
+
current_midi_state = gr.State(value="Ode_to_Joy_Easy_variation")
|
87 |
+
|
88 |
+
gr.Markdown("Make music with generated sounds!")
|
89 |
+
with gr.Row(variant="panel"):
|
90 |
+
with gr.Column(scale=3):
|
91 |
+
|
92 |
+
@gr.render(inputs=midi_files_state)
|
93 |
+
def check_midis(midi_dict):
|
94 |
+
midis = midi_dict["midis"]
|
95 |
+
midi_names = list(midis.keys())
|
96 |
+
|
97 |
+
instrument_dropdown = gr.Dropdown(
|
98 |
+
midi_names, label="Select from preset midi files", value="Ode_to_Joy_Easy_variation"
|
99 |
+
)
|
100 |
+
|
101 |
+
def select_midi(midi_name):
|
102 |
+
# print(f"midi_name: {midi_name}")
|
103 |
+
mid = midis[midi_name]
|
104 |
+
tracks = [Track(t, mid.ticks_per_beat) for t in mid.tracks]
|
105 |
+
midi_info_text = f"Name: {midi_name}"
|
106 |
+
for i, track in enumerate(tracks):
|
107 |
+
midi_info_text += f"\n{len(track.events)} events loaded from Track {i}."
|
108 |
+
|
109 |
+
return {midi_info_textbox: gr.Textbox(label="Midi info", lines=10,
|
110 |
+
placeholder=midi_info_text),
|
111 |
+
current_midi_state: midi_name}
|
112 |
+
|
113 |
+
instrument_dropdown.select(select_midi, inputs=instrument_dropdown,
|
114 |
+
outputs=[midi_info_textbox, current_midi_state])
|
115 |
+
|
116 |
+
midi_file = gr.File(label="Upload a midi file", type="binary", scale=1)
|
117 |
+
midi_info_textbox = gr.Textbox(label="Midi info", lines=10,
|
118 |
+
placeholder="Please select/upload a midi on the left.", scale=3,
|
119 |
+
visible=False)
|
120 |
+
|
121 |
+
with gr.Column(scale=3, ):
|
122 |
+
|
123 |
+
@gr.render(inputs=[current_midi_state, midi_files_state, virtual_instruments_state])
|
124 |
+
def render_select_instruments(current_midi_name, midi_dict, virtual_instruments_dict):
|
125 |
+
|
126 |
+
virtual_instruments = virtual_instruments_dict["virtual_instruments"]
|
127 |
+
instrument_names = list(virtual_instruments.keys())
|
128 |
+
|
129 |
+
midis = midi_dict["midis"]
|
130 |
+
mid = midis[current_midi_name]
|
131 |
+
tracks = [Track(t, mid.ticks_per_beat) for t in mid.tracks]
|
132 |
+
|
133 |
+
dropdowns = []
|
134 |
+
for i, track in enumerate(tracks):
|
135 |
+
dropdowns.append(gr.Dropdown(
|
136 |
+
instrument_names, value=default_instrument, label=f"Track {i}: {len(track.events)} notes",
|
137 |
+
info=f"Select an instrument to play this track!"
|
138 |
+
))
|
139 |
+
|
140 |
+
def select_instruments(*instruments):
|
141 |
+
return instruments
|
142 |
+
|
143 |
+
for d in dropdowns:
|
144 |
+
d.select(select_instruments, inputs=dropdowns,
|
145 |
+
outputs=current_instruments_state)
|
146 |
+
|
147 |
+
|
148 |
+
with gr.Column(scale=3):
|
149 |
+
max_notes_slider = gr.Slider(minimum=10.0, maximum=999.0, value=100.0, step=1.0,
|
150 |
+
label="Maximum number of synthesized notes in each track",
|
151 |
+
info="Lower this value to prevent Gradio timeouts")
|
152 |
+
make_track_button = gr.Button(variant="primary", value="Make track", scale=1)
|
153 |
+
track_audio = gr.Audio(type="numpy", label="Play music", interactive=False)
|
154 |
+
|
155 |
+
with gr.Row(variant="panel", visible=False):
|
156 |
+
with gr.Tab("Origin sound"):
|
157 |
+
inpaint_steps_slider = gr.Slider(minimum=5.0, maximum=999.0, value=20.0, step=1.0,
|
158 |
+
label="inpaint_steps")
|
159 |
+
noising_strength_slider = gradioWebUI.get_noising_strength_slider(default_noising_strength=1.)
|
160 |
+
end_noise_level_ratio_slider = gr.Slider(minimum=0.0, maximum=1., value=0.0, step=0.01,
|
161 |
+
label="end_noise_level_ratio")
|
162 |
+
attack_slider = gr.Slider(minimum=0.0, maximum=1.5, value=0.5, step=0.01, label="attack in sec")
|
163 |
+
before_release_slider = gr.Slider(minimum=0.0, maximum=1.5, value=0.5, step=0.01,
|
164 |
+
label="before_release in sec")
|
165 |
+
release_slider = gr.Slider(minimum=0.0, maximum=1.0, value=0.3, step=0.01, label="release in sec")
|
166 |
+
mask_flexivity_slider = gr.Slider(minimum=0.01, maximum=1.00, value=1., step=0.01,
|
167 |
+
label="mask_flexivity")
|
168 |
+
with gr.Tab("Length adjustment config"):
|
169 |
+
use_dynamic_mask_checkbox = gr.Checkbox(label="Use dynamic mask", value=True)
|
170 |
+
test_duration_envelope_button = gr.Button(variant="primary", value="Apply envelope", scale=1)
|
171 |
+
test_duration_stretch_button = gr.Button(variant="primary", value="Apply stretch", scale=1)
|
172 |
+
test_duration_inpaint_button = gr.Button(variant="primary", value="Inpaint different duration", scale=1)
|
173 |
+
duration_slider = gradioWebUI.get_duration_slider()
|
174 |
+
with gr.Tab("Pitch shift config"):
|
175 |
+
pitch_shift_radio = gr.Radio(choices=["librosa", "torchaudio", "rubberband"],
|
176 |
+
value="librosa")
|
177 |
+
|
178 |
+
with gr.Row(variant="panel", visible=False):
|
179 |
+
with gr.Column(scale=2):
|
180 |
+
with gr.Row(variant="panel"):
|
181 |
+
source_sound_spectrogram_image = gr.Image(label="New sound spectrogram", type="numpy",
|
182 |
+
height=600, scale=1)
|
183 |
+
source_sound_phase_image = gr.Image(label="New sound phase", type="numpy",
|
184 |
+
height=600, scale=1)
|
185 |
+
|
186 |
+
make_track_button.click(make_track,
|
187 |
+
inputs=[inpaint_steps_slider, current_midi_state, midi_files_state,
|
188 |
+
max_notes_slider, noising_strength_slider,
|
189 |
+
attack_slider,
|
190 |
+
before_release_slider,
|
191 |
+
current_instruments_state,
|
192 |
+
virtual_instruments_state],
|
193 |
+
outputs=[track_audio])
|
194 |
+
|
195 |
+
midi_file.change(read_midi,
|
196 |
+
inputs=[midi_file,
|
197 |
+
midi_files_state],
|
198 |
+
outputs=[midi_info_textbox,
|
199 |
+
current_midi_state,
|
200 |
+
midi_files_state])
|
webUI/natural_language_guided_4/rec.py
ADDED
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
|
3 |
+
from data_generation.nsynth import get_nsynth_dataloader
|
4 |
+
from webUI.natural_language_guided_STFT.utils import encodeBatch2GradioOutput_STFT, InputBatch2Encode_STFT, \
|
5 |
+
latent_representation_to_Gradio_image
|
6 |
+
|
7 |
+
|
8 |
+
def get_recSTFT_module(gradioWebUI, reconstruction_state):
|
9 |
+
# Load configurations
|
10 |
+
uNet = gradioWebUI.uNet
|
11 |
+
freq_resolution, time_resolution = gradioWebUI.freq_resolution, gradioWebUI.time_resolution
|
12 |
+
VAE_scale = gradioWebUI.VAE_scale
|
13 |
+
height, width, channels = int(freq_resolution / VAE_scale), int(time_resolution / VAE_scale), gradioWebUI.channels
|
14 |
+
|
15 |
+
timesteps = gradioWebUI.timesteps
|
16 |
+
VAE_quantizer = gradioWebUI.VAE_quantizer
|
17 |
+
VAE_encoder = gradioWebUI.VAE_encoder
|
18 |
+
VAE_decoder = gradioWebUI.VAE_decoder
|
19 |
+
CLAP = gradioWebUI.CLAP
|
20 |
+
CLAP_tokenizer = gradioWebUI.CLAP_tokenizer
|
21 |
+
device = gradioWebUI.device
|
22 |
+
squared = gradioWebUI.squared
|
23 |
+
sample_rate = gradioWebUI.sample_rate
|
24 |
+
noise_strategy = gradioWebUI.noise_strategy
|
25 |
+
|
26 |
+
def generate_reconstruction_samples(sample_source, batchsize_slider, encodeCache,
|
27 |
+
reconstruction_samples):
|
28 |
+
|
29 |
+
vae_batchsize = int(batchsize_slider)
|
30 |
+
|
31 |
+
if sample_source == "text2sound_trainSTFT":
|
32 |
+
training_dataset_path = f'data/NSynth/nsynth-STFT-train-52.hdf5' # Make sure to use your actual path
|
33 |
+
iterator = get_nsynth_dataloader(training_dataset_path, batch_size=vae_batchsize, shuffle=True,
|
34 |
+
get_latent_representation=False, with_meta_data=False,
|
35 |
+
task="STFT")
|
36 |
+
elif sample_source == "text2sound_validSTFT":
|
37 |
+
training_dataset_path = f'data/NSynth/nsynth-STFT-valid-52.hdf5' # Make sure to use your actual path
|
38 |
+
iterator = get_nsynth_dataloader(training_dataset_path, batch_size=vae_batchsize, shuffle=True,
|
39 |
+
get_latent_representation=False, with_meta_data=False,
|
40 |
+
task="STFT")
|
41 |
+
elif sample_source == "text2sound_testSTFT":
|
42 |
+
training_dataset_path = f'data/NSynth/nsynth-STFT-test-52.hdf5' # Make sure to use your actual path
|
43 |
+
iterator = get_nsynth_dataloader(training_dataset_path, batch_size=vae_batchsize, shuffle=True,
|
44 |
+
get_latent_representation=False, with_meta_data=False,
|
45 |
+
task="STFT")
|
46 |
+
else:
|
47 |
+
raise NotImplementedError()
|
48 |
+
|
49 |
+
spectrogram_batch = next(iter(iterator))
|
50 |
+
|
51 |
+
origin_flipped_log_spectrums, origin_flipped_phases, origin_signals, latent_representations, quantized_latent_representations = InputBatch2Encode_STFT(
|
52 |
+
VAE_encoder, spectrogram_batch, resolution=(512, width * VAE_scale), quantizer=VAE_quantizer, squared=squared)
|
53 |
+
|
54 |
+
latent_representation_gradio_images, quantized_latent_representation_gradio_images = [], []
|
55 |
+
for i in range(vae_batchsize):
|
56 |
+
latent_representation_gradio_images.append(latent_representation_to_Gradio_image(latent_representations[i]))
|
57 |
+
quantized_latent_representation_gradio_images.append(
|
58 |
+
latent_representation_to_Gradio_image(quantized_latent_representations[i]))
|
59 |
+
|
60 |
+
if quantized_latent_representations is None:
|
61 |
+
quantized_latent_representations = latent_representations
|
62 |
+
reconstruction_flipped_log_spectrums, reconstruction_flipped_phases, reconstruction_signals, reconstruction_flipped_log_spectrums_WOA, reconstruction_flipped_phases_WOA, reconstruction_signals_WOA = encodeBatch2GradioOutput_STFT(VAE_decoder,
|
63 |
+
quantized_latent_representations,
|
64 |
+
resolution=(
|
65 |
+
512,
|
66 |
+
width * VAE_scale),
|
67 |
+
original_STFT_batch=spectrogram_batch
|
68 |
+
)
|
69 |
+
|
70 |
+
reconstruction_samples["origin_flipped_log_spectrums"] = origin_flipped_log_spectrums
|
71 |
+
reconstruction_samples["origin_flipped_phases"] = origin_flipped_phases
|
72 |
+
reconstruction_samples["origin_signals"] = origin_signals
|
73 |
+
reconstruction_samples["latent_representation_gradio_images"] = latent_representation_gradio_images
|
74 |
+
reconstruction_samples[
|
75 |
+
"quantized_latent_representation_gradio_images"] = quantized_latent_representation_gradio_images
|
76 |
+
reconstruction_samples[
|
77 |
+
"reconstruction_flipped_log_spectrums"] = reconstruction_flipped_log_spectrums
|
78 |
+
reconstruction_samples[
|
79 |
+
"reconstruction_flipped_phases"] = reconstruction_flipped_phases
|
80 |
+
reconstruction_samples["reconstruction_signals"] = reconstruction_signals
|
81 |
+
reconstruction_samples[
|
82 |
+
"reconstruction_flipped_log_spectrums_WOA"] = reconstruction_flipped_log_spectrums_WOA
|
83 |
+
reconstruction_samples[
|
84 |
+
"reconstruction_flipped_phases_WOA"] = reconstruction_flipped_phases_WOA
|
85 |
+
reconstruction_samples["reconstruction_signals_WOA"] = reconstruction_signals_WOA
|
86 |
+
reconstruction_samples["sampleRate"] = sample_rate
|
87 |
+
|
88 |
+
latent_representation_gradio_image = reconstruction_samples["latent_representation_gradio_images"][0]
|
89 |
+
quantized_latent_representation_gradio_image = \
|
90 |
+
reconstruction_samples["quantized_latent_representation_gradio_images"][0]
|
91 |
+
origin_flipped_log_spectrum = reconstruction_samples["origin_flipped_log_spectrums"][0]
|
92 |
+
origin_flipped_phase = reconstruction_samples["origin_flipped_phases"][0]
|
93 |
+
origin_signal = reconstruction_samples["origin_signals"][0]
|
94 |
+
reconstruction_flipped_log_spectrum = reconstruction_samples["reconstruction_flipped_log_spectrums"][0]
|
95 |
+
reconstruction_flipped_phase = reconstruction_samples["reconstruction_flipped_phases"][0]
|
96 |
+
reconstruction_signal = reconstruction_samples["reconstruction_signals"][0]
|
97 |
+
reconstruction_flipped_log_spectrum_WOA = reconstruction_samples["reconstruction_flipped_log_spectrums_WOA"][0]
|
98 |
+
reconstruction_flipped_phase_WOA = reconstruction_samples["reconstruction_flipped_phases_WOA"][0]
|
99 |
+
reconstruction_signal_WOA = reconstruction_samples["reconstruction_signals_WOA"][0]
|
100 |
+
|
101 |
+
return {origin_amplitude_image_output: origin_flipped_log_spectrum,
|
102 |
+
origin_phase_image_output: origin_flipped_phase,
|
103 |
+
origin_audio_output: (sample_rate, origin_signal),
|
104 |
+
latent_representation_image_output: latent_representation_gradio_image,
|
105 |
+
quantized_latent_representation_image_output: quantized_latent_representation_gradio_image,
|
106 |
+
reconstruction_amplitude_image_output: reconstruction_flipped_log_spectrum,
|
107 |
+
reconstruction_phase_image_output: reconstruction_flipped_phase,
|
108 |
+
reconstruction_audio_output: (sample_rate, reconstruction_signal),
|
109 |
+
reconstruction_amplitude_image_output_WOA: reconstruction_flipped_log_spectrum_WOA,
|
110 |
+
reconstruction_phase_image_output_WOA: reconstruction_flipped_phase_WOA,
|
111 |
+
reconstruction_audio_output_WOA: (sample_rate, reconstruction_signal_WOA),
|
112 |
+
sample_index_slider: gr.update(minimum=0, maximum=vae_batchsize - 1, value=0, step=1.0,
|
113 |
+
label="Sample index.",
|
114 |
+
info="Slide to view other samples", scale=1, visible=True),
|
115 |
+
reconstruction_state: encodeCache,
|
116 |
+
reconstruction_samples_state: reconstruction_samples}
|
117 |
+
|
118 |
+
def show_reconstruction_sample(sample_index, encodeCache_state, reconstruction_samples_state):
|
119 |
+
sample_index = int(sample_index)
|
120 |
+
sampleRate = reconstruction_samples_state["sampleRate"]
|
121 |
+
latent_representation_gradio_image = reconstruction_samples_state["latent_representation_gradio_images"][
|
122 |
+
sample_index]
|
123 |
+
quantized_latent_representation_gradio_image = \
|
124 |
+
reconstruction_samples_state["quantized_latent_representation_gradio_images"][sample_index]
|
125 |
+
origin_flipped_log_spectrum = reconstruction_samples_state["origin_flipped_log_spectrums"][sample_index]
|
126 |
+
origin_flipped_phase = reconstruction_samples_state["origin_flipped_phases"][sample_index]
|
127 |
+
origin_signal = reconstruction_samples_state["origin_signals"][sample_index]
|
128 |
+
reconstruction_flipped_log_spectrum = reconstruction_samples_state["reconstruction_flipped_log_spectrums"][
|
129 |
+
sample_index]
|
130 |
+
reconstruction_flipped_phase = reconstruction_samples_state["reconstruction_flipped_phases"][
|
131 |
+
sample_index]
|
132 |
+
reconstruction_signal = reconstruction_samples_state["reconstruction_signals"][sample_index]
|
133 |
+
reconstruction_flipped_log_spectrum_WOA = reconstruction_samples_state["reconstruction_flipped_log_spectrums_WOA"][
|
134 |
+
sample_index]
|
135 |
+
reconstruction_flipped_phase_WOA = reconstruction_samples_state["reconstruction_flipped_phases_WOA"][
|
136 |
+
sample_index]
|
137 |
+
reconstruction_signal_WOA = reconstruction_samples_state["reconstruction_signals_WOA"][sample_index]
|
138 |
+
return origin_flipped_log_spectrum, origin_flipped_phase, (sampleRate, origin_signal), \
|
139 |
+
latent_representation_gradio_image, quantized_latent_representation_gradio_image, \
|
140 |
+
reconstruction_flipped_log_spectrum, reconstruction_flipped_phase, (sampleRate, reconstruction_signal), \
|
141 |
+
reconstruction_flipped_log_spectrum_WOA, reconstruction_flipped_phase_WOA, (sampleRate, reconstruction_signal_WOA), \
|
142 |
+
encodeCache_state, reconstruction_samples_state
|
143 |
+
|
144 |
+
with gr.Tab("Reconstruction"):
|
145 |
+
reconstruction_samples_state = gr.State(value={})
|
146 |
+
gr.Markdown("Test reconstruction.")
|
147 |
+
with gr.Row(variant="panel"):
|
148 |
+
with gr.Column():
|
149 |
+
sample_source_radio = gr.Radio(
|
150 |
+
choices=["synthetic", "external", "text2sound_trainSTFT", "text2sound_testSTFT", "text2sound_validSTFT"],
|
151 |
+
value="text2sound_trainf", info="Info placeholder", scale=2)
|
152 |
+
batchsize_slider = gr.Slider(minimum=1., maximum=16., value=4., step=1.,
|
153 |
+
label="batchsize")
|
154 |
+
with gr.Column():
|
155 |
+
generate_button = gr.Button(variant="primary", value="Generate reconstruction samples", scale=1)
|
156 |
+
sample_index_slider = gr.Slider(minimum=0, maximum=3, value=0, step=1.0, label="Sample index.",
|
157 |
+
info="Slide to view other samples", scale=1, visible=False)
|
158 |
+
with gr.Row(variant="panel"):
|
159 |
+
with gr.Column():
|
160 |
+
origin_amplitude_image_output = gr.Image(label="Spectrogram", type="numpy", height=300, width=100, scale=1)
|
161 |
+
origin_phase_image_output = gr.Image(label="Phase", type="numpy", height=300, width=100, scale=1)
|
162 |
+
origin_audio_output = gr.Audio(type="numpy", label="Play the example!")
|
163 |
+
with gr.Column():
|
164 |
+
reconstruction_amplitude_image_output = gr.Image(label="Spectrogram", type="numpy", height=300, width=100, scale=1)
|
165 |
+
reconstruction_phase_image_output = gr.Image(label="Phase", type="numpy", height=300, width=100, scale=1)
|
166 |
+
reconstruction_audio_output = gr.Audio(type="numpy", label="Play the example!")
|
167 |
+
with gr.Column():
|
168 |
+
reconstruction_amplitude_image_output_WOA = gr.Image(label="Spectrogram", type="numpy", height=300, width=100, scale=1)
|
169 |
+
reconstruction_phase_image_output_WOA = gr.Image(label="Phase", type="numpy", height=300, width=100, scale=1)
|
170 |
+
reconstruction_audio_output_WOA = gr.Audio(type="numpy", label="Play the example!")
|
171 |
+
with gr.Row(variant="panel", equal_height=True):
|
172 |
+
latent_representation_image_output = gr.Image(label="latent_representation", type="numpy", height=300, width=100)
|
173 |
+
quantized_latent_representation_image_output = gr.Image(label="quantized", type="numpy", height=300, width=100)
|
174 |
+
|
175 |
+
generate_button.click(generate_reconstruction_samples,
|
176 |
+
inputs=[sample_source_radio, batchsize_slider, reconstruction_state,
|
177 |
+
reconstruction_samples_state],
|
178 |
+
outputs=[origin_amplitude_image_output, origin_phase_image_output, origin_audio_output,
|
179 |
+
latent_representation_image_output, quantized_latent_representation_image_output,
|
180 |
+
reconstruction_amplitude_image_output, reconstruction_phase_image_output, reconstruction_audio_output,
|
181 |
+
reconstruction_amplitude_image_output_WOA, reconstruction_phase_image_output_WOA, reconstruction_audio_output_WOA,
|
182 |
+
sample_index_slider, reconstruction_state, reconstruction_samples_state])
|
183 |
+
|
184 |
+
sample_index_slider.change(show_reconstruction_sample,
|
185 |
+
inputs=[sample_index_slider, reconstruction_state, reconstruction_samples_state],
|
186 |
+
outputs=[origin_amplitude_image_output, origin_phase_image_output, origin_audio_output,
|
187 |
+
latent_representation_image_output, quantized_latent_representation_image_output,
|
188 |
+
reconstruction_amplitude_image_output, reconstruction_phase_image_output, reconstruction_audio_output,
|
189 |
+
reconstruction_amplitude_image_output_WOA, reconstruction_phase_image_output_WOA, reconstruction_audio_output_WOA,
|
190 |
+
reconstruction_state, reconstruction_samples_state])
|
webUI/natural_language_guided_4/sound2sound_with_text.py
ADDED
@@ -0,0 +1,325 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import librosa
|
3 |
+
import numpy as np
|
4 |
+
import torch
|
5 |
+
|
6 |
+
from model.DiffSynthSampler import DiffSynthSampler
|
7 |
+
from tools import pad_STFT, encode_stft
|
8 |
+
from tools import safe_int, adjust_audio_length
|
9 |
+
from webUI.natural_language_guided_4.utils import InputBatch2Encode_STFT, encodeBatch2GradioOutput_STFT, \
|
10 |
+
latent_representation_to_Gradio_image, resize_image_to_aspect_ratio, add_instrument
|
11 |
+
|
12 |
+
|
13 |
+
def get_sound2sound_with_text_module(gradioWebUI, sound2sound_with_text_state, virtual_instruments_state):
|
14 |
+
# Load configurations
|
15 |
+
uNet = gradioWebUI.uNet
|
16 |
+
freq_resolution, time_resolution = gradioWebUI.freq_resolution, gradioWebUI.time_resolution
|
17 |
+
VAE_scale = gradioWebUI.VAE_scale
|
18 |
+
height, width, channels = int(freq_resolution / VAE_scale), int(time_resolution / VAE_scale), gradioWebUI.channels
|
19 |
+
timesteps = gradioWebUI.timesteps
|
20 |
+
VAE_encoder = gradioWebUI.VAE_encoder
|
21 |
+
VAE_quantizer = gradioWebUI.VAE_quantizer
|
22 |
+
VAE_decoder = gradioWebUI.VAE_decoder
|
23 |
+
CLAP = gradioWebUI.CLAP
|
24 |
+
CLAP_tokenizer = gradioWebUI.CLAP_tokenizer
|
25 |
+
device = gradioWebUI.device
|
26 |
+
squared = gradioWebUI.squared
|
27 |
+
sample_rate = gradioWebUI.sample_rate
|
28 |
+
noise_strategy = gradioWebUI.noise_strategy
|
29 |
+
|
30 |
+
def receive_upload_origin_audio(sound2sound_duration, sound2sound_origin,
|
31 |
+
sound2sound_with_text_dict, virtual_instruments_dict):
|
32 |
+
origin_sr, origin_audio = sound2sound_origin
|
33 |
+
origin_audio = origin_audio / np.max(np.abs(origin_audio))
|
34 |
+
|
35 |
+
width = int(time_resolution * ((sound2sound_duration + 1) / 4) / VAE_scale)
|
36 |
+
audio_length = 256 * (VAE_scale * width - 1)
|
37 |
+
origin_audio = adjust_audio_length(origin_audio, audio_length, origin_sr, sample_rate)
|
38 |
+
|
39 |
+
D = librosa.stft(origin_audio, n_fft=1024, hop_length=256, win_length=1024)
|
40 |
+
padded_D = pad_STFT(D)
|
41 |
+
encoded_D = encode_stft(padded_D)
|
42 |
+
|
43 |
+
# Todo: justify batchsize to 1
|
44 |
+
origin_spectrogram_batch_tensor = torch.from_numpy(
|
45 |
+
np.repeat(encoded_D[np.newaxis, :, :, :], 1, axis=0)).float().to(device)
|
46 |
+
|
47 |
+
# Todo: remove hard-coding
|
48 |
+
origin_flipped_log_spectrums, origin_flipped_phases, origin_signals, origin_latent_representations, quantized_origin_latent_representations = InputBatch2Encode_STFT(
|
49 |
+
VAE_encoder, origin_spectrogram_batch_tensor, resolution=(512, width * VAE_scale), quantizer=VAE_quantizer,
|
50 |
+
squared=squared)
|
51 |
+
|
52 |
+
sound2sound_with_text_dict["origin_latent_representations"] = origin_latent_representations.tolist()
|
53 |
+
sound2sound_with_text_dict[
|
54 |
+
"sound2sound_origin_latent_representation_image"] = latent_representation_to_Gradio_image(
|
55 |
+
origin_latent_representations[0]).tolist()
|
56 |
+
sound2sound_with_text_dict[
|
57 |
+
"sound2sound_origin_quantized_latent_representation_image"] = latent_representation_to_Gradio_image(
|
58 |
+
quantized_origin_latent_representations[0]).tolist()
|
59 |
+
|
60 |
+
|
61 |
+
return {sound2sound_origin_spectrogram_image: resize_image_to_aspect_ratio(origin_flipped_log_spectrums[0],
|
62 |
+
1.55,
|
63 |
+
1),
|
64 |
+
sound2sound_origin_phase_image: resize_image_to_aspect_ratio(origin_flipped_phases[0],
|
65 |
+
1.55,
|
66 |
+
1),
|
67 |
+
sound2sound_origin_latent_representation_image: latent_representation_to_Gradio_image(
|
68 |
+
origin_latent_representations[0]),
|
69 |
+
sound2sound_origin_quantized_latent_representation_image: latent_representation_to_Gradio_image(
|
70 |
+
quantized_origin_latent_representations[0]),
|
71 |
+
sound2sound_with_text_state: sound2sound_with_text_dict,
|
72 |
+
virtual_instruments_state: virtual_instruments_dict}
|
73 |
+
|
74 |
+
def sound2sound_sample(sound2sound_prompts, sound2sound_negative_prompts, sound2sound_batchsize,
|
75 |
+
sound2sound_guidance_scale, sound2sound_sampler,
|
76 |
+
sound2sound_sample_steps,
|
77 |
+
sound2sound_noising_strength, sound2sound_seed, sound2sound_dict, virtual_instruments_dict):
|
78 |
+
# input processing
|
79 |
+
sound2sound_seed = safe_int(sound2sound_seed, 12345678)
|
80 |
+
sound2sound_batchsize = int(sound2sound_batchsize)
|
81 |
+
noising_strength = sound2sound_noising_strength
|
82 |
+
sound2sound_sample_steps = int(sound2sound_sample_steps)
|
83 |
+
CFG = int(sound2sound_guidance_scale)
|
84 |
+
|
85 |
+
origin_latent_representations = torch.tensor(
|
86 |
+
sound2sound_dict["origin_latent_representations"]).repeat(sound2sound_batchsize, 1, 1, 1).to(
|
87 |
+
device)
|
88 |
+
|
89 |
+
# sound2sound
|
90 |
+
text2sound_embedding = \
|
91 |
+
CLAP.get_text_features(**CLAP_tokenizer([sound2sound_prompts], padding=True, return_tensors="pt"))[0].to(
|
92 |
+
device)
|
93 |
+
|
94 |
+
mySampler = DiffSynthSampler(timesteps, height=height, channels=channels, noise_strategy=noise_strategy)
|
95 |
+
negative_condition = \
|
96 |
+
CLAP.get_text_features(**CLAP_tokenizer([sound2sound_negative_prompts], padding=True, return_tensors="pt"))[
|
97 |
+
0]
|
98 |
+
mySampler.activate_classifier_free_guidance(CFG, negative_condition.to(device))
|
99 |
+
|
100 |
+
normalized_sample_steps = int(sound2sound_sample_steps / noising_strength)
|
101 |
+
mySampler.respace(list(np.linspace(0, timesteps - 1, normalized_sample_steps, dtype=np.int32)))
|
102 |
+
|
103 |
+
condition = text2sound_embedding.repeat(sound2sound_batchsize, 1)
|
104 |
+
|
105 |
+
# Todo: remove-hard coding
|
106 |
+
width = origin_latent_representations.shape[-1]
|
107 |
+
new_sound_latent_representations, initial_noise = \
|
108 |
+
mySampler.img_guided_sample(model=uNet, shape=(sound2sound_batchsize, channels, height, width),
|
109 |
+
seed=sound2sound_seed,
|
110 |
+
noising_strength=noising_strength,
|
111 |
+
guide_img=origin_latent_representations, return_tensor=True,
|
112 |
+
condition=condition,
|
113 |
+
sampler=sound2sound_sampler)
|
114 |
+
|
115 |
+
new_sound_latent_representations = new_sound_latent_representations[-1]
|
116 |
+
|
117 |
+
# Quantize new sound latent representations
|
118 |
+
quantized_new_sound_latent_representations, loss, (_, _, _) = VAE_quantizer(new_sound_latent_representations)
|
119 |
+
|
120 |
+
new_sound_flipped_log_spectrums, new_sound_flipped_phases, new_sound_signals, _, _, _ = encodeBatch2GradioOutput_STFT(
|
121 |
+
VAE_decoder,
|
122 |
+
quantized_new_sound_latent_representations,
|
123 |
+
resolution=(
|
124 |
+
512,
|
125 |
+
width * VAE_scale),
|
126 |
+
original_STFT_batch=None
|
127 |
+
)
|
128 |
+
|
129 |
+
new_sound_latent_representation_gradio_images = []
|
130 |
+
new_sound_quantized_latent_representation_gradio_images = []
|
131 |
+
new_sound_spectrogram_gradio_images = []
|
132 |
+
new_sound_phase_gradio_images = []
|
133 |
+
new_sound_rec_signals_gradio = []
|
134 |
+
for i in range(sound2sound_batchsize):
|
135 |
+
new_sound_latent_representation_gradio_images.append(
|
136 |
+
latent_representation_to_Gradio_image(new_sound_latent_representations[i]))
|
137 |
+
new_sound_quantized_latent_representation_gradio_images.append(
|
138 |
+
latent_representation_to_Gradio_image(quantized_new_sound_latent_representations[i]))
|
139 |
+
new_sound_spectrogram_gradio_images.append(new_sound_flipped_log_spectrums[i])
|
140 |
+
new_sound_phase_gradio_images.append(new_sound_flipped_phases[i])
|
141 |
+
new_sound_rec_signals_gradio.append((sample_rate, new_sound_signals[i]))
|
142 |
+
sound2sound_dict[
|
143 |
+
"new_sound_latent_representation_gradio_images"] = new_sound_latent_representation_gradio_images
|
144 |
+
sound2sound_dict[
|
145 |
+
"new_sound_quantized_latent_representation_gradio_images"] = new_sound_quantized_latent_representation_gradio_images
|
146 |
+
sound2sound_dict["new_sound_spectrogram_gradio_images"] = new_sound_spectrogram_gradio_images
|
147 |
+
sound2sound_dict["new_sound_phase_gradio_images"] = new_sound_phase_gradio_images
|
148 |
+
sound2sound_dict["new_sound_rec_signals_gradio"] = new_sound_rec_signals_gradio
|
149 |
+
|
150 |
+
# save instrument
|
151 |
+
sound2sound_dict["latent_representations"] = new_sound_latent_representations.to("cpu").detach().numpy()
|
152 |
+
sound2sound_dict["quantized_latent_representations"] = quantized_new_sound_latent_representations.to(
|
153 |
+
"cpu").detach().numpy()
|
154 |
+
sound2sound_dict["condition"] = condition.to("cpu").detach().numpy()
|
155 |
+
sound2sound_dict["negative_condition"] = negative_condition.to("cpu").detach().numpy()
|
156 |
+
sound2sound_dict["guidance_scale"] = CFG
|
157 |
+
sound2sound_dict["sampler"] = sound2sound_sampler
|
158 |
+
|
159 |
+
return {sound2sound_new_sound_latent_representation_image: latent_representation_to_Gradio_image(
|
160 |
+
new_sound_latent_representations[0]),
|
161 |
+
sound2sound_new_sound_quantized_latent_representation_image: latent_representation_to_Gradio_image(
|
162 |
+
quantized_new_sound_latent_representations[0]),
|
163 |
+
sound2sound_new_sound_spectrogram_image: resize_image_to_aspect_ratio(new_sound_flipped_log_spectrums[0],
|
164 |
+
1.55,
|
165 |
+
1),
|
166 |
+
sound2sound_new_sound_phase_image: resize_image_to_aspect_ratio(new_sound_flipped_phases[0],
|
167 |
+
1.55,
|
168 |
+
1),
|
169 |
+
sound2sound_new_sound_audio: (sample_rate, new_sound_signals[0]),
|
170 |
+
sound2sound_sample_index_slider: gr.update(minimum=0, maximum=sound2sound_batchsize - 1, value=0,
|
171 |
+
step=1.0,
|
172 |
+
visible=True,
|
173 |
+
label="Sample index",
|
174 |
+
info="Swipe to view other samples"),
|
175 |
+
sound2sound_seed_textbox: sound2sound_seed,
|
176 |
+
sound2sound_with_text_state: sound2sound_dict,
|
177 |
+
virtual_instruments_state: virtual_instruments_dict}
|
178 |
+
|
179 |
+
def show_sound2sound_sample(sound2sound_sample_index, sound2sound_with_text_dict):
|
180 |
+
sample_index = int(sound2sound_sample_index)
|
181 |
+
return {sound2sound_new_sound_latent_representation_image:
|
182 |
+
sound2sound_with_text_dict["new_sound_latent_representation_gradio_images"][sample_index],
|
183 |
+
sound2sound_new_sound_quantized_latent_representation_image:
|
184 |
+
sound2sound_with_text_dict["new_sound_quantized_latent_representation_gradio_images"][sample_index],
|
185 |
+
sound2sound_new_sound_spectrogram_image: resize_image_to_aspect_ratio(
|
186 |
+
sound2sound_with_text_dict["new_sound_spectrogram_gradio_images"][
|
187 |
+
sample_index], 1.55, 1),
|
188 |
+
sound2sound_new_sound_phase_image: resize_image_to_aspect_ratio(
|
189 |
+
sound2sound_with_text_dict["new_sound_phase_gradio_images"][
|
190 |
+
sample_index], 1.55, 1),
|
191 |
+
sound2sound_new_sound_audio: sound2sound_with_text_dict["new_sound_rec_signals_gradio"][sample_index]}
|
192 |
+
|
193 |
+
def save_virtual_instrument(sample_index, virtual_instrument_name, sound2sound_dict, virtual_instruments_dict):
|
194 |
+
virtual_instruments_dict = add_instrument(sound2sound_dict, virtual_instruments_dict, virtual_instrument_name,
|
195 |
+
sample_index)
|
196 |
+
|
197 |
+
return {virtual_instruments_state: virtual_instruments_dict,
|
198 |
+
text2sound_instrument_name_textbox: gr.Textbox(label="Instrument name", lines=1,
|
199 |
+
placeholder=f"Saved as {virtual_instrument_name}!")}
|
200 |
+
|
201 |
+
with gr.Tab("Sound2Sound"):
|
202 |
+
gr.Markdown("Generate new sound based on a given sound!")
|
203 |
+
with gr.Row(variant="panel"):
|
204 |
+
with gr.Column(scale=3):
|
205 |
+
sound2sound_prompts_textbox = gr.Textbox(label="Positive prompt", lines=2, value="organ")
|
206 |
+
text2sound_negative_prompts_textbox = gr.Textbox(label="Negative prompt", lines=2, value="")
|
207 |
+
|
208 |
+
with gr.Column(scale=1):
|
209 |
+
sound2sound_sample_button = gr.Button(variant="primary", value="Generate", scale=1)
|
210 |
+
|
211 |
+
sound2sound_sample_index_slider = gr.Slider(minimum=0, maximum=3, value=0, step=1.0, visible=False,
|
212 |
+
label="Sample index",
|
213 |
+
info="Swipe to view other samples")
|
214 |
+
|
215 |
+
with gr.Row(variant="panel"):
|
216 |
+
with gr.Column(scale=1):
|
217 |
+
with gr.Tab("Origin sound"):
|
218 |
+
sound2sound_duration_slider = gradioWebUI.get_duration_slider()
|
219 |
+
|
220 |
+
sound2sound_origin_audio = gr.Audio(
|
221 |
+
sources=["microphone", "upload"], label="Upload/Record source sound",
|
222 |
+
waveform_options=gr.WaveformOptions(
|
223 |
+
waveform_color="#01C6FF",
|
224 |
+
waveform_progress_color="#0066B4",
|
225 |
+
skip_length=1,
|
226 |
+
show_controls=False,
|
227 |
+
),
|
228 |
+
)
|
229 |
+
|
230 |
+
with gr.Row(variant="panel"):
|
231 |
+
sound2sound_origin_spectrogram_image = gr.Image(label="Original upload spectrogram",
|
232 |
+
type="numpy",visible=True)
|
233 |
+
sound2sound_origin_phase_image = gr.Image(label="Original upload phase",
|
234 |
+
type="numpy", visible=True)
|
235 |
+
|
236 |
+
with gr.Tab("Sound2sound settings"):
|
237 |
+
sound2sound_sample_steps_slider = gradioWebUI.get_sample_steps_slider()
|
238 |
+
sound2sound_sampler_radio = gradioWebUI.get_sampler_radio()
|
239 |
+
sound2sound_batchsize_slider = gradioWebUI.get_batchsize_slider()
|
240 |
+
sound2sound_noising_strength_slider = gradioWebUI.get_noising_strength_slider()
|
241 |
+
sound2sound_guidance_scale_slider = gradioWebUI.get_guidance_scale_slider()
|
242 |
+
sound2sound_seed_textbox = gradioWebUI.get_seed_textbox()
|
243 |
+
|
244 |
+
with gr.Column(scale=1):
|
245 |
+
sound2sound_new_sound_audio = gr.Audio(type="numpy", label="Play new sound", interactive=False,
|
246 |
+
waveform_options=gr.WaveformOptions(
|
247 |
+
waveform_color="#FFB6C1",
|
248 |
+
waveform_progress_color="#FF0000",
|
249 |
+
skip_length=1,
|
250 |
+
show_controls=False,
|
251 |
+
), )
|
252 |
+
with gr.Row(variant="panel"):
|
253 |
+
sound2sound_new_sound_spectrogram_image = gr.Image(label="New sound spectrogram", type="numpy",
|
254 |
+
scale=1)
|
255 |
+
sound2sound_new_sound_phase_image = gr.Image(label="New sound phase", type="numpy",
|
256 |
+
scale=1)
|
257 |
+
|
258 |
+
with gr.Row(variant="panel",):
|
259 |
+
text2sound_instrument_name_textbox = gr.Textbox(label="Instrument name", lines=2,
|
260 |
+
placeholder="Name of your instrument",
|
261 |
+
scale=1)
|
262 |
+
text2sound_save_instrument_button = gr.Button(variant="primary",
|
263 |
+
value="Save instrument",
|
264 |
+
scale=1)
|
265 |
+
|
266 |
+
with gr.Row(variant="panel"):
|
267 |
+
sound2sound_origin_latent_representation_image = gr.Image(label="Original latent representation",
|
268 |
+
type="numpy", height=800,
|
269 |
+
visible=False)
|
270 |
+
sound2sound_origin_quantized_latent_representation_image = gr.Image(
|
271 |
+
label="Original quantized latent representation", type="numpy", height=800, visible=False)
|
272 |
+
|
273 |
+
sound2sound_new_sound_latent_representation_image = gr.Image(label="New latent representation",
|
274 |
+
type="numpy", height=800, visible=False)
|
275 |
+
sound2sound_new_sound_quantized_latent_representation_image = gr.Image(
|
276 |
+
label="New sound quantized latent representation", type="numpy", height=800, visible=False)
|
277 |
+
|
278 |
+
sound2sound_origin_audio.change(receive_upload_origin_audio,
|
279 |
+
inputs=[sound2sound_duration_slider,
|
280 |
+
sound2sound_origin_audio,
|
281 |
+
sound2sound_with_text_state,
|
282 |
+
virtual_instruments_state],
|
283 |
+
outputs=[sound2sound_origin_spectrogram_image,
|
284 |
+
sound2sound_origin_phase_image,
|
285 |
+
sound2sound_origin_latent_representation_image,
|
286 |
+
sound2sound_origin_quantized_latent_representation_image,
|
287 |
+
sound2sound_with_text_state,
|
288 |
+
virtual_instruments_state])
|
289 |
+
|
290 |
+
sound2sound_sample_button.click(sound2sound_sample,
|
291 |
+
inputs=[sound2sound_prompts_textbox,
|
292 |
+
text2sound_negative_prompts_textbox,
|
293 |
+
sound2sound_batchsize_slider,
|
294 |
+
sound2sound_guidance_scale_slider,
|
295 |
+
sound2sound_sampler_radio,
|
296 |
+
sound2sound_sample_steps_slider,
|
297 |
+
sound2sound_noising_strength_slider,
|
298 |
+
sound2sound_seed_textbox,
|
299 |
+
sound2sound_with_text_state,
|
300 |
+
virtual_instruments_state],
|
301 |
+
outputs=[sound2sound_new_sound_latent_representation_image,
|
302 |
+
sound2sound_new_sound_quantized_latent_representation_image,
|
303 |
+
sound2sound_new_sound_spectrogram_image,
|
304 |
+
sound2sound_new_sound_phase_image,
|
305 |
+
sound2sound_new_sound_audio,
|
306 |
+
sound2sound_sample_index_slider,
|
307 |
+
sound2sound_seed_textbox,
|
308 |
+
sound2sound_with_text_state,
|
309 |
+
virtual_instruments_state])
|
310 |
+
|
311 |
+
text2sound_save_instrument_button.click(save_virtual_instrument,
|
312 |
+
inputs=[sound2sound_sample_index_slider,
|
313 |
+
text2sound_instrument_name_textbox,
|
314 |
+
sound2sound_with_text_state,
|
315 |
+
virtual_instruments_state],
|
316 |
+
outputs=[virtual_instruments_state,
|
317 |
+
text2sound_instrument_name_textbox])
|
318 |
+
|
319 |
+
sound2sound_sample_index_slider.change(show_sound2sound_sample,
|
320 |
+
inputs=[sound2sound_sample_index_slider, sound2sound_with_text_state],
|
321 |
+
outputs=[sound2sound_new_sound_latent_representation_image,
|
322 |
+
sound2sound_new_sound_quantized_latent_representation_image,
|
323 |
+
sound2sound_new_sound_spectrogram_image,
|
324 |
+
sound2sound_new_sound_phase_image,
|
325 |
+
sound2sound_new_sound_audio])
|
webUI/natural_language_guided_4/super_resolution_with_text.py
ADDED
@@ -0,0 +1,387 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import librosa
|
2 |
+
import numpy as np
|
3 |
+
import torch
|
4 |
+
import gradio as gr
|
5 |
+
from scipy.ndimage import zoom
|
6 |
+
|
7 |
+
from model.DiffSynthSampler import DiffSynthSampler
|
8 |
+
from tools import adjust_audio_length, rescale, safe_int, pad_STFT, encode_stft
|
9 |
+
from webUI.natural_language_guided_STFT.utils import latent_representation_to_Gradio_image
|
10 |
+
from webUI.natural_language_guided_STFT.utils import InputBatch2Encode_STFT, encodeBatch2GradioOutput_STFT
|
11 |
+
|
12 |
+
|
13 |
+
def get_super_resolution_with_text_module(gradioWebUI, inpaintWithText_state):
|
14 |
+
# Load configurations
|
15 |
+
uNet = gradioWebUI.uNet
|
16 |
+
freq_resolution, time_resolution = gradioWebUI.freq_resolution, gradioWebUI.time_resolution
|
17 |
+
VAE_scale = gradioWebUI.VAE_scale
|
18 |
+
height, width, channels = int(freq_resolution/VAE_scale), int(time_resolution/VAE_scale), gradioWebUI.channels
|
19 |
+
timesteps = gradioWebUI.timesteps
|
20 |
+
VAE_encoder = gradioWebUI.VAE_encoder
|
21 |
+
VAE_quantizer = gradioWebUI.VAE_quantizer
|
22 |
+
VAE_decoder = gradioWebUI.VAE_decoder
|
23 |
+
CLAP = gradioWebUI.CLAP
|
24 |
+
CLAP_tokenizer = gradioWebUI.CLAP_tokenizer
|
25 |
+
device = gradioWebUI.device
|
26 |
+
squared = gradioWebUI.squared
|
27 |
+
sample_rate = gradioWebUI.sample_rate
|
28 |
+
noise_strategy = gradioWebUI.noise_strategy
|
29 |
+
|
30 |
+
def receive_uopoad_origin_audio(sound2sound_duration, sound2sound_origin_source, sound2sound_origin_upload, sound2sound_origin_microphone,
|
31 |
+
inpaintWithText_dict):
|
32 |
+
|
33 |
+
if sound2sound_origin_source == "upload":
|
34 |
+
origin_sr, origin_audio = sound2sound_origin_upload
|
35 |
+
else:
|
36 |
+
origin_sr, origin_audio = sound2sound_origin_microphone
|
37 |
+
|
38 |
+
origin_audio = origin_audio / np.max(np.abs(origin_audio))
|
39 |
+
|
40 |
+
width = int(time_resolution*((sound2sound_duration+1)/4) / VAE_scale)
|
41 |
+
audio_length = 256 * (VAE_scale * width - 1)
|
42 |
+
origin_audio = adjust_audio_length(origin_audio, audio_length, origin_sr, sample_rate)
|
43 |
+
|
44 |
+
D = librosa.stft(origin_audio, n_fft=1024, hop_length=256, win_length=1024)
|
45 |
+
padded_D = pad_STFT(D)
|
46 |
+
encoded_D = encode_stft(padded_D)
|
47 |
+
|
48 |
+
# Todo: justify batchsize to 1
|
49 |
+
origin_spectrogram_batch_tensor = torch.from_numpy(
|
50 |
+
np.repeat(encoded_D[np.newaxis, :, :, :], 1, axis=0)).float().to(device)
|
51 |
+
|
52 |
+
# Todo: remove hard-coding
|
53 |
+
origin_flipped_log_spectrums, origin_flipped_phases, origin_signals, origin_latent_representations, quantized_origin_latent_representations = InputBatch2Encode_STFT(
|
54 |
+
VAE_encoder, origin_spectrogram_batch_tensor, resolution=(512, width * VAE_scale), quantizer=VAE_quantizer, squared=squared)
|
55 |
+
|
56 |
+
if sound2sound_origin_source == "upload":
|
57 |
+
inpaintWithText_dict["origin_upload_latent_representations"] = origin_latent_representations.tolist()
|
58 |
+
inpaintWithText_dict[
|
59 |
+
"sound2sound_origin_upload_latent_representation_image"] = latent_representation_to_Gradio_image(
|
60 |
+
origin_latent_representations[0]).tolist()
|
61 |
+
inpaintWithText_dict[
|
62 |
+
"sound2sound_origin_upload_quantized_latent_representation_image"] = latent_representation_to_Gradio_image(
|
63 |
+
quantized_origin_latent_representations[0]).tolist()
|
64 |
+
return {sound2sound_origin_spectrogram_upload_image: origin_flipped_log_spectrums[0],
|
65 |
+
sound2sound_origin_phase_upload_image: origin_flipped_phases[0],
|
66 |
+
sound2sound_origin_spectrogram_microphone_image: gr.update(),
|
67 |
+
sound2sound_origin_phase_microphone_image: gr.update(),
|
68 |
+
sound2sound_origin_upload_latent_representation_image: latent_representation_to_Gradio_image(
|
69 |
+
origin_latent_representations[0]),
|
70 |
+
sound2sound_origin_upload_quantized_latent_representation_image: latent_representation_to_Gradio_image(
|
71 |
+
quantized_origin_latent_representations[0]),
|
72 |
+
sound2sound_origin_microphone_latent_representation_image: gr.update(),
|
73 |
+
sound2sound_origin_microphone_quantized_latent_representation_image: gr.update(),
|
74 |
+
inpaintWithText_state: inpaintWithText_dict}
|
75 |
+
else:
|
76 |
+
inpaintWithText_dict["origin_microphone_latent_representations"] = origin_latent_representations.tolist()
|
77 |
+
inpaintWithText_dict[
|
78 |
+
"sound2sound_origin_microphone_latent_representation_image"] = latent_representation_to_Gradio_image(
|
79 |
+
origin_latent_representations[0]).tolist()
|
80 |
+
inpaintWithText_dict[
|
81 |
+
"sound2sound_origin_microphone_quantized_latent_representation_image"] = latent_representation_to_Gradio_image(
|
82 |
+
quantized_origin_latent_representations[0]).tolist()
|
83 |
+
return {sound2sound_origin_spectrogram_upload_image: origin_flipped_log_spectrums[0],
|
84 |
+
sound2sound_origin_phase_upload_image: origin_flipped_phases[0],
|
85 |
+
sound2sound_origin_spectrogram_microphone_image: gr.update(),
|
86 |
+
sound2sound_origin_phase_microphone_image: gr.update(),
|
87 |
+
sound2sound_origin_upload_latent_representation_image: latent_representation_to_Gradio_image(
|
88 |
+
origin_latent_representations[0]),
|
89 |
+
sound2sound_origin_upload_quantized_latent_representation_image: latent_representation_to_Gradio_image(
|
90 |
+
quantized_origin_latent_representations[0]),
|
91 |
+
sound2sound_origin_microphone_latent_representation_image: gr.update(),
|
92 |
+
sound2sound_origin_microphone_quantized_latent_representation_image: gr.update(),
|
93 |
+
inpaintWithText_state: inpaintWithText_dict}
|
94 |
+
|
95 |
+
def sound2sound_sample(sound2sound_origin_spectrogram_upload, sound2sound_origin_spectrogram_microphone,
|
96 |
+
text2sound_prompts, text2sound_negative_prompts, sound2sound_batchsize,
|
97 |
+
sound2sound_guidance_scale, sound2sound_sampler,
|
98 |
+
sound2sound_sample_steps, sound2sound_origin_source,
|
99 |
+
sound2sound_noising_strength, sound2sound_seed, sound2sound_inpaint_area, inpaintWithText_dict
|
100 |
+
):
|
101 |
+
|
102 |
+
# input preprocessing
|
103 |
+
sound2sound_seed = safe_int(sound2sound_seed, 12345678)
|
104 |
+
sound2sound_batchsize = int(sound2sound_batchsize)
|
105 |
+
noising_strength = sound2sound_noising_strength
|
106 |
+
sound2sound_sample_steps = int(sound2sound_sample_steps)
|
107 |
+
CFG = int(sound2sound_guidance_scale)
|
108 |
+
|
109 |
+
text2sound_embedding = \
|
110 |
+
CLAP.get_text_features(**CLAP_tokenizer([text2sound_prompts], padding=True, return_tensors="pt"))[0].to(device)
|
111 |
+
|
112 |
+
if sound2sound_origin_source == "upload":
|
113 |
+
origin_latent_representations = torch.tensor(
|
114 |
+
inpaintWithText_dict["origin_upload_latent_representations"]).repeat(sound2sound_batchsize, 1, 1, 1).to(
|
115 |
+
device)
|
116 |
+
elif sound2sound_origin_source == "microphone":
|
117 |
+
origin_latent_representations = torch.tensor(
|
118 |
+
inpaintWithText_dict["origin_microphone_latent_representations"]).repeat(sound2sound_batchsize, 1, 1, 1).to(
|
119 |
+
device)
|
120 |
+
else:
|
121 |
+
print("Input source not in ['upload', 'microphone']!")
|
122 |
+
raise NotImplementedError()
|
123 |
+
|
124 |
+
high_resolution_latent_representations = torch.zeros((sound2sound_batchsize, channels, 256, 64)).to(device)
|
125 |
+
high_resolution_latent_representations[:, :, :128, :] = origin_latent_representations
|
126 |
+
latent_mask = np.ones((256, 64))
|
127 |
+
latent_mask[192:, :] = 0.0
|
128 |
+
print(f"latent_mask mean: {np.mean(latent_mask)}")
|
129 |
+
|
130 |
+
if sound2sound_inpaint_area == "inpaint masked":
|
131 |
+
latent_mask = 1 - latent_mask
|
132 |
+
latent_mask = torch.from_numpy(latent_mask).unsqueeze(0).unsqueeze(1).repeat(sound2sound_batchsize, channels, 1,
|
133 |
+
1).float().to(device)
|
134 |
+
latent_mask = torch.flip(latent_mask, [2])
|
135 |
+
|
136 |
+
mySampler = DiffSynthSampler(timesteps, height=height*2, channels=channels, noise_strategy=noise_strategy)
|
137 |
+
unconditional_condition = \
|
138 |
+
CLAP.get_text_features(**CLAP_tokenizer([text2sound_negative_prompts], padding=True, return_tensors="pt"))[0]
|
139 |
+
mySampler.activate_classifier_free_guidance(CFG, unconditional_condition.to(device))
|
140 |
+
|
141 |
+
normalized_sample_steps = int(sound2sound_sample_steps / noising_strength)
|
142 |
+
|
143 |
+
mySampler.respace(list(np.linspace(0, timesteps - 1, normalized_sample_steps, dtype=np.int32)))
|
144 |
+
|
145 |
+
# Todo: remove hard-coding
|
146 |
+
width = high_resolution_latent_representations.shape[-1]
|
147 |
+
condition = text2sound_embedding.repeat(sound2sound_batchsize, 1)
|
148 |
+
|
149 |
+
new_sound_latent_representations, initial_noise = \
|
150 |
+
mySampler.inpaint_sample(model=uNet, shape=(sound2sound_batchsize, channels, height*2, width),
|
151 |
+
seed=sound2sound_seed,
|
152 |
+
noising_strength=noising_strength,
|
153 |
+
guide_img=high_resolution_latent_representations, mask=latent_mask, return_tensor=True,
|
154 |
+
condition=condition, sampler=sound2sound_sampler)
|
155 |
+
|
156 |
+
new_sound_latent_representations = new_sound_latent_representations[-1]
|
157 |
+
|
158 |
+
# Quantize new sound latent representations
|
159 |
+
quantized_new_sound_latent_representations, loss, (_, _, _) = VAE_quantizer(new_sound_latent_representations)
|
160 |
+
new_sound_flipped_log_spectrums, new_sound_flipped_phases, new_sound_signals, _, _, _ = encodeBatch2GradioOutput_STFT(VAE_decoder,
|
161 |
+
quantized_new_sound_latent_representations,
|
162 |
+
resolution=(
|
163 |
+
1024,
|
164 |
+
width * VAE_scale),
|
165 |
+
original_STFT_batch=None
|
166 |
+
)
|
167 |
+
|
168 |
+
new_sound_latent_representation_gradio_images = []
|
169 |
+
new_sound_quantized_latent_representation_gradio_images = []
|
170 |
+
new_sound_spectrogram_gradio_images = []
|
171 |
+
new_sound_phase_gradio_images = []
|
172 |
+
new_sound_rec_signals_gradio = []
|
173 |
+
for i in range(sound2sound_batchsize):
|
174 |
+
new_sound_latent_representation_gradio_images.append(
|
175 |
+
latent_representation_to_Gradio_image(new_sound_latent_representations[i]))
|
176 |
+
new_sound_quantized_latent_representation_gradio_images.append(
|
177 |
+
latent_representation_to_Gradio_image(quantized_new_sound_latent_representations[i]))
|
178 |
+
new_sound_spectrogram_gradio_images.append(new_sound_flipped_log_spectrums[i])
|
179 |
+
new_sound_phase_gradio_images.append(new_sound_flipped_phases[i])
|
180 |
+
new_sound_rec_signals_gradio.append((sample_rate, new_sound_signals[i]))
|
181 |
+
|
182 |
+
inpaintWithText_dict[
|
183 |
+
"new_sound_latent_representation_gradio_images"] = new_sound_latent_representation_gradio_images
|
184 |
+
inpaintWithText_dict[
|
185 |
+
"new_sound_quantized_latent_representation_gradio_images"] = new_sound_quantized_latent_representation_gradio_images
|
186 |
+
inpaintWithText_dict["new_sound_spectrogram_gradio_images"] = new_sound_spectrogram_gradio_images
|
187 |
+
inpaintWithText_dict["new_sound_phase_gradio_images"] = new_sound_phase_gradio_images
|
188 |
+
inpaintWithText_dict["new_sound_rec_signals_gradio"] = new_sound_rec_signals_gradio
|
189 |
+
|
190 |
+
return {sound2sound_new_sound_latent_representation_image: latent_representation_to_Gradio_image(
|
191 |
+
new_sound_latent_representations[0]),
|
192 |
+
sound2sound_new_sound_quantized_latent_representation_image: latent_representation_to_Gradio_image(
|
193 |
+
quantized_new_sound_latent_representations[0]),
|
194 |
+
sound2sound_new_sound_spectrogram_image: new_sound_flipped_log_spectrums[0],
|
195 |
+
sound2sound_new_sound_phase_image: new_sound_flipped_phases[0],
|
196 |
+
sound2sound_new_sound_audio: (sample_rate, new_sound_signals[0]),
|
197 |
+
sound2sound_sample_index_slider: gr.update(minimum=0, maximum=sound2sound_batchsize - 1, value=0,
|
198 |
+
step=1.0,
|
199 |
+
visible=True,
|
200 |
+
label="Sample index",
|
201 |
+
info="Swipe to view other samples"),
|
202 |
+
sound2sound_seed_textbox: sound2sound_seed,
|
203 |
+
inpaintWithText_state: inpaintWithText_dict}
|
204 |
+
|
205 |
+
def show_sound2sound_sample(sound2sound_sample_index, inpaintWithText_dict):
|
206 |
+
sample_index = int(sound2sound_sample_index)
|
207 |
+
return {sound2sound_new_sound_latent_representation_image:
|
208 |
+
inpaintWithText_dict["new_sound_latent_representation_gradio_images"][sample_index],
|
209 |
+
sound2sound_new_sound_quantized_latent_representation_image:
|
210 |
+
inpaintWithText_dict["new_sound_quantized_latent_representation_gradio_images"][sample_index],
|
211 |
+
sound2sound_new_sound_spectrogram_image: inpaintWithText_dict["new_sound_spectrogram_gradio_images"][
|
212 |
+
sample_index],
|
213 |
+
sound2sound_new_sound_phase_image: inpaintWithText_dict["new_sound_phase_gradio_images"][
|
214 |
+
sample_index],
|
215 |
+
sound2sound_new_sound_audio: inpaintWithText_dict["new_sound_rec_signals_gradio"][sample_index]}
|
216 |
+
|
217 |
+
def sound2sound_switch_origin_source(sound2sound_origin_source):
|
218 |
+
|
219 |
+
if sound2sound_origin_source == "upload":
|
220 |
+
return {sound2sound_origin_upload_audio: gr.update(visible=True),
|
221 |
+
sound2sound_origin_microphone_audio: gr.update(visible=False),
|
222 |
+
sound2sound_origin_spectrogram_upload_image: gr.update(visible=True),
|
223 |
+
sound2sound_origin_phase_upload_image: gr.update(visible=True),
|
224 |
+
sound2sound_origin_spectrogram_microphone_image: gr.update(visible=False),
|
225 |
+
sound2sound_origin_phase_microphone_image: gr.update(visible=False),
|
226 |
+
sound2sound_origin_upload_latent_representation_image: gr.update(visible=True),
|
227 |
+
sound2sound_origin_upload_quantized_latent_representation_image: gr.update(visible=True),
|
228 |
+
sound2sound_origin_microphone_latent_representation_image: gr.update(visible=False),
|
229 |
+
sound2sound_origin_microphone_quantized_latent_representation_image: gr.update(visible=False)}
|
230 |
+
elif sound2sound_origin_source == "microphone":
|
231 |
+
return {sound2sound_origin_upload_audio: gr.update(visible=False),
|
232 |
+
sound2sound_origin_microphone_audio: gr.update(visible=True),
|
233 |
+
sound2sound_origin_spectrogram_upload_image: gr.update(visible=False),
|
234 |
+
sound2sound_origin_phase_upload_image: gr.update(visible=False),
|
235 |
+
sound2sound_origin_spectrogram_microphone_image: gr.update(visible=True),
|
236 |
+
sound2sound_origin_phase_microphone_image: gr.update(visible=True),
|
237 |
+
sound2sound_origin_upload_latent_representation_image: gr.update(visible=False),
|
238 |
+
sound2sound_origin_upload_quantized_latent_representation_image: gr.update(visible=False),
|
239 |
+
sound2sound_origin_microphone_latent_representation_image: gr.update(visible=True),
|
240 |
+
sound2sound_origin_microphone_quantized_latent_representation_image: gr.update(visible=True)}
|
241 |
+
else:
|
242 |
+
print("Input source not in ['upload', 'microphone']!")
|
243 |
+
|
244 |
+
with gr.Tab("Super Resolution"):
|
245 |
+
gr.Markdown("Select the area to inpaint and use the prompt to guide the synthesis of a new sound!")
|
246 |
+
with gr.Row(variant="panel"):
|
247 |
+
with gr.Column(scale=3):
|
248 |
+
text2sound_prompts_textbox = gr.Textbox(label="Positive prompt", lines=2, value="organ")
|
249 |
+
text2sound_negative_prompts_textbox = gr.Textbox(label="Negative prompt", lines=2, value="")
|
250 |
+
|
251 |
+
with gr.Column(scale=1):
|
252 |
+
sound2sound_sample_button = gr.Button(variant="primary", value="Generate", scale=1)
|
253 |
+
|
254 |
+
sound2sound_sample_index_slider = gr.Slider(minimum=0, maximum=3, value=0, step=1.0, visible=False,
|
255 |
+
label="Sample index",
|
256 |
+
info="Swipe to view other samples")
|
257 |
+
|
258 |
+
with gr.Row(variant="panel"):
|
259 |
+
with gr.Column(scale=1):
|
260 |
+
with gr.Tab("Origin sound"):
|
261 |
+
sound2sound_duration_slider = gradioWebUI.get_duration_slider()
|
262 |
+
sound2sound_origin_source_radio = gr.Radio(choices=["upload", "microphone"], value="upload",
|
263 |
+
label="Input source")
|
264 |
+
|
265 |
+
sound2sound_origin_upload_audio = gr.Audio(type="numpy", label="Upload", source="upload",
|
266 |
+
interactive=True, visible=True)
|
267 |
+
sound2sound_origin_microphone_audio = gr.Audio(type="numpy", label="Record", source="microphone",
|
268 |
+
interactive=True, visible=False)
|
269 |
+
with gr.Row(variant="panel"):
|
270 |
+
sound2sound_origin_spectrogram_upload_image = gr.Image(label="Original upload spectrogram",
|
271 |
+
type="numpy", height=600,
|
272 |
+
visible=True, tool="sketch")
|
273 |
+
sound2sound_origin_phase_upload_image = gr.Image(label="Original upload phase",
|
274 |
+
type="numpy", height=600,
|
275 |
+
visible=True)
|
276 |
+
sound2sound_origin_spectrogram_microphone_image = gr.Image(label="Original microphone spectrogram",
|
277 |
+
type="numpy", height=600,
|
278 |
+
visible=False, tool="sketch")
|
279 |
+
sound2sound_origin_phase_microphone_image = gr.Image(label="Original microphone phase",
|
280 |
+
type="numpy", height=600,
|
281 |
+
visible=False)
|
282 |
+
sound2sound_inpaint_area_radio = gr.Radio(choices=["inpaint masked", "inpaint not masked"],
|
283 |
+
value="inpaint masked")
|
284 |
+
|
285 |
+
with gr.Tab("Sound2sound settings"):
|
286 |
+
sound2sound_sample_steps_slider = gradioWebUI.get_sample_steps_slider()
|
287 |
+
sound2sound_sampler_radio = gradioWebUI.get_sampler_radio()
|
288 |
+
sound2sound_batchsize_slider = gradioWebUI.get_batchsize_slider()
|
289 |
+
sound2sound_noising_strength_slider = gradioWebUI.get_noising_strength_slider(default_noising_strength=1.0)
|
290 |
+
sound2sound_guidance_scale_slider = gradioWebUI.get_guidance_scale_slider()
|
291 |
+
sound2sound_seed_textbox = gradioWebUI.get_seed_textbox()
|
292 |
+
|
293 |
+
|
294 |
+
with gr.Column(scale=1):
|
295 |
+
sound2sound_new_sound_audio = gr.Audio(type="numpy", label="Play new sound", interactive=False)
|
296 |
+
with gr.Row(variant="panel"):
|
297 |
+
sound2sound_new_sound_spectrogram_image = gr.Image(label="New sound spectrogram", type="numpy",
|
298 |
+
height=1200, scale=1)
|
299 |
+
sound2sound_new_sound_phase_image = gr.Image(label="New sound phase", type="numpy",
|
300 |
+
height=1200, scale=1)
|
301 |
+
|
302 |
+
with gr.Row(variant="panel"):
|
303 |
+
sound2sound_origin_upload_latent_representation_image = gr.Image(label="Original latent representation",
|
304 |
+
type="numpy", height=1200,
|
305 |
+
visible=True)
|
306 |
+
sound2sound_origin_upload_quantized_latent_representation_image = gr.Image(
|
307 |
+
label="Original quantized latent representation", type="numpy", height=1200, visible=True)
|
308 |
+
|
309 |
+
sound2sound_origin_microphone_latent_representation_image = gr.Image(label="Original latent representation",
|
310 |
+
type="numpy", height=1200,
|
311 |
+
visible=False)
|
312 |
+
sound2sound_origin_microphone_quantized_latent_representation_image = gr.Image(
|
313 |
+
label="Original quantized latent representation", type="numpy", height=1200, visible=False)
|
314 |
+
|
315 |
+
sound2sound_new_sound_latent_representation_image = gr.Image(label="New latent representation",
|
316 |
+
type="numpy", height=1200)
|
317 |
+
sound2sound_new_sound_quantized_latent_representation_image = gr.Image(
|
318 |
+
label="New sound quantized latent representation", type="numpy", height=1200)
|
319 |
+
|
320 |
+
sound2sound_origin_upload_audio.change(receive_uopoad_origin_audio,
|
321 |
+
inputs=[sound2sound_duration_slider, sound2sound_origin_source_radio, sound2sound_origin_upload_audio,
|
322 |
+
sound2sound_origin_microphone_audio, inpaintWithText_state],
|
323 |
+
outputs=[sound2sound_origin_spectrogram_upload_image,
|
324 |
+
sound2sound_origin_phase_upload_image,
|
325 |
+
sound2sound_origin_spectrogram_microphone_image,
|
326 |
+
sound2sound_origin_phase_microphone_image,
|
327 |
+
sound2sound_origin_upload_latent_representation_image,
|
328 |
+
sound2sound_origin_upload_quantized_latent_representation_image,
|
329 |
+
sound2sound_origin_microphone_latent_representation_image,
|
330 |
+
sound2sound_origin_microphone_quantized_latent_representation_image,
|
331 |
+
inpaintWithText_state])
|
332 |
+
sound2sound_origin_microphone_audio.change(receive_uopoad_origin_audio,
|
333 |
+
inputs=[sound2sound_duration_slider, sound2sound_origin_source_radio, sound2sound_origin_upload_audio,
|
334 |
+
sound2sound_origin_microphone_audio, inpaintWithText_state],
|
335 |
+
outputs=[sound2sound_origin_spectrogram_upload_image,
|
336 |
+
sound2sound_origin_phase_upload_image,
|
337 |
+
sound2sound_origin_spectrogram_microphone_image,
|
338 |
+
sound2sound_origin_phase_microphone_image,
|
339 |
+
sound2sound_origin_upload_latent_representation_image,
|
340 |
+
sound2sound_origin_upload_quantized_latent_representation_image,
|
341 |
+
sound2sound_origin_microphone_latent_representation_image,
|
342 |
+
sound2sound_origin_microphone_quantized_latent_representation_image,
|
343 |
+
inpaintWithText_state])
|
344 |
+
|
345 |
+
sound2sound_sample_button.click(sound2sound_sample,
|
346 |
+
inputs=[sound2sound_origin_spectrogram_upload_image,
|
347 |
+
sound2sound_origin_spectrogram_microphone_image,
|
348 |
+
text2sound_prompts_textbox,
|
349 |
+
text2sound_negative_prompts_textbox,
|
350 |
+
sound2sound_batchsize_slider,
|
351 |
+
sound2sound_guidance_scale_slider,
|
352 |
+
sound2sound_sampler_radio,
|
353 |
+
sound2sound_sample_steps_slider,
|
354 |
+
sound2sound_origin_source_radio,
|
355 |
+
sound2sound_noising_strength_slider,
|
356 |
+
sound2sound_seed_textbox,
|
357 |
+
sound2sound_inpaint_area_radio,
|
358 |
+
inpaintWithText_state],
|
359 |
+
outputs=[sound2sound_new_sound_latent_representation_image,
|
360 |
+
sound2sound_new_sound_quantized_latent_representation_image,
|
361 |
+
sound2sound_new_sound_spectrogram_image,
|
362 |
+
sound2sound_new_sound_phase_image,
|
363 |
+
sound2sound_new_sound_audio,
|
364 |
+
sound2sound_sample_index_slider,
|
365 |
+
sound2sound_seed_textbox,
|
366 |
+
inpaintWithText_state])
|
367 |
+
|
368 |
+
sound2sound_sample_index_slider.change(show_sound2sound_sample,
|
369 |
+
inputs=[sound2sound_sample_index_slider, inpaintWithText_state],
|
370 |
+
outputs=[sound2sound_new_sound_latent_representation_image,
|
371 |
+
sound2sound_new_sound_quantized_latent_representation_image,
|
372 |
+
sound2sound_new_sound_spectrogram_image,
|
373 |
+
sound2sound_new_sound_phase_image,
|
374 |
+
sound2sound_new_sound_audio])
|
375 |
+
|
376 |
+
sound2sound_origin_source_radio.change(sound2sound_switch_origin_source,
|
377 |
+
inputs=[sound2sound_origin_source_radio],
|
378 |
+
outputs=[sound2sound_origin_upload_audio,
|
379 |
+
sound2sound_origin_microphone_audio,
|
380 |
+
sound2sound_origin_spectrogram_upload_image,
|
381 |
+
sound2sound_origin_phase_upload_image,
|
382 |
+
sound2sound_origin_spectrogram_microphone_image,
|
383 |
+
sound2sound_origin_phase_microphone_image,
|
384 |
+
sound2sound_origin_upload_latent_representation_image,
|
385 |
+
sound2sound_origin_upload_quantized_latent_representation_image,
|
386 |
+
sound2sound_origin_microphone_latent_representation_image,
|
387 |
+
sound2sound_origin_microphone_quantized_latent_representation_image])
|
webUI/natural_language_guided_4/text2sound.py
ADDED
@@ -0,0 +1,220 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
from model.DiffSynthSampler import DiffSynthSampler
|
5 |
+
from tools import safe_int
|
6 |
+
from webUI.natural_language_guided_4.utils import latent_representation_to_Gradio_image, \
|
7 |
+
encodeBatch2GradioOutput_STFT, add_instrument, resize_image_to_aspect_ratio
|
8 |
+
|
9 |
+
|
10 |
+
def get_text2sound_module(gradioWebUI, text2sound_state, virtual_instruments_state):
|
11 |
+
# Load configurations
|
12 |
+
uNet = gradioWebUI.uNet
|
13 |
+
freq_resolution, time_resolution = gradioWebUI.freq_resolution, gradioWebUI.time_resolution
|
14 |
+
VAE_scale = gradioWebUI.VAE_scale
|
15 |
+
height, width, channels = int(freq_resolution / VAE_scale), int(time_resolution / VAE_scale), gradioWebUI.channels
|
16 |
+
|
17 |
+
timesteps = gradioWebUI.timesteps
|
18 |
+
VAE_quantizer = gradioWebUI.VAE_quantizer
|
19 |
+
VAE_decoder = gradioWebUI.VAE_decoder
|
20 |
+
CLAP = gradioWebUI.CLAP
|
21 |
+
CLAP_tokenizer = gradioWebUI.CLAP_tokenizer
|
22 |
+
device = gradioWebUI.device
|
23 |
+
squared = gradioWebUI.squared
|
24 |
+
sample_rate = gradioWebUI.sample_rate
|
25 |
+
noise_strategy = gradioWebUI.noise_strategy
|
26 |
+
|
27 |
+
def diffusion_random_sample(text2sound_prompts, text2sound_negative_prompts, text2sound_batchsize,
|
28 |
+
text2sound_duration,
|
29 |
+
text2sound_guidance_scale, text2sound_sampler,
|
30 |
+
text2sound_sample_steps, text2sound_seed,
|
31 |
+
text2sound_dict):
|
32 |
+
text2sound_sample_steps = int(text2sound_sample_steps)
|
33 |
+
text2sound_seed = safe_int(text2sound_seed, 12345678)
|
34 |
+
|
35 |
+
width = int(time_resolution * ((text2sound_duration + 1) / 4) / VAE_scale)
|
36 |
+
|
37 |
+
text2sound_batchsize = int(text2sound_batchsize)
|
38 |
+
|
39 |
+
text2sound_embedding = \
|
40 |
+
CLAP.get_text_features(**CLAP_tokenizer([text2sound_prompts], padding=True, return_tensors="pt"))[0].to(
|
41 |
+
device)
|
42 |
+
|
43 |
+
CFG = int(text2sound_guidance_scale)
|
44 |
+
|
45 |
+
mySampler = DiffSynthSampler(timesteps, height=height, channels=channels, noise_strategy=noise_strategy)
|
46 |
+
negative_condition = \
|
47 |
+
CLAP.get_text_features(**CLAP_tokenizer([text2sound_negative_prompts], padding=True, return_tensors="pt"))[
|
48 |
+
0]
|
49 |
+
|
50 |
+
mySampler.activate_classifier_free_guidance(CFG, negative_condition.to(device))
|
51 |
+
|
52 |
+
mySampler.respace(list(np.linspace(0, timesteps - 1, text2sound_sample_steps, dtype=np.int32)))
|
53 |
+
|
54 |
+
condition = text2sound_embedding.repeat(text2sound_batchsize, 1)
|
55 |
+
|
56 |
+
latent_representations, initial_noise = \
|
57 |
+
mySampler.sample(model=uNet, shape=(text2sound_batchsize, channels, height, width), seed=text2sound_seed,
|
58 |
+
return_tensor=True, condition=condition, sampler=text2sound_sampler)
|
59 |
+
|
60 |
+
latent_representations = latent_representations[-1]
|
61 |
+
|
62 |
+
latent_representation_gradio_images = []
|
63 |
+
quantized_latent_representation_gradio_images = []
|
64 |
+
new_sound_spectrogram_gradio_images = []
|
65 |
+
new_sound_phase_gradio_images = []
|
66 |
+
new_sound_rec_signals_gradio = []
|
67 |
+
|
68 |
+
quantized_latent_representations, loss, (_, _, _) = VAE_quantizer(latent_representations)
|
69 |
+
# Todo: remove hard-coding
|
70 |
+
flipped_log_spectrums, flipped_phases, rec_signals, _, _, _ = encodeBatch2GradioOutput_STFT(VAE_decoder,
|
71 |
+
quantized_latent_representations,
|
72 |
+
resolution=(
|
73 |
+
512,
|
74 |
+
width * VAE_scale),
|
75 |
+
original_STFT_batch=None
|
76 |
+
)
|
77 |
+
|
78 |
+
for i in range(text2sound_batchsize):
|
79 |
+
latent_representation_gradio_images.append(latent_representation_to_Gradio_image(latent_representations[i]))
|
80 |
+
quantized_latent_representation_gradio_images.append(
|
81 |
+
latent_representation_to_Gradio_image(quantized_latent_representations[i]))
|
82 |
+
new_sound_spectrogram_gradio_images.append(flipped_log_spectrums[i])
|
83 |
+
new_sound_phase_gradio_images.append(flipped_phases[i])
|
84 |
+
new_sound_rec_signals_gradio.append((sample_rate, rec_signals[i]))
|
85 |
+
|
86 |
+
text2sound_dict["latent_representation_gradio_images"] = latent_representation_gradio_images
|
87 |
+
text2sound_dict["quantized_latent_representation_gradio_images"] = quantized_latent_representation_gradio_images
|
88 |
+
text2sound_dict["new_sound_spectrogram_gradio_images"] = new_sound_spectrogram_gradio_images
|
89 |
+
text2sound_dict["new_sound_phase_gradio_images"] = new_sound_phase_gradio_images
|
90 |
+
text2sound_dict["new_sound_rec_signals_gradio"] = new_sound_rec_signals_gradio
|
91 |
+
|
92 |
+
# save instrument
|
93 |
+
text2sound_dict["latent_representations"] = latent_representations.to("cpu").detach().numpy()
|
94 |
+
text2sound_dict["quantized_latent_representations"] = quantized_latent_representations.to(
|
95 |
+
"cpu").detach().numpy()
|
96 |
+
text2sound_dict["condition"] = condition.to("cpu").detach().numpy()
|
97 |
+
text2sound_dict["negative_condition"] = negative_condition.to("cpu").detach().numpy()
|
98 |
+
text2sound_dict["guidance_scale"] = CFG
|
99 |
+
text2sound_dict["sampler"] = text2sound_sampler
|
100 |
+
|
101 |
+
return {text2sound_latent_representation_image: text2sound_dict["latent_representation_gradio_images"][0],
|
102 |
+
text2sound_quantized_latent_representation_image:
|
103 |
+
text2sound_dict["quantized_latent_representation_gradio_images"][0],
|
104 |
+
text2sound_sampled_spectrogram_image: resize_image_to_aspect_ratio(
|
105 |
+
text2sound_dict["new_sound_spectrogram_gradio_images"][0],
|
106 |
+
1.55,
|
107 |
+
1),
|
108 |
+
text2sound_sampled_phase_image: resize_image_to_aspect_ratio(
|
109 |
+
text2sound_dict["new_sound_phase_gradio_images"][0],
|
110 |
+
1.55,
|
111 |
+
1),
|
112 |
+
text2sound_sampled_audio: text2sound_dict["new_sound_rec_signals_gradio"][0],
|
113 |
+
text2sound_seed_textbox: text2sound_seed,
|
114 |
+
text2sound_state: text2sound_dict,
|
115 |
+
text2sound_sample_index_slider: gr.update(minimum=0, maximum=text2sound_batchsize - 1, value=0, step=1,
|
116 |
+
visible=True,
|
117 |
+
label="Sample index.",
|
118 |
+
info="Swipe to view other samples")}
|
119 |
+
|
120 |
+
def show_random_sample(sample_index, text2sound_dict):
|
121 |
+
sample_index = int(sample_index)
|
122 |
+
text2sound_dict["sample_index"] = sample_index
|
123 |
+
print(text2sound_dict["new_sound_rec_signals_gradio"][sample_index])
|
124 |
+
return {text2sound_latent_representation_image: text2sound_dict["latent_representation_gradio_images"][
|
125 |
+
sample_index],
|
126 |
+
text2sound_quantized_latent_representation_image:
|
127 |
+
text2sound_dict["quantized_latent_representation_gradio_images"][sample_index],
|
128 |
+
text2sound_sampled_spectrogram_image: resize_image_to_aspect_ratio(
|
129 |
+
text2sound_dict["new_sound_spectrogram_gradio_images"][sample_index], 1.55, 1),
|
130 |
+
text2sound_sampled_phase_image: resize_image_to_aspect_ratio(text2sound_dict["new_sound_phase_gradio_images"][
|
131 |
+
sample_index], 1.55, 1),
|
132 |
+
text2sound_sampled_audio: text2sound_dict["new_sound_rec_signals_gradio"][sample_index]}
|
133 |
+
|
134 |
+
def save_virtual_instrument(sample_index, virtual_instrument_name, text2sound_dict, virtual_instruments_dict):
|
135 |
+
virtual_instruments_dict = add_instrument(text2sound_dict, virtual_instruments_dict, virtual_instrument_name,
|
136 |
+
sample_index)
|
137 |
+
|
138 |
+
return {virtual_instruments_state: virtual_instruments_dict,
|
139 |
+
text2sound_instrument_name_textbox: gr.Textbox(label="Instrument name", lines=1,
|
140 |
+
placeholder=f"Saved as {virtual_instrument_name}!")}
|
141 |
+
|
142 |
+
with gr.Tab("Text2sound"):
|
143 |
+
gr.Markdown("Use neural networks to select random sounds using your favorite instrument!")
|
144 |
+
with gr.Row(variant="panel"):
|
145 |
+
with gr.Column(scale=3):
|
146 |
+
text2sound_prompts_textbox = gr.Textbox(label="Positive prompt", lines=2, value="organ")
|
147 |
+
text2sound_negative_prompts_textbox = gr.Textbox(label="Negative prompt", lines=2, value="")
|
148 |
+
|
149 |
+
with gr.Column(scale=1):
|
150 |
+
text2sound_sampling_button = gr.Button(variant="primary",
|
151 |
+
value="Generate a batch of samples and show "
|
152 |
+
"the first one",
|
153 |
+
scale=1)
|
154 |
+
text2sound_sample_index_slider = gr.Slider(minimum=0, maximum=3, value=0, step=1.0, visible=False,
|
155 |
+
label="Sample index",
|
156 |
+
info="Swipe to view other samples")
|
157 |
+
with gr.Row(variant="panel"):
|
158 |
+
with gr.Column(variant="panel", scale=1):
|
159 |
+
text2sound_sample_steps_slider = gradioWebUI.get_sample_steps_slider()
|
160 |
+
text2sound_sampler_radio = gradioWebUI.get_sampler_radio()
|
161 |
+
text2sound_batchsize_slider = gradioWebUI.get_batchsize_slider()
|
162 |
+
text2sound_duration_slider = gradioWebUI.get_duration_slider()
|
163 |
+
text2sound_guidance_scale_slider = gradioWebUI.get_guidance_scale_slider()
|
164 |
+
text2sound_seed_textbox = gradioWebUI.get_seed_textbox()
|
165 |
+
|
166 |
+
with gr.Column(variant="panel", scale=1):
|
167 |
+
with gr.Row(variant="panel", ):
|
168 |
+
text2sound_sampled_spectrogram_image = gr.Image(label="Sampled spectrogram", type="numpy", )
|
169 |
+
text2sound_sampled_phase_image = gr.Image(label="Sampled phase", type="numpy")
|
170 |
+
text2sound_sampled_audio = gr.Audio(type="numpy", label="Play",
|
171 |
+
scale=1)
|
172 |
+
|
173 |
+
with gr.Row(variant="panel", ):
|
174 |
+
text2sound_instrument_name_textbox = gr.Textbox(label="Instrument name", lines=2,
|
175 |
+
placeholder="Name of your instrument",
|
176 |
+
scale=1)
|
177 |
+
text2sound_save_instrument_button = gr.Button(variant="primary",
|
178 |
+
value="Save instrument",
|
179 |
+
scale=1)
|
180 |
+
|
181 |
+
with gr.Row(variant="panel"):
|
182 |
+
text2sound_latent_representation_image = gr.Image(label="Sampled latent representation", type="numpy",
|
183 |
+
height=200, width=100, visible=False)
|
184 |
+
text2sound_quantized_latent_representation_image = gr.Image(label="Quantized latent representation",
|
185 |
+
type="numpy", height=200, width=100,
|
186 |
+
visible=False)
|
187 |
+
|
188 |
+
text2sound_sampling_button.click(diffusion_random_sample,
|
189 |
+
inputs=[text2sound_prompts_textbox,
|
190 |
+
text2sound_negative_prompts_textbox,
|
191 |
+
text2sound_batchsize_slider,
|
192 |
+
text2sound_duration_slider,
|
193 |
+
text2sound_guidance_scale_slider, text2sound_sampler_radio,
|
194 |
+
text2sound_sample_steps_slider,
|
195 |
+
text2sound_seed_textbox,
|
196 |
+
text2sound_state],
|
197 |
+
outputs=[text2sound_latent_representation_image,
|
198 |
+
text2sound_quantized_latent_representation_image,
|
199 |
+
text2sound_sampled_spectrogram_image,
|
200 |
+
text2sound_sampled_phase_image,
|
201 |
+
text2sound_sampled_audio,
|
202 |
+
text2sound_seed_textbox,
|
203 |
+
text2sound_state,
|
204 |
+
text2sound_sample_index_slider])
|
205 |
+
|
206 |
+
text2sound_save_instrument_button.click(save_virtual_instrument,
|
207 |
+
inputs=[text2sound_sample_index_slider,
|
208 |
+
text2sound_instrument_name_textbox,
|
209 |
+
text2sound_state,
|
210 |
+
virtual_instruments_state],
|
211 |
+
outputs=[virtual_instruments_state,
|
212 |
+
text2sound_instrument_name_textbox])
|
213 |
+
|
214 |
+
text2sound_sample_index_slider.change(show_random_sample,
|
215 |
+
inputs=[text2sound_sample_index_slider, text2sound_state],
|
216 |
+
outputs=[text2sound_latent_representation_image,
|
217 |
+
text2sound_quantized_latent_representation_image,
|
218 |
+
text2sound_sampled_spectrogram_image,
|
219 |
+
text2sound_sampled_phase_image,
|
220 |
+
text2sound_sampled_audio])
|
webUI/natural_language_guided_4/track_maker.py
ADDED
@@ -0,0 +1,248 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import librosa
|
2 |
+
import numpy as np
|
3 |
+
import torch
|
4 |
+
|
5 |
+
from model.DiffSynthSampler import DiffSynthSampler
|
6 |
+
from webUI.natural_language_guided_4.utils import encodeBatch2GradioOutput_STFT
|
7 |
+
import mido
|
8 |
+
import torchaudio.transforms as transforms
|
9 |
+
from tqdm import tqdm
|
10 |
+
|
11 |
+
|
12 |
+
# def pitch_shift_audio(waveform, sample_rate, n_steps, device='cpu', n_fft=1024, hop_length=None):
|
13 |
+
# # 如果输入是 numpy 数组,则转换为 torch.Tensor
|
14 |
+
# if isinstance(waveform, np.ndarray):
|
15 |
+
# waveform = torch.from_numpy(waveform)
|
16 |
+
#
|
17 |
+
# # 设置 hop_length 为 n_fft 的一半(合理的默认值),以减少 STFT 操作的内存开销
|
18 |
+
# if hop_length is None:
|
19 |
+
# hop_length = n_fft // 4
|
20 |
+
#
|
21 |
+
# # 将 waveform 移动到指定设备上
|
22 |
+
# waveform = waveform.to(device, dtype=torch.float32)
|
23 |
+
#
|
24 |
+
# # 创建 pitch_shift 变换,并移动到指定设备上
|
25 |
+
# pitch_shift = transforms.PitchShift(
|
26 |
+
# sample_rate=sample_rate,
|
27 |
+
# n_steps=n_steps,
|
28 |
+
# n_fft=n_fft,
|
29 |
+
# hop_length=hop_length
|
30 |
+
# ).to(device)
|
31 |
+
#
|
32 |
+
# # 执行变换,并将结果从设备移动到 CPU 上,最后转换为 numpy 数组
|
33 |
+
# shifted_waveform = pitch_shift(waveform).detach().cpu().numpy()
|
34 |
+
#
|
35 |
+
# return shifted_waveform
|
36 |
+
|
37 |
+
|
38 |
+
def pitch_shift_librosa(waveform, sample_rate, total_steps, step_size=4, n_fft=4096, hop_length=None):
|
39 |
+
# librosa 需要输入的是 numpy 数组
|
40 |
+
if isinstance(waveform, torch.Tensor):
|
41 |
+
waveform = waveform.numpy()
|
42 |
+
|
43 |
+
# 如果 hop_length 未提供,则使用 n_fft 的四分之一作为默认值
|
44 |
+
if hop_length is None:
|
45 |
+
hop_length = n_fft // 4
|
46 |
+
|
47 |
+
# 逐步进行 pitch shift,每次提升 step_size 个半音
|
48 |
+
current_waveform = waveform
|
49 |
+
num_steps = int(np.ceil(total_steps / step_size))
|
50 |
+
|
51 |
+
for i in range(num_steps):
|
52 |
+
step = min(step_size, total_steps - i * step_size) # 确保最后一步不会超过 total_steps
|
53 |
+
current_waveform = librosa.effects.pitch_shift(
|
54 |
+
current_waveform, sr=sample_rate, n_steps=step,
|
55 |
+
n_fft=n_fft, hop_length=hop_length
|
56 |
+
)
|
57 |
+
|
58 |
+
return current_waveform
|
59 |
+
|
60 |
+
|
61 |
+
|
62 |
+
|
63 |
+
class NoteEvent:
|
64 |
+
def __init__(self, note, velocity, start_time, duration):
|
65 |
+
self.note = note
|
66 |
+
self.velocity = velocity
|
67 |
+
self.start_time = start_time # In ticks
|
68 |
+
self.duration = duration # In ticks
|
69 |
+
|
70 |
+
def __str__(self):
|
71 |
+
return f"Note {self.note}, velocity {self.velocity}, start_time {self.start_time}, duration {self.duration}"
|
72 |
+
|
73 |
+
|
74 |
+
class Track:
|
75 |
+
def __init__(self, track, ticks_per_beat, max_notes=100):
|
76 |
+
self.tempo_events = self._parse_tempo_events(track)
|
77 |
+
self.events = self._parse_note_events(track)
|
78 |
+
self.ticks_per_beat = ticks_per_beat
|
79 |
+
self.max_notes = int(max_notes)
|
80 |
+
|
81 |
+
def _parse_tempo_events(self, track):
|
82 |
+
tempo_events = []
|
83 |
+
current_tempo = 500000 # Default MIDI tempo is 120 BPM which is 500000 microseconds per beat
|
84 |
+
for msg in track:
|
85 |
+
if msg.type == 'set_tempo':
|
86 |
+
tempo_events.append((msg.time, msg.tempo))
|
87 |
+
elif not msg.is_meta:
|
88 |
+
tempo_events.append((msg.time, current_tempo))
|
89 |
+
return tempo_events
|
90 |
+
|
91 |
+
def _parse_note_events(self, track):
|
92 |
+
events = []
|
93 |
+
start_time = 0
|
94 |
+
for msg in track:
|
95 |
+
if not msg.is_meta:
|
96 |
+
start_time += msg.time
|
97 |
+
if msg.type == 'note_on' and msg.velocity > 0:
|
98 |
+
note_on_time = start_time
|
99 |
+
elif msg.type == 'note_on' and msg.velocity == 0:
|
100 |
+
duration = start_time - note_on_time
|
101 |
+
events.append(NoteEvent(msg.note, msg.velocity, note_on_time, duration))
|
102 |
+
return events
|
103 |
+
|
104 |
+
def synthesize_track(self, diffSynthSampler, sample_rate=16000):
|
105 |
+
track_audio = np.zeros(int(self._get_total_time() * sample_rate), dtype=np.float32)
|
106 |
+
current_tempo = 500000 # Start with default MIDI tempo 120 BPM
|
107 |
+
duration_note_mapping = {}
|
108 |
+
|
109 |
+
for event in tqdm(self.events[:self.max_notes]):
|
110 |
+
current_tempo = self._get_tempo_at(event.start_time)
|
111 |
+
seconds_per_tick = mido.tick2second(1, self.ticks_per_beat, current_tempo)
|
112 |
+
start_time_sec = event.start_time * seconds_per_tick
|
113 |
+
# Todo: set a minimum duration
|
114 |
+
duration_sec = event.duration * seconds_per_tick
|
115 |
+
duration_sec = max(duration_sec, 0.75)
|
116 |
+
start_sample = int(start_time_sec * sample_rate)
|
117 |
+
if not (str(duration_sec) in duration_note_mapping):
|
118 |
+
note_sample = diffSynthSampler(event.velocity, duration_sec)
|
119 |
+
duration_note_mapping[str(duration_sec)] = note_sample / np.max(np.abs(note_sample))
|
120 |
+
|
121 |
+
# note_audio = pyrb.pitch_shift(duration_note_mapping[str(duration_sec)], sample_rate, event.note - 52)
|
122 |
+
# note_audio = pitch_shift_audio(duration_note_mapping[str(duration_sec)], sample_rate, event.note - 52)
|
123 |
+
note_audio = pitch_shift_librosa(duration_note_mapping[str(duration_sec)], sample_rate, event.note - 52)
|
124 |
+
end_sample = start_sample + len(note_audio)
|
125 |
+
track_audio[start_sample:end_sample] += note_audio
|
126 |
+
|
127 |
+
return track_audio
|
128 |
+
|
129 |
+
def _get_tempo_at(self, time_tick):
|
130 |
+
current_tempo = 500000 # Start with default MIDI tempo 120 BPM
|
131 |
+
elapsed_ticks = 0
|
132 |
+
|
133 |
+
for tempo_change in self.tempo_events:
|
134 |
+
if elapsed_ticks + tempo_change[0] > time_tick:
|
135 |
+
return current_tempo
|
136 |
+
elapsed_ticks += tempo_change[0]
|
137 |
+
current_tempo = tempo_change[1]
|
138 |
+
|
139 |
+
return current_tempo
|
140 |
+
|
141 |
+
def _get_total_time(self):
|
142 |
+
total_time = 0
|
143 |
+
current_tempo = 500000 # Start with default MIDI tempo 120 BPM
|
144 |
+
|
145 |
+
for event in self.events:
|
146 |
+
current_tempo = self._get_tempo_at(event.start_time)
|
147 |
+
seconds_per_tick = mido.tick2second(1, self.ticks_per_beat, current_tempo)
|
148 |
+
total_time += event.duration * seconds_per_tick
|
149 |
+
|
150 |
+
return total_time + 10
|
151 |
+
|
152 |
+
|
153 |
+
class DiffSynth:
|
154 |
+
def __init__(self, instruments_configs, noise_prediction_model, VAE_quantizer, VAE_decoder, text_encoder, CLAP_tokenizer, device,
|
155 |
+
model_sample_rate=16000, timesteps=1000, channels=4, freq_resolution=512, time_resolution=256, VAE_scale=4, squared=False):
|
156 |
+
|
157 |
+
self.noise_prediction_model = noise_prediction_model
|
158 |
+
self.VAE_quantizer = VAE_quantizer
|
159 |
+
self.VAE_decoder = VAE_decoder
|
160 |
+
self.device = device
|
161 |
+
self.model_sample_rate = model_sample_rate
|
162 |
+
self.timesteps = timesteps
|
163 |
+
self.channels = channels
|
164 |
+
self.freq_resolution = freq_resolution
|
165 |
+
self.time_resolution = time_resolution
|
166 |
+
self.height = int(freq_resolution/VAE_scale)
|
167 |
+
self.VAE_scale = VAE_scale
|
168 |
+
self.squared = squared
|
169 |
+
self.text_encoder = text_encoder
|
170 |
+
self.CLAP_tokenizer = CLAP_tokenizer
|
171 |
+
|
172 |
+
# instruments_configs 是字典 string -> (condition, negative_condition, guidance_scale, sample_steps, seed, initial_noise, sampler)
|
173 |
+
self.instruments_configs = instruments_configs
|
174 |
+
self.diffSynthSamplers = {}
|
175 |
+
self._update_instruments()
|
176 |
+
|
177 |
+
|
178 |
+
def _update_instruments(self):
|
179 |
+
|
180 |
+
def diffSynthSamplerWrapper(instruments_config):
|
181 |
+
|
182 |
+
def diffSynthSampler(velocity, duration_sec, sample_rate=16000):
|
183 |
+
|
184 |
+
condition = self.text_encoder.get_text_features(**self.CLAP_tokenizer([""], padding=True, return_tensors="pt")).to(self.device)
|
185 |
+
sample_steps = instruments_config['sample_steps']
|
186 |
+
sampler = instruments_config['sampler']
|
187 |
+
noising_strength = instruments_config['noising_strength']
|
188 |
+
latent_representation = instruments_config['latent_representation']
|
189 |
+
attack = instruments_config['attack']
|
190 |
+
before_release = instruments_config['before_release']
|
191 |
+
|
192 |
+
assert sample_rate == self.model_sample_rate, "sample_rate != model_sample_rate"
|
193 |
+
|
194 |
+
width = int(self.time_resolution * ((duration_sec + 1) / 4) / self.VAE_scale)
|
195 |
+
|
196 |
+
mySampler = DiffSynthSampler(self.timesteps, height=128, channels=4, noise_strategy="repeat", mute=True)
|
197 |
+
mySampler.respace(list(np.linspace(0, self.timesteps - 1, sample_steps, dtype=np.int32)))
|
198 |
+
|
199 |
+
# mask = 1, freeze
|
200 |
+
latent_mask = torch.zeros((1, 1, self.height, width), dtype=torch.float32).to(self.device)
|
201 |
+
latent_mask[:, :, :, :int(self.time_resolution * (attack / 4) / self.VAE_scale)] = 1.0
|
202 |
+
latent_mask[:, :, :, -int(self.time_resolution * ((before_release+1) / 4) / self.VAE_scale):] = 1.0
|
203 |
+
|
204 |
+
latent_representations, _ = \
|
205 |
+
mySampler.inpaint_sample(model=self.noise_prediction_model, shape=(1, self.channels, self.height, width),
|
206 |
+
noising_strength=noising_strength, condition=condition,
|
207 |
+
guide_img=latent_representation, mask=latent_mask, return_tensor=True,
|
208 |
+
sampler=sampler,
|
209 |
+
use_dynamic_mask=True, end_noise_level_ratio=0.0,
|
210 |
+
mask_flexivity=1.0)
|
211 |
+
|
212 |
+
|
213 |
+
latent_representations = latent_representations[-1]
|
214 |
+
|
215 |
+
quantized_latent_representations, _, (_, _, _) = self.VAE_quantizer(latent_representations)
|
216 |
+
# Todo: remove hard-coding
|
217 |
+
|
218 |
+
flipped_log_spectrums, flipped_phases, rec_signals, _, _, _ = encodeBatch2GradioOutput_STFT(self.VAE_decoder,
|
219 |
+
quantized_latent_representations,
|
220 |
+
resolution=(
|
221 |
+
512,
|
222 |
+
width * self.VAE_scale),
|
223 |
+
original_STFT_batch=None,
|
224 |
+
)
|
225 |
+
|
226 |
+
|
227 |
+
return rec_signals[0]
|
228 |
+
|
229 |
+
return diffSynthSampler
|
230 |
+
|
231 |
+
for key in self.instruments_configs.keys():
|
232 |
+
self.diffSynthSamplers[key] = diffSynthSamplerWrapper(self.instruments_configs[key])
|
233 |
+
|
234 |
+
def get_music(self, mid, instrument_names, sample_rate=16000, max_notes=100):
|
235 |
+
tracks = [Track(t, mid.ticks_per_beat, max_notes) for t in mid.tracks]
|
236 |
+
assert len(tracks) <= len(instrument_names), f"len(tracks) = {len(tracks)} > {len(instrument_names)} = len(instrument_names)"
|
237 |
+
|
238 |
+
track_audios = [track.synthesize_track(self.diffSynthSamplers[instrument_names[i]], sample_rate=sample_rate) for i, track in enumerate(tracks)]
|
239 |
+
|
240 |
+
# 将所有音轨填充至最长音轨的长度,以便它们可以被叠加
|
241 |
+
max_length = max(len(audio) for audio in track_audios)
|
242 |
+
full_audio = np.zeros(max_length, dtype=np.float32) # 初始化全音频数组为零
|
243 |
+
for audio in track_audios:
|
244 |
+
# 音轨可能不够长,需要填充零
|
245 |
+
padded_audio = np.pad(audio, (0, max_length - len(audio)), 'constant')
|
246 |
+
full_audio += padded_audio # 叠加音轨
|
247 |
+
|
248 |
+
return full_audio
|
webUI/natural_language_guided_4/utils.py
ADDED
@@ -0,0 +1,228 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import librosa
|
2 |
+
import numpy as np
|
3 |
+
import torch
|
4 |
+
from PIL import Image
|
5 |
+
from tools import np_power_to_db, decode_stft, depad_STFT
|
6 |
+
|
7 |
+
|
8 |
+
def spectrogram_to_Gradio_image(spc):
|
9 |
+
### input: spc [np.ndarray]
|
10 |
+
frequency_resolution, time_resolution = spc.shape[-2], spc.shape[-1]
|
11 |
+
spc = np.reshape(spc, (frequency_resolution, time_resolution))
|
12 |
+
|
13 |
+
# Todo:
|
14 |
+
magnitude_spectrum = np.abs(spc)
|
15 |
+
log_spectrum = np_power_to_db(magnitude_spectrum)
|
16 |
+
flipped_log_spectrum = np.flipud(log_spectrum)
|
17 |
+
|
18 |
+
colorful_spc = np.ones((frequency_resolution, time_resolution, 3)) * -80.0
|
19 |
+
colorful_spc[:, :, 0] = flipped_log_spectrum
|
20 |
+
colorful_spc[:, :, 1] = flipped_log_spectrum
|
21 |
+
colorful_spc[:, :, 2] = np.ones((frequency_resolution, time_resolution)) * -60.0
|
22 |
+
# Rescale to 0-255 and convert to uint8
|
23 |
+
rescaled = (colorful_spc + 80.0) / 80.0
|
24 |
+
rescaled = (255.0 * rescaled).astype(np.uint8)
|
25 |
+
return rescaled
|
26 |
+
|
27 |
+
|
28 |
+
def phase_to_Gradio_image(phase):
|
29 |
+
### input: spc [np.ndarray]
|
30 |
+
frequency_resolution, time_resolution = phase.shape[-2], phase.shape[-1]
|
31 |
+
phase = np.reshape(phase, (frequency_resolution, time_resolution))
|
32 |
+
|
33 |
+
# Todo:
|
34 |
+
flipped_phase = np.flipud(phase)
|
35 |
+
flipped_phase = (flipped_phase + 1.0) / 2.0
|
36 |
+
|
37 |
+
colorful_spc = np.zeros((frequency_resolution, time_resolution, 3))
|
38 |
+
colorful_spc[:, :, 0] = flipped_phase
|
39 |
+
colorful_spc[:, :, 1] = flipped_phase
|
40 |
+
colorful_spc[:, :, 2] = 0.2
|
41 |
+
# Rescale to 0-255 and convert to uint8
|
42 |
+
rescaled = (255.0 * colorful_spc).astype(np.uint8)
|
43 |
+
return rescaled
|
44 |
+
|
45 |
+
|
46 |
+
def latent_representation_to_Gradio_image(latent_representation):
|
47 |
+
# input: latent_representation [torch.tensor]
|
48 |
+
if not isinstance(latent_representation, np.ndarray):
|
49 |
+
latent_representation = latent_representation.to("cpu").detach().numpy()
|
50 |
+
image = latent_representation
|
51 |
+
|
52 |
+
def normalize_image(img):
|
53 |
+
min_val = img.min()
|
54 |
+
max_val = img.max()
|
55 |
+
normalized_img = ((img - min_val) / (max_val - min_val) * 255)
|
56 |
+
return normalized_img
|
57 |
+
|
58 |
+
image[0, :, :] = normalize_image(image[0, :, :])
|
59 |
+
image[1, :, :] = normalize_image(image[1, :, :])
|
60 |
+
image[2, :, :] = normalize_image(image[2, :, :])
|
61 |
+
image[3, :, :] = normalize_image(image[3, :, :])
|
62 |
+
image_transposed = np.transpose(image, (1, 2, 0))
|
63 |
+
enlarged_image = np.repeat(image_transposed, 8, axis=0)
|
64 |
+
enlarged_image = np.repeat(enlarged_image, 8, axis=1)
|
65 |
+
return np.flipud(enlarged_image).astype(np.uint8)
|
66 |
+
|
67 |
+
|
68 |
+
def InputBatch2Encode_STFT(encoder, STFT_batch, resolution=(512, 256), quantizer=None, squared=True):
|
69 |
+
"""Transform batch of numpy spectrogram's into signals and encodings."""
|
70 |
+
# Todo: remove resolution hard-coding
|
71 |
+
frequency_resolution, time_resolution = resolution
|
72 |
+
|
73 |
+
device = next(encoder.parameters()).device
|
74 |
+
if not (quantizer is None):
|
75 |
+
latent_representation_batch = encoder(STFT_batch.to(device))
|
76 |
+
quantized_latent_representation_batch, loss, (_, _, _) = quantizer(latent_representation_batch)
|
77 |
+
else:
|
78 |
+
mu, logvar, latent_representation_batch = encoder(STFT_batch.to(device))
|
79 |
+
quantized_latent_representation_batch = None
|
80 |
+
|
81 |
+
STFT_batch = STFT_batch.to("cpu").detach().numpy()
|
82 |
+
|
83 |
+
origin_flipped_log_spectrums, origin_flipped_phases, origin_signals = [], [], []
|
84 |
+
for STFT in STFT_batch:
|
85 |
+
|
86 |
+
padded_D_rec = decode_stft(STFT)
|
87 |
+
D_rec = depad_STFT(padded_D_rec)
|
88 |
+
spc = np.abs(D_rec)
|
89 |
+
phase = np.angle(D_rec)
|
90 |
+
|
91 |
+
flipped_log_spectrum = spectrogram_to_Gradio_image(spc)
|
92 |
+
flipped_phase = phase_to_Gradio_image(phase)
|
93 |
+
|
94 |
+
# get_audio
|
95 |
+
rec_signal = librosa.istft(D_rec, hop_length=256, win_length=1024)
|
96 |
+
|
97 |
+
origin_flipped_log_spectrums.append(flipped_log_spectrum)
|
98 |
+
origin_flipped_phases.append(flipped_phase)
|
99 |
+
origin_signals.append(rec_signal)
|
100 |
+
|
101 |
+
return origin_flipped_log_spectrums, origin_flipped_phases, origin_signals, \
|
102 |
+
latent_representation_batch, quantized_latent_representation_batch
|
103 |
+
|
104 |
+
|
105 |
+
def encodeBatch2GradioOutput_STFT(decoder, latent_vector_batch, resolution=(512, 256), original_STFT_batch=None):
|
106 |
+
"""Show a spectrogram."""
|
107 |
+
# Todo: remove resolution hard-coding
|
108 |
+
frequency_resolution, time_resolution = resolution
|
109 |
+
|
110 |
+
if isinstance(latent_vector_batch, np.ndarray):
|
111 |
+
latent_vector_batch = torch.from_numpy(latent_vector_batch).to(next(decoder.parameters()).device)
|
112 |
+
|
113 |
+
reconstruction_batch = decoder(latent_vector_batch).to("cpu").detach().numpy()
|
114 |
+
|
115 |
+
flipped_log_spectrums, flipped_phases, rec_signals = [], [], []
|
116 |
+
flipped_log_spectrums_with_original_amp, flipped_phases_with_original_amp, rec_signals_with_original_amp = [], [], []
|
117 |
+
|
118 |
+
for index, STFT in enumerate(reconstruction_batch):
|
119 |
+
padded_D_rec = decode_stft(STFT)
|
120 |
+
D_rec = depad_STFT(padded_D_rec)
|
121 |
+
spc = np.abs(D_rec)
|
122 |
+
phase = np.angle(D_rec)
|
123 |
+
|
124 |
+
flipped_log_spectrum = spectrogram_to_Gradio_image(spc)
|
125 |
+
flipped_phase = phase_to_Gradio_image(phase)
|
126 |
+
|
127 |
+
# get_audio
|
128 |
+
rec_signal = librosa.istft(D_rec, hop_length=256, win_length=1024)
|
129 |
+
|
130 |
+
flipped_log_spectrums.append(flipped_log_spectrum)
|
131 |
+
flipped_phases.append(flipped_phase)
|
132 |
+
rec_signals.append(rec_signal)
|
133 |
+
|
134 |
+
##########################################
|
135 |
+
|
136 |
+
if original_STFT_batch is not None:
|
137 |
+
STFT[0, :, :] = original_STFT_batch[index, 0, :, :]
|
138 |
+
|
139 |
+
padded_D_rec = decode_stft(STFT)
|
140 |
+
D_rec = depad_STFT(padded_D_rec)
|
141 |
+
spc = np.abs(D_rec)
|
142 |
+
phase = np.angle(D_rec)
|
143 |
+
|
144 |
+
flipped_log_spectrum = spectrogram_to_Gradio_image(spc)
|
145 |
+
flipped_phase = phase_to_Gradio_image(phase)
|
146 |
+
|
147 |
+
# get_audio
|
148 |
+
rec_signal = librosa.istft(D_rec, hop_length=256, win_length=1024)
|
149 |
+
|
150 |
+
flipped_log_spectrums_with_original_amp.append(flipped_log_spectrum)
|
151 |
+
flipped_phases_with_original_amp.append(flipped_phase)
|
152 |
+
rec_signals_with_original_amp.append(rec_signal)
|
153 |
+
|
154 |
+
|
155 |
+
return flipped_log_spectrums, flipped_phases, rec_signals, \
|
156 |
+
flipped_log_spectrums_with_original_amp, flipped_phases_with_original_amp, rec_signals_with_original_amp
|
157 |
+
|
158 |
+
|
159 |
+
|
160 |
+
def add_instrument(source_dict, virtual_instruments_dict, virtual_instrument_name, sample_index):
|
161 |
+
|
162 |
+
virtual_instruments = virtual_instruments_dict["virtual_instruments"]
|
163 |
+
virtual_instrument = {
|
164 |
+
"latent_representation": source_dict["latent_representations"][sample_index],
|
165 |
+
"quantized_latent_representation": source_dict["quantized_latent_representations"][sample_index],
|
166 |
+
"sampler": source_dict["sampler"],
|
167 |
+
"signal": source_dict["new_sound_rec_signals_gradio"][sample_index],
|
168 |
+
"spectrogram_gradio_image": source_dict["new_sound_spectrogram_gradio_images"][
|
169 |
+
sample_index],
|
170 |
+
"phase_gradio_image": source_dict["new_sound_phase_gradio_images"][
|
171 |
+
sample_index]}
|
172 |
+
virtual_instruments[virtual_instrument_name] = virtual_instrument
|
173 |
+
virtual_instruments_dict["virtual_instruments"] = virtual_instruments
|
174 |
+
return virtual_instruments_dict
|
175 |
+
|
176 |
+
|
177 |
+
def resize_image_to_aspect_ratio(image_data, aspect_ratio_width, aspect_ratio_height):
|
178 |
+
"""
|
179 |
+
根据给定的宽高比例拉伸图像,并保持输入输出数据为 NumPy 数组。
|
180 |
+
|
181 |
+
参数:
|
182 |
+
image_data (numpy array): 输入图像数据 (height, width, 3)
|
183 |
+
aspect_ratio_width (int): 目标宽度比例
|
184 |
+
aspect_ratio_height (int): 目标高度比例
|
185 |
+
|
186 |
+
返回:
|
187 |
+
numpy array: 调整大小后的图像数据
|
188 |
+
"""
|
189 |
+
# 获取图像的当前宽度和高度
|
190 |
+
original_height, original_width, channels = image_data.shape
|
191 |
+
|
192 |
+
# 计算当前的宽高比
|
193 |
+
current_aspect_ratio = original_width / original_height
|
194 |
+
|
195 |
+
# 计算目标的宽高比
|
196 |
+
target_aspect_ratio = aspect_ratio_width / aspect_ratio_height
|
197 |
+
|
198 |
+
# 判断是拉伸宽度还是高度
|
199 |
+
if current_aspect_ratio > target_aspect_ratio:
|
200 |
+
# 当前图像宽高比大于目标宽高比,说明宽度相对较大,需要拉伸高度
|
201 |
+
new_width = original_width
|
202 |
+
new_height = int(new_width / target_aspect_ratio)
|
203 |
+
else:
|
204 |
+
# 当前图像宽高比小于或等于目标宽高比,拉伸宽度
|
205 |
+
new_height = original_height
|
206 |
+
new_width = int(new_height * target_aspect_ratio)
|
207 |
+
|
208 |
+
# 将 numpy 数组转换为 PIL 图像对象
|
209 |
+
image = Image.fromarray(image_data.astype('uint8'))
|
210 |
+
|
211 |
+
# 使用 PIL 的 resize 函数进行缩放,使用 LANCZOS 替代 ANTIALIAS
|
212 |
+
resized_image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
|
213 |
+
|
214 |
+
# 将 PIL 图像转换回 numpy 数组
|
215 |
+
resized_image_data = np.array(resized_image)
|
216 |
+
|
217 |
+
return resized_image_data
|
218 |
+
|
219 |
+
|
220 |
+
def average_np_arrays(arr_list):
|
221 |
+
if not arr_list:
|
222 |
+
raise ValueError("Input list cannot be empty")
|
223 |
+
|
224 |
+
stacked_arrays = np.stack(arr_list, axis=0)
|
225 |
+
|
226 |
+
avg_array = np.mean(stacked_arrays, axis=0)
|
227 |
+
|
228 |
+
return avg_array
|
webUI/presets/instruments/ax.wav
ADDED
Binary file (131 kB). View file
|
|
webUI/presets/instruments/electronic_sound.wav
ADDED
Binary file (131 kB). View file
|
|
webUI/presets/instruments/keyboard.wav
ADDED
Binary file (128 kB). View file
|
|
webUI/presets/instruments/organ.wav
ADDED
Binary file (131 kB). View file
|
|
webUI/presets/instruments/string.wav
ADDED
Binary file (131 kB). View file
|
|
webUI/presets/instruments/synth_lead.wav
ADDED
Binary file (131 kB). View file
|
|
webUI/presets/midis/Air_on_the_G_String.mid
ADDED
Binary file (6 kB). View file
|
|
webUI/presets/midis/Arhbo.mid
ADDED
Binary file (14.7 kB). View file
|
|
webUI/presets/midis/Canon_in_D.mid
ADDED
Binary file (10.9 kB). View file
|
|
webUI/presets/midis/Ode_to_Joy_Easy_variation.mid
ADDED
Binary file (920 Bytes). View file
|
|
webUI/presets/midis/Rrharil.mid
ADDED
Binary file (16.2 kB). View file
|
|