Spaces:

WeixuanYuan
/

Sound_VAE

Build error

App Files Files Community

WeixuanYuan commited on May 15, 2023

Commit

b88cc47

1 Parent(s): 18a55e0

Upload 31 files

Browse files

Files changed (31) hide show

MyTest.py +3 -0
NN.json +0 -0
app.py +184 -0
configurations/conf.json +83 -0
configurations/read_configuration.py +152 -0
data_generation/data_generation.py +380 -0
data_generation/decoding.py +64 -0
data_generation/encoding.py +92 -0
example.ipynb +0 -0
external sources.txt +3 -0
generate_synthetic_data_online.py +431 -0
load_data.py +150 -0
melody_synth/complex_torch_synth.py +221 -0
melody_synth/melody_generator.py +121 -0
melody_synth/non_random_LFOs.py +121 -0
melody_synth/random_duration.py +86 -0
melody_synth/random_midi.py +86 -0
melody_synth/random_pitch.py +87 -0
melody_synth/random_rhythm.py +143 -0
model/VAE.py +230 -0
model/VAE_torchV.py +171 -0
model/perceptual_label_predictor.py +68 -0
models/decoder_5_13.pt +3 -0
models/encoder_5_13.pt +3 -0
models/new_trained_models/perceptual_label_predictor.h5 +3 -0
models/perceptual_label_predictor.h5 +3 -0
new_sound_generation.py +203 -0
requirements.txt +5 -0
test_audio.wav +0 -0
tools.py +84 -0
webUI/initial_example_encodes.json +210 -0

MyTest.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from load_data import load_data
2	+
3	+ data_cache = load_data(500)

NN.json ADDED Viewed

File without changes

app.py ADDED Viewed

	@@ -0,0 +1,184 @@

+import gradio as gr
+import os
+import json
+import numpy as np
+import torch
+import librosa
+from tools import VAE_out_put_to_spc, np_power_to_db
+from model.VAE_torchV import Encoder, Decoder
+SPECTROGRAM_RESOLUTION = (512, 256, 3)
+encoder = Encoder((1, 512, 256), 24, N2=0, channel_sizes=[64, 64, 64, 96, 96, 128, 160, 216]).to("cuda")
+decoder = Decoder(24, N2=0, N3=8, channel_sizes=[64, 64, 64, 96, 96, 128, 160, 216]).to("cuda")
+model_name = "test"
+encoder.load_state_dict(torch.load(f"models/test_encoder_CA.pt"))
+decoder.load_state_dict(torch.load(f"models/test_decoder_CA.pt"))
+INIT_ENCODE_CACHE = {"init": np.random.random((24, ))}
+with open('webUI/initial_example_encodes.json', 'r') as f:
+    list_dict = json.load(f)
+    for k in list_dict.keys():
+      INIT_ENCODE_CACHE[k] = np.array(list_dict[k])
+#################################
+def prepare_image(image):
+    # Rescale to 0-255 and convert to uint8
+    rescaled = (image + 80.0) / 80.0
+    rescaled = (255.0 * rescaled).astype(np.uint8)
+    return rescaled
+def encodeBatch2GradioOutput(latent_vector_batch, resolution=(512, 256)):
+    """Show a spectrogram."""
+    reconstruction_batch = decoder(latent_vector_batch).to("cpu").detach().numpy()
+    flipped_log_spectrums, rec_signals = [], []
+    for reconstruction in reconstruction_batch:
+      spc = VAE_out_put_to_spc(reconstruction)
+      spc = np.reshape(spc, resolution)
+      magnitude_spectrum = np.abs(spc)
+      log_spectrum = np_power_to_db(magnitude_spectrum)
+      flipped_log_spectrum = np.flipud(log_spectrum)
+      colorful_spc = np.ones((512, 256, 3)) * -80.0
+      colorful_spc[:, :, 0] = flipped_log_spectrum
+      colorful_spc[:, :, 1] = flipped_log_spectrum
+      colorful_spc[:, :, 2] = np.ones((512, 256)) * -60.0
+      flipped_log_spectrum = prepare_image(colorful_spc)
+      # get_audio
+      abs_spec = np.zeros((513, 256))
+      abs_spec[:512, :] = abs_spec[:512, :] + np.sqrt(np.reshape(spc, (512, 256)))
+      rec_signal = librosa.griffinlim(abs_spec, n_iter=32, hop_length=256, win_length=1024)
+      flipped_log_spectrums.append(flipped_log_spectrum)
+      rec_signals.append(rec_signal)
+    return flipped_log_spectrums, 16000, rec_signals
+def get_example_module(encodeCache):
+  def show_example(selected_example, encodeCache):
+    example_encode = torch.Tensor(np.reshape(encodeCache[selected_example], (-1, 24))).to("cuda")
+    flipped_log_spectrums, sampleRate, rec_signals = encodeBatch2GradioOutput(example_encode)
+    flipped_log_spectrum, rec_signal = flipped_log_spectrums[0], rec_signals[0]
+    return flipped_log_spectrum, str(encodeCache), (sampleRate, rec_signal), encodeCache
+  with gr.Tab("Examples"):
+    gr.Markdown("Predefined examples.")
+    with gr.Row():
+      with gr.Column():
+        selected_example = gr.Dropdown(
+            list(INIT_ENCODE_CACHE.keys()), label="Examples", info="Choose one example!"
+        )
+        example_button = gr.Button(value="Show example")
+      with gr.Column():
+        example_image_output = gr.Image(label="Reconstruction", type="numpy")
+        example_image_output.style(height=250, width=600)
+        example_audio_output = gr.Audio(type="numpy", label="Play reconstruction output!")
+        example_text_output = gr.Textbox()
+  example_button.click(show_example, inputs=[selected_example, encodeCache],
+            outputs=[example_image_output, example_text_output, example_audio_output, encodeCache])
+def get_reconstruction_module():
+  def do_nothing(image_input):
+    return np.random.random(SPECTROGRAM_RESOLUTION)
+  with gr.Tab("Reconstruction"):
+    gr.Markdown("Test reconstruction.")
+    with gr.Row():
+      with gr.Column():
+        test_reconstruction_input = gr.Number(label="Batch_index")
+        test_reconstruction_button = gr.Button(value="Generate")
+      with gr.Column():
+        test_reconstruction_output = gr.Image(label="Reconstruction", type="numpy")
+        test_reconstruction_output.style(height=250, width=600)
+  test_reconstruction_button.click(do_nothing, inputs=test_reconstruction_input, outputs=test_reconstruction_output)
+def get_interpolation_module(encodeCache):
+  def interpolate(first_interpulation_input, second_interpulation_input, interpulation_input_ratio, encodeCache):
+    # Todo: use batch
+    first_interpulation_input_encode = torch.Tensor(np.reshape(encodeCache[first_interpulation_input], (-1, 24)))
+    second_interpulation_input_encode = torch.Tensor(np.reshape(encodeCache[second_interpulation_input], (-1, 24)))
+    ratio = torch.Tensor([interpulation_input_ratio])
+    interpulation_encode = first_interpulation_input_encode * ratio + second_interpulation_input_encode * (1 - ratio)
+    interpulation_input_encode = torch.stack((first_interpulation_input_encode, second_interpulation_input_encode, interpulation_encode), dim=0).to("cuda")
+    flipped_log_spectrums, sampleRate, rec_signals = encodeBatch2GradioOutput(interpulation_input_encode)
+    first_flipped_log_spectrum, first_rec_signal = flipped_log_spectrums[0], rec_signals[0]
+    second_flipped_log_spectrum, second_rec_signal = flipped_log_spectrums[1], rec_signals[1]
+    interpolation_flipped_log_spectrum, interpolation_rec_signal = flipped_log_spectrums[2], rec_signals[2]
+    return first_flipped_log_spectrum, (sampleRate, first_rec_signal), second_flipped_log_spectrum, (sampleRate, second_rec_signal), interpolation_flipped_log_spectrum, (sampleRate, interpolation_rec_signal), encodeCache
+  def refresh_interpolation_input(encodeCache):
+    return gr.Dropdown.update(choices=list(encodeCache.keys())), gr.Dropdown.update(choices=list(encodeCache.keys())), str(list(encodeCache.keys())), encodeCache
+  with gr.Tab("Interpolation"):
+    gr.Markdown("Test Interpolation.")
+    with gr.Row():
+      with gr.Column():
+        with gr.Row():
+          first_interpulation_input = gr.Dropdown(list(INIT_ENCODE_CACHE.keys()), label="First input")
+          second_interpulation_input = gr.Dropdown(list(INIT_ENCODE_CACHE.keys()), label="Second input")
+        with gr.Row():
+          first_input_audio = gr.Audio(type="numpy", label="first_input_audio")
+          first_input_audio.style(length=125)
+          second_input_audio = gr.Audio(type="numpy", label="second_input_audio")
+          second_input_audio.style(length=125)
+        interpulation_input_ratio = gr.Slider(minimum=-0.20, maximum=1.20, value=0.5, step=0.01, label="Ratio of the first input.")
+        interpulation_refresh_button = gr.Button(value="Refresh")
+        interpulation_button = gr.Button(value="Interpulate")
+      with gr.Column():
+        with gr.Row():
+          first_input_spectrogram = gr.Image(label="First Input", type="numpy")
+          first_input_spectrogram.style(height=250, width=125)
+          interpolation_spectrogram = gr.Image(label="Interpolation", type="numpy")
+          interpolation_spectrogram.style(height=250, width=125)
+          second_input_spectrogram = gr.Image(label="Second Input", type="numpy")
+          second_input_spectrogram.style(height=250, width=125)
+        interpolation_audio = gr.Audio(type="numpy", label="Interpolation")
+        interpolation_audio.style(length=125)
+        interpolation_text_output = gr.Textbox()
+  interpulation_refresh_button.click(refresh_interpolation_input, inputs=[encodeCache],
+                                     outputs=[first_interpulation_input, second_interpulation_input, interpolation_text_output, encodeCache])
+  interpulation_button.click(interpolate, inputs=[first_interpulation_input, second_interpulation_input, interpulation_input_ratio, encodeCache],
+                             outputs=[first_input_spectrogram, first_input_audio, second_input_spectrogram, second_input_audio, interpolation_spectrogram, interpolation_audio, encodeCache])
+def get_random_sampling_module(encodeCache):
+  def random_sample(mu, sigma, encodeCache):
+    random_encode = torch.Tensor([mu]) + torch.Tensor([sigma]) * torch.randn(1, 24)
+    encodeCache["mytest"] = random_encode.detach().numpy()
+    random_encode = random_encode.to("cuda")
+    flipped_log_spectrums, sampleRate, rec_signals = encodeBatch2GradioOutput(random_encode)
+    random_log_spectrum, random_rec_signal = flipped_log_spectrums[0], rec_signals[0]
+    return random_log_spectrum, (sampleRate, random_rec_signal), encodeCache
+  with gr.Tab("Random sampling"):
+    gr.Markdown("Test reconstruction.")
+    with gr.Row():
+      with gr.Column():
+        with gr.Row():
+          mu = gr.Number(label="mu")
+          sigma = gr.Number(label="sigma")
+        random_sampling_button = gr.Button(value="Sample")
+      with gr.Column():
+        random_sampling_spectrogram = gr.Image(label="Random sampling", type="numpy")
+        random_sampling_spectrogram.style(height=250, width=600)
+        random_sampling_audio = gr.Audio(type="numpy", label="Interpolation")
+        random_sampling_audio.style(length=125)
+  random_sampling_button.click(random_sample, inputs=[mu, sigma, encodeCache], outputs=[random_sampling_spectrogram, random_sampling_audio, encodeCache])
+with gr.Blocks() as demo:
+  initial_examples = gr.State(value=INIT_ENCODE_CACHE)
+  # initial_interpolation_examples = gr.State(value={"init": np.random.random(SPECTROGRAM_RESOLUTION)})
+  get_example_module(initial_examples)
+  # get_reconstruction_module()
+  get_random_sampling_module(initial_examples)
+  get_interpolation_module(initial_examples)
+# demo.launch(share=True)
+demo.launch(share=True, debug=True)

configurations/conf.json ADDED Viewed

	@@ -0,0 +1,83 @@

+{
+  "midpoints":
+  {
+      "osc1_amp": [0.000, 0.01, 0.05, 0.15, 0.25, 0.45, 0.65, 0.75, 0.85, 0.89, 0.9],
+      "osc_amp2": [0.000, 0.01, 0.05, 0.15, 0.25, 0.45, 0.65, 0.75, 0.85, 0.89, 0.9],
+      "osc2_amp": [0.000, 0.01, 0.05, 0.15, 0.25, 0.45, 0.65, 0.75, 0.85, 0.89, 0.9],
+      "attack": [0.001, 0.03, 0.1, 0.25, 0.40, 0.7],
+      "decay": [0.001, 0.2, 0.60, 1.2],
+      "sustain": [0.01, 0.2, 0.5, 1.0],
+      "release": [0.001, 0.15, 0.35, 0.8],
+      "cutoff_freq": [2200, 2400, 2600, 2800, 3000,
+        3200, 3400, 3600, 3800, 4000,
+        4200, 4400, 4600, 4800, 5000, 5200, 5400, 5600, 5800, 6000, 6200, 6400, 6600, 6800, 7000, 7200, 7400, 7600, 7800],
+      "osc_types": [0, 1, 2, 3, 4, 5],
+      "amp_mod_depth": [0, 0, 0, 0.1, 0.3, 0.5, 1.0],
+      "amp_mod_freq": [0, 0, 1, 2, 4, 8],
+      "mod_waveforms": [0,1,2,3],
+      "pitch_mod_depth": [0,1],
+      "pitch_mod_freq": [1,2,4,8]
+  },
+  "subspace_range":
+  {
+      "osc_amp2": 2,
+      "osc1_amp": 2,
+      "osc2_amp": 2,
+      "attack": 1,
+      "decay": 1,
+      "sustain": 2,
+      "release": 2,
+      "cutoff_freq": 0,
+      "osc_types": 0
+  },
+  "is_discrete":
+  {
+      "osc_amp2": false,
+      "osc1_amp": false,
+      "osc2_amp": false,
+      "attack": false,
+      "decay": false,
+      "sustain": false,
+      "release": false,
+      "cutoff_freq": false,
+      "duration": false,
+      "osc_types": true,
+      "mod_waveforms": true,
+      "amp_mod_depth": true,
+      "amp_mod_freq": true,
+      "pitch_mod_depth": true,
+      "pitch_mod_freq": true
+  },
+  "sample_rate": 16384,
+  "n_sample_note": 65536,
+  "n_sample_music": 65536,
+  "STFT_hyperParameter":
+  {
+    "frame_length": 512,
+    "frame_step": 256
+  },
+  "midi_midpoints":
+  {
+      "duration": [0.1,0.5,1.0,2.0],
+      "pitch": [48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76]
+  },
+  "midi_is_discrete":
+  {
+      "duration": false,
+      "pitch": true
+  },
+  "midi_max_n_notes": 8,
+  "resolution":
+  {
+      "time_resolution": 509,
+      "freq_resolution": 513
+  }
+}

configurations/read_configuration.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import json
+bins_path = 'configurations/conf.json'
+def parameter_range(parameter_name):
+    """
+    :param parameter_name:
+    :return: List[Float]--midpoints of bins for the input synthesizer parameter
+    """
+    with open(bins_path) as f:
+        midpoints = json.load(f)["midpoints"]
+    return midpoints[parameter_name]
+def cluster_range():
+    """
+    :return: Dict[String:Int]--defines the range of cluster to search
+    """
+    with open(bins_path) as f:
+        cluster_r = json.load(f)["subspace_range"]
+    return cluster_r
+def midi_parameter_range(parameter_name):
+    """
+    :param parameter_name:
+    :return: List[Float]--midpoints of bins for the input midi parameter
+    """
+    with open(bins_path) as f:
+        r = json.load(f)["midi_midpoints"]
+    return r[parameter_name]
+def is_discrete(parameter_name):
+    """
+    :param parameter_name:
+    :return: Boolean--if the input synthesizer parameter is discrete
+    """
+    with open(bins_path) as f:
+        is_dis = json.load(f)["is_discrete"]
+    return is_dis[parameter_name]
+def midi_is_discrete(parameter_name):
+    """
+    :param parameter_name:
+    :return: Boolean--if the input midi parameter is discrete
+    """
+    with open(bins_path) as f:
+        is_dis = json.load(f)["midi_is_discrete"]
+    return is_dis[parameter_name]
+def get_label_size():
+    """
+    :return: Int--length of synthesizer parameter encoding
+    """
+    with open(bins_path) as f:
+        conf = json.load(f)
+        midpoints = conf["midpoints"]
+    n_labels = 0
+    for key in midpoints:
+        n_labels = n_labels + len(midpoints[key])
+    return n_labels
+def get_bins_length():
+    """
+    :return: Dict[String:Int]--Number of bins for all synthesizer parameters
+    """
+    with open(bins_path) as f:
+        midpoints = json.load(f)["midpoints"]
+    bins_length = {}
+    for key in midpoints:
+        bins_length[key] = len(midpoints[key])
+    return bins_length
+def get_conf_stft_hyperparameter():
+    """
+    :return: Dict[String:Int]--STFT hyper parameters
+    """
+    with open(bins_path) as f:
+        STFT_hyperParameters = json.load(f)["STFT_hyperParameter"]
+    return STFT_hyperParameters
+def get_conf_sample_rate():
+    """
+    :return: Int--sample_rate
+    """
+    with open(bins_path) as f:
+        sample_rate = json.load(f)["sample_rate"]
+    return sample_rate
+def get_conf_n_sample_note():
+    """
+    :return: Int--sample number of a note example
+    """
+    with open(bins_path) as f:
+        n_sample_note = json.load(f)["n_sample_note"]
+    return n_sample_note
+def get_conf_n_sample():
+    """
+    :return: Int--sample number of a melody example
+    """
+    with open(bins_path) as f:
+        n_sample = json.load(f)["n_sample_music"]
+    return n_sample
+def get_conf_time_resolution():
+    """
+    :return: Int--spectrogram resolution on time dimension
+    """
+    with open(bins_path) as f:
+        resolution = json.load(f)["resolution"]
+    return resolution["time_resolution"]
+def get_conf_pitch_resolution():
+    """
+    :return: Int--spectrogram resolution on pitch dimension
+    """
+    with open(bins_path) as f:
+        resolution = json.load(f)["resolution"]
+    return resolution["freq_resolution"]
+def get_conf_max_n_notes():
+    """
+    :return: Int--maximum number of notes to be generated in a melody
+    """
+    with open(bins_path) as f:
+        max_n_notes = json.load(f)["midi_max_n_notes"]
+    return max_n_notes

data_generation/data_generation.py ADDED Viewed

	@@ -0,0 +1,380 @@

+from typing import List
+import librosa
+from data_generation.encoding import ParameterDescription, Sample
+from melody_synth.random_midi import RandomMidi
+from melody_synth.melody_generator import MelodyGenerator
+from scipy.io.wavfile import write
+from pathlib import Path
+from tqdm import tqdm
+import json
+import matplotlib.pyplot as plt
+import numpy as np
+import tensorflow as tf
+import matplotlib
+from configurations.read_configuration import parameter_range, is_discrete, get_conf_stft_hyperparameter
+import shutil
+# from model.log_spectrogram import power_to_db
+from tools import power_to_db
+num_params = 16
+def plot_spectrogram(signal: np.ndarray,
+                     path: str,
+                     frame_length=512,
+                     frame_step=256):
+    """Computes the spectrogram of the given signal and saves it.
+    Parameters
+    ----------
+    signal: np.ndarray
+        The signal for which to compute the spectrogram.
+    path: str
+        Path to save the the computed spectrogram.
+    frame_length:
+        Window size of the FFT.
+    frame_step:
+        Hop size of the FFT.
+    """
+    # Compute spectrum for each frame. Returns complex tensor.
+    # todo: duplicate code in log_spectrogram.py. Move this somewhere else perhaps.
+    spectrogram = tf.signal.stft(signal,
+                                 frame_length=frame_length,
+                                 frame_step=frame_step,
+                                 pad_end=False)  # Returns 63 frames instead of 64 otherwise
+    # Compute the magnitudes
+    magnitude_spectrum = tf.abs(spectrogram)
+    log_spectrum = power_to_db(magnitude_spectrum)
+    matplotlib.pyplot.imsave(path, np.transpose(log_spectrum), vmin=-100, vmax=0, origin='lower')
+def plot_mel_spectrogram(signal: np.ndarray,
+                     path: str,
+                     frame_length=512,
+                     frame_step=256):
+    spectrogram = librosa.feature.melspectrogram(signal, sr=16384, n_fft=2048, hop_length=frame_step, win_length=frame_length)
+    matplotlib.pyplot.imsave(path, spectrogram, vmin=-100, vmax=0, origin='lower')
+# List of ParameterDescription objects that specify the parameters for generation
+param_descriptions: List[ParameterDescription]
+param_descriptions = [
+    # Oscillator levels
+    ParameterDescription(name="osc1_amp",
+                         values=parameter_range('osc1_amp'),
+                         discrete=is_discrete('osc1_amp')),
+    ParameterDescription(name="osc2_amp",
+                         values=parameter_range('osc2_amp'),
+                         discrete=is_discrete('osc2_amp')),
+    # ADSR params
+    ParameterDescription(name="attack",
+                         values=parameter_range('attack'),
+                         discrete=is_discrete('attack')),
+    ParameterDescription(name="decay",
+                         values=parameter_range('decay'),
+                         discrete=is_discrete('decay')),
+    ParameterDescription(name="sustain",
+                         values=parameter_range('sustain'),
+                         discrete=is_discrete('sustain')),
+    ParameterDescription(name="release",
+                         values=parameter_range('release'),
+                         discrete=is_discrete('release')),
+    ParameterDescription(name="cutoff_freq",
+                         values=parameter_range('cutoff_freq'),
+                         discrete=is_discrete('cutoff_freq')),
+    # Oscillators types
+    # 0 for sin saw, 1 for sin square, 2 for saw square
+    # 3 for sin triangle, 4 for triangle saw, 5 for triangle square
+    ParameterDescription(name="osc_types",
+                         values=parameter_range('osc_types'),
+                         discrete=is_discrete('osc_types')),
+]
+def generate_dataset_for_cnn(n: int,
+                             path_name="./data/data_cnn_model",
+                             sample_rate=16384,
+                             n_samples_for_note=16384 * 4,
+                             n_samples_for_melody=16384 * 4, write_parameter=True, write_spectrogram=True):
+    """
+    Generate dataset of size n for 'Inversynth' cnn model
+    :param n: Int
+    :param path_name: String--path to save the dataset
+    :param sample_rate: Int
+    :param n_samples_for_note:  Int
+    :param n_samples_for_melody:  Int
+    :param write_parameter: Boolean--if write parameter values in a .txt file
+    :param write_spectrogram: Boolean--write spectrogram with parameter values in the file name
+    :return:
+    """
+    shutil.rmtree(path_name)
+    Path(path_name).mkdir(parents=True, exist_ok=True)
+    print("Generating dataset...")
+    synth = MelodyGenerator(sample_rate,
+                            n_samples_for_note, n_samples_for_melody)
+    randomMidi = RandomMidi()
+    for i in tqdm(range(n)):
+        parameter_values = [param.generate() for param in param_descriptions]
+        # Dict of parameter values, what our synthesizer expects as input
+        parameter_values_raw = {param.name: param.value for param in parameter_values}
+        strategy = {"rhythm_strategy": "free_rhythm",
+                    "pitch_strategy": "free_pitch",
+                    "duration_strategy": "random_duration",
+                    }
+        midi_encode, midi = randomMidi(strategy)
+        signal = synth.get_melody(parameter_values_raw, midi=midi).numpy()
+        # Path to store each sample with its label
+        path = path_name + f"/{i}"
+        Path(path).mkdir(parents=True, exist_ok=True)
+        if write_parameter:
+            suffix = 'spectrogram'
+            for parameter_value in parameter_values:
+                suffix += f'_{parameter_value.name}={"%.3f" % parameter_value.value}'
+            if write_spectrogram:
+                plot_spectrogram(signal, path=path + f"/{suffix}.png", frame_length=1024, frame_step=256)
+            else:
+                with open(path + f"/{suffix}.txt", "w") as f:
+                    f.write("test")
+                    f.close()
+        write(path + f"/{i}.wav", synth.sample_rate, signal)
+        sample = Sample(parameter_values)
+        # Dump label as json
+        with open(path + "/label.json", "w") as label_file:
+            label = sample.get_values()
+            label['midi'] = midi
+            # print(len(label["encoding"]))
+            json.dump(label, label_file, ensure_ascii=True)
+    print('Data generation done!')
+def generate_dataset_for_triplet(n: int,
+                                 path_name="./data/data_triplet_val_10_500",
+                                 sample_rate=16384,
+                                 n_samples_for_note=16384 * 4,
+                                 n_samples_for_melody=16384 * 4,
+                                 n_labels=30,
+                                 write_spectrogram=True):
+    """
+    Generate dataset of size n for triplet model
+    :param write_spectrogram: Boolean--if write spectrogram
+    :param n: Int :param path_name: String--path to save the dataset :param sample_rate: Int :param
+    n_samples_for_note:  Int :param n_samples_for_melody:  Int :param n_labels: Int--number of synthesizer parameter
+    combinations contained in the dataset (a hyper parameter of triplet model)
+    """
+    shutil.rmtree(path_name)
+    Path(path_name).mkdir(parents=True, exist_ok=True)
+    print("Generating dataset...")
+    synth = MelodyGenerator(sample_rate,
+                            n_samples_for_note, n_samples_for_melody)
+    randomMidi = RandomMidi()
+    parameter_values_examples = [[param.generate() for param in param_descriptions] for i in range(n_labels)]
+    parameter_values_raw_examples = [{param.name: param.value for param in parameter_values} for parameter_values in
+                                     parameter_values_examples]
+    np.random.seed()
+    for i in tqdm(range(n)):
+        label_index = np.random.randint(0, n_labels)
+        parameter_values = parameter_values_examples[label_index]
+        parameter_values_raw = parameter_values_raw_examples[label_index]
+        strategy = {"rhythm_strategy": "free_rhythm",
+                    "pitch_strategy": "free_pitch",
+                    "duration_strategy": "random_duration",
+                    }
+        midi_encode, midi = randomMidi(strategy)
+        signal = synth.get_melody(parameter_values_raw, midi=midi).numpy()
+        # Path to store each sample with its label
+        path = path_name + f"/{i}"
+        Path(path).mkdir(parents=True, exist_ok=True)
+        write(path + f"/{i}.wav", synth.sample_rate, signal)
+        suffix = 'spectrogram'
+        for parameter_value in parameter_values:
+            suffix += f'_{parameter_value.name}={"%.3f" % parameter_value.value}'
+        if write_spectrogram:
+            hp = get_conf_stft_hyperparameter()
+            frame_l = hp['frame_length']
+            frame_s = hp['frame_length']
+            plot_spectrogram(signal, path=path + f"/{suffix}.png", frame_length=frame_l, frame_step=frame_s)
+        else:
+            with open(path + f"/{suffix}.txt", "w") as f:
+                f.write("test")
+                f.close()
+        with open(path + "/label_index.json", "w") as label_index_file:
+            index_json = {'index': label_index}
+            json.dump(index_json, label_index_file, ensure_ascii=False)
+        # save midi as .txt file
+        with open(path + "/midi.txt", "w") as midi_file:
+            midi_file.write(str(midi))
+            midi_file.close()
+    print('Data generation done!')
+def manhattan_distance(SP1, SP2):
+    """
+    :param SP1: first input synthesizer parameter combination
+    :param SP2: second input synthesizer parameter combination
+    :return: Float--manhattan distance between SP1 and SP2
+    """
+    md = []
+    for key in SP1:
+        parameter_name = key
+        value1 = SP1[parameter_name]
+        value2 = SP2[parameter_name]
+        bins = parameter_range(parameter_name)
+        bin_index1 = np.argmin(np.abs(np.array(bins) - value1))
+        bin_index2 = np.argmin(np.abs(np.array(bins) - value2))
+        if parameter_name == "osc_types":
+            if bin_index1 == bin_index2:
+                d = 0
+            else:
+                d = 1
+        else:
+            d = np.abs(bin_index1 - bin_index2) / (len(bins) - 1)
+        md.append(d)
+    return np.average(md)
+def generate_dataset_for_mixed_input_model(n: int,
+                                           path_name="./data/data_mixed_input",
+                                           sample_rate=16384,
+                                           n_samples_for_note=16384 * 4,
+                                           n_samples_for_melody=16384 * 4
+                                           ):
+    """
+    Generate dataset of size n for mixed_input_model model
+    :param n: Int
+    :param path_name: String--path to save the dataset
+    :param sample_rate: Int
+    :param n_samples_for_note:  Int
+    :param n_samples_for_melody:  Int
+    :return:
+    """
+    shutil.rmtree(path_name)
+    Path(path_name).mkdir(parents=True, exist_ok=True)
+    print("Generating dataset...")
+    synth = MelodyGenerator(sample_rate,
+                            n_samples_for_note, n_samples_for_melody)
+    randomMidi = RandomMidi()
+    strategy = {"rhythm_strategy": "free_rhythm",
+                "pitch_strategy": "free_pitch",
+                "duration_strategy": "random_duration",
+                }
+    strategy0 = {"rhythm_strategy": "single_note_rhythm",
+                 "pitch_strategy": "fixed_pitch",
+                 "duration_strategy": "fixed_duration",
+                 }
+    strategy1 = {"rhythm_strategy": "single_note_rhythm",
+                 "pitch_strategy": "fixed_pitch1",
+                 "duration_strategy": "fixed_duration",
+                 }
+    strategy2 = {"rhythm_strategy": "single_note_rhythm",
+                 "pitch_strategy": "fixed_pitch2",
+                 "duration_strategy": "fixed_duration",
+                 }
+    strategy3 = {"rhythm_strategy": "single_note_rhythm",
+                 "pitch_strategy": "fixed_pitch3",
+                 "duration_strategy": "fixed_duration",
+                 }
+    strategy4 = {"rhythm_strategy": "single_note_rhythm",
+                 "pitch_strategy": "fixed_pitch4",
+                 "duration_strategy": "fixed_duration",
+                 }
+    np.random.seed()
+    for i in tqdm(range(n)):
+        path = path_name + f"/{i}"
+        Path(path).mkdir(parents=True, exist_ok=True)
+        parameter_values = [param.generate() for param in param_descriptions]
+        parameter_values_raw = {param.name: param.value for param in parameter_values}
+        # generate query music
+        midi_encode, midi = randomMidi(strategy)
+        signal_query = synth.get_melody(parameter_values_raw, midi=midi).numpy()
+        write(path + f"/{i}.wav", synth.sample_rate, signal_query)
+        # plot_spectrogram(signal, path=path + f"/{i}_input.png", frame_length=512, frame_step=256)
+        if np.random.rand() < 0.01:  # 50% positive
+            with open(path + "/label.json", "w") as label_file:
+                sample = Sample(parameter_values)
+                label = sample.get_values()
+                label['manhattan_distance'] = 0.
+                json.dump(label, label_file, ensure_ascii=False)
+        else:
+            with open(path + "/label.json", "w") as label_file:
+                query_sp = parameter_values_raw
+                parameter_values = [param.generate() for param in param_descriptions]
+                parameter_values_raw = {param.name: param.value for param in parameter_values}
+                sample = Sample(parameter_values)
+                label = sample.get_values()
+                md = manhattan_distance(query_sp, parameter_values_raw)
+                label['manhattan_distance'] = md
+                json.dump(label, label_file, ensure_ascii=False)
+        # generate query music
+        midi_encode, midi = randomMidi(strategy0)
+        signal_single_note = synth.get_melody(parameter_values_raw, midi=midi).numpy()
+        write(path + f"/{i}_0.wav", synth.sample_rate, signal_single_note)
+        # plot_spectrogram(signal, path=path + f"/{i}_input.png", frame_length=512, frame_step=256)
+        # generate query music
+        midi_encode, midi = randomMidi(strategy1)
+        signal_single_note = synth.get_melody(parameter_values_raw, midi=midi).numpy()
+        write(path + f"/{i}_1.wav", synth.sample_rate, signal_single_note)
+        # plot_spectrogram(signal, path=path + f"/{i}_input.png", frame_length=512, frame_step=256)
+        # generate query music
+        midi_encode, midi = randomMidi(strategy2)
+        signal_single_note = synth.get_melody(parameter_values_raw, midi=midi).numpy()
+        write(path + f"/{i}_2.wav", synth.sample_rate, signal_single_note)
+        # plot_spectrogram(signal, path=path + f"/{i}_input.png", frame_length=512, frame_step=256)
+        # generate query music
+        midi_encode, midi = randomMidi(strategy3)
+        signal_single_note = synth.get_melody(parameter_values_raw, midi=midi).numpy()
+        write(path + f"/{i}_3.wav", synth.sample_rate, signal_single_note)
+        # plot_spectrogram(signal, path=path + f"/{i}_input.png", frame_length=512, frame_step=256)
+        # generate query music
+        midi_encode, midi = randomMidi(strategy4)
+        signal_single_note = synth.get_melody(parameter_values_raw, midi=midi).numpy()
+        write(path + f"/{i}_4.wav", synth.sample_rate, signal_single_note)
+        # plot_spectrogram(signal, path=path + f"/{i}_input.png", frame_length=512, frame_step=256)
+    print('Data generation done!')

data_generation/decoding.py ADDED Viewed

	@@ -0,0 +1,64 @@

+from typing import Dict
+from data_generation.data_generation import param_descriptions
+import numpy as np
+from melody_synth.melody_generator import MelodyGenerator
+from melody_synth.random_midi import RandomMidi
+def decode_label(prediction: np.ndarray,
+                 sample_rate: int,
+                 n_samples: int,
+                 return_params=False,
+                 discard_parameters=[]):
+    """Parses a network prediction array, synthesizes the described audio and returns it.
+    Parameters
+    ----------
+    prediction: np.ndarray
+        The network prediction array
+    sample_rate: int
+        Sample rate of the audio to generate.
+    n_samples: int
+        Number of samples per wav file.
+    return_params: bool
+        Whether or not to also return the parameters alongside the signal
+    discard_parameters: List[str]
+        Parameter names that should be discarded (set to their default value)
+    Returns
+    -------
+    np.ndarray:
+        The generated signal
+    """
+    params: Dict[str, float] = {}
+    index = 0
+    for i, param_description in enumerate(param_descriptions):
+        # Parses the one-hot-encoding of the prediction array
+        bits = len(param_description.values)
+        curr_prediction = prediction[index:index + bits]
+        hot_index = curr_prediction.argmax()
+        params[param_description.name] = param_description.parameter_value(hot_index).value
+        index += bits
+    for param_str in discard_parameters:
+        params[param_str] = 0  # todo: make this safe and change to default value and not just 0
+    synth = MelodyGenerator(sample_rate,
+                            n_samples, n_samples)
+    randomMidi = RandomMidi()
+    strategy = {"rhythm_strategy": "single_note_rhythm",
+                "pitch_strategy": "fixed_pitch",
+                "duration_strategy": "fixed_duration",
+                }
+    midi_encode, midi = randomMidi(strategy)
+    signal = synth.get_melody(params, midi=midi).numpy()
+    if return_params:
+        return signal, params
+    return signal

data_generation/encoding.py ADDED Viewed

	@@ -0,0 +1,92 @@

+from typing import List, Dict
+import numpy as np
+def parameter_range_low_high(parameter_range: List):
+    """
+    :param parameter_range:List[Float]--midpoints of bins
+    :return: List[Float]--lower and upper bounds of bins
+    """
+    temp1 = np.array(parameter_range[1:])
+    temp2 = np.array(parameter_range[:len(temp1)])
+    temp1 = 0.5 * (temp1 + temp2)
+    return np.hstack([parameter_range[0], temp1, parameter_range[len(parameter_range) - 1]])
+class ParameterValue:
+    """Describes a one hot encoded parameter value."""
+    name: str
+    value: float
+    encoding: List[float]
+    index: int
+    def __init__(self, name, value, encoding, index):
+        self.name = name
+        self.value = value
+        self.encoding = encoding
+        self.index = index
+class ParameterDescription:
+    """A description for generating a parameter value."""
+    # Discrete is used to generate samples that don't exactly fit into a bin for training.
+    def __init__(self, name, values: List[float], discrete=True):
+        self.name = name
+        self.values = values
+        self.discrete = discrete
+        self.parameter_low_high = parameter_range_low_high(values)
+    # one-hot encoding as per paper
+    # Value used for specifying a different value than values[index], useful for non-discrete params. todo: too adhoc?
+    def parameter_value(self, index, value=None) -> ParameterValue:
+        if value is None:
+            value = self.values[index]
+        encoding = np.zeros(len(self.values), dtype=float)
+        encoding[index] = 1.0
+        return ParameterValue(
+            name=self.name,
+            value=value,
+            encoding=encoding,
+            index=index
+        )
+    # random even distribution as per paper
+    def generate(self) -> ParameterValue:
+        # choose a bin if parameter is discrete
+        if self.discrete:
+            index = np.random.randint(0, len(self.values))
+            return self.parameter_value(index)
+        # otherwise generate a random value
+        else:
+            indexFinder = np.random.uniform(0, 1)
+            l = np.linspace(0.0, 1, len(self.values))
+            index = np.argmin(np.abs(l - indexFinder))
+            value = (self.parameter_low_high[index+1] - self.parameter_low_high[index]) * np.random.uniform(0, 1) + self.parameter_low_high[index]
+            return self.parameter_value(index, value)
+    # get the index of the best matching bin
+    def get_bin_index(self, value):
+        return np.argmin(np.abs(np.array(self.values) - value))
+    def decode(self, encoding: List[float]) -> ParameterValue:
+        index = np.array(encoding).argmax()
+        return self.parameter_value(index)
+class Sample:
+    """Describes the label of one training sample."""
+    parameters: List[ParameterValue]
+    def __init__(self, parameters):
+        self.parameters = parameters
+    def get_values(self) -> Dict[str, dict]:
+        return {
+            "parameters": {p.name: p.value for p in self.parameters},
+            "encoding": list(np.hstack(p.encoding for p in self.parameters))
+        }

example.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

external sources.txt ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ 1. External datasets have been preprocessed and stored in the directory '/data/external_data'. Links for the datasets can be found in the same directory.
2	+
3	+ 2. External codes: Some code in '/model/VAE.py', 'non_random_LFOs.py' and '/melody/complex_torch_synth.py' references external code. References are made in the files.

generate_synthetic_data_online.py ADDED Viewed

	@@ -0,0 +1,431 @@

+import matplotlib.pyplot as plt
+import librosa
+import matplotlib
+import pandas as pd
+from typing import Optional
+from torch import tensor
+from ddsp.core import tf_float32
+import torch
+from torch import Tensor
+import numpy as np
+import tensorflow as tf
+from torchsynth.config import SynthConfig
+import ddsp
+from pathlib import Path
+from typing import Dict
+from data_generation.encoding import ParameterDescription
+from typing import List
+from configurations.read_configuration import parameter_range, is_discrete, midi_parameter_range, midi_is_discrete
+import shutil
+from tqdm import tqdm
+from scipy.io.wavfile import write
+from melody_synth.complex_torch_synth import DoubleSawSynth, SinSawSynth, SinTriangleSynth, TriangleSawSynth
+sample_rate = 16000
+n_samples = sample_rate * 4.5
+class NoteGenerator:
+    """
+    This class is responsible for single-note audio generation by function 'get_note'.
+    """
+    def __init__(self,
+                 sample_rate=sample_rate,
+                 n_samples=sample_rate * 4.5):
+        self.sample_rate = sample_rate
+        self.n_samples = n_samples
+        synthconfig = SynthConfig(
+            batch_size=1, reproducible=False, sample_rate=sample_rate,
+            buffer_size_seconds=np.float64(n_samples) / np.float64(sample_rate)
+        )
+        self.Saw_Square_Voice = DoubleSawSynth(synthconfig)
+        self.SinSawVoice = SinSawSynth(synthconfig)
+        self.SinTriVoice = SinTriangleSynth(synthconfig)
+        self.TriSawVoice = TriangleSawSynth(synthconfig)
+    def get_note(self, params: Dict[str, float]):
+        osc_amp2 = np.float64(params.get("osc_amp2", 0))
+        if osc_amp2 < 0.45:
+            osc1_amp = 0.9
+            osc2_amp = osc_amp2
+        else:
+            osc1_amp = 0.9 - osc_amp2
+            osc2_amp = 0.9
+        attack_1 = np.float64(params.get("attack_1", 0))
+        decay_1 = np.float64(params.get("decay_1", 0))
+        sustain_1 = np.float64(params.get("sustain_1", 0))
+        release_1 = np.float64(params.get("release_1", 0))
+        attack_2 = np.float64(params.get("attack_2", 0))
+        decay_2 = np.float64(params.get("decay_2", 0))
+        sustain_2 = np.float64(params.get("sustain_2", 0))
+        release_2 = np.float64(params.get("release_2", 0))
+        amp_mod_freq = params.get("amp_mod_freq", 0)
+        amp_mod_depth = params.get("amp_mod_depth", 0)
+        amp_mod_waveform = params.get("amp_mod_waveform", 0)
+        pitch_mod_freq_1 = params.get("pitch_mod_freq_1", 0)
+        pitch_mod_depth = params.get("pitch_mod_depth", 0)
+        cutoff_freq = params.get("cutoff_freq", 4000)
+        pitch = np.float64(params.get("pitch", 0))
+        duration = np.float64(params.get("duration", 0))
+        syn_parameters = {
+            ("adsr_1", "attack"): tensor([attack_1]),  # [0.0, 2.0]
+            ("adsr_1", "decay"): tensor([decay_1]),  # [0.0, 2.0]
+            ("adsr_1", "sustain"): tensor([sustain_1]),  # [0.0, 2.0]
+            ("adsr_1", "release"): tensor([release_1]),  # [0.0, 2.0]
+            ("adsr_1", "alpha"): tensor([5]),  # [0.1, 6.0]
+            ("adsr_2", "attack"): tensor([attack_2]),  # [0.0, 2.0]
+            ("adsr_2", "decay"): tensor([decay_2]),  # [0.0, 2.0]
+            ("adsr_2", "sustain"): tensor([sustain_2]),  # [0.0, 2.0]
+            ("adsr_2", "release"): tensor([release_2]),  # [0.0, 2.0]
+            ("adsr_2", "alpha"): tensor([5]),  # [0.1, 6.0]
+            ("keyboard", "midi_f0"): tensor([pitch]),
+            ("keyboard", "duration"): tensor([duration]),
+            # Mixer parameter
+            ("mixer", "vco_1"): tensor([osc1_amp]),  # [0, 1]
+            ("mixer", "vco_2"): tensor([osc2_amp]),  # [0, 1]
+            # Constant parameters:
+            ("vco_1", "mod_depth"): tensor([pitch_mod_depth]),  # [-96, 96]
+            ("vco_1", "tuning"): tensor([0.0]),  # [-24.0, 24]
+            ("vco_2", "mod_depth"): tensor([pitch_mod_depth]),  # [-96, 96]
+            ("vco_2", "tuning"): tensor([0.0]),  # [-24.0, 24]
+            # LFOs
+            ("lfo_amp_sin", "frequency"): tensor([amp_mod_freq]),  # [0, 20]
+            ("lfo_amp_sin", "mod_depth"): tensor([0]),  # [-10, 20]
+            ("lfo_pitch_sin_1", "frequency"): tensor([pitch_mod_freq_1]),  # [0, 20]
+            ("lfo_pitch_sin_1", "mod_depth"): tensor([10]),  # [-10, 20]
+            ("lfo_pitch_sin_2", "frequency"): tensor([pitch_mod_freq_1]),  # [0, 20]
+            ("lfo_pitch_sin_2", "mod_depth"): tensor([10]),  # [-10, 20]
+        }
+        osc_types = params.get("osc_types", 0)
+        if osc_types == 0:
+            synth = self.SinSawVoice
+            syn_parameters[("vco_2", "shape")] = tensor([1])
+        elif osc_types == 1:
+            synth = self.SinSawVoice
+            syn_parameters[("vco_2", "shape")] = tensor([0])
+        elif osc_types == 2:
+            synth = self.Saw_Square_Voice
+            syn_parameters[("vco_1", "shape")] = tensor([1])
+            syn_parameters[("vco_2", "shape")] = tensor([0])
+        elif osc_types == 3:
+            synth = self.SinTriVoice
+        elif osc_types == 4:
+            synth = self.TriSawVoice
+            syn_parameters[("vco_2", "shape")] = tensor([1])
+        else:
+            synth = self.TriSawVoice
+            syn_parameters[("vco_2", "shape")] = tensor([0])
+        synth.set_parameters(syn_parameters)
+        audio_out = synth.get_signal(amp_mod_depth, amp_mod_waveform, int(sample_rate * duration), osc1_amp, osc2_amp)
+        single_note = audio_out[0].detach().numpy()
+        cutoff_freq = tf_float32(cutoff_freq)
+        impulse_response = ddsp.core.sinc_impulse_response(cutoff_freq, 2048, self.sample_rate)
+        single_note = tf_float32(single_note)
+        return ddsp.core.fft_convolve(single_note[tf.newaxis, :], impulse_response)[0, :]
+class MelodyGenerator:
+    """
+    This class is responsible for multi-note audio generation by function 'get_melody'.
+    """
+    def __init__(self,
+                 sample_rate=sample_rate,
+                 n_note_samples=sample_rate * 4.5,
+                 n_melody_samples=sample_rate * 4.5):
+        self.sample_rate = sample_rate
+        self.noteGenerator = NoteGenerator(sample_rate, sample_rate * 4.5)
+        self.n_melody_samples = int(n_melody_samples)
+    def get_melody(self, params_list: List[Dict[str, float]], onsets):
+        track = np.zeros(self.n_melody_samples)
+        for i in range(len(onsets)):
+            location = onsets[i]
+            single_note = self.noteGenerator.get_note(params_list[i])
+            single_note = np.hstack(
+                [np.zeros(int(location)), single_note, np.zeros(self.n_melody_samples)])[
+                          :self.n_melody_samples]
+            track = track + single_note
+        return track
+def plot_log_spectrogram(signal: np.ndarray,
+                         path: str,
+                         n_fft=2048,
+                         frame_length=1024,
+                         frame_step=256):
+    """Write spectrogram."""
+    stft = librosa.stft(signal, n_fft=1024, hop_length=256, win_length=1024)
+    amp = np.square(np.real(stft)) + np.square(np.imag(stft))
+    magnitude_spectrum = np.abs(amp)
+    log_mel = np_power_to_db(magnitude_spectrum)
+    matplotlib.pyplot.imsave(path, log_mel, vmin=-100, vmax=0, origin='lower')
+def np_power_to_db(S, amin=1e-16, top_db=80.0):
+    """A helper function for scaling."""
+    def np_log10(x):
+        numerator = np.log(x)
+        denominator = np.log(10)
+        return numerator / denominator
+    # Scale magnitude relative to maximum value in S. Zeros in the output
+    # correspond to positions where S == ref.
+    ref = np.max(S)
+    # 每个元素取max
+    log_spec = 10.0 * np_log10(np.maximum(amin, S))
+    log_spec -= 10.0 * np_log10(np.maximum(amin, ref))
+    log_spec = np.maximum(log_spec, np.max(log_spec) - top_db)
+    return log_spec
+synth = MelodyGenerator()
+param_descriptions: List[ParameterDescription]
+param_descriptions = [
+    # Oscillator levels
+    ParameterDescription(name="osc_amp2",
+                         values=parameter_range('osc_amp2'),
+                         discrete=is_discrete('osc_amp2')),
+    # ADSR params
+    ParameterDescription(name="attack_1",
+                         values=parameter_range('attack'),
+                         discrete=is_discrete('attack')),
+    ParameterDescription(name="decay_1",
+                         values=parameter_range('decay'),
+                         discrete=is_discrete('decay')),
+    ParameterDescription(name="sustain_1",
+                         values=parameter_range('sustain'),
+                         discrete=is_discrete('sustain')),
+    ParameterDescription(name="release_1",
+                         values=parameter_range('release'),
+                         discrete=is_discrete('release')),
+    ParameterDescription(name="attack_2",
+                         values=parameter_range('attack'),
+                         discrete=is_discrete('attack')),
+    ParameterDescription(name="decay_2",
+                         values=parameter_range('decay'),
+                         discrete=is_discrete('decay')),
+    ParameterDescription(name="sustain_2",
+                         values=parameter_range('sustain'),
+                         discrete=is_discrete('sustain')),
+    ParameterDescription(name="release_2",
+                         values=parameter_range('release'),
+                         discrete=is_discrete('release')),
+    ParameterDescription(name="cutoff_freq",
+                         values=parameter_range('cutoff_freq'),
+                         discrete=is_discrete('cutoff_freq')),
+    ParameterDescription(name="pitch",
+                         values=midi_parameter_range('pitch'),
+                         discrete=midi_is_discrete('pitch')),
+    ParameterDescription(name="duration",
+                         values=midi_parameter_range('duration'),
+                         discrete=midi_is_discrete('duration')),
+    ParameterDescription(name="amp_mod_freq",
+                         values=parameter_range('amp_mod_freq'),
+                         discrete=is_discrete('amp_mod_freq')),
+    ParameterDescription(name="amp_mod_depth",
+                         values=parameter_range('amp_mod_depth'),
+                         discrete=is_discrete('amp_mod_depth')),
+    ParameterDescription(name="pitch_mod_freq_1",
+                         values=parameter_range('pitch_mod_freq'),
+                         discrete=is_discrete('pitch_mod_freq')),
+    ParameterDescription(name="pitch_mod_freq_2",
+                         values=parameter_range('pitch_mod_freq'),
+                         discrete=is_discrete('pitch_mod_freq')),
+    ParameterDescription(name="pitch_mod_depth",
+                         values=parameter_range('pitch_mod_depth'),
+                         discrete=is_discrete('pitch_mod_depth')),
+    # Oscillators types
+    # 0 for sin saw, 1 for sin square, 2 for saw square
+    # 3 for sin triangle, 4 for triangle saw, 5 for triangle square
+    ParameterDescription(name="osc_types",
+                         values=parameter_range('osc_types'),
+                         discrete=is_discrete('osc_types')),
+]
+frame_length = 1024
+frame_step = 256
+spectrogram_len = 256
+n_fft = 1024
+def generate_synth_dataset_log_muted_512(n: int, path_name="./data/data_log", write_spec=False):
+    if Path(path_name).exists():
+        shutil.rmtree(path_name)
+    Path(path_name).mkdir(parents=True, exist_ok=True)
+    print("Generating dataset...")
+    synthetic_data = np.ones((n, 512, 256))
+    for i in range(n):
+        index = i
+        parameter_values = [param.generate() for param in param_descriptions]
+        parameter_values_raw = {param.name: param.value for param in parameter_values}
+        parameter_values_raw["duration"] = 3.0
+        parameter_values_raw["pitch"] = 52
+        parameter_values_raw["pitch_mod_depth"] = 0.0
+        signal = synth.get_melody([parameter_values_raw], [0])
+        # mel = librosa.feature.melspectrogram(signal, sr=sample_rate, n_fft=n_fft, hop_length=frame_step, win_length=frame_length)[:,:spectrogram_len]
+        stft = librosa.stft(signal, n_fft=1024, hop_length=256, win_length=1024)
+        amp = np.square(np.real(stft)) + np.square(np.imag(stft))
+        synthetic_data[i] = amp[:512, :256]
+        if write_spec:
+            write(path_name + f"/{i}.wav", synth.sample_rate, signal)
+            plot_log_spectrogram(signal, path=path_name + f"/{i}.png", frame_length=frame_length, frame_step=frame_step)
+    print(f"Generating dataset over, {n} samples generated!")
+    return synthetic_data
+def generate_synth_dataset_log_512(n: int, path_name="./data/data_log", write_spec=False):
+    """Generate the synthetic dataset with a progress bar."""
+    if Path(path_name).exists():
+        shutil.rmtree(path_name)
+    Path(path_name).mkdir(parents=True, exist_ok=True)
+    print("Generating dataset...")
+    synthetic_data = np.ones((n, 512, 256))
+    for i in tqdm(range(n)):
+        index = i
+        parameter_values = [param.generate() for param in param_descriptions]
+        parameter_values_raw = {param.name: param.value for param in parameter_values}
+        parameter_values_raw["duration"] = 3.0
+        parameter_values_raw["pitch"] = 52
+        parameter_values_raw["pitch_mod_depth"] = 0.0
+        signal = synth.get_melody([parameter_values_raw], [0])
+        # mel = librosa.feature.melspectrogram(signal, sr=sample_rate, n_fft=n_fft, hop_length=frame_step, win_length=frame_length)[:,:spectrogram_len]
+        stft = librosa.stft(signal, n_fft=1024, hop_length=256, win_length=1024)
+        amp = np.square(np.real(stft)) + np.square(np.imag(stft))
+        synthetic_data[i] = amp[:512, :256]
+        if write_spec:
+            write(path_name + f"/{i}.wav", synth.sample_rate, signal)
+            plot_log_spectrogram(signal, path=path_name + f"/{i}.png", frame_length=frame_length, frame_step=frame_step)
+    print(f"Generating dataset over, {n} samples generated!")
+    return synthetic_data
+def generate_DANN_dataset_muted(n: int, path_name="./data/data_DANN", write_spec=False):
+    """Generate the synthetic dataset without a progress bar."""
+    if Path(path_name).exists():
+        shutil.rmtree(path_name)
+    Path(path_name).mkdir(parents=True, exist_ok=True)
+    print("Generating dataset...")
+    multinote_data = np.ones((n, 512, 256))
+    single_data = np.ones((n, 512, 256))
+    for i in range(n):
+        index = i
+        par_list = []
+        n_notes = np.random.randint(1, 5)
+        onsets = []
+        for j in range(n_notes):
+            parameter_values = [param.generate() for param in param_descriptions]
+            parameter_values_raw = {param.name: param.value for param in parameter_values}
+            # parameter_values_raw["duration"] = 0.5
+            parameter_values_raw["pitch_mod_depth"] = 0.0
+            par_list.append(parameter_values_raw)
+            onsets.append(np.random.randint(0, sample_rate * 3))
+        signal = synth.get_melody(par_list, onsets)
+        stft = librosa.stft(signal, n_fft=1024, hop_length=256, win_length=1024)
+        amp = np.square(np.real(stft)) + np.square(np.imag(stft))
+        multinote_data[i] = amp[:512, :256]
+        if write_spec:
+            write(path_name + f"/{i}.wav", synth.sample_rate, signal)
+            plot_log_spectrogram(signal, path=path_name + f"/mul_{i}.png", frame_length=frame_length,
+                                 frame_step=frame_step)
+        single_par = par_list[np.argmin(onsets)]
+        single_par["duration"] = 3.0
+        single_par["pitch"] = 52
+        signal = synth.get_melody([single_par], [0])
+        stft = librosa.stft(signal, n_fft=1024, hop_length=256, win_length=1024)
+        amp = np.square(np.real(stft)) + np.square(np.imag(stft))
+        single_data[i] = amp[:512, :256]
+        if write_spec:
+            write(path_name + f"/{i}.wav", synth.sample_rate, signal)
+            plot_log_spectrogram(signal, path=path_name + f"/single_{i}.png", frame_length=frame_length,
+                                 frame_step=frame_step)
+    print(f"Generating dataset over, {n} samples generated!")
+    return multinote_data, single_data
+def generate_DANN_dataset(n: int, path_name="./data/data_DANN", write_spec=False):
+    """Generate the synthetic dataset for adversarial training."""
+    if Path(path_name).exists():
+        shutil.rmtree(path_name)
+    Path(path_name).mkdir(parents=True, exist_ok=True)
+    print("Generating dataset...")
+    multinote_data = np.ones((n, 512, 256))
+    single_data = np.ones((n, 512, 256))
+    for i in tqdm(range(n)):
+        par_list = []
+        n_notes = np.random.randint(1, 5)
+        onsets = []
+        for j in range(n_notes):
+            parameter_values = [param.generate() for param in param_descriptions]
+            parameter_values_raw = {param.name: param.value for param in parameter_values}
+            parameter_values_raw["pitch_mod_depth"] = 0.0
+            par_list.append(parameter_values_raw)
+            onsets.append(np.random.randint(0, sample_rate * 3))
+        signal = synth.get_melody(par_list, onsets)
+        stft = librosa.stft(signal, n_fft=1024, hop_length=256, win_length=1024)
+        amp = np.square(np.real(stft)) + np.square(np.imag(stft))
+        multinote_data[i] = amp[:512, :256]
+        if write_spec:
+            write(path_name + f"/{i}.wav", synth.sample_rate, signal)
+            plot_log_spectrogram(signal, path=path_name + f"/mul_{i}.png", frame_length=frame_length,
+                                 frame_step=frame_step)
+        single_par = par_list[np.argmin(onsets)]
+        single_par["duration"] = 3.0
+        single_par["pitch"] = 52
+        signal = synth.get_melody([single_par], [0])
+        stft = librosa.stft(signal, n_fft=1024, hop_length=256, win_length=1024)
+        amp = np.square(np.real(stft)) + np.square(np.imag(stft))
+        single_data[i] = amp[:512, :256]
+        if write_spec:
+            write(path_name + f"/{i}.wav", synth.sample_rate, signal)
+            plot_log_spectrogram(signal, path=path_name + f"/single_{i}.png", frame_length=frame_length,
+                                 frame_step=frame_step)
+    print(f"Generating dataset over, {n} samples generated!")
+    return multinote_data, single_data

load_data.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import joblib
+import numpy as np
+from generate_synthetic_data_online import generate_synth_dataset_log_512, generate_synth_dataset_log_muted_512
+from tools import show_spc, spc_to_VAE_input, VAE_out_put_to_spc, np_log10
+import torch.utils.data as data
+class Data_cache():
+    """This is a class that stores synthetic data."""
+    def __init__(self, synthetic_data, external_sources):
+        self.n_synthetic = np.shape(synthetic_data)[0]
+        self.synthetic_data = synthetic_data.astype(np.float32)
+        self.external_sources = external_sources.astype(np.float32)
+        self.epsilon = 1e-20
+    def get_all_data(self):
+        return np.vstack([self.synthetic_data, self.external_sources])
+    def refresh(self):
+        self.synthetic_data = generate_synth_dataset(self.n_synthetic, mute=True)
+    def get_data_loader(self, shuffle=True, BATCH_SIZE=8, new_way=False):
+        all_data = self.get_all_data()
+        our_data = []
+        for i in range(len(all_data)):
+            if new_way:
+                spectrogram = VAE_out_put_to_spc(np.reshape(all_data[i], (1, 512, 256)))
+                log_spectrogram = np.log10(spectrogram + self.epsilon)
+                our_data.append(log_spectrogram)
+            else:
+                our_data.append(np.reshape(all_data[i], (1, 512, 256)))
+        iterator = data.DataLoader(our_data, shuffle=shuffle, batch_size=BATCH_SIZE)
+        return iterator
+def generate_synth_dataset(n_synthetic, mute=False):
+    """Preprocessing for synthetic data"""
+    n_synthetic_sample = n_synthetic
+    if mute:
+        Input0 = generate_synth_dataset_log_muted_512(n_synthetic_sample)
+    else:
+        Input0 = generate_synth_dataset_log_512(n_synthetic_sample)
+    Input0 = spc_to_VAE_input(Input0)
+    Input0 = Input0.reshape(Input0.shape[0], Input0.shape[1], Input0.shape[2], 1)
+    return Input0
+def read_data(data_path):
+    """Read external sources"""
+    data = np.array(joblib.load(data_path))
+    data = spc_to_VAE_input(data)
+    data = data.reshape(data.shape[0], data.shape[1], data.shape[2], 1)
+    return data
+def load_data(n_synthetic):
+    """Generate the hybrid dataset."""
+    Input_synthetic = generate_synth_dataset(n_synthetic)
+    Input_AU = read_data("./data/external_data/ARTURIA_data")
+    print("ARTURIA dataset loaded.")
+    Input_NSynth = read_data("./data/external_data/NSynth_data")
+    print("NSynth dataset loaded.")
+    Input_SF = read_data("./data/external_data/soundfonts_data")
+    Input_SF_256 = np.zeros((337, 512, 256, 1))
+    Input_SF_256[:,:,:251,:] += Input_SF
+    Input_SF =Input_SF_256
+    print("SoundFonts dataset loaded.")
+    Input_google = read_data("./data/external_data/WaveNet_samples")
+    Input_external = np.vstack([Input_AU, Input_NSynth, Input_SF, Input_google])
+    data_cache = Data_cache(Input_synthetic, Input_external)
+    print(f"Data loaded, data shape: {np.shape(data_cache.get_all_data())}")
+    return data_cache
+def show_data(dataset_name, n_sample=3, index=-1, new_way=False):
+    """Show and return a certain dataset.
+    Parameters
+    ----------
+    dataset_name: String
+        Name of the dataset to show.
+    n_samples: int
+        Number of samples to show.
+    index: int
+        Setting 'index' larger equal 0 shows the 'index'-th sample in the desired dataset.
+    Returns
+    -------
+    np.ndarray:
+        The showed dataset.
+    """
+    if dataset_name == "ARTURIA":
+        data = read_data("./data/external_data/ARTURIA_data")
+    elif dataset_name == "NSynth":
+        data = read_data("./data/external_data/NSynth_data")
+    elif dataset_name == "SoundFonts":
+        data = read_data("./data/external_data/soundfonts_data")
+    elif dataset_name == "Synthetic":
+        data = generate_synth_dataset(int(n_sample * 3))
+    else:
+        print("Example command: \"!python thesis_main.py show_data -s [ARTURIA, NSynth, SoundFonts, Synthetic] -n 5\"")
+        return
+    if index >= 0:
+        show_spc(VAE_out_put_to_spc(data[index]))
+    else:
+        for i in range(n_sample):
+            index = np.random.randint(0,len(data))
+            print(index)
+            show_spc(VAE_out_put_to_spc(data[index]))
+    return data
+def show_data(tensor_batch, index=-1, new_way=False):
+    if index < 0:
+        index = np.random.randint(0, tensor_batch.shape[0])
+    if new_way:
+        sample = tensor_batch[index].detach().numpy()
+        spectrogram = 10.0 ** sample
+        print(f"The {index}-th sample:")
+        show_spc(spectrogram)
+    else:
+        sample = tensor_batch[index].detach().numpy()
+        show_spc(VAE_out_put_to_spc(sample))
+    # return data

melody_synth/complex_torch_synth.py ADDED Viewed

	@@ -0,0 +1,221 @@

+from typing import Optional
+import numpy as np
+import torch
+from torch import Tensor, tensor
+from torchsynth.config import SynthConfig
+from torchsynth.module import (
+    ADSR,
+    VCA,
+    AudioMixer,
+    ControlRateUpsample,
+    MonophonicKeyboard,
+    SineVCO,
+    SquareSawVCO,
+    VCO, LFO, ModulationMixer,
+)
+from torchsynth.signal import Signal
+from torchsynth.synth import AbstractSynth
+# from configurations.read_configuration import get_conf_sample_rate
+from melody_synth.non_random_LFOs import SinLFO, SawLFO, TriLFO, SquareLFO, RSawLFO
+class TriangleVCO(VCO):
+    """This is an expanded module that inherits VCO producing Triangle waves."""
+    def oscillator(self, argument: Signal, midi_f0: Tensor) -> Signal:
+        return torch.arcsin(torch.sin(argument * 2)) * 2.0 / torch.pi
+class AmpModTorchSynth(AbstractSynth):
+    """This is an abstract class using the modules provided by 1B1Synth to assemble synthesizers that generate the
+    training set. (The implementation of this class references code in TorchSynth) """
+    def __init__(
+            self,
+            synthconfig: Optional[SynthConfig] = None,
+            nebula: Optional[str] = "nebula",
+            *args,
+            **kwargs,
+    ):
+        AbstractSynth.__init__(self, synthconfig=synthconfig, *args, **kwargs)
+        self.share_modules = [
+            ("keyboard", MonophonicKeyboard),
+            ("adsr_1", ADSR),
+            ("adsr_2", ADSR),
+            ("upsample", ControlRateUpsample),
+            ("vca", VCA),
+            ("lfo_amp_sin", SinLFO),
+            ("lfo_pitch_sin_1", SinLFO),
+            ("lfo_pitch_sin_2", SinLFO),
+            (
+                "mixer",
+                AudioMixer,
+                {
+                    "n_input": 2,
+                    "curves": [1.0, 1.0],
+                    "names": ["vco_1", "vco_2"],
+                },
+            )
+        ]
+    def output(self) -> Tensor:
+        """Synthesizes the signal as Tensor"""
+        midi_f0, note_on_duration = self.keyboard()
+        adsr1 = self.adsr_1(note_on_duration)
+        adsr1 = self.upsample(adsr1)
+        adsr2 = self.adsr_2(note_on_duration)
+        adsr2 = self.upsample(adsr2)
+        amp_modulation = self.lfo_amp_sin()
+        amp_modulation = self.upsample(amp_modulation)
+        pitch_modulation_1 = self.lfo_pitch_sin_1()
+        pitch_modulation_1 = self.upsample(pitch_modulation_1)
+        pitch_modulation_2 = self.lfo_pitch_sin_2()
+        pitch_modulation_2 = self.upsample(pitch_modulation_2)
+        vco_amp1 = adsr1 * (amp_modulation * 0.5 + 0.5)
+        vco_amp2 = adsr2 * (amp_modulation * 0.5 + 0.5)
+        vco_1_out = self.vco_1(midi_f0, pitch_modulation_1)
+        vco_1_out = self.vca(vco_1_out, vco_amp1)
+        vco_2_out = self.vco_2(midi_f0, pitch_modulation_2)
+        vco_2_out = self.vca(vco_2_out, vco_amp2)
+        return self.mixer(vco_1_out, vco_2_out)
+    def get_signal(self, amp_mod_depth, amp_waveform, duration_l, amp1, amp2):
+        """Synthesizes the signal as Tensor"""
+        midi_f0, note_on_duration = self.keyboard()
+        adsr1 = self.adsr_1(note_on_duration)
+        adsr1 = self.upsample(adsr1)
+        adsr2 = self.adsr_2(note_on_duration)
+        adsr2 = self.upsample(adsr2)
+        amp_modulation = self.lfo_amp_sin()
+        amp_modulation = self.upsample(amp_modulation)
+        pitch_modulation_1 = self.lfo_pitch_sin_1()
+        pitch_modulation_1 = self.upsample(pitch_modulation_1)
+        pitch_modulation_2 = self.lfo_pitch_sin_2()
+        pitch_modulation_2 = self.upsample(pitch_modulation_2)
+        vco_amp1 = adsr1 * (amp_modulation * 0.5 + 0.5)
+        vco_amp2 = adsr2 * (amp_modulation * 0.5 + 0.5)
+        vco_1_out = self.vco_1(midi_f0, pitch_modulation_1)
+        vco_1_out = self.vca(vco_1_out, vco_amp1)
+        vco_2_out = self.vco_2(midi_f0, pitch_modulation_1)
+        vco_2_out = self.vca(vco_2_out, vco_amp2)
+        return self.mixer(vco_1_out, vco_2_out)
+class DoubleSawSynth(AmpModTorchSynth):
+    """In addition to the shared modules, this synthesizer uses two "SquareSawVCO" modules to generate square and
+    sawtooth waves"""
+    def __init__(
+            self,
+            synthconfig: Optional[SynthConfig] = None,
+            nebula: Optional[str] = "saw_square_voice",
+            *args,
+            **kwargs,
+    ):
+        AmpModTorchSynth.__init__(self, synthconfig=synthconfig, *args, **kwargs)
+        # Register all modules as children
+        module_list = self.share_modules
+        module_list.append(("vco_1", SquareSawVCO))
+        module_list.append(("vco_2", SquareSawVCO))
+        self.add_synth_modules(module_list)
+class SinSawSynth(AmpModTorchSynth):
+    """In addition to the shared modules, this synthesizer uses a "SinVco" and a "SquareSawVCO" to generate
+    sine and sawtooth/square waves """
+    def __init__(
+            self,
+            synthconfig: Optional[SynthConfig] = None,
+            nebula: Optional[str] = "sin_saw_voice",
+            *args,
+            **kwargs,
+    ):
+        AmpModTorchSynth.__init__(self, synthconfig=synthconfig, *args, **kwargs)
+        # Register all modules as children
+        module_list = self.share_modules
+        module_list.append(("vco_1", SineVCO))
+        module_list.append(("vco_2", SquareSawVCO))
+        self.add_synth_modules(module_list)
+class SinTriangleSynth(AmpModTorchSynth):
+    """In addition to the shared modules, this synthesizer uses a "SinVco" and a "TriangleVCO" to generate
+    sine and triangle waves """
+    def __init__(
+            self,
+            synthconfig: Optional[SynthConfig] = None,
+            nebula: Optional[str] = "sin_tri_voice",
+            *args,
+            **kwargs,
+    ):
+        AmpModTorchSynth.__init__(self, synthconfig=synthconfig, *args, **kwargs)
+        # Register all modules as children
+        module_list = self.share_modules
+        module_list.append(("vco_1", SineVCO))
+        module_list.append(("vco_2", TriangleVCO))
+        self.add_synth_modules(module_list)
+class TriangleSawSynth(AmpModTorchSynth):
+    """In addition to the shared modules, this synthesizer uses a "TriangleVCO" and a "SquareSawVCO" to generate
+    triangle and sawtooth/square waves """
+    def __init__(
+            self,
+            synthconfig: Optional[SynthConfig] = None,
+            nebula: Optional[str] = "triangle_saw_voice",
+            *args,
+            **kwargs,
+    ):
+        AmpModTorchSynth.__init__(self, synthconfig=synthconfig, *args, **kwargs)
+        # Register all modules as children
+        module_list = self.share_modules
+        module_list.append(("vco_1", TriangleVCO))
+        module_list.append(("vco_2", SquareSawVCO))
+        self.add_synth_modules(module_list)
+def amp_mod_with_duration(env, duration_l):
+    env_np = env.detach().numpy()[0] + 1e-30
+    env_np_shift = np.hstack([[0], env_np[:-1]])
+    env_np_sign = (env_np - env_np_shift)[:duration_l] + 1e-30
+    env_np_sign_nor = np.around(env_np_sign / np.abs(env_np_sign))
+    env_np_sign_nor_shift = np.hstack([[0], env_np_sign_nor[:-1]])
+    extreme_points = (env_np_sign_nor - env_np_sign_nor_shift)
+    (max_loc,) = np.where(extreme_points == -2)
+    n_max = len(max_loc)
+    if n_max == 0:
+        return env
+    else:
+        last_max_loc = max_loc[n_max - 1] - 1
+        # new_env = np.hstack([env_np[:last_max_loc], np.ones(len(env_np) - last_max_loc) * env_np[last_max_loc]])
+        new_env = np.hstack([env_np[:last_max_loc], (env_np[last_max_loc:] * 0.8 + 0.2)])
+        return tensor([new_env])

melody_synth/melody_generator.py ADDED Viewed

	@@ -0,0 +1,121 @@

+from typing import Dict
+import torch
+from ddsp.core import tf_float32
+import tensorflow as tf
+import ddsp
+import numpy as np
+from torch import tensor
+from melody_synth.complex_torch_synth import SinSawSynth, DoubleSawSynth, TriangleSawSynth, SinTriangleSynth
+from torchsynth.config import SynthConfig
+if torch.cuda.is_available():
+    device = "cuda"
+else:
+    device = "cpu"
+class MelodyGenerator:
+    """This is the only external interface of the melody_synth package."""
+    def __init__(self,
+                 sample_rate: int,
+                 n_note_samples: int,
+                 n_melody_samples: int):
+        self.sample_rate = sample_rate
+        self.n_note_samples = n_note_samples
+        self.n_melody_samples = n_melody_samples
+        synthconfig = SynthConfig(
+            batch_size=1, reproducible=False, sample_rate=sample_rate,
+            buffer_size_seconds=np.float64(n_note_samples) / np.float64(sample_rate)
+        )
+        self.Saw_Square_Voice = DoubleSawSynth(synthconfig)
+        self.SinSawVoice = SinSawSynth(synthconfig)
+        self.SinTriVoice = SinTriangleSynth(synthconfig)
+        self.TriSawVoice = TriangleSawSynth(synthconfig)
+    def get_melody(self, params: Dict[str, float], midi) -> [tf.Tensor]:
+        """Generates a random melody audio.
+        Parameters
+        ----------
+        params: Dict[str, float]
+            Dictionary of specifications (see Readme).
+        midi: List[float, float, float]
+            Melody midi (see Readme).
+        Returns
+        -------
+        onsets: List[tf.Tensor]
+            Audio.
+        """
+        osc1_amp = np.float(params.get("osc1_amp", 0))
+        osc2_amp = np.float(params.get("osc2_amp", 0))
+        attack = np.float(params.get("attack", 0))
+        decay = np.float(params.get("decay", 0))
+        sustain = np.float(params.get("sustain", 0))
+        release = np.float(params.get("release", 0))
+        cutoff_freq = params.get("cutoff_freq", 4000)
+        syn_parameters = {
+            ("adsr", "attack"): tensor([attack]),  # [0.0, 2.0]
+            ("adsr", "decay"): tensor([decay]),  # [0.0, 2.0]
+            ("adsr", "sustain"): tensor([sustain]),  # [0.0, 2.0]
+            ("adsr", "release"): tensor([release]),  # [0.0, 2.0]
+            ("adsr", "alpha"): tensor([3]),  # [0.1, 6.0]
+            # Mixer parameter
+            ("mixer", "vco_1"): tensor([osc1_amp]),  # [0, 1]
+            ("mixer", "vco_2"): tensor([osc2_amp]),  # [0, 1]
+            # Constant parameters:
+            ("vco_1", "mod_depth"): tensor([0.0]),  # [-96, 96]
+            ("vco_1", "tuning"): tensor([0.0]),  # [-24.0, 24]
+            ("vco_2", "mod_depth"): tensor([0.0]),  # [-96, 96]
+            ("vco_2", "tuning"): tensor([0.0]),  # [-24.0, 24]
+        }
+        osc_types = params.get("osc_types", 0)
+        if osc_types == 0:
+            synth = self.SinSawVoice
+            syn_parameters[("vco_2", "shape")] = tensor([1])
+        elif osc_types == 1:
+            synth = self.SinSawVoice
+            syn_parameters[("vco_2", "shape")] = tensor([0])
+        elif osc_types == 2:
+            synth = self.Saw_Square_Voice
+            syn_parameters[("vco_1", "shape")] = tensor([1])
+            syn_parameters[("vco_2", "shape")] = tensor([0])
+        elif osc_types == 3:
+            synth = self.SinTriVoice
+        elif osc_types == 4:
+            synth = self.TriSawVoice
+            syn_parameters[("vco_2", "shape")] = tensor([1])
+        else:
+            synth = self.TriSawVoice
+            syn_parameters[("vco_2", "shape")] = tensor([0])
+        track = np.zeros(self.n_melody_samples)
+        for i in range(len(midi)):
+            (location, pitch, duration) = midi[i]
+            syn_parameters[("keyboard", "midi_f0")] = tensor([pitch])
+            syn_parameters[("keyboard", "duration")] = tensor([duration])
+            synth.set_parameters(syn_parameters)
+            audio_out, parameters, is_train = synth()
+            single_note = audio_out[0]
+            single_note = np.hstack(
+                [np.zeros(int(location * self.sample_rate)), single_note, np.zeros(self.n_melody_samples)])[
+                          :self.n_melody_samples]
+            track = track + single_note
+        no_cutoff = False
+        if no_cutoff:
+            return track
+        cutoff_freq = tf_float32(cutoff_freq)
+        impulse_response = ddsp.core.sinc_impulse_response(cutoff_freq,
+                                                           2048,
+                                                           self.sample_rate)
+        track = tf_float32(track)
+        return ddsp.core.fft_convolve(track[tf.newaxis, :], impulse_response)[0, :]

melody_synth/non_random_LFOs.py ADDED Viewed

	@@ -0,0 +1,121 @@

+from typing import Optional
+import torch
+from torch import Tensor, tensor
+from torchsynth.config import SynthConfig
+from torchsynth.module import (
+    VCO, LFO, ModulationMixer,
+)
+from torchsynth.signal import Signal
+from torchsynth.synth import AbstractSynth
+class SinLFO(LFO):
+    """A LFO that generates the sine waveform.
+    (The implementation of this class is a modification of the code in TorchSynth) """
+    def output(self, mod_signal: Optional[Signal] = None) -> Signal:
+        # This module accepts signals at control rate!
+        if mod_signal is not None:
+            assert mod_signal.shape == (self.batch_size, self.control_buffer_size)
+        frequency = self.make_control(mod_signal)
+        argument = torch.cumsum(2 * torch.pi * frequency / self.control_rate, dim=1)
+        argument = argument + self.p("initial_phase").unsqueeze(1)
+        shapes = torch.stack(self.make_lfo_shapes(argument), dim=1).as_subclass(Signal)
+        mode = torch.stack([self.p(lfo) for lfo in self.lfo_types], dim=1)
+        mode[0] = tensor([1.0, 0., 0., 0., 0.])
+        mode = torch.pow(mode, self.exponent)
+        mode = mode / torch.sum(mode, dim=1, keepdim=True)
+        return torch.matmul(mode.unsqueeze(1), shapes).squeeze(1).as_subclass(Signal)
+class TriLFO(LFO):
+    """A LFO that generates the triangle waveform.
+    (The implementation of this class is a modification of the code in TorchSynth) """
+    def output(self, mod_signal: Optional[Signal] = None) -> Signal:
+        # This module accepts signals at control rate!
+        if mod_signal is not None:
+            assert mod_signal.shape == (self.batch_size, self.control_buffer_size)
+        frequency = self.make_control(mod_signal)
+        argument = torch.cumsum(2 * torch.pi * frequency / self.control_rate, dim=1)
+        argument = argument + self.p("initial_phase").unsqueeze(1)
+        shapes = torch.stack(self.make_lfo_shapes(argument), dim=1).as_subclass(Signal)
+        mode = torch.stack([self.p(lfo) for lfo in self.lfo_types], dim=1)
+        mode[0] = tensor([0.5, 0.5, 0., 0., 0.])
+        mode = torch.pow(mode, self.exponent)
+        mode = mode / torch.sum(mode, dim=1, keepdim=True)
+        return torch.matmul(mode.unsqueeze(1), shapes).squeeze(1).as_subclass(Signal)
+class SawLFO(LFO):
+    """A LFO that generates the sawtooth waveform.
+    (The implementation of this class is a modification of the code in TorchSynth) """
+    def output(self, mod_signal: Optional[Signal] = None) -> Signal:
+        # This module accepts signals at control rate!
+        if mod_signal is not None:
+            assert mod_signal.shape == (self.batch_size, self.control_buffer_size)
+        frequency = self.make_control(mod_signal)
+        argument = torch.cumsum(2 * torch.pi * frequency / self.control_rate, dim=1)
+        argument = argument + self.p("initial_phase").unsqueeze(1)
+        shapes = torch.stack(self.make_lfo_shapes(argument), dim=1).as_subclass(Signal)
+        mode = torch.stack([self.p(lfo) for lfo in self.lfo_types], dim=1)
+        mode[0] = tensor([0.5, 0., 0.5, 0., 0.])
+        mode = torch.pow(mode, self.exponent)
+        mode = mode / torch.sum(mode, dim=1, keepdim=True)
+        return torch.matmul(mode.unsqueeze(1), shapes).squeeze(1).as_subclass(Signal)
+class RSawLFO(LFO):
+    """A LFO that generates the sawtooth waveform.
+    (The implementation of this class is a modification of the code in TorchSynth) """
+    def output(self, mod_signal: Optional[Signal] = None) -> Signal:
+        # This module accepts signals at control rate!
+        if mod_signal is not None:
+            assert mod_signal.shape == (self.batch_size, self.control_buffer_size)
+        frequency = self.make_control(mod_signal)
+        argument = torch.cumsum(2 * torch.pi * frequency / self.control_rate, dim=1)
+        argument = argument + self.p("initial_phase").unsqueeze(1)
+        shapes = torch.stack(self.make_lfo_shapes(argument), dim=1).as_subclass(Signal)
+        mode = torch.stack([self.p(lfo) for lfo in self.lfo_types], dim=1)
+        mode[0] = tensor([0.5, 0., 0.0, 0.5, 0.])
+        mode = torch.pow(mode, self.exponent)
+        mode = mode / torch.sum(mode, dim=1, keepdim=True)
+        return torch.matmul(mode.unsqueeze(1), shapes).squeeze(1).as_subclass(Signal)
+class SquareLFO(LFO):
+    """A LFO that generates the square waveform.
+    (The implementation of this class is a modification of the code in TorchSynth) """
+    def output(self, mod_signal: Optional[Signal] = None) -> Signal:
+        # This module accepts signals at control rate!
+        if mod_signal is not None:
+            assert mod_signal.shape == (self.batch_size, self.control_buffer_size)
+        frequency = self.make_control(mod_signal)
+        argument = torch.cumsum(2 * torch.pi * frequency / self.control_rate, dim=1)
+        argument = argument + self.p("initial_phase").unsqueeze(1)
+        shapes = torch.stack(self.make_lfo_shapes(argument), dim=1).as_subclass(Signal)
+        mode = torch.stack([self.p(lfo) for lfo in self.lfo_types], dim=1)
+        mode[0] = tensor([0.5, 0., 0., 0., 0.5])
+        mode = torch.pow(mode, self.exponent)
+        mode = mode / torch.sum(mode, dim=1, keepdim=True)
+        return torch.matmul(mode.unsqueeze(1), shapes).squeeze(1).as_subclass(Signal)

melody_synth/random_duration.py ADDED Viewed

	@@ -0,0 +1,86 @@

+from configurations.read_configuration import midi_parameter_range, midi_is_discrete
+from data_generation.encoding import ParameterDescription
+def get_full_duration(midi):
+    """Uses "full_duration" strategy to generate random duration (see Readme)."""
+    n = len(midi)
+    (time_starting_point, pitch) = midi[n - 1]
+    new_midi = [(time_starting_point, pitch, 0.5)]
+    next_time_starting_point = time_starting_point
+    for i in range(n - 1):
+        (time_starting_point, pitch) = midi[n - 2 - i]
+        duration = (next_time_starting_point - time_starting_point) * 0.9
+        new_midi.insert(0, (time_starting_point, pitch, duration))
+        next_time_starting_point = time_starting_point
+    return new_midi
+def get_random_duration(midi):
+    """Uses "random_duration" strategy to generate random duration (see Readme)."""
+    parameterDescription = ParameterDescription(name="duration",
+                                                values=midi_parameter_range('duration'),
+                                                discrete=midi_is_discrete('duration'))
+    n = len(midi)
+    new_midi = []
+    for i in range(n):
+        (location, pitch) = midi[i]
+        duration = float(parameterDescription.generate().value)
+        new_midi.append((location, pitch, duration))
+    return new_midi
+def get_fixed_duration(midi):
+    return [(location, pitch, 2.0) for (location, pitch) in midi]
+def get_limited_random_duration(midi):
+    """Uses "limited_random_duration" strategy to generate random duration (see Readme)."""
+    parameterDescription = ParameterDescription(name="duration",
+                                                values=midi_parameter_range('duration'),
+                                                discrete=midi_is_discrete('duration'))
+    n = len(midi)
+    (time_starting_point, pitch) = midi[n - 1]
+    duration = float(parameterDescription.generate().value)
+    new_midi = [(time_starting_point, pitch, duration)]
+    next_time_starting_point = time_starting_point
+    for i in range(n - 1):
+        (time_starting_point, pitch) = midi[n - 2 - i]
+        max_duration = (next_time_starting_point - time_starting_point) * 0.9
+        duration = float(parameterDescription.generate().value)
+        duration = min(duration, max_duration)
+        new_midi.insert(0, (time_starting_point, pitch, duration))
+        next_time_starting_point = time_starting_point
+    return new_midi
+class RandomDuration:
+    """Third component in the random midi pipeline responsible for random duration (keyboard hold time) generating"""
+    def __call__(self, strategy: str, midi, *args, **kwargs):
+        """Choose required strategy to generate random duration for each note.
+        Parameters
+        ----------
+        strategy: str
+                Strategy names for random duration (see Readme).
+        midi: List[(float, float)]
+                Random rhythm and pitch from previous pipeline component.
+        Returns
+        -------
+        midi: List[(float, float, float)]
+                Original input list with duration assigned to each note onset.
+        """
+        if strategy == 'random_duration':
+            midi = get_random_duration(midi)
+        elif strategy == 'limited_random_duration':
+            midi = get_limited_random_duration(midi)
+        elif strategy == 'fixed_duration':
+            midi = get_fixed_duration(midi)
+        else:
+            midi = get_full_duration(midi)
+        return midi

melody_synth/random_midi.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import numpy as np
+from configurations.read_configuration import get_conf_sample_rate, get_conf_stft_hyperparameter,\
+    midi_parameter_range, get_conf_time_resolution, get_conf_max_n_notes
+from melody_synth.random_duration import RandomDuration
+from melody_synth.random_pitch import RandomPitch
+from melody_synth.random_rhythm import RandomRhythm
+class RandomMidi:
+    """Pipeline generating random midi"""
+    def __init__(self):
+        self.randomRhythm = RandomRhythm()
+        self.randomPitch = RandomPitch()
+        self.randomDuration = RandomDuration()
+        self.max_n_notes = get_conf_max_n_notes()
+    def __call__(self, strategy=None, *args, **kwargs):
+        """Assembles the pipeline based on given strategies and return random midi.
+        Parameters
+        ----------
+        strategy: Dict[str, str]
+                    Strategies names for random rhythm, pitch and duration generation (see Readme).
+        Returns
+        -------
+        encode, midi: List[int], List[(float, float, float)]
+                        encode -- Midi's label as a list of 0s and 1s
+                        midi -- A list of (onset, pitch, duration) tuples, each tuple refers to a note
+        """
+        if strategy is None:
+            strategy = {"rhythm_strategy": "non-test",
+                        "pitch_strategy": "random_major",
+                        "duration_strategy": "limited_random",
+                        }
+        midi = self.randomRhythm(strategy["rhythm_strategy"])
+        midi = self.randomPitch(strategy["pitch_strategy"], midi)
+        midi = self.randomDuration(strategy["duration_strategy"], midi)
+        return self.get_encode(midi), midi
+    def get_encode(self, midi):
+        """Generate labels for midi
+        Parameters
+        ----------
+        midi: List[(onset, pitch, duration)]
+                A list of (onset, pitch, duration) tuples, each tuple refers to a note
+        Returns
+        -------
+        encode: List[int]
+                Midi's label as a list of 0s and 1s
+        Encoding method
+        -------
+        One-hot Encoding for each note. Stack all note labels to form midi label.
+        """
+        duration_range = midi_parameter_range("duration")
+        pitch_range = midi_parameter_range("pitch")
+        time_resolution = get_conf_time_resolution()
+        pixel_duration = get_conf_stft_hyperparameter()["frame_step"] / get_conf_sample_rate()
+        single_note_encode_length = (time_resolution + len(pitch_range) + len(duration_range))
+        encode_length = single_note_encode_length * self.max_n_notes
+        encode = []
+        for i in range(len(midi)):
+            (location, pitch, duration) = midi[i]
+            location_index = int(float(location) / pixel_duration)
+            if location_index >= time_resolution:
+                break
+            pitch_index = pitch - pitch_range[0]
+            duration_index = np.argmin(np.abs(np.array(duration_range) - duration))
+            single_note_encode = np.zeros(single_note_encode_length)
+            single_note_encode[location_index] = 1
+            single_note_encode[time_resolution + pitch_index] = 1
+            single_note_encode[time_resolution + len(pitch_range) + duration_index] = 1
+            encode = np.hstack([encode, single_note_encode])
+        return np.hstack([encode, np.zeros(encode_length)])[:encode_length]

melody_synth/random_pitch.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import numpy as np
+from configurations.read_configuration import midi_parameter_range
+class RandomPitch:
+    """Second component in the random midi pipeline responsible for random rhythm (note onsets) generating"""
+    def __init__(self):
+        self.pitch_range = midi_parameter_range("pitch")
+        self.major = [0, 2, 4, 5, 7, 9, 11]
+        self.minor = [0, 2, 3, 5, 7, 8, 10]
+    def __call__(self, strategy: str, onsets, *args, **kwargs):
+        """Choose required strategy to generate random pitch for each note.
+        Parameters
+        ----------
+        strategy: str
+                Strategy names for random pitches (see Readme).
+        onsets: List[float]
+                Random rhythm from previous pipeline component.
+        Returns
+        -------
+        midi: List[(float, float)]
+                Original input list with pitches assigned to each note onset.
+        """
+        if strategy == 'random_major':
+            return self.get_random_major(onsets)
+        elif strategy == 'random_minor':
+            return self.get_random_minor(onsets)
+        elif strategy == 'fixed_pitch':
+            return self.get_fixed_pitch(onsets)
+        elif strategy == 'fixed_pitch1':
+            return self.get_fixed_pitch1(onsets)
+        elif strategy == 'fixed_pitch2':
+            return self.get_fixed_pitch2(onsets)
+        elif strategy == 'fixed_pitch3':
+            return self.get_fixed_pitch3(onsets)
+        elif strategy == 'fixed_pitch4':
+            return self.get_fixed_pitch4(onsets)
+        else:
+            return self.get_random_pitch(onsets)
+    def get_random_major(self, midi):
+        """Uses "random_major" strategy to generate random pitches (see Readme)."""
+        random_scale = np.random.randint(0, 12)
+        scale = [one for one in self.pitch_range if (one - random_scale) % 12 in self.major]
+        midi = [(duration, scale[np.random.randint(0, len(scale))]) for duration in midi]
+        # midi[0] = (midi[0][0], random_scale + self.pitch_range[-1])
+        midi[len(midi) - 1] = (midi[len(midi) - 1][0], random_scale + self.pitch_range[0])
+        return midi
+    def get_random_pitch(self, midi):
+        """Uses "free_pitch" strategy to generate random pitches (see Readme)."""
+        return [(duration, np.random.randint(self.pitch_range[0], self.pitch_range[-1])) for duration in midi]
+    def get_fixed_pitch(self, midi):
+        """Uses "free_pitch" strategy to generate random pitches (see Readme)."""
+        return [(duration, 48) for duration in midi]
+    def get_fixed_pitch1(self, midi):
+        """Uses "free_pitch" strategy to generate random pitches (see Readme)."""
+        return [(duration, 55) for duration in midi]
+    def get_fixed_pitch2(self, midi):
+        """Uses "free_pitch" strategy to generate random pitches (see Readme)."""
+        return [(duration, 62) for duration in midi]
+    def get_fixed_pitch3(self, midi):
+        """Uses "free_pitch" strategy to generate random pitches (see Readme)."""
+        return [(duration, 69) for duration in midi]
+    def get_fixed_pitch4(self, midi):
+        """Uses "free_pitch" strategy to generate random pitches (see Readme)."""
+        return [(duration, 76) for duration in midi]
+    def get_random_minor(self, midi):
+        """Uses "random_minor" strategy to generate random pitches (see Readme)."""
+        random_scale = np.random.randint(0, 12)
+        scale = [one for one in self.pitch_range if (one - random_scale) % 12 in self.minor]
+        midi = [(duration, scale[np.random.randint(0, len(scale))]) for duration in midi]
+        # midi[0] = (midi[0][0], random_scale + self.pitch_range[-1])
+        midi[len(midi) - 1] = (midi[len(midi) - 1][0], random_scale + self.pitch_range[0])
+        return midi

melody_synth/random_rhythm.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import random
+import numpy as np
+from configurations.read_configuration import get_conf_n_sample, get_conf_sample_rate, get_conf_max_n_notes
+def get_random_note_type_index(distribution):
+    """A helper method that randomly chooses next note type based on a distribution
+    Parameters
+    ----------
+    distribution: List[float]
+         Note type distribution.
+    Returns
+    -------
+    midi: int
+        Random type index.
+    """
+    r = np.random.random()
+    for i in range(len(distribution)):
+        r = r - distribution[i]
+        if r < 0:
+            return i
+    return len(distribution) - 1
+# Todo: rewrite this part
+def to_onsets_in_seconds(bpm, notes):
+    """A helper method that transform a list of note types into a list of note onsets (in seconds)
+    Parameters
+    ----------
+    bpm: float
+         BPM
+    notes: List[int]
+    Returns
+    -------
+    midi: int
+        Random type index.
+    """
+    full_note_length = 4 * 60 / bpm
+    onsets = [0]
+    for i in range(len(notes)):
+        onsets.append(onsets[i] + full_note_length * notes[i])
+    return onsets
+class RandomRhythm:
+    """First component in the random midi pipeline responsible for random rhythm (note onsets) generating"""
+    def __init__(self):
+        self.note_types = [0, 1, 3 / 4, 0.5, 3 / 8, 0.25, 1 / 8]
+        self.first_note_type_distribution = np.array([0, 0.2, 0.05, 0.25, 0.05, 0.3, 0.15])
+        self.rhythm_generation_matrix = np.array([
+            [0.1, 0.1, 0.25, 0.1, 0.25, 0.2],
+            [0.05, 0.25, 0.25, 0.05, 0.3, 0.1],
+            [0.1, 0.1, 0.3, 0.05, 0.35, 0.1],
+            [0.05, 0.05, 0.2, 0.2, 0.25, 0.25],
+            [0.1, 0.05, 0.1, 0.05, 0.4, 0.3],
+            [0.1, 0.05, 0.1, 0.1, 0.3, 0.35],
+        ])
+        # self.bpm = bpm
+        self.rhythm_duration = np.array([0, 1, 3 / 4, 0.5, 3 / 8, 0.25])
+        self.audio_length = get_conf_n_sample() / get_conf_sample_rate()
+        self.bpm_range = [90, 100, 110, 120, 130, 140, 150, 160, 170]
+        self.max_n_notes = get_conf_max_n_notes()
+    def __call__(self, strategy: str, *args, **kwargs):
+        """Choose required strategy to generate random rhythm (note onsets).
+        Parameters
+        ----------
+        strategy: str
+                    Strategy names for random rhythm (see Readme).
+        Returns
+        -------
+        onsets: List[float]
+                    A list of floats referring to note onsets in seconds.
+        """
+        if strategy == 'bpm_based_rhythm':
+            rhythm = self.get_bpm_based_rhythm()
+        elif strategy == 'free_rhythm':
+            rhythm = self.get_free_rhythm()
+        elif strategy == 'single_note_rhythm':
+            rhythm = self.get_single_note()
+        else:
+            rhythm = [0.0, 1, 2, 3, 4]
+        return rhythm[:self.max_n_notes]
+    def get_bpm_based_rhythm(self):
+        """Uses "bpm_based_rhythm" strategy to generate random rhythm (see Readme)."""
+        # Todo: clean up this part
+        bpm = random.choice(self.bpm_range)
+        first_note = get_random_note_type_index(self.first_note_type_distribution)
+        note_type_indexes = [first_note]
+        current_note_type = first_note
+        while True:
+            current_note_type = get_random_note_type_index(self.rhythm_generation_matrix[current_note_type - 1]) + 1
+            note_type_indexes.append(current_note_type)
+            # Random early stop
+            if np.random.random() < 9 / bpm:
+                break
+        notes = [self.note_types[note_type_index] for note_type_index in note_type_indexes]
+        onsets = to_onsets_in_seconds(bpm, notes)
+        return onsets
+    def get_free_rhythm(self):
+        """Uses "free_rhythm" strategy to generate random rhythm (see Readme)."""
+        n_notes = np.random.randint(int(self.max_n_notes * 0.6), self.max_n_notes)
+        # n_notes = np.random.randint(int(1), self.max_n_notes)
+        onsets = np.random.rand(n_notes)
+        onsets.sort()
+        # Avoid notes too close together
+        pre = onsets[0]
+        n_removed = 0
+        for i in range(len(onsets)-1):
+            index = i - n_removed + 1
+            if (onsets[index] - pre) < 0.05:
+                new_onsets = np.delete(onsets, index)
+                onsets = new_onsets
+                n_removed = n_removed + 1
+            else:
+                pre = onsets[index]
+        return ((onsets - onsets[0])*self.audio_length).tolist()
+    def get_single_note(self):
+        return [0.0]

model/VAE.py ADDED Viewed

	@@ -0,0 +1,230 @@

+import tensorflow as tf
+from keras import backend as K
+from keras.layers import Input, Dense, Conv2D, Conv2DTranspose, Flatten, Reshape, Lambda, BatchNormalization
+from keras.models import Model
+import numpy as np
+import threading
+KL = tf.keras.layers
+def cbam_layer(inputs_tensor=None, ratio=None):
+    """Source: https://blog.csdn.net/ZXF_1991/article/details/104615942
+    The channel attention
+    """
+    channels = K.int_shape(inputs_tensor)[-1]
+    def share_layer(inputs=None):
+        x_ = KL.Conv2D(channels // ratio, (1, 1), strides=1, padding="valid")(inputs)
+        x_ = KL.Activation('relu')(x_)
+        output_share = KL.Conv2D(channels, (1, 1), strides=1, padding="valid")(x_)
+        return output_share
+    x_global_avg_pool = KL.GlobalAveragePooling2D()(inputs_tensor)
+    x_global_avg_pool = KL.Reshape((1, 1, channels))(x_global_avg_pool)
+    x_global_max_pool = KL.GlobalMaxPool2D()(inputs_tensor)
+    x_global_max_pool = KL.Reshape((1, 1, channels))(x_global_max_pool)
+    x_global_avg_pool = share_layer(x_global_avg_pool)
+    x_global_max_pool = share_layer(x_global_max_pool)
+    x = KL.Add()([x_global_avg_pool, x_global_max_pool])
+    x = KL.Activation('sigmoid')(x)
+    CAM = KL.multiply([inputs_tensor, x])
+    output = CAM
+    return output
+def res_cell(x, n_channel=64, stride=1):
+    """The basic unit in the VAE, cell."""
+    if stride == -1:
+        # upsample cell
+        skip = tf.keras.layers.UpSampling2D(size=(2, 2), interpolation='bilinear')(x)
+        skip = Conv2D(filters=n_channel, kernel_size=(1, 1), strides=1, padding='same')(skip)
+        x = Conv2DTranspose(filters=n_channel, kernel_size=(5, 5), strides=2, padding='same')(x)
+        x = BatchNormalization()(x)
+        x = tf.keras.activations.elu(x)
+        x = Conv2DTranspose(filters=n_channel, kernel_size=(5, 5), padding='same')(x)
+    elif stride == 2:
+        # downsample cell
+        skip = Conv2D(filters=n_channel, kernel_size=(1, 1), strides=2, padding='same')(x)
+        x = Conv2D(filters=n_channel, kernel_size=(5, 5), strides=stride, padding='same')(x)
+        x = BatchNormalization()(x)
+        x = tf.keras.activations.elu(x)
+        x = Conv2D(filters=n_channel, kernel_size=(5, 5), padding='same')(x)
+    else:
+        # preserving cell
+        skip = tf.identity(x)
+        x = Conv2D(filters=n_channel, kernel_size=(5, 5), strides=stride, padding='same')(x)
+        x = BatchNormalization()(x)
+        x = tf.keras.activations.elu(x)
+        x = Conv2D(filters=n_channel, kernel_size=(5, 5), padding='same')(x)
+    x = BatchNormalization()(x)
+    x = cbam_layer(inputs_tensor=x, ratio=8)
+    x = x + skip
+    x = tf.keras.activations.elu(x)
+    return x
+def res_block(x, n_channel=64, upsample=False, n_cells=2):
+    """The block is a stack of cells."""
+    if upsample:
+        x = res_cell(x, n_channel=n_channel, stride=-1)
+    else:
+        x = res_cell(x, n_channel=n_channel, stride=2)
+    for _ in range(n_cells - 1):
+        x = res_cell(x, n_channel=n_channel, stride=1)
+    return x
+def l1_distance(x1, x2):
+    return tf.reduce_mean(tf.math.abs(x1 - x2))
+def l1_log_distance(x1, x2):
+    return tf.reduce_mean(tf.math.abs(tf.math.log(tf.maximum(1e-6, x1)) - tf.math.log(tf.maximum(1e-6, x2))))
+img_height = 512
+img_width = 256
+num_channels = 1
+input_shape = (img_height, img_width, num_channels)
+timbre_dim = 20
+n_filters = 64
+act = 'elu'
+def compute_latent(x):
+    """Re-parameterizing."""
+    mu, sigma = x
+    batch = K.shape(mu)[0]
+    dim = K.int_shape(mu)[1]
+    eps = K.random_normal(shape=(batch, dim))
+    return mu + K.exp(sigma / 2) * eps
+def get_encoder(N2=0, channel_sizes=None):
+    """Assemble and return the VAE encoder."""
+    if channel_sizes is None:
+        channel_sizes = [32, 64, 64, 96, 96, 128, 160, 216]
+    encoder_input = Input(shape=input_shape)
+    encoder_conv = res_block(encoder_input, channel_sizes[0], upsample=False, n_cells=1)
+    for c in channel_sizes[1:]:
+        encoder_conv = res_block(encoder_conv, c, upsample=False, n_cells=1 + N2)
+    encoder = Flatten()(encoder_conv)
+    mu_timbre = Dense(timbre_dim)(encoder)
+    sigma_timbre = Dense(timbre_dim)(encoder)
+    latent_vector = Lambda(compute_latent, output_shape=(timbre_dim,))([mu_timbre, sigma_timbre])
+    kl_loss = -0.5 * (1 + sigma_timbre - tf.square(mu_timbre) - tf.exp(sigma_timbre))
+    kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis=1))
+    encoder = Model(encoder_input, [latent_vector, kl_loss])
+    return encoder
+def get_decoder(N2=0, N3=8, channel_sizes=None):
+    """Assemble and return the VAE decoder."""
+    if channel_sizes is None:
+        channel_sizes = [32, 64, 64, 96, 96, 128, 160, 216]
+    conv_shape = [-1, 2 ** (9 - N3), 2 ** (8 - N3), channel_sizes[-1]]
+    decoder_input = Input(shape=(timbre_dim,))
+    decoder = Dense(conv_shape[1] * conv_shape[2] * conv_shape[3], activation=act)(decoder_input)
+    decoder_conv = Reshape((conv_shape[1], conv_shape[2], conv_shape[3]))(decoder)
+    for c in list(reversed(channel_sizes))[1:]:
+        decoder_conv = res_block(decoder_conv, c, upsample=True, n_cells=1 + N2)
+    decoder_conv = Conv2DTranspose(filters=num_channels, kernel_size=5, strides=2,
+                                   padding='same', activation='sigmoid')(decoder_conv)
+    decoder = Model(decoder_input, decoder_conv)
+    return decoder
+def VAE(N2=0, N3=8, channel_sizes=None):
+    """Assemble and return the VAE."""
+    if channel_sizes is None:
+        channel_sizes = [32, 64, 64, 96, 96, 128, 160, 216]
+    print("Creating model...")
+    assert N2 >= 0, "Please set N2 >= 0"
+    assert N3 >= 1, "Please set 1 <= N3 <= 8"
+    assert N3 <= 8, "Please set 1 <= N3 <= 8"
+    assert N3 == len(channel_sizes), "Please set N3 = len(channel_sizes)"
+    encoder = get_encoder(N2, channel_sizes)
+    decoder = get_decoder(N2, N3, channel_sizes)
+    # encoder = tf.keras.models.load_model(f"encoder_thesis_record_1.h5")
+    # decoder = tf.keras.models.load_model(f"decoder_thesis_record_1.h5")
+    encoder_input1 = Input(shape=input_shape)
+    scalar_input1 = Input(shape=(1,))
+    embedding_1_timbre, kl_loss = encoder(encoder_input1)
+    reconstruction_1 = decoder(embedding_1_timbre)
+    VAE = Model([encoder_input1, scalar_input1], [reconstruction_1, kl_loss])
+    # decoder.summary()
+    VAE.summary()
+    return encoder, decoder, VAE
+def my_thread(data_cache):
+    data_cache.refresh()
+def train_VAE(vae, encoder, decoder, data_cache, stages, batch_size):
+    """Train the VAE.
+    Parameters
+    ----------
+    vae: keras.engine.functional.Functional
+        The VAE.
+    encoder: keras.engine.functional.Functional
+        The VAE encoder.
+    decoder: keras.engine.functional.Functional
+        The VAE decoder.
+    data_cache: Data_cache
+        A Data_cache entity that provides training data.
+    stages: Dict
+        Defines the training stages. In each stage, the synthetic data will be refreshed and
+        models will be stored once.
+    Returns
+    -------
+    """
+    threshold = 1e-0
+    kl_weight = 100.0
+    def weighted_binary_cross_entropy_loss(true, pred):
+        b_n = true * tf.math.log(tf.maximum(1e-20, pred)) + (1 - true) * tf.math.log(tf.maximum(1e-20, 1 - pred))
+        w = tf.maximum(threshold, true)
+        return -tf.reduce_sum(b_n / w) / batch_size
+    def reconstruction_loss(true, pred):
+        reconstruction_loss = weighted_binary_cross_entropy_loss(K.flatten(true), K.flatten(pred))
+        return K.mean(reconstruction_loss)
+    def kl_loss(true, pred):
+        return pred * kl_weight
+    for stage in stages:
+        threshold = stage["threshold"]
+        kl_weight = stage["kl_weight"]
+        vae.compile(tf.keras.optimizers.Adam(learning_rate=stage["learning_rate"]), loss=[reconstruction_loss, kl_loss])
+        Input_all = data_cache.get_all_data()
+        n_total = np.shape(Input_all)[0]
+        t = threading.Thread(target=my_thread, args=(data_cache,))
+        t.start()
+        history = vae.fit([Input_all, np.ones(n_total)], [Input_all, np.ones(n_total)], epochs=stage["n_epoch"],
+                          batch_size=batch_size)
+        t.join()
+        encoder.save(f"./models/new_trained_models/encoder.h5")
+        decoder.save(f"./models/new_trained_models/decoder.h5")

model/VAE_torchV.py ADDED Viewed

	@@ -0,0 +1,171 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class ChannelAttention(nn.Module):
+    def __init__(self, in_planes, ratio=16):
+        super(ChannelAttention, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.max_pool = nn.AdaptiveMaxPool2d(1)
+        self.fc1 = nn.Conv2d(in_planes, in_planes // ratio, 1, bias=False)
+        self.relu1 = nn.ReLU()
+        self.fc2 = nn.Conv2d(in_planes // ratio, in_planes, 1, bias=False)
+        self.sigmoid = nn.Sigmoid()
+    def forward(self, x):
+        avg_out = self.fc2(self.relu1(self.fc1(self.avg_pool(x))))
+        max_out = self.fc2(self.relu1(self.fc1(self.max_pool(x))))
+        y = avg_out + max_out
+        y = self.sigmoid(y)
+        return x * y.expand_as(x)
+class ResCell(nn.Module):
+  def __init__(self, input_channel, output_channel, stride=1):
+    super(ResCell, self).__init__()
+    self.stride = stride
+    self.input_channel = input_channel
+    self.output_channel = output_channel
+    if self.stride == -1:
+      output_size = ()
+      self.upsample = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
+      self.skip = nn.Conv2d(self.input_channel, self.output_channel, kernel_size=1, stride=1, padding=0)
+      self.conv1 = nn.ConvTranspose2d(self.input_channel, self.output_channel, kernel_size=5, stride=2, padding=2, output_padding=1)
+      self.conv2 = nn.ConvTranspose2d(self.output_channel, self.output_channel, kernel_size=5, padding=2)
+    elif self.stride == 2:
+      self.skip = nn.Conv2d(self.input_channel, self.output_channel, kernel_size=1, stride=2, padding=0)
+      self.conv1 = nn.Conv2d(self.input_channel, self.output_channel, kernel_size=5, stride=self.stride, padding=2)
+      self.conv2 = nn.Conv2d(self.output_channel, self.output_channel, kernel_size=5, padding=2)
+    else:
+      self.conv1 = nn.Conv2d(self.input_channel, self.output_channel, kernel_size=5, stride=self.stride, padding=2)
+      self.conv2 = nn.Conv2d(self.output_channel, self.output_channel, kernel_size=5, padding=2)
+    self.bn1 = nn.BatchNorm2d(self.output_channel)
+    self.bn2 = nn.BatchNorm2d(self.output_channel)
+    # Please replace `CBAM` with the actual module and parameters
+    self.cbam = ChannelAttention(self.output_channel)
+  def forward(self, x):
+    if self.stride == -1:
+      upsampled_x = self.upsample(x)
+      skip = self.skip(upsampled_x)
+      x = F.elu(self.bn1(self.conv1(x)))
+      x = self.conv2(x)
+    elif self.stride == 2:
+      skip = self.skip(x)
+      x = F.elu(self.bn1(self.conv1(x)))
+      x = self.conv2(x)
+    else:
+      skip = x
+      x = F.elu(self.bn1(self.conv1(x)))
+      x = self.conv2(x)
+    x = self.bn2(x)
+    x = self.cbam(x)
+    x = x + skip
+    x = F.elu(x)
+    return x
+class ResBlock(nn.Module):
+  def __init__(self, input_channel, output_channel, upsample=False, n_cells=2):
+    super(ResBlock, self).__init__()
+    stride = -1 if upsample else 2
+    self.cells = nn.ModuleList([ResCell(input_channel, output_channel, stride=stride)])
+    for _ in range(n_cells - 1):
+        self.cells.append(ResCell(input_channel, output_channel, stride=1))
+  def forward(self, x):
+    for cell in self.cells:
+        x = cell(x)
+    return x
+class Encoder(nn.Module):
+  def __init__(self, input_shape, timbre_dim, N2=0, channel_sizes=None):
+    super(Encoder, self).__init__()
+    if channel_sizes is None:
+        channel_sizes = [32, 64, 64, 96, 96, 128, 160, 216]
+    self.input_shape = input_shape
+    self.timbre_dim = timbre_dim
+    self.blocks = nn.ModuleList()
+    self.blocks.append(ResBlock(input_channel=1, output_channel=channel_sizes[0], upsample=False, n_cells=1))
+    input_channel = channel_sizes[0]
+    for c in channel_sizes[1:]:
+        self.blocks.append(ResBlock(input_channel=input_channel, output_channel=c, upsample=False, n_cells=1 + N2))
+        input_channel = c
+    self.flatten = nn.Flatten()
+    self.mu_timbre = nn.Linear(self._get_flattened_dim(), timbre_dim)
+    self.sigma_timbre = nn.Linear(self._get_flattened_dim(), timbre_dim)
+  def _get_flattened_dim(self):
+    x = torch.zeros((1,) + self.input_shape)
+    for block in self.blocks:
+        x = block(x)
+    x = self.flatten(x)
+    return x.shape[1]
+  def reparameterize(self, mu, logvar):
+    std = torch.exp(0.5*logvar)
+    eps = torch.randn_like(std)
+    return mu + eps*std
+  def forward(self, x):
+    for block in self.blocks:
+        x = block(x)
+    x = self.flatten(x)
+    mu = self.mu_timbre(x)
+    logvar = self.sigma_timbre(x)
+    latent_vector = self.reparameterize(mu, logvar)
+#     kl_loss = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp(), dim=1)
+#     kl_loss = torch.mean(kl_loss)
+    return mu, logvar, latent_vector
+class Decoder(nn.Module):
+  def __init__(self, timbre_dim, N2=0, N3=8, channel_sizes=None):
+    super(Decoder, self).__init__()
+    if channel_sizes is None:
+        channel_sizes = [32, 64, 64, 96, 96, 128, 160, 216]
+    self.conv_shape = [-1, channel_sizes[-1], 2 ** (9 - N3), 2 ** (8 - N3)]
+    self.dense = nn.Linear(timbre_dim, self.conv_shape[1] * self.conv_shape[2] * self.conv_shape[3])
+    self.blocks = nn.ModuleList()
+    input_channel = channel_sizes[-1]
+    for c in list(reversed(channel_sizes))[1:]:
+        self.blocks.append(ResBlock(input_channel=input_channel, output_channel=c, upsample=True, n_cells=1 + N2))
+        input_channel = c
+    self.decoder_conv = nn.ConvTranspose2d(channel_sizes[0], 1, kernel_size=5, stride=2, padding=2, output_padding=1)
+  def forward(self, x):
+    x = F.elu(self.dense(x))
+    x = x.view(-1, self.conv_shape[1], self.conv_shape[2], self.conv_shape[3])
+    for block in self.blocks:
+        x = block(x)
+    x = self.decoder_conv(x)
+    x = torch.sigmoid(x)
+    return x

model/perceptual_label_predictor.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import tensorflow as tf
+from keras.layers import Input, Dense, Dropout
+from keras.models import Model
+from keras.losses import binary_crossentropy
+from load_data import read_data
+import joblib
+import numpy as np
+KL = tf.keras.layers
+def perceptual_label_predictor():
+    """Assemble and return the perceptual_label_predictor."""
+    mini_input = Input((20,))
+    p = Dense(20, activation='relu')(mini_input)
+    p = Dropout(0.2)(p)
+    p = Dense(16, activation='relu')(p)
+    p = Dropout(0.2)(p)
+    p = Dense(5, activation='sigmoid')(p)
+    style_predictor = Model(mini_input, p)
+    style_predictor.summary()
+    return style_predictor
+def train_perceptual_label_predictor(perceptual_label_predictor, encoder):
+    """Train the perceptual_label_predictor. (Including data loading.)"""
+    Input_synthetic = read_data("./data/labeled_dataset/synthetic_data")
+    Input_AU = read_data("./data/external_data/ARTURIA_data")[:100]
+    AU_labels = joblib.load("./data/labeled_dataset/ARTURIA_labels")
+    synth_labels = joblib.load("./data/labeled_dataset/synthetic_data_labels")
+    AU_encode = encoder.predict(Input_AU)[0]
+    Synth_encode = encoder.predict(Input_synthetic)[0]
+    perceptual_label_predictor.compile(optimizer='adam', loss=binary_crossentropy)
+    perceptual_label_predictor.fit(np.vstack([AU_encode, Synth_encode]), np.vstack([AU_labels, synth_labels]), epochs=140, validation_split=0.05, batch_size=16)
+    perceptual_label_predictor.save(f"./models/new_trained_models/perceptual_label_predictor.h5")

models/decoder_5_13.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c7c25c8e241ade613409293d0ba3b37823a129699e802e1a2e9d9bb0074b11d3
+size 16884433

models/encoder_5_13.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3da8ae2c98eea048a5cf4247ed637c67edf8b121e97efd3c7564e8ed4ea9fa51
+size 21627911

models/new_trained_models/perceptual_label_predictor.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5c6d2c3df579658bc54e07dd76d0e9efb02831bebf5e2d8c104becac930f0071
+size 46080

models/perceptual_label_predictor.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f59db5a982347ec8c40452b8f77a35432d00253a8ed347726f9b102fc4550d44
+size 46176

new_sound_generation.py ADDED Viewed

	@@ -0,0 +1,203 @@

+import numpy as np
+import matplotlib
+from pathlib import Path
+import shutil
+from tqdm import tqdm
+from tools import save_results, VAE_out_put_to_spc, show_spc
+def test_reconstruction(encoder, decoder, data, n_sample=5, f=0, path_name="./data/test_reconstruction", save_data=False):
+    """Generate and show reconstruction results. Randomly reconstruct 'n_sample' samples in 'data'.
+    You can manually set the index of the first reconstructed sample by 'f'.
+    Parameters
+    ----------
+    encoder: keras.engine.functional.Functional
+        The VAE encoder.
+    decoder: keras.engine.functional.Functional
+        Sample rate of the audio to generate.
+    data: numpy array
+        The VAE decoder.
+    n_sample: int
+        Number of samples to reconstruct.
+    f: int
+        Index of the first reconstructed sample.
+    path_name: String
+        Path to save the results.
+    save_data: bool
+        Whether save the results.
+    Returns
+    -------
+    """
+    if save_data:
+        if Path(path_name).exists():
+            shutil.rmtree(path_name)
+        Path(path_name).mkdir(parents=True, exist_ok=True)
+    for i in range(n_sample):
+        index = np.random.randint(np.shape(data)[0])
+        if i == 0:
+            index = f
+        print("######################################################")
+        print(f"index: {index}")
+        input = data[index]
+        print(f"Original:")
+        show_spc(VAE_out_put_to_spc(input))
+        if save_data:
+            save_results(VAE_out_put_to_spc(input), f"{path_name}/origin_{index}.png", f"{path_name}/origin_{index}.wav")
+        input = data[index:index + 1]
+        timbre_encode = encoder.predict(input)[0]
+        encode = timbre_encode
+        reconstruction = decoder.predict(encode)[0]
+        reconstruction = VAE_out_put_to_spc(reconstruction)
+        reconstruction = np.minimum(5000, reconstruction)
+        print(f"Reconstruction:")
+        show_spc(reconstruction)
+        if save_data:
+            save_results(reconstruction, f"{path_name}/reconstruction_{index}.png", f"{path_name}/reconstruction_{index}.wav")
+def test_interpulation(data0, data1, encoder, decoder, path_name = "./data/test_interpolation", save_data=False):
+    """Generate new sounds by latent space interpolation.
+    Parameters
+    ----------
+    data0: numpy array
+        First input for interpolation.
+    data1: numpy array
+        Second input for interpolation.
+    encoder: keras.engine.functional.Functional
+        The VAE encoder.
+    decoder: keras.engine.functional.Functional
+        Sample rate of the audio to generate.
+    path_name: String
+        Path to save the results.
+    save_data: bool
+        Whether save the results.
+    Returns
+    -------
+    """
+    if save_data:
+        if Path(path_name).exists():
+            shutil.rmtree(path_name)
+        Path(path_name).mkdir(parents=True, exist_ok=True)
+    if save_data:
+        save_results(VAE_out_put_to_spc(data0), f"{path_name}/origin_0.png", f"{path_name}/origin_0.wav")
+        save_results(VAE_out_put_to_spc(data1), f"{path_name}/origin_1.png", f"{path_name}/origin_1.wav")
+    print("First Original:")
+    show_spc(VAE_out_put_to_spc(data0))
+    print("Second Original:")
+    show_spc(VAE_out_put_to_spc(data1))
+    print("######################################################")
+    print("Interpolations:")
+    data0 = np.reshape(data0, (1, 512, 256, 1))
+    data1 = np.reshape(data1, (1, 512, 256, 1))
+    timbre_encode0 = encoder.predict(data0)[0]
+    timbre_encode1 = encoder.predict(data1)[0]
+    n_f = 8
+    for i in tqdm(range(n_f+1)):
+        rate = 1 - i/n_f
+        new_timbre = rate * timbre_encode0 + (1-rate) * timbre_encode1
+        output = decoder.predict(new_timbre)
+        spc = np.reshape(VAE_out_put_to_spc(output), (512,256))
+        if save_data:
+            save_results(spc, f"{path_name}/test_interpolation_{i}.png", f"{path_name}/test_interpolation_{i}.wav")
+        show_spc(spc)
+def test_random_sampling(decoder, n_sample=20, mu=np.zeros(20), sigma=np.ones(20), save_data = False, path_name = "./data/test_random_sampling"):
+    """Generate new sounds by random sampling in the latent space.
+    Parameters
+    ----------
+    decoder: keras.engine.functional.Functional
+        Sample rate of the audio to generate.
+    path_name: String
+        Path to save the results.
+    save_data: bool
+        Whether save the results.
+    Returns
+    -------
+    """
+    if save_data:
+        if Path(path_name).exists():
+            shutil.rmtree(path_name)
+        Path(path_name).mkdir(parents=True, exist_ok=True)
+    for i in tqdm(range(n_sample)):
+        off_set = np.random.normal(mu,np.square(sigma))
+        new_timbre = np.reshape(off_set, (1,20))
+        output = decoder.predict(new_timbre)
+        spc = np.reshape(VAE_out_put_to_spc(output), (512,256))
+        if save_data:
+            save_results(spc, f"{path_name}/random_sampling_{i}.png", f"{path_name}/random_sampling_{i}.wav")
+        show_spc(spc)
+def test_style_transform(original, encoder, decoder, perceptual_label_predictor, n_samples=10, save_data = False, goal=0, direction=0, path_name = "./data/test_style_transform"):
+    """Generate new sounds by latent space interpolation.
+    Parameters
+    ----------
+    original: numpy array
+        Original for style transform.
+    encoder: keras.engine.functional.Functional
+        The VAE encoder.
+    decoder: keras.engine.functional.Functional
+        Sample rate of the audio to generate.
+    perceptual_label_predictor: keras.engine.functional.Functional
+        Model that selects the output.
+    path_name: String
+        Path to save the results.
+    save_data: bool
+        Whether save the results.
+    Returns
+    -------
+    """
+    if save_data:
+        if Path(path_name).exists():
+            shutil.rmtree(path_name)
+        Path(path_name).mkdir(parents=True, exist_ok=True)
+        save_results(VAE_out_put_to_spc(original), f"{path_name}/origin.png", f"{path_name}/origin.wav")
+    labels_names = ["metallic", "warm", "breathy", "evolving", "aggressiv"]
+    timbre_dim = 20
+    print("Original:")
+    show_spc(VAE_out_put_to_spc(original))
+    print("######################################################")
+    original_code = encoder.predict(np.reshape(original, (1,512,256,1)))[0]
+    new_encodes = np.zeros((n_samples, timbre_dim)) + original_code
+    new_encodes = [new_encode + np.random.normal(np.zeros(timbre_dim) * 0.2,np.ones(timbre_dim)) for new_encode in new_encodes]
+    new_encodes = np.array(new_encodes, dtype=np.float32)
+    perceptual_labels = perceptual_label_predictor.predict(new_encodes)[:,goal]
+    if direction == 0:
+        best_index = np.argmin(perceptual_labels)
+        suffix = f"less_{labels_names[goal]}"
+    else:
+        best_index = np.argmax(perceptual_labels)
+        suffix = f"more_{labels_names[goal]}"
+    output = decoder.predict(new_encodes[best_index:best_index+1])
+    spc = np.reshape(VAE_out_put_to_spc(output), (512,256))
+    if save_data:
+        save_results(spc, f"{path_name}/{suffix}.png", f"{path_name}/{suffix}.wav")
+    print("Manipulated (suffix):")
+    show_spc(spc)

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+gradio
+ddsp
+torchsynth
+pytorch_lightning -y
+pytorch_lightning==1.7.0

test_audio.wav ADDED Viewed

Binary file (522 kB). View file

tools.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import numpy as np
+import tensorflow as tf
+import matplotlib.pyplot as plt
+import matplotlib
+import librosa
+from scipy.io.wavfile import write
+k = 1e-16
+def np_log10(x):
+    numerator = np.log(x + 1e-16)
+    denominator = np.log(10)
+    return numerator / denominator
+def sigmoid(x):
+    s = 1 / (1 + np.exp(-x))
+    return s
+def inv_sigmoid(s):
+    x = np.log((s / (1 - s)) + 1e-16)
+    return x
+def spc_to_VAE_input(spc):
+    """Restrict value range from 0 to 1."""
+    return spc / (1 + spc)
+def VAE_out_put_to_spc(o):
+    """Inverse transform of function 'spc_to_VAE_input'."""
+    return o / (1 - o + k)
+def denoise(spc):
+    """Filter back ground noise. (Not used.)"""
+    return np.maximum(0, spc - (2e-5))
+hop_length = 256
+win_length = 1024
+def np_power_to_db(S, amin=1e-16, top_db=80.0):
+    """Helper method for scaling."""
+    ref = np.max(S)
+    # set fixed value for ref
+    # 每个元素取max
+    log_spec = 10.0 * np_log10(np.maximum(amin, S))
+    log_spec -= 10.0 * np_log10(np.maximum(amin, ref))
+    log_spec = np.maximum(log_spec, np.max(log_spec) - top_db)
+    return log_spec
+def show_spc(spc, resolution=(512, 256)):
+    """Show a spectrogram."""
+    spc = np.reshape(spc, resolution)
+    magnitude_spectrum = np.abs(spc)
+    log_spectrum = np_power_to_db(magnitude_spectrum)
+    plt.imshow(np.flipud(log_spectrum))
+    plt.show()
+def save_results(spectrogram, spectrogram_image_path, waveform_path):
+    """Save the input 'spectrogram' and its waveform (reconstructed bu Griffin Lim)
+     to path provided by 'spectrogram_image_path' and 'waveform_path'."""
+    # save image
+    magnitude_spectrum = np.abs(spectrogram)
+    log_spc = np_power_to_db(magnitude_spectrum)
+    log_spc = np.reshape(log_spc, (512, 256))
+    matplotlib.pyplot.imsave(spectrogram_image_path, log_spc, vmin=-100, vmax=0,
+                             origin='lower')
+    # save waveform
+    abs_spec = np.zeros((513, 256))
+    abs_spec[:512, :] = abs_spec[:512, :] + np.sqrt(np.reshape(spectrogram, (512, 256)))
+    rec_signal = librosa.griffinlim(abs_spec, n_iter=32, hop_length=256, win_length=1024)
+    write(waveform_path, 16000, rec_signal)

webUI/initial_example_encodes.json ADDED Viewed

	@@ -0,0 +1,210 @@

+{
+    "random": [
+        0.398047679983798,
+        0.3181480556003946,
+        0.4481247707840732,
+        0.17181013678356805,
+        0.38504974079327525,
+        0.1630011861056878,
+        0.2718486665735521,
+        0.4338781507304229,
+        0.6538380475574059,
+        0.4158802639661583,
+        0.23043953925285032,
+        0.10156416680503988,
+        0.30416174813259533,
+        0.5135342367189637,
+        0.5187104878467569,
+        0.3526902627535946,
+        0.7747258429690094,
+        0.2627179357923156,
+        0.9086876170530048,
+        0.9271088722414674,
+        0.7019110161290005,
+        0.7718117435584002,
+        0.36794622268993094,
+        0.201057894940179
+    ],
+    "few_overtone_fading_out": [
+        -1.0641387701034546,
+        -0.22593635320663452,
+        -3.0761210918426514,
+        0.3395313322544098,
+        3.4432809352874756,
+        1.2180149555206299,
+        -3.312405824661255,
+        -0.9400798082351685,
+        1.4439473152160645,
+        -2.7892191410064697,
+        -4.55153751373291,
+        2.2633938789367676,
+        -1.9029977321624756,
+        0.37142419815063477,
+        1.7903343439102173,
+        3.5063657760620117,
+        -1.5748300552368164,
+        -2.555540084838867,
+        0.07989349216222763,
+        0.23952914774417877,
+        0.5571582317352295,
+        -1.200455904006958,
+        -1.2390071153640747,
+        -0.5626499652862549
+    ],
+    "much_overtone": [
+        -1.649719476699829,
+        -2.1237597465515137,
+        -3.3006417751312256,
+        -1.4381871223449707,
+        2.0898361206054688,
+        0.34304752945899963,
+        -1.6530671119689941,
+        0.6339190006256104,
+        -0.700051486492157,
+        -0.6604726910591125,
+        -2.3133397102355957,
+        3.0709166526794434,
+        -1.2595521211624146,
+        0.6515411138534546,
+        1.8037245273590088,
+        0.17395955324172974,
+        -1.1443099975585938,
+        -2.599336624145508,
+        -1.909640908241272,
+        -1.422598123550415,
+        0.5974328517913818,
+        -2.559039354324341,
+        -2.4977917671203613,
+        -0.9264755249023438
+    ],
+    "much_overtone_high_register": [
+        -1.5539246797561646,
+        -1.4718247652053833,
+        -0.2083689421415329,
+        0.47305312752723694,
+        -0.3550421893596649,
+        1.6288657188415527,
+        -2.5005292892456055,
+        -0.5079396367073059,
+        1.9173517227172852,
+        -2.8692283630371094,
+        -1.3840413093566895,
+        1.8955140113830566,
+        -0.6880097389221191,
+        1.2770217657089233,
+        0.2371762990951538,
+        2.726161241531372,
+        -1.7957547903060913,
+        -0.28189635276794434,
+        -0.6052505373954773,
+        -0.557244598865509,
+        -0.7524706721305847,
+        -0.6265298128128052,
+        -1.4746038913726807,
+        -2.393972396850586
+    ],
+    "blurry": [
+        1.3638803958892822,
+        -1.7874348163604736,
+        0.4853333532810211,
+        -0.9626323580741882,
+        -0.2609613537788391,
+        -1.5900236368179321,
+        -2.30368709564209,
+        0.4847792387008667,
+        1.5201102495193481,
+        -0.5147597789764404,
+        -2.452840805053711,
+        1.5057097673416138,
+        -0.16519485414028168,
+        2.126760721206665,
+        0.7019514441490173,
+        1.612999439239502,
+        -0.3407663106918335,
+        -2.8276443481445312,
+        1.311924934387207,
+        -1.5173300504684448,
+        -0.015635617077350616,
+        -2.7689361572265625,
+        1.1804192066192627,
+        -3.077000141143799
+    ],
+    "interesting_release": [
+        1.7602112293243408,
+        -1.9114476442337036,
+        -2.730947256088257,
+        -0.6088603138923645,
+        3.317946195602417,
+        -0.2716876268386841,
+        -2.3106558322906494,
+        -2.114469289779663,
+        -4.443742752075195,
+        -1.0665826797485352,
+        -3.0929622650146484,
+        1.1979585886001587,
+        -1.6287152767181396,
+        -1.537142276763916,
+        2.4184482097625732,
+        0.22694607079029083,
+        0.10934393107891083,
+        -0.18058283627033234,
+        -2.489964723587036,
+        -4.448374271392822,
+        1.2452409267425537,
+        0.05835026502609253,
+        0.8547804355621338,
+        0.8163737654685974
+    ],
+    "global_trend": [
+        -1.0987968444824219,
+        -1.1155377626419067,
+        0.14996160566806793,
+        -3.165109157562256,
+        -2.5396244525909424,
+        1.8292016983032227,
+        -3.5159406661987305,
+        3.4396510124206543,
+        -2.3765876293182373,
+        0.5692415833473206,
+        -1.7827686071395874,
+        -0.4062053859233856,
+        -1.6925498247146606,
+        0.7511563897132874,
+        0.12510846555233002,
+        -0.14617247879505157,
+        0.5096412897109985,
+        2.2399022579193115,
+        0.5798826217651367,
+        -1.5942487716674805,
+        -0.36588573455810547,
+        -0.9877008199691772,
+        4.732168674468994,
+        -5.468194007873535
+    ],
+    "crescendo": [
+        1.3167105913162231,
+        -1.1503334045410156,
+        -3.488548517227173,
+        -1.146520972251892,
+        2.478545665740967,
+        -0.5853592753410339,
+        -2.1441550254821777,
+        -2.1898915767669678,
+        -7.137173175811768,
+        -0.34099096059799194,
+        -3.832253932952881,
+        2.0366034507751465,
+        -1.3639447689056396,
+        -2.3450658321380615,
+        1.1388293504714966,
+        1.1278795003890991,
+        0.6025446653366089,
+        -0.2925209701061249,
+        -0.07147964835166931,
+        -3.1970367431640625,
+        2.4061689376831055,
+        0.477089524269104,
+        -0.8897881507873535,
+        2.827509880065918
+    ]
+}