Spaces:
Running
Running
Interrupt Button Update
Browse filesStereo wav file
Improved Melody guided, partial
- app.py +32 -11
- audiocraft/data/audio.py +4 -2
- audiocraft/utils/extend.py +9 -1
app.py
CHANGED
@@ -15,17 +15,20 @@ import time
|
|
15 |
import warnings
|
16 |
from audiocraft.models import MusicGen
|
17 |
from audiocraft.data.audio import audio_write
|
18 |
-
from audiocraft.utils.extend import generate_music_segments, add_settings_to_image
|
19 |
import numpy as np
|
20 |
import random
|
21 |
|
22 |
MODEL = None
|
23 |
MODELS = None
|
24 |
-
IS_SHARED_SPACE = "
|
25 |
INTERRUPTED = False
|
26 |
UNLOAD_MODEL = False
|
27 |
MOVE_TO_CPU = False
|
28 |
|
|
|
|
|
|
|
29 |
def interrupt():
|
30 |
global INTERRUPTING
|
31 |
INTERRUPTING = True
|
@@ -63,9 +66,18 @@ def load_model(version):
|
|
63 |
|
64 |
|
65 |
def predict(model, text, melody, duration, dimension, topk, topp, temperature, cfg_coef, background, title, include_settings, settings_font, settings_font_color, seed, overlap=1):
|
66 |
-
global MODEL, INTERRUPTED
|
67 |
output_segments = None
|
68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
if MODEL is None or MODEL.name != model:
|
70 |
MODEL = load_model(model)
|
71 |
else:
|
@@ -92,6 +104,7 @@ def predict(model, text, melody, duration, dimension, topk, topp, temperature, c
|
|
92 |
seed = random.randint(0, 0xffff_ffff_ffff)
|
93 |
torch.manual_seed(seed)
|
94 |
|
|
|
95 |
print(f'Segment duration: {segment_duration}, duration: {duration}, overlap: {overlap}')
|
96 |
MODEL.set_generation_params(
|
97 |
use_sampling=True,
|
@@ -134,6 +147,12 @@ def predict(model, text, melody, duration, dimension, topk, topp, temperature, c
|
|
134 |
duration -= segment_duration - overlap
|
135 |
output_segments.append(next_segment)
|
136 |
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
if output_segments:
|
138 |
try:
|
139 |
# Combine the output segments into one long audio file or stack tracks
|
@@ -143,21 +162,22 @@ def predict(model, text, melody, duration, dimension, topk, topp, temperature, c
|
|
143 |
output = output_segments[0]
|
144 |
for i in range(1, len(output_segments)):
|
145 |
overlap_samples = overlap * MODEL.sample_rate
|
146 |
-
output = torch.cat([output[:, :, :-overlap_samples], output_segments[i]
|
147 |
output = output.detach().cpu().float()[0]
|
148 |
except Exception as e:
|
149 |
print(f"Error combining segments: {e}. Using the first segment only.")
|
150 |
output = output_segments[0].detach().cpu().float()[0]
|
151 |
else:
|
152 |
output = output.detach().cpu().float()[0]
|
|
|
153 |
with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
|
154 |
if include_settings:
|
155 |
-
video_description = f"{text}\n Duration: {str(initial_duration)} Dimension: {dimension}\n Top-k:{topk} Top-p:{topp}\n Randomness:{temperature}\n cfg:{cfg_coef} overlap: {overlap}\n Seed: {seed}\n Melody File:#todo"
|
156 |
background = add_settings_to_image(title, video_description, background_path=background, font=settings_font, font_color=settings_font_color)
|
157 |
audio_write(
|
158 |
file.name, output, MODEL.sample_rate, strategy="loudness",
|
159 |
-
loudness_headroom_db=
|
160 |
-
waveform_video = make_waveform(file.name,bg_image=background, bar_count=
|
161 |
if MOVE_TO_CPU:
|
162 |
MODEL.to('cpu')
|
163 |
if UNLOAD_MODEL:
|
@@ -177,6 +197,7 @@ def ui(**kwargs):
|
|
177 |
# UnlimitedMusicGen
|
178 |
This is your private demo for [UnlimitedMusicGen](https://github.com/Oncorporation/audiocraft), a simple and controllable model for music generation
|
179 |
presented at: ["Simple and Controllable Music Generation"](https://huggingface.co/papers/2306.05284)
|
|
|
180 |
|
181 |
Disclaimer: This won't run on CPU only. Clone this App and run on GPU instance!
|
182 |
"""
|
@@ -208,12 +229,12 @@ def ui(**kwargs):
|
|
208 |
with gr.Row():
|
209 |
model = gr.Radio(["melody", "medium", "small", "large"], label="Model", value="melody", interactive=True)
|
210 |
with gr.Row():
|
211 |
-
duration = gr.Slider(minimum=1, maximum=
|
212 |
overlap = gr.Slider(minimum=1, maximum=29, value=5, step=1, label="Overlap", interactive=True)
|
213 |
dimension = gr.Slider(minimum=-2, maximum=2, value=2, step=1, label="Dimension", info="determines which direction to add new segements of audio. (1 = stack tracks, 2 = lengthen, -2..0 = ?)", interactive=True)
|
214 |
with gr.Row():
|
215 |
-
topk = gr.Number(label="Top-k", value=250, interactive=True)
|
216 |
-
topp = gr.Number(label="Top-p", value=0, interactive=True)
|
217 |
temperature = gr.Number(label="Randomness Temperature", value=0.75, precision=None, interactive=True)
|
218 |
cfg_coef = gr.Number(label="Classifier Free Guidance", value=5.5, precision=None, interactive=True)
|
219 |
with gr.Row():
|
|
|
15 |
import warnings
|
16 |
from audiocraft.models import MusicGen
|
17 |
from audiocraft.data.audio import audio_write
|
18 |
+
from audiocraft.utils.extend import generate_music_segments, add_settings_to_image, INTERRUPTING
|
19 |
import numpy as np
|
20 |
import random
|
21 |
|
22 |
MODEL = None
|
23 |
MODELS = None
|
24 |
+
IS_SHARED_SPACE = "Surn/UnlimitedMusicGen" in os.environ.get('SPACE_ID', '')
|
25 |
INTERRUPTED = False
|
26 |
UNLOAD_MODEL = False
|
27 |
MOVE_TO_CPU = False
|
28 |
|
29 |
+
def interrupt_callback():
|
30 |
+
return INTERRUPTED
|
31 |
+
|
32 |
def interrupt():
|
33 |
global INTERRUPTING
|
34 |
INTERRUPTING = True
|
|
|
66 |
|
67 |
|
68 |
def predict(model, text, melody, duration, dimension, topk, topp, temperature, cfg_coef, background, title, include_settings, settings_font, settings_font_color, seed, overlap=1):
|
69 |
+
global MODEL, INTERRUPTED, INTERRUPTING
|
70 |
output_segments = None
|
71 |
+
|
72 |
+
INTERRUPTED = False
|
73 |
+
INTERRUPTING = False
|
74 |
+
if temperature < 0:
|
75 |
+
raise gr.Error("Temperature must be >= 0.")
|
76 |
+
if topk < 0:
|
77 |
+
raise gr.Error("Topk must be non-negative.")
|
78 |
+
if topp < 0:
|
79 |
+
raise gr.Error("Topp must be non-negative.")
|
80 |
+
|
81 |
if MODEL is None or MODEL.name != model:
|
82 |
MODEL = load_model(model)
|
83 |
else:
|
|
|
104 |
seed = random.randint(0, 0xffff_ffff_ffff)
|
105 |
torch.manual_seed(seed)
|
106 |
|
107 |
+
|
108 |
print(f'Segment duration: {segment_duration}, duration: {duration}, overlap: {overlap}')
|
109 |
MODEL.set_generation_params(
|
110 |
use_sampling=True,
|
|
|
147 |
duration -= segment_duration - overlap
|
148 |
output_segments.append(next_segment)
|
149 |
|
150 |
+
if INTERRUPTING:
|
151 |
+
INTERRUPTED = True
|
152 |
+
INTERRUPTING = False
|
153 |
+
print("Function execution interrupted!")
|
154 |
+
raise gr.Error("Interrupted.")
|
155 |
+
|
156 |
if output_segments:
|
157 |
try:
|
158 |
# Combine the output segments into one long audio file or stack tracks
|
|
|
162 |
output = output_segments[0]
|
163 |
for i in range(1, len(output_segments)):
|
164 |
overlap_samples = overlap * MODEL.sample_rate
|
165 |
+
output = torch.cat([output[:, :, :-overlap_samples], output_segments[i]], dim=dimension)
|
166 |
output = output.detach().cpu().float()[0]
|
167 |
except Exception as e:
|
168 |
print(f"Error combining segments: {e}. Using the first segment only.")
|
169 |
output = output_segments[0].detach().cpu().float()[0]
|
170 |
else:
|
171 |
output = output.detach().cpu().float()[0]
|
172 |
+
|
173 |
with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
|
174 |
if include_settings:
|
175 |
+
video_description = f"{text}\n Duration: {str(initial_duration)} Dimension: {dimension}\n Top-k:{topk} Top-p:{topp}\n Randomness:{temperature}\n cfg:{cfg_coef} overlap: {overlap}\n Seed: {seed}\n Model: {model}\n Melody File:#todo"
|
176 |
background = add_settings_to_image(title, video_description, background_path=background, font=settings_font, font_color=settings_font_color)
|
177 |
audio_write(
|
178 |
file.name, output, MODEL.sample_rate, strategy="loudness",
|
179 |
+
loudness_headroom_db=19, loudness_compressor=True, add_suffix=False, channels=2)
|
180 |
+
waveform_video = make_waveform(file.name,bg_image=background, bar_count=45)
|
181 |
if MOVE_TO_CPU:
|
182 |
MODEL.to('cpu')
|
183 |
if UNLOAD_MODEL:
|
|
|
197 |
# UnlimitedMusicGen
|
198 |
This is your private demo for [UnlimitedMusicGen](https://github.com/Oncorporation/audiocraft), a simple and controllable model for music generation
|
199 |
presented at: ["Simple and Controllable Music Generation"](https://huggingface.co/papers/2306.05284)
|
200 |
+
Todo: Working on improved Melody Conditioned Music Generation transitions.
|
201 |
|
202 |
Disclaimer: This won't run on CPU only. Clone this App and run on GPU instance!
|
203 |
"""
|
|
|
229 |
with gr.Row():
|
230 |
model = gr.Radio(["melody", "medium", "small", "large"], label="Model", value="melody", interactive=True)
|
231 |
with gr.Row():
|
232 |
+
duration = gr.Slider(minimum=1, maximum=720, value=10, label="Duration", interactive=True)
|
233 |
overlap = gr.Slider(minimum=1, maximum=29, value=5, step=1, label="Overlap", interactive=True)
|
234 |
dimension = gr.Slider(minimum=-2, maximum=2, value=2, step=1, label="Dimension", info="determines which direction to add new segements of audio. (1 = stack tracks, 2 = lengthen, -2..0 = ?)", interactive=True)
|
235 |
with gr.Row():
|
236 |
+
topk = gr.Number(label="Top-k", value=250, precision=0, interactive=True)
|
237 |
+
topp = gr.Number(label="Top-p", value=0, precision=0, interactive=True)
|
238 |
temperature = gr.Number(label="Randomness Temperature", value=0.75, precision=None, interactive=True)
|
239 |
cfg_coef = gr.Number(label="Classifier Free Guidance", value=5.5, precision=None, interactive=True)
|
240 |
with gr.Row():
|
audiocraft/data/audio.py
CHANGED
@@ -22,7 +22,7 @@ import torchaudio as ta
|
|
22 |
|
23 |
import av
|
24 |
|
25 |
-
from .audio_utils import f32_pcm, i16_pcm, normalize_audio
|
26 |
|
27 |
|
28 |
_av_initialized = False
|
@@ -157,7 +157,7 @@ def audio_write(stem_name: tp.Union[str, Path],
|
|
157 |
rms_headroom_db: float = 18, loudness_headroom_db: float = 14,
|
158 |
loudness_compressor: bool = False,
|
159 |
log_clipping: bool = True, make_parent_dir: bool = True,
|
160 |
-
add_suffix: bool = True) -> Path:
|
161 |
"""Convenience function for saving audio to disk. Returns the filename the audio was written to.
|
162 |
|
163 |
Args:
|
@@ -190,6 +190,8 @@ def audio_write(stem_name: tp.Union[str, Path],
|
|
190 |
wav = normalize_audio(wav, normalize, strategy, peak_clip_headroom_db,
|
191 |
rms_headroom_db, loudness_headroom_db, log_clipping=log_clipping,
|
192 |
sample_rate=sample_rate, stem_name=str(stem_name))
|
|
|
|
|
193 |
kwargs: dict = {}
|
194 |
if format == 'mp3':
|
195 |
suffix = '.mp3'
|
|
|
22 |
|
23 |
import av
|
24 |
|
25 |
+
from .audio_utils import f32_pcm, i16_pcm, normalize_audio, convert_audio
|
26 |
|
27 |
|
28 |
_av_initialized = False
|
|
|
157 |
rms_headroom_db: float = 18, loudness_headroom_db: float = 14,
|
158 |
loudness_compressor: bool = False,
|
159 |
log_clipping: bool = True, make_parent_dir: bool = True,
|
160 |
+
add_suffix: bool = True, channels:int = 1) -> Path:
|
161 |
"""Convenience function for saving audio to disk. Returns the filename the audio was written to.
|
162 |
|
163 |
Args:
|
|
|
190 |
wav = normalize_audio(wav, normalize, strategy, peak_clip_headroom_db,
|
191 |
rms_headroom_db, loudness_headroom_db, log_clipping=log_clipping,
|
192 |
sample_rate=sample_rate, stem_name=str(stem_name))
|
193 |
+
if channels > 1:
|
194 |
+
wav = convert_audio(wav,sample_rate, sample_rate, channels)
|
195 |
kwargs: dict = {}
|
196 |
if format == 'mp3':
|
197 |
suffix = '.mp3'
|
audiocraft/utils/extend.py
CHANGED
@@ -11,6 +11,9 @@ import requests
|
|
11 |
from io import BytesIO
|
12 |
from huggingface_hub import hf_hub_download
|
13 |
|
|
|
|
|
|
|
14 |
def separate_audio_segments(audio, segment_duration=30, overlap=1):
|
15 |
sr, audio_data = audio[0], audio[1]
|
16 |
|
@@ -65,6 +68,8 @@ def generate_music_segments(text, melody, MODEL, seed, duration:int=10, overlap:
|
|
65 |
|
66 |
# Iterate over the segments to create list of Meldoy tensors
|
67 |
for segment_idx in range(total_segments):
|
|
|
|
|
68 |
print(f"segment {segment_idx + 1} of {total_segments} \r")
|
69 |
sr, verse = melody_segments[segment_idx][0], torch.from_numpy(melody_segments[segment_idx][1]).to(MODEL.device).float().t().unsqueeze(0)
|
70 |
|
@@ -77,6 +82,9 @@ def generate_music_segments(text, melody, MODEL, seed, duration:int=10, overlap:
|
|
77 |
|
78 |
torch.manual_seed(seed)
|
79 |
for idx, verse in enumerate(melodys):
|
|
|
|
|
|
|
80 |
print(f"Generating New Melody Segment {idx + 1}: {text}\r")
|
81 |
if output_segments:
|
82 |
# If this isn't the first segment, use the last chunk of the previous segment as the input
|
@@ -166,7 +174,7 @@ def load_font(font_name, font_size=16):
|
|
166 |
|
167 |
if font is None:
|
168 |
try:
|
169 |
-
font_path = ImageFont.truetype(hf_hub_download(repo_id=
|
170 |
font = ImageFont.truetype(font_path, font_size)
|
171 |
except (FileNotFoundError, OSError):
|
172 |
print("Font not found. Trying to download from local assets folder...\n")
|
|
|
11 |
from io import BytesIO
|
12 |
from huggingface_hub import hf_hub_download
|
13 |
|
14 |
+
|
15 |
+
INTERRUPTING = False
|
16 |
+
|
17 |
def separate_audio_segments(audio, segment_duration=30, overlap=1):
|
18 |
sr, audio_data = audio[0], audio[1]
|
19 |
|
|
|
68 |
|
69 |
# Iterate over the segments to create list of Meldoy tensors
|
70 |
for segment_idx in range(total_segments):
|
71 |
+
if INTERRUPTING:
|
72 |
+
return [], duration
|
73 |
print(f"segment {segment_idx + 1} of {total_segments} \r")
|
74 |
sr, verse = melody_segments[segment_idx][0], torch.from_numpy(melody_segments[segment_idx][1]).to(MODEL.device).float().t().unsqueeze(0)
|
75 |
|
|
|
82 |
|
83 |
torch.manual_seed(seed)
|
84 |
for idx, verse in enumerate(melodys):
|
85 |
+
if INTERRUPTING:
|
86 |
+
return output_segments, duration - (segment_duration * len(output_segments))
|
87 |
+
|
88 |
print(f"Generating New Melody Segment {idx + 1}: {text}\r")
|
89 |
if output_segments:
|
90 |
# If this isn't the first segment, use the last chunk of the previous segment as the input
|
|
|
174 |
|
175 |
if font is None:
|
176 |
try:
|
177 |
+
font_path = ImageFont.truetype(hf_hub_download(repo_id=os.environ.get('SPACE_ID', ''), filename="assets/" + font_name, repo_type="space"), encoding="UTF-8")
|
178 |
font = ImageFont.truetype(font_path, font_size)
|
179 |
except (FileNotFoundError, OSError):
|
180 |
print("Font not found. Trying to download from local assets folder...\n")
|