EconLabAI commited on
Commit
99705b8
·
verified ·
1 Parent(s): 05a1da5

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -0
app.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+ import torch
3
+ import torchaudio
4
+ from einops import rearrange
5
+ from stable_audio_tools import get_pretrained_model
6
+ from stable_audio_tools.inference.generation import generate_diffusion_cond
7
+ import gradio as gr
8
+
9
+ @spaces.GPU(duration=180)
10
+ def generate_audio(prompt, duration=10, steps=50, cfg_scale=7):
11
+ device = "cuda" if torch.cuda.is_available() else "cpu"
12
+
13
+ # Modell laden und zum Gerät verschieben
14
+ model, model_config = get_pretrained_model("stabilityai/stable-audio-open-1.0")
15
+ model = model.to(device)
16
+
17
+ sample_rate = model_config["sample_rate"]
18
+ sample_size = model_config["sample_size"]
19
+
20
+ # Konditionierung einrichten
21
+ conditioning = [{
22
+ "prompt": prompt,
23
+ "seconds_start": 0,
24
+ "seconds_total": duration
25
+ }]
26
+
27
+ # Audio generieren mit anpassbaren Parametern
28
+ output = generate_diffusion_cond(
29
+ model,
30
+ steps=steps,
31
+ cfg_scale=cfg_scale,
32
+ conditioning=conditioning,
33
+ sample_size=sample_size,
34
+ sigma_min=0.3,
35
+ sigma_max=500,
36
+ sampler_type="dpmpp-3m-sde", # Besserer Sampler
37
+ device=device
38
+ )
39
+
40
+ # Audio-Batch in eine einzelne Sequenz umwandeln
41
+ output = rearrange(output, "b d n -> d (b n)")
42
+
43
+ # Peak-Normalisierung, Clipping, Konvertierung zu int16
44
+ output = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1, 1).mul(32767).to(torch.int16).cpu()
45
+
46
+ return output, sample_rate
47
+
48
+ def generate(prompt, duration=10, steps=50, cfg_scale=7):
49
+ audio, sr = generate_audio(prompt, duration, steps, cfg_scale)
50
+ return (sr, audio.numpy())
51
+
52
+ # Verbesserte Benutzeroberfläche
53
+ iface = gr.Interface(
54
+ fn=generate,
55
+ inputs=[
56
+ gr.Textbox(label="Prompt", placeholder="Beschreiben Sie den gewünschten Sound..."),
57
+ gr.Slider(minimum=1, maximum=30, value=10, step=1, label="Dauer (Sekunden)"),
58
+ gr.Slider(minimum=20, maximum=100, value=50, step=5, label="Anzahl der Schritte"),
59
+ gr.Slider(minimum=1, maximum=15, value=7, step=0.5, label="CFG Scale"),
60
+ ],
61
+ outputs=gr.Audio(label="Generiertes Audio"),
62
+ title="Stable Audio Generator",
63
+ description="Generieren Sie Audio aus Textbeschreibungen mit Stable Audio 1.0",
64
+ )
65
+
66
+ if __name__ == "__main__":
67
+ iface.launch()