Spaces:
Sleeping
Sleeping
File size: 3,638 Bytes
99705b8 59bb3da 99705b8 7dbebe4 3696a79 7dbebe4 3696a79 3919a75 3696a79 99705b8 3696a79 59bb3da 3696a79 99705b8 3696a79 99705b8 3696a79 99705b8 3696a79 99705b8 3696a79 99705b8 3696a79 99705b8 3696a79 99705b8 3696a79 99705b8 3696a79 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
import spaces
import random
import torch
import torchaudio
from einops import rearrange
from stable_audio_tools import get_pretrained_model
from stable_audio_tools.inference.generation import generate_diffusion_cond
import gradio as gr
import os
from huggingface_hub import login
# Ordner für temporäre Dateien erstellen
os.makedirs('static', exist_ok=True)
# Authentifizierung
if os.environ.get("HUGGING_FACE_HUB_TOKEN"):
token = os.environ["HUGGING_FACE_HUB_TOKEN"].strip()
try:
login(token=token, add_to_git_credential=True)
except Exception as e:
print(f"Warnung: Login fehlgeschlagen - {str(e)}")
@spaces.GPU(duration=300)
def generate_audio(prompt, duration=10, steps=50, cfg_scale=7):
try:
seed = random.randint(0, 2**63 - 1)
random.seed(seed)
torch.manual_seed(seed)
device = "cuda" if torch.cuda.is_available() else "cpu"
# Modell laden und zum Gerät verschieben
model, model_config = get_pretrained_model("stabilityai/stable-audio-open-1.0")
model = model.to(device)
sample_rate = model_config["sample_rate"]
sample_size = model_config["sample_size"]
# Konditionierung einrichten
conditioning = [{
"prompt": prompt,
"seconds_start": 0,
"seconds_total": duration # Keine Begrenzung mehr
}]
# Audio generieren mit anpassbaren Parametern
output = generate_diffusion_cond(
model,
steps=steps, # Keine Begrenzung mehr
cfg_scale=cfg_scale,
conditioning=conditioning,
sample_size=sample_size,
sigma_min=0.3,
sigma_max=500,
sampler_type="dpmpp-3m-sde",
device=device
)
# Audio verarbeiten
output = rearrange(output, "b d n -> d (b n)")
output = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1, 1).mul(32767).to(torch.int16).cpu()
# Audio speichern
output_path = "static/generated_audio.wav"
torchaudio.save(output_path, output, model_config["sample_rate"])
return output_path
except Exception as e:
print(f"Fehler bei der Audiogenerierung: {str(e)}")
raise e
# Benutzerdefiniertes CSS für besseres Aussehen
custom_css = """
body { background-color: #f6f6f6; }
.gradio-container { max-width: 800px; margin: auto; }
"""
# Gradio Interface mit Blocks
with gr.Blocks(css=custom_css) as demo:
gr.Markdown("# Stable Audio Generator")
gr.Markdown("Generieren Sie Audio aus Textbeschreibungen mit Stable Audio 1.0")
with gr.Row():
with gr.Column():
prompt = gr.Textbox(
label="Prompt",
placeholder="Beschreiben Sie den gewünschten Sound..."
)
duration = gr.Slider(
minimum=1, maximum=30, value=10,
step=1, label="Dauer (Sekunden)"
)
steps = gr.Slider(
minimum=20, maximum=100, value=50,
step=5, label="Anzahl der Schritte"
)
cfg_scale = gr.Slider(
minimum=1, maximum=15, value=7,
step=0.5, label="CFG Scale"
)
generate_btn = gr.Button("Generieren")
with gr.Column():
output = gr.Audio(label="Generiertes Audio", type="filepath")
generate_btn.click(
fn=generate_audio,
inputs=[prompt, duration, steps, cfg_scale],
outputs=output
)
if __name__ == "__main__":
demo.launch() |