File size: 3,638 Bytes
99705b8
59bb3da
99705b8
 
 
 
 
 
7dbebe4
 
 
3696a79
 
 
7dbebe4
 
3696a79
 
 
 
 
3919a75
3696a79
99705b8
3696a79
59bb3da
 
 
 
3696a79
 
 
 
 
 
 
 
99705b8
3696a79
 
 
 
 
 
99705b8
3696a79
 
 
 
 
 
 
 
 
 
 
 
99705b8
3696a79
 
 
99705b8
3696a79
 
 
 
 
99705b8
3696a79
 
 
99705b8
3696a79
 
 
 
 
99705b8
3696a79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99705b8
 
3696a79
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import spaces
import random
import torch
import torchaudio
from einops import rearrange
from stable_audio_tools import get_pretrained_model
from stable_audio_tools.inference.generation import generate_diffusion_cond
import gradio as gr
import os
from huggingface_hub import login

# Ordner für temporäre Dateien erstellen
os.makedirs('static', exist_ok=True)

# Authentifizierung
if os.environ.get("HUGGING_FACE_HUB_TOKEN"):
    token = os.environ["HUGGING_FACE_HUB_TOKEN"].strip()
    try:
        login(token=token, add_to_git_credential=True)
    except Exception as e:
        print(f"Warnung: Login fehlgeschlagen - {str(e)}")

@spaces.GPU(duration=300)
def generate_audio(prompt, duration=10, steps=50, cfg_scale=7):
    try:
        seed = random.randint(0, 2**63 - 1)
        random.seed(seed)
        torch.manual_seed(seed)
    
        device = "cuda" if torch.cuda.is_available() else "cpu"
        
        # Modell laden und zum Gerät verschieben
        model, model_config = get_pretrained_model("stabilityai/stable-audio-open-1.0")
        model = model.to(device)
        
        sample_rate = model_config["sample_rate"]
        sample_size = model_config["sample_size"]

        # Konditionierung einrichten
        conditioning = [{
            "prompt": prompt,
            "seconds_start": 0,
            "seconds_total": duration  # Keine Begrenzung mehr
        }]

        # Audio generieren mit anpassbaren Parametern
        output = generate_diffusion_cond(
            model,
            steps=steps,  # Keine Begrenzung mehr
            cfg_scale=cfg_scale,
            conditioning=conditioning,
            sample_size=sample_size,
            sigma_min=0.3,
            sigma_max=500,
            sampler_type="dpmpp-3m-sde",
            device=device
        )

        # Audio verarbeiten
        output = rearrange(output, "b d n -> d (b n)")
        output = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1, 1).mul(32767).to(torch.int16).cpu()

        # Audio speichern
        output_path = "static/generated_audio.wav"
        torchaudio.save(output_path, output, model_config["sample_rate"])
        
        return output_path

    except Exception as e:
        print(f"Fehler bei der Audiogenerierung: {str(e)}")
        raise e

# Benutzerdefiniertes CSS für besseres Aussehen
custom_css = """
body { background-color: #f6f6f6; }
.gradio-container { max-width: 800px; margin: auto; }
"""

# Gradio Interface mit Blocks
with gr.Blocks(css=custom_css) as demo:
    gr.Markdown("# Stable Audio Generator")
    gr.Markdown("Generieren Sie Audio aus Textbeschreibungen mit Stable Audio 1.0")
    
    with gr.Row():
        with gr.Column():
            prompt = gr.Textbox(
                label="Prompt",
                placeholder="Beschreiben Sie den gewünschten Sound..."
            )
            duration = gr.Slider(
                minimum=1, maximum=30, value=10,
                step=1, label="Dauer (Sekunden)"
            )
            steps = gr.Slider(
                minimum=20, maximum=100, value=50,
                step=5, label="Anzahl der Schritte"
            )
            cfg_scale = gr.Slider(
                minimum=1, maximum=15, value=7,
                step=0.5, label="CFG Scale"
            )
            generate_btn = gr.Button("Generieren")
        
        with gr.Column():
            output = gr.Audio(label="Generiertes Audio", type="filepath")
    
    generate_btn.click(
        fn=generate_audio,
        inputs=[prompt, duration, steps, cfg_scale],
        outputs=output
    )

if __name__ == "__main__":
    demo.launch()