Phil Sobrepena commited on
Commit
65f1027
·
1 Parent(s): eb4ead1
Files changed (1) hide show
  1. app.py +3 -35
app.py CHANGED
@@ -64,7 +64,7 @@ def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: 0,
64
  cfg_strength: 4.5, duration: 8.0):
65
 
66
  rng = torch.Generator(device=device)
67
- if seed >= 1:
68
  rng.manual_seed(seed)
69
  else:
70
  rng.seed()
@@ -98,42 +98,10 @@ def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: 0,
98
  return video_save_path
99
 
100
 
101
- @spaces.GPU(duration=120)
102
- @torch.inference_mode()
103
- def text_to_audio(prompt: str, negative_prompt: str, seed: int, num_steps: int, cfg_strength: float,
104
- duration: float):
105
-
106
- rng = torch.Generator(device=device)
107
- if seed >= 0:
108
- rng.manual_seed(seed)
109
- else:
110
- rng.seed()
111
- fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
112
-
113
- clip_frames = sync_frames = None
114
- seq_cfg.duration = duration
115
- net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
116
-
117
- audios = generate(clip_frames,
118
- sync_frames, [prompt],
119
- negative_text=[negative_prompt],
120
- feature_utils=feature_utils,
121
- net=net,
122
- fm=fm,
123
- rng=rng,
124
- cfg_strength=cfg_strength)
125
- audio = audios.float().cpu()[0]
126
-
127
- audio_save_path = tempfile.NamedTemporaryFile(delete=False, suffix='.flac').name
128
- torchaudio.save(audio_save_path, audio, seq_cfg.sampling_rate)
129
- log.info(f'Saved audio to {audio_save_path}')
130
- return audio_save_path
131
-
132
-
133
  video_to_audio_tab = gr.Interface(
134
  fn=video_to_audio,
135
  description="""
136
- Sonisphere | Video-to-Audio
137
  NOTE: It takes longer to process high-resolution videos (>384 px on the shorter side).
138
  Doing so does not improve results.
139
 
@@ -150,7 +118,7 @@ video_to_audio_tab = gr.Interface(
150
  ],
151
  outputs='playable_video',
152
  cache_examples=False,
153
- title='Sonisphere — Video-to-Audio Synthesis',
154
  examples=[
155
  ])
156
 
 
64
  cfg_strength: 4.5, duration: 8.0):
65
 
66
  rng = torch.Generator(device=device)
67
+ if seed >= 0:
68
  rng.manual_seed(seed)
69
  else:
70
  rng.seed()
 
98
  return video_save_path
99
 
100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  video_to_audio_tab = gr.Interface(
102
  fn=video_to_audio,
103
  description="""
104
+ Video-to-Audio
105
  NOTE: It takes longer to process high-resolution videos (>384 px on the shorter side).
106
  Doing so does not improve results.
107
 
 
118
  ],
119
  outputs='playable_video',
120
  cache_examples=False,
121
+ title='Sonisphere — Sonic Branding Synthesis',
122
  examples=[
123
  ])
124