Phil Sobrepena commited on
Commit
d15bc7d
·
1 Parent(s): bac321f

visibility, simplified ui

Browse files
Files changed (1) hide show
  1. app.py +44 -12
app.py CHANGED
@@ -60,14 +60,14 @@ net, feature_utils, seq_cfg = get_model()
60
 
61
  @spaces.GPU(duration=120)
62
  @torch.inference_mode()
63
- def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str):
64
-
65
- num_steps = 25
66
- cfg_strength = 4.5
67
- duration = 8.0
68
- rng = torch.Generator(device=device)
69
- rng.seed()
70
 
 
 
 
 
 
71
  fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
72
 
73
  video_info = load_video(video, duration)
@@ -98,6 +98,38 @@ def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str):
98
  return video_save_path
99
 
100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  video_to_audio_tab = gr.Interface(
102
  fn=video_to_audio,
103
  description="""
@@ -112,14 +144,14 @@ video_to_audio_tab = gr.Interface(
112
  gr.Video(),
113
  gr.Text(label='Prompt'),
114
  gr.Text(label='Negative prompt', value='music'),
115
- # gr.Number(label='Seed (-1: random)', value=-1, precision=0, minimum=-1),
116
- # gr.Number(label='Num steps', value=25, precision=0, minimum=1),
117
- # gr.Number(label='Guidance Strength', value=4.5, minimum=1),
118
- # gr.Number(label='Duration (sec)', value=8, minimum=1),
119
  ],
120
  outputs='playable_video',
121
  cache_examples=False,
122
- title='Sonisphere — Sonic Branding through Multi-modal Audio Synthesis',
123
  examples=[
124
  ])
125
 
 
60
 
61
  @spaces.GPU(duration=120)
62
  @torch.inference_mode()
63
+ def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int, num_steps: int,
64
+ cfg_strength: float, duration: float):
 
 
 
 
 
65
 
66
+ rng = torch.Generator(device=device)
67
+ if seed >= 0:
68
+ rng.manual_seed(seed)
69
+ else:
70
+ rng.seed()
71
  fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
72
 
73
  video_info = load_video(video, duration)
 
98
  return video_save_path
99
 
100
 
101
+ # @spaces.GPU(duration=120)
102
+ # @torch.inference_mode()
103
+ # def text_to_audio(prompt: str, negative_prompt: str, seed: int, num_steps: int, cfg_strength: float,
104
+ # duration: float):
105
+
106
+ # rng = torch.Generator(device=device)
107
+ # if seed >= 0:
108
+ # rng.manual_seed(seed)
109
+ # else:
110
+ # rng.seed()
111
+ # fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
112
+
113
+ # clip_frames = sync_frames = None
114
+ # seq_cfg.duration = duration
115
+ # net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
116
+
117
+ # audios = generate(clip_frames,
118
+ # sync_frames, [prompt],
119
+ # negative_text=[negative_prompt],
120
+ # feature_utils=feature_utils,
121
+ # net=net,
122
+ # fm=fm,
123
+ # rng=rng,
124
+ # cfg_strength=cfg_strength)
125
+ # audio = audios.float().cpu()[0]
126
+
127
+ # audio_save_path = tempfile.NamedTemporaryFile(delete=False, suffix='.flac').name
128
+ # torchaudio.save(audio_save_path, audio, seq_cfg.sampling_rate)
129
+ # log.info(f'Saved audio to {audio_save_path}')
130
+ # return audio_save_path
131
+
132
+
133
  video_to_audio_tab = gr.Interface(
134
  fn=video_to_audio,
135
  description="""
 
144
  gr.Video(),
145
  gr.Text(label='Prompt'),
146
  gr.Text(label='Negative prompt', value='music'),
147
+ gr.Number(label='Seed (-1: random)', value=-1, precision=0, minimum=-1, visible=False),
148
+ gr.Number(label='Num steps', value=25, precision=0, minimum=1, visible=False),
149
+ gr.Number(label='Guidance Strength', value=4.5, minimum=1, visible=False),
150
+ gr.Number(label='Duration (sec)', value=8, minimum=1, visible=False),
151
  ],
152
  outputs='playable_video',
153
  cache_examples=False,
154
+ title='Sonisphere — Sonic Branding through Multi Modal Audio Synthesis',
155
  examples=[
156
  ])
157