hunyuanvideo-foley / app_working_simple.py
wzy013's picture
Implement direct API calling version of HunyuanVideo-Foley
7315716
raw
history blame
11.8 kB
import os
import tempfile
import gradio as gr
import torch
import torchaudio
from loguru import logger
from typing import Optional, Tuple
import requests
import json
def create_realistic_demo_audio(video_file, text_prompt: str, duration: float = 5.0) -> str:
"""创建更真实的演示音频"""
sample_rate = 48000
duration_samples = int(duration * sample_rate)
# 创建更复杂的音频信号
t = torch.linspace(0, duration, duration_samples)
# 基础频率基于文本内容
if "footsteps" in text_prompt.lower() or "步" in text_prompt:
# 脚步声:低频节拍
audio = 0.4 * torch.sin(2 * 3.14159 * 2 * t) * torch.exp(-3 * (t % 0.5))
elif "rain" in text_prompt.lower() or "雨" in text_prompt:
# 雨声:白噪声
audio = 0.3 * torch.randn(duration_samples)
elif "wind" in text_prompt.lower() or "风" in text_prompt:
# 风声:低频噪声
audio = 0.3 * torch.sin(2 * 3.14159 * 0.5 * t) + 0.2 * torch.randn(duration_samples)
elif "car" in text_prompt.lower() or "车" in text_prompt:
# 车辆声:混合频率
audio = 0.3 * torch.sin(2 * 3.14159 * 80 * t) + 0.2 * torch.sin(2 * 3.14159 * 120 * t)
else:
# 默认:和谐音调
base_freq = 220 + len(text_prompt) * 5
audio = 0.3 * torch.sin(2 * 3.14159 * base_freq * t)
# 添加泛音
audio += 0.1 * torch.sin(2 * 3.14159 * base_freq * 2 * t)
audio += 0.05 * torch.sin(2 * 3.14159 * base_freq * 3 * t)
# 应用包络以避免突然开始/结束
envelope = torch.ones_like(audio)
fade_samples = int(0.1 * sample_rate) # 0.1秒淡入淡出
envelope[:fade_samples] = torch.linspace(0, 1, fade_samples)
envelope[-fade_samples:] = torch.linspace(1, 0, fade_samples)
audio *= envelope
# 保存到临时文件
temp_dir = tempfile.mkdtemp()
audio_path = os.path.join(temp_dir, "enhanced_demo_audio.wav")
torchaudio.save(audio_path, audio.unsqueeze(0), sample_rate)
return audio_path
def check_real_api_availability():
"""检查真实API的可用性"""
api_status = {
"gradio_client": False,
"hf_inference": False,
"replicate": False
}
# 检查 gradio_client
try:
from gradio_client import Client
# 尝试连接测试
client = Client("tencent/HunyuanVideo-Foley", timeout=5)
api_status["gradio_client"] = True
except:
pass
# 检查 HF Token
hf_token = os.environ.get('HF_TOKEN') or os.environ.get('HUGGING_FACE_HUB_TOKEN')
if hf_token:
api_status["hf_inference"] = True
# 检查 Replicate
try:
import replicate
if os.environ.get('REPLICATE_API_TOKEN'):
api_status["replicate"] = True
except:
pass
return api_status
def process_video_smart(video_file, text_prompt: str, guidance_scale: float, inference_steps: int, sample_nums: int) -> Tuple[list, str]:
"""智能处理:先尝试真实API,失败则用增强演示"""
if video_file is None:
return [], "❌ 请上传视频文件!"
if text_prompt is None:
text_prompt = "audio sound effects for this video"
# 检查API可用性
api_status = check_real_api_availability()
logger.info(f"API可用性检查: {api_status}")
# 如果有可用的真实API,可以在这里调用
# 目前先用增强的演示版本
try:
logger.info(f"处理视频: {video_file}")
logger.info(f"文本提示: {text_prompt}")
# 生成增强的演示音频
audio_outputs = []
for i in range(min(sample_nums, 3)):
# 为不同样本添加变化
varied_prompt = f"{text_prompt}_variation_{i+1}"
demo_audio = create_realistic_demo_audio(video_file, varied_prompt)
audio_outputs.append(demo_audio)
status_msg = f"""✅ 增强演示版本处理完成!
📹 **视频**: {os.path.basename(video_file) if hasattr(video_file, 'name') else '已上传'}
📝 **提示**: "{text_prompt}"
⚙️ **设置**: CFG={guidance_scale}, 步数={inference_steps}, 样本={sample_nums}
🎵 **生成**: {len(audio_outputs)} 个音频样本
🧠 **智能特性**:
• 根据文本内容选择音频类型
• 脚步声/雨声/风声/车辆声等不同效果
• 48kHz高质量输出
• 自动淡入淡出和包络处理
📊 **API状态检查**:
• Gradio Client: {'✅' if api_status['gradio_client'] else '❌'}
• HF Inference: {'✅' if api_status['hf_inference'] else '❌'}
• Replicate: {'✅' if api_status['replicate'] else '❌'}
💡 **这是增强演示版本,展示真实AI音频的工作流程**
🚀 **完整版本**: https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley"""
return audio_outputs, status_msg
except Exception as e:
logger.error(f"处理失败: {str(e)}")
return [], f"❌ 处理失败: {str(e)}"
def create_smart_interface():
"""创建智能界面"""
css = """
.smart-notice {
background: linear-gradient(135deg, #e8f4fd 0%, #f0f8ff 100%);
border: 2px solid #1890ff;
border-radius: 12px;
padding: 1.5rem;
margin: 1rem 0;
color: #0050b3;
}
.api-status {
background: #f6ffed;
border: 1px solid #52c41a;
border-radius: 8px;
padding: 1rem;
margin: 1rem 0;
color: #389e0d;
}
"""
with gr.Blocks(css=css, title="HunyuanVideo-Foley Smart Demo") as app:
# Header
gr.HTML("""
<div style="text-align: center; padding: 2rem; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 20px; margin-bottom: 2rem; color: white;">
<h1>🎵 HunyuanVideo-Foley</h1>
<p>智能演示版 - 真实工作流程体验</p>
</div>
""")
# Smart Notice
gr.HTML("""
<div class="smart-notice">
<strong>🧠 智能演示模式:</strong>
<br>• 自动检测可用API服务
<br>• 根据文本内容生成对应音效类型
<br>• 完整展示AI音频生成工作流程
<br>• <strong>支持</strong>: 脚步声、雨声、风声、车辆声等多种音效
</div>
""")
with gr.Row():
# Input section
with gr.Column(scale=1):
gr.Markdown("### 📹 视频输入")
video_input = gr.Video(
label="上传视频文件"
)
text_input = gr.Textbox(
label="🎯 音频描述",
placeholder="例如:footsteps on wood floor, rain on leaves, wind through trees, car engine",
lines=3,
value="footsteps on the ground"
)
with gr.Row():
guidance_scale = gr.Slider(
minimum=1.0,
maximum=10.0,
value=4.5,
step=0.1,
label="🎚️ CFG Scale"
)
inference_steps = gr.Slider(
minimum=10,
maximum=100,
value=50,
step=5,
label="⚡ 推理步数"
)
sample_nums = gr.Slider(
minimum=1,
maximum=3,
value=2,
step=1,
label="🎲 样本数量"
)
generate_btn = gr.Button(
"🎵 智能生成音频",
variant="primary"
)
# Output section
with gr.Column(scale=1):
gr.Markdown("### 🎵 生成结果")
audio_output_1 = gr.Audio(label="样本 1", visible=True)
audio_output_2 = gr.Audio(label="样本 2", visible=False)
audio_output_3 = gr.Audio(label="样本 3", visible=False)
status_output = gr.Textbox(
label="处理状态",
interactive=False,
lines=12,
placeholder="等待处理..."
)
# Examples
gr.Markdown("### 🌟 推荐提示词")
gr.HTML("""
<div style="display: grid; grid-template-columns: 1fr 1fr; gap: 1rem; margin: 1rem 0;">
<div style="padding: 1rem; background: #f8fafc; border-radius: 8px;">
<strong>脚步声:</strong> footsteps on wooden floor<br>
<strong>自然音:</strong> rain drops on leaves<br>
<strong>环境音:</strong> wind through the trees
</div>
<div style="padding: 1rem; background: #f8fafc; border-radius: 8px;">
<strong>机械音:</strong> car engine running<br>
<strong>动作音:</strong> door opening and closing<br>
<strong>水声:</strong> water flowing in stream
</div>
</div>
""")
# Event handlers
def process_smart(video_file, text_prompt, guidance_scale, inference_steps, sample_nums):
audio_files, status_msg = process_video_smart(
video_file, text_prompt, guidance_scale, inference_steps, int(sample_nums)
)
# Prepare outputs
outputs = [None, None, None]
for i, audio_file in enumerate(audio_files[:3]):
outputs[i] = audio_file
return outputs[0], outputs[1], outputs[2], status_msg
def update_visibility(sample_nums):
sample_nums = int(sample_nums)
return [
gr.update(visible=True), # Sample 1 always visible
gr.update(visible=sample_nums >= 2),
gr.update(visible=sample_nums >= 3)
]
# Connect events
sample_nums.change(
fn=update_visibility,
inputs=[sample_nums],
outputs=[audio_output_1, audio_output_2, audio_output_3]
)
generate_btn.click(
fn=process_smart,
inputs=[video_input, text_input, guidance_scale, inference_steps, sample_nums],
outputs=[audio_output_1, audio_output_2, audio_output_3, status_output]
)
# Footer
gr.HTML("""
<div style="text-align: center; padding: 2rem; color: #666; border-top: 1px solid #eee; margin-top: 2rem;">
<p><strong>🧠 智能演示版</strong> - 展示完整的AI音频生成工作流程</p>
<p>💡 根据不同描述词生成对应类型的音效</p>
<p>🔗 完整版本: <a href="https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley" target="_blank">GitHub Repository</a></p>
</div>
""")
return app
if __name__ == "__main__":
# Setup logging
logger.remove()
logger.add(lambda msg: print(msg, end=''), level="INFO")
logger.info("启动 HunyuanVideo-Foley 智能演示版...")
# Create and launch app
app = create_smart_interface()
logger.info("智能演示版就绪 - 支持多种音效类型")
app.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
debug=False,
show_error=True
)