SadTalkerjggj

Build error

App Files Files Community

Uhhy commited on Sep 10, 2024

Commit

4df1f63

verified ·

1 Parent(s): c6f8a33

Update app.py

Browse files

Files changed (1) hide show

app.py +196 -96

app.py CHANGED Viewed

@@ -5,8 +5,11 @@ import shutil
 from pydub import AudioSegment
 import spaces
 import torch
-import gradio as gr
-from huggingface_hub import snapshot_download
 from examples.get_examples import get_examples
 from src.facerender.pirender_animate import AnimateFromCoeff_PIRender
@@ -19,24 +22,45 @@ from src.utils.init_path import init_path
 checkpoint_path = 'checkpoints'
 config_path = 'src/config'
-device = "cuda" if torch.cuda.is_available(
-) else "mps" if platform.system() == 'Darwin' else "cpu"
 os.environ['TORCH_HOME'] = checkpoint_path
 snapshot_download(repo_id='vinthony/SadTalker-V002rc',
                   local_dir=checkpoint_path, local_dir_use_symlinks=True)
 def mp3_to_wav(mp3_filename, wav_filename, frame_rate):
     AudioSegment.from_file(file=mp3_filename).set_frame_rate(
         frame_rate).export(wav_filename, format="wav")
 @spaces.GPU(duration=0)
-def generate_video(source_image, driven_audio, preprocess='crop', still_mode=False, use_enhancer=False,
-                   batch_size=1, size=256, pose_style=0, facerender='facevid2vid', exp_scale=1.0,
-                   use_ref_video=False, ref_video=None, ref_info=None, use_idle_mode=False,
-                   length_of_audio=0, use_blink=True, result_dir='./results/'):
     # Initialize models and paths
     sadtalker_paths = init_path(
         checkpoint_path, config_path, size, False, preprocess)
@@ -111,6 +135,10 @@ def generate_video(source_image, driven_audio, preprocess='crop', still_mode=Fal
     else:
         batch = get_data(first_coeff_path, audio_path, device, ref_eyeblink_coeff_path=ref_eyeblink_coeff_path,
                          still=still_mode, idlemode=use_idle_mode, length_of_audio=length_of_audio, use_blink=use_blink)
         coeff_path = audio_to_coeff.generate(
             batch, save_dir, pose_style, ref_pose_coeff_path)
@@ -124,94 +152,166 @@ def generate_video(source_image, driven_audio, preprocess='crop', still_mode=Fal
     return return_path
-# Gradio UI
-with gr.Blocks(analytics_enabled=False) as demo:
-    with gr.Row():
-        with gr.Column(variant='panel'):
-            with gr.Tabs(elem_id="sadtalker_source_image"):
-                with gr.TabItem('Source image'):
-                    with gr.Row():
-                        source_image = gr.Image(
-                            label="Source image", sources="upload", type="filepath", elem_id="img2img_image")
-            with gr.Tabs(elem_id="sadtalker_driven_audio"):
-                with gr.TabItem('Driving Methods'):
-                    gr.Markdown(
-                        "Possible driving combinations: <br> 1. Audio only 2. Audio/IDLE Mode + Ref Video(pose, blink, pose+blink) 3. IDLE Mode only 4. Ref Video only (all) ")
-                    with gr.Row():
-                        driven_audio = gr.Audio(
-                            label="Input audio", sources="upload", type="filepath")
-                        driven_audio_no = gr.Audio(
-                            label="Use IDLE mode, no audio is required", sources="upload", type="filepath", visible=False)
-                        with gr.Column():
-                            use_idle_mode = gr.Checkbox(
-                                label="Use Idle Animation")
-                            length_of_audio = gr.Number(
-                                value=5, label="The length(seconds) of the generated video.")
-                            use_idle_mode.change(lambda choice: (gr.update(visible=not choice), gr.update(visible=choice)),
-                                                 inputs=use_idle_mode, outputs=[driven_audio, driven_audio_no])
-                    with gr.Row():
-                        ref_video = gr.Video(
-                            label="Reference Video", sources="upload", elem_id="vidref")
-                        with gr.Column():
-                            use_ref_video = gr.Checkbox(
-                                label="Use Reference Video")
-                            ref_info = gr.Radio(['pose', 'blink', 'pose+blink', 'all'], value='pose', label='Reference Video',
-                                                info="How to borrow from reference Video?((fully transfer, aka, video driving mode))")
-                        ref_video.change(lambda path: gr.update(
-                            value=path is not None), inputs=ref_video, outputs=use_ref_video)
-        with gr.Column(variant='panel'):
-            with gr.Tabs(elem_id="sadtalker_checkbox"):
-                with gr.TabItem('Settings'):
-                    with gr.Column(variant='panel'):
-                        with gr.Row():
-                            pose_style = gr.Slider(
-                                minimum=0, maximum=45, step=1, label="Pose style", value=0)
-                            exp_weight = gr.Slider(
-                                minimum=0, maximum=3, step=0.1, label="expression scale", value=1)
-                            blink_every = gr.Checkbox(
-                                label="use eye blink", value=True)
-                        with gr.Row():
-                            size_of_image = gr.Radio(
-                                [256, 512], value=256, label='face model resolution', info="use 256/512 model?")
-                            preprocess_type = gr.Radio(
-                                ['crop', 'resize', 'full', 'extcrop', 'extfull'], value='crop', label='preprocess', info="How to handle input image?")
-                        with gr.Row():
-                            is_still_mode = gr.Checkbox(
-                                label="Still Mode (fewer head motion, works with preprocess `full`)")
-                            facerender = gr.Radio(
-                                ['facevid2vid', 'pirender'], value='facevid2vid', label='facerender', info="which face render?")
-                        with gr.Row():
-                            batch_size = gr.Slider(
-                                label="batch size in generation", step=1, maximum=10, value=1)
-                            enhancer = gr.Checkbox(
-                                label="GFPGAN as Face enhancer", value=True)
-                        submit = gr.Button(
-                            'Generate', elem_id="sadtalker_generate", variant='primary')
-            with gr.Tabs(elem_id="sadtalker_generated"):
-                gen_video = gr.Video(label="Generated video")
-    submit.click(
-        fn=generate_video,
-        inputs=[source_image, driven_audio, preprocess_type, is_still_mode, enhancer, batch_size, size_of_image,
-                pose_style, facerender, exp_weight, use_ref_video, ref_video, ref_info, use_idle_mode, length_of_audio, blink_every],
-        outputs=[gen_video],
     )
-    with gr.Row():
-        gr.Examples(examples=get_examples(), inputs=[source_image, driven_audio, preprocess_type, is_still_mode, enhancer],
-                    outputs=[gen_video], fn=generate_video)
-demo.launch(debug=True)

 from pydub import AudioSegment
 import spaces
 import torch
+from fastapi import FastAPI, File, UploadFile, Form
+from fastapi.responses import FileResponse
+from fastapi.staticfiles import StaticFiles
+from fastapi.templating import Jinja2Templates
+from transformers import pipeline
 from examples.get_examples import get_examples
 from src.facerender.pirender_animate import AnimateFromCoeff_PIRender
 checkpoint_path = 'checkpoints'
 config_path = 'src/config'
+device = "cuda" if torch.cuda.is_available() else "mps" if platform.system() == 'Darwin' else "cpu"
 os.environ['TORCH_HOME'] = checkpoint_path
 snapshot_download(repo_id='vinthony/SadTalker-V002rc',
                   local_dir=checkpoint_path, local_dir_use_symlinks=True)
+app = FastAPI()
+app.mount("/results", StaticFiles(directory="results"), name="results")
+templates = Jinja2Templates(directory="templates")
 def mp3_to_wav(mp3_filename, wav_filename, frame_rate):
     AudioSegment.from_file(file=mp3_filename).set_frame_rate(
         frame_rate).export(wav_filename, format="wav")
+def get_pose_style_from_audio(audio_path):
+    """Determines pose style based on audio emotion using a pre-trained model."""
+    # Load the pre-trained emotion recognition model
+    emotion_recognizer = pipeline("sentiment-analysis")
+    # Analyze the audio emotion
+    results = emotion_recognizer(audio_path)
+    emotion = results[0]["label"]
+    # Map emotion to pose style (you can adjust these mappings)
+    pose_style_mapping = {
+        "POSITIVE": 15,  # Happy
+        "NEGATIVE": 35,  # Sad
+        "NEUTRAL": 0,   # Normal
+        # Add more emotion mappings as needed
+    }
+    return pose_style_mapping.get(emotion, 0)  # Default to neutral pose if unknown
 @spaces.GPU(duration=0)
+def generate_video(source_image: str, driven_audio: str, preprocess: str = 'crop', still_mode: bool = False,
+                   use_enhancer: bool = False, batch_size: int = 1, size: int = 256,
+                   facerender: str = 'facevid2vid', exp_scale: float = 1.0, use_ref_video: bool = False,
+                   ref_video: str = None, ref_info: str = None, use_idle_mode: bool = False,
+                   length_of_audio: int = 0, use_blink: bool = True, result_dir: str = './results/') -> str:
     # Initialize models and paths
     sadtalker_paths = init_path(
         checkpoint_path, config_path, size, False, preprocess)
     else:
         batch = get_data(first_coeff_path, audio_path, device, ref_eyeblink_coeff_path=ref_eyeblink_coeff_path,
                          still=still_mode, idlemode=use_idle_mode, length_of_audio=length_of_audio, use_blink=use_blink)
+        # Get pose style from audio
+        pose_style = get_pose_style_from_audio(audio_path)
         coeff_path = audio_to_coeff.generate(
             batch, save_dir, pose_style, ref_pose_coeff_path)
     return return_path
+@app.post("/generate")
+async def generate_video_api(source_image: UploadFile = File(...), driven_audio: UploadFile = File(None),
+                            preprocess: str = Form('crop'), still_mode: bool = Form(False),
+                            use_enhancer: bool = Form(False), batch_size: int = Form(1), size: int = Form(256),
+                            facerender: str = Form('facevid2vid'), exp_scale: float = Form(1.0),
+                            use_ref_video: bool = Form(False), ref_video: UploadFile = File(None),
+                            ref_info: str = Form(None), use_idle_mode: bool = Form(False),
+                            length_of_audio: int = Form(0), use_blink: bool = Form(True), result_dir: str = Form('./results/')):
+    # Save the uploaded files temporarily
+    temp_source_image_path = f"temp/{source_image.filename}"
+    os.makedirs("temp", exist_ok=True)
+    with open(temp_source_image_path, "wb") as buffer:
+        shutil.copyfileobj(source_image.file, buffer)
+    if driven_audio:
+        temp_driven_audio_path = f"temp/{driven_audio.filename}"
+        with open(temp_driven_audio_path, "wb") as buffer:
+            shutil.copyfileobj(driven_audio.file, buffer)
+    else:
+        temp_driven_audio_path = None
+    if ref_video:
+        temp_ref_video_path = f"temp/{ref_video.filename}"
+        with open(temp_ref_video_path, "wb") as buffer:
+            shutil.copyfileobj(ref_video.file, buffer)
+    else:
+        temp_ref_video_path = None
+    # Generate the video
+    video_path = generate_video(
+        source_image=temp_source_image_path,
+        driven_audio=temp_driven_audio_path,
+        preprocess=preprocess,
+        still_mode=still_mode,
+        use_enhancer=use_enhancer,
+        batch_size=batch_size,
+        size=size,
+        facerender=facerender,
+        exp_scale=exp_scale,
+        use_ref_video=use_ref_video,
+        ref_video=temp_ref_video_path,
+        ref_info=ref_info,
+        use_idle_mode=use_idle_mode,
+        length_of_audio=length_of_audio,
+        use_blink=use_blink,
+        result_dir=result_dir
     )
+    # Clean up temporary files
+    shutil.rmtree("temp")
+    # Return the generated video file
+    return FileResponse(video_path)
+@app.get("/")
+async def root(request):
+    return templates.TemplateResponse("index.html", {"request": request})
+# HTML Template (`templates/index.html`)
+html = """
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>SadTalker API</title>
+    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css">
+    <script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/jquery.slim.min.js"></script>
+    <script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/umd/popper.min.js"></script>
+    <script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/js/bootstrap.min.js"></script>
+</head>
+<body>
+    <div class="container mt-5">
+        <h1>SadTalker API</h1>
+        <form method="POST" action="/generate" enctype="multipart/form-data">
+            <div class="form-group">
+                <label for="source_image">Source Image:</label>
+                <input type="file" class="form-control-file" id="source_image" name="source_image" required>
+            </div>
+            <div class="form-group">
+                <label for="driven_audio">Driving Audio:</label>
+                <input type="file" class="form-control-file" id="driven_audio" name="driven_audio">
+            </div>
+            <div class="form-group">
+                <label for="preprocess">Preprocess:</label>
+                <select class="form-control" id="preprocess" name="preprocess">
+                    <option value="crop">Crop</option>
+                    <option value="resize">Resize</option>
+                    <option value="full">Full</option>
+                    <option value="extcrop">ExtCrop</option>
+                    <option value="extfull">ExtFull</option>
+                </select>
+            </div>
+            <div class="form-check">
+                <input type="checkbox" class="form-check-input" id="still_mode" name="still_mode">
+                <label class="form-check-label" for="still_mode">Still Mode</label>
+            </div>
+            <div class="form-check">
+                <input type="checkbox" class="form-check-input" id="use_enhancer" name="use_enhancer">
+                <label class="form-check-label" for="use_enhancer">Use GFPGAN Enhancer</label>
+            </div>
+            <div class="form-group">
+                <label for="batch_size">Batch Size:</label>
+                <input type="number" class="form-control" id="batch_size" name="batch_size" min="1" max="10" value="1">
+            </div>
+            <div class="form-group">
+                <label for="size">Face Model Resolution:</label>
+                <select class="form-control" id="size" name="size">
+                    <option value="256">256</option>
+                    <option value="512">512</option>
+                </select>
+            </div>
+            <div class="form-group">
+                <label for="facerender">Face Render:</label>
+                <select class="form-control" id="facerender" name="facerender">
+                    <option value="facevid2vid">FaceVid2Vid</option>
+                    <option value="pirender">PIRender</option>
+                </select>
+            </div>
+            <div class="form-group">
+                <label for="exp_scale">Expression Scale:</label>
+                <input type="number" class="form-control" id="exp_scale" name="exp_scale" min="0" max="3" step="0.1" value="1.0">
+            </div>
+            <div class="form-check">
+                <input type="checkbox" class="form-check-input" id="use_ref_video" name="use_ref_video">
+                <label class="form-check-label" for="use_ref_video">Use Reference Video</label>
+            </div>
+            <div class="form-group">
+                <label for="ref_video">Reference Video:</label>
+                <input type="file" class="form-control-file" id="ref_video" name="ref_video">
+            </div>
+            <div class="form-group">
+                <label for="ref_info">Reference Video Information:</label>
+                <select class="form-control" id="ref_info" name="ref_info">
+                    <option value="pose">Pose</option>
+                    <option value="blink">Blink</option>
+                    <option value="pose+blink">Pose + Blink</option>
+                    <option value="all">All</option>
+                </select>
+            </div>
+            <div class="form-check">
+                <input type="checkbox" class="form-check-input" id="use_idle_mode" name="use_idle_mode">
+                <label class="form-check-label" for="use_idle_mode">Use Idle Animation</label>
+            </div>
+            <div class="form-group">
+                <label for="length_of_audio">Length of Audio (seconds):</label>
+                <input type="number" class="form-control" id="length_of_audio" name="length_of_audio" min="0" value="0">
+            </div>
+            <div class="form-check">
+                <input type="checkbox" class="form-check-input" id="use_blink" name="use_blink" checked>
+                <label class="form-check-label" for="use_blink">Use Eye Blink</label>
+            </div>
+            <button type="submit" class="btn btn-primary">Generate</button>
+        </form>
+    </div>
+</body>
+</html>
+"""
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000, reload=True)