SadTalkerjggj

Build error

App Files Files Community

Uhhy commited on Sep 10, 2024

Commit

41c7921

verified ·

1 Parent(s): 2107997

Update app.py

Browse files

Files changed (1) hide show

app.py +6 -27

app.py CHANGED Viewed

@@ -11,6 +11,7 @@ from fastapi.staticfiles import StaticFiles
 from fastapi.templating import Jinja2Templates
 from transformers import pipeline
 from huggingface_hub import snapshot_download
 from examples.get_examples import get_examples
 from src.facerender.pirender_animate import AnimateFromCoeff_PIRender
 from src.utils.preprocess import CropAndExtract
@@ -37,23 +38,15 @@ def mp3_to_wav(mp3_filename, wav_filename, frame_rate):
         frame_rate).export(wav_filename, format="wav")
 def get_pose_style_from_audio(audio_path):
-    """Determines pose style based on audio emotion using a pre-trained model."""
-    # Load the pre-trained emotion recognition model
     emotion_recognizer = pipeline("sentiment-analysis")
-    # Analyze the audio emotion
     results = emotion_recognizer(audio_path)
     emotion = results[0]["label"]
-    # Map emotion to pose style (you can adjust these mappings)
     pose_style_mapping = {
-        "POSITIVE": 15,  # Happy
-        "NEGATIVE": 35,  # Sad
-        "NEUTRAL": 0,   # Normal
-        # Add more emotion mappings as needed
     }
-    return pose_style_mapping.get(emotion, 0)  # Default to neutral pose if unknown
 @spaces.GPU(duration=0)
 def generate_video(source_image: str, driven_audio: str, preprocess: str = 'crop', still_mode: bool = False,
@@ -61,7 +54,6 @@ def generate_video(source_image: str, driven_audio: str, preprocess: str = 'crop
                    facerender: str = 'facevid2vid', exp_scale: float = 1.0, use_ref_video: bool = False,
                    ref_video: str = None, ref_info: str = None, use_idle_mode: bool = False,
                    length_of_audio: int = 0, use_blink: bool = True, result_dir: str = './results/') -> str:
-    # Initialize models and paths
     sadtalker_paths = init_path(
         checkpoint_path, config_path, size, False, preprocess)
     audio_to_coeff = Audio2Coeff(sadtalker_paths, device)
@@ -69,18 +61,15 @@ def generate_video(source_image: str, driven_audio: str, preprocess: str = 'crop
     animate_from_coeff = AnimateFromCoeff(sadtalker_paths, device) if facerender == 'facevid2vid' and device != 'mps' \
         else AnimateFromCoeff_PIRender(sadtalker_paths, device)
-    # Create directories for saving results
     time_tag = str(uuid.uuid4())
     save_dir = os.path.join(result_dir, time_tag)
     os.makedirs(save_dir, exist_ok=True)
     input_dir = os.path.join(save_dir, 'input')
     os.makedirs(input_dir, exist_ok=True)
-    # Process source image
     pic_path = os.path.join(input_dir, os.path.basename(source_image))
     shutil.move(source_image, input_dir)
-    # Process driven audio
     if driven_audio and os.path.isfile(driven_audio):
         audio_path = os.path.join(input_dir, os.path.basename(driven_audio))
         if '.mp3' in audio_path:
@@ -96,7 +85,6 @@ def generate_video(source_image: str, driven_audio: str, preprocess: str = 'crop
     else:
         assert use_ref_video and ref_info == 'all'
-    # Process reference video
     if use_ref_video and ref_info == 'all':
         ref_video_videoname = os.path.splitext(os.path.split(ref_video)[-1])[0]
         audio_path = os.path.join(save_dir, ref_video_videoname+'.wav')
@@ -109,7 +97,6 @@ def generate_video(source_image: str, driven_audio: str, preprocess: str = 'crop
     else:
         ref_video_coeff_path = None
-    # Preprocess source image
     first_frame_dir = os.path.join(save_dir, 'first_frame_dir')
     os.makedirs(first_frame_dir, exist_ok=True)
     first_coeff_path, crop_pic_path, crop_info = preprocess_model.generate(
@@ -117,7 +104,6 @@ def generate_video(source_image: str, driven_audio: str, preprocess: str = 'crop
     if first_coeff_path is None:
         raise AttributeError("No face is detected")
-    # Determine reference coefficients
     ref_pose_coeff_path, ref_eyeblink_coeff_path = None, None
     if use_ref_video:
         if ref_info == 'pose':
@@ -129,20 +115,17 @@ def generate_video(source_image: str, driven_audio: str, preprocess: str = 'crop
     else:
         ref_pose_coeff_path = ref_eyeblink_coeff_path = None
-    # Generate coefficients from audio or reference video
     if use_ref_video and ref_info == 'all':
         coeff_path = ref_video_coeff_path
     else:
         batch = get_data(first_coeff_path, audio_path, device, ref_eyeblink_coeff_path=ref_eyeblink_coeff_path,
                          still=still_mode, idlemode=use_idle_mode, length_of_audio=length_of_audio, use_blink=use_blink)
-        # Get pose style from audio
         pose_style = get_pose_style_from_audio(audio_path)
         coeff_path = audio_to_coeff.generate(
             batch, save_dir, pose_style, ref_pose_coeff_path)
-    # Generate video from coefficients
     data = get_facerender_data(coeff_path, crop_pic_path, first_coeff_path, audio_path, batch_size, still_mode=still_mode,
                                preprocess=preprocess, size=size, expression_scale=exp_scale, facemodel=facerender)
     return_path = animate_from_coeff.generate(data, save_dir, pic_path, crop_info, enhancer='gfpgan' if use_enhancer else None,
@@ -160,7 +143,6 @@ async def generate_video_api(source_image: UploadFile = File(...), driven_audio:
                             use_ref_video: bool = Form(False), ref_video: UploadFile = File(None),
                             ref_info: str = Form(None), use_idle_mode: bool = Form(False),
                             length_of_audio: int = Form(0), use_blink: bool = Form(True), result_dir: str = Form('./results/')):
-    # Save the uploaded files temporarily
     temp_source_image_path = f"temp/{source_image.filename}"
     os.makedirs("temp", exist_ok=True)
     with open(temp_source_image_path, "wb") as buffer:
@@ -180,7 +162,6 @@ async def generate_video_api(source_image: UploadFile = File(...), driven_audio:
     else:
         temp_ref_video_path = None
-    # Generate the video
     video_path = generate_video(
         source_image=temp_source_image_path,
         driven_audio=temp_driven_audio_path,
@@ -200,10 +181,8 @@ async def generate_video_api(source_image: UploadFile = File(...), driven_audio:
         result_dir=result_dir
     )
-    # Clean up temporary files
     shutil.rmtree("temp")
-    # Return the generated video file
     return FileResponse(video_path)
@@ -314,4 +293,4 @@ html = """
 if __name__ == "__main__":
     import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=7860)

 from fastapi.templating import Jinja2Templates
 from transformers import pipeline
 from huggingface_hub import snapshot_download
 from examples.get_examples import get_examples
 from src.facerender.pirender_animate import AnimateFromCoeff_PIRender
 from src.utils.preprocess import CropAndExtract
         frame_rate).export(wav_filename, format="wav")
 def get_pose_style_from_audio(audio_path):
     emotion_recognizer = pipeline("sentiment-analysis")
     results = emotion_recognizer(audio_path)
     emotion = results[0]["label"]
     pose_style_mapping = {
+        "POSITIVE": 15,
+        "NEGATIVE": 35,
+        "NEUTRAL": 0,
     }
+    return pose_style_mapping.get(emotion, 0)
 @spaces.GPU(duration=0)
 def generate_video(source_image: str, driven_audio: str, preprocess: str = 'crop', still_mode: bool = False,
                    facerender: str = 'facevid2vid', exp_scale: float = 1.0, use_ref_video: bool = False,
                    ref_video: str = None, ref_info: str = None, use_idle_mode: bool = False,
                    length_of_audio: int = 0, use_blink: bool = True, result_dir: str = './results/') -> str:
     sadtalker_paths = init_path(
         checkpoint_path, config_path, size, False, preprocess)
     audio_to_coeff = Audio2Coeff(sadtalker_paths, device)
     animate_from_coeff = AnimateFromCoeff(sadtalker_paths, device) if facerender == 'facevid2vid' and device != 'mps' \
         else AnimateFromCoeff_PIRender(sadtalker_paths, device)
     time_tag = str(uuid.uuid4())
     save_dir = os.path.join(result_dir, time_tag)
     os.makedirs(save_dir, exist_ok=True)
     input_dir = os.path.join(save_dir, 'input')
     os.makedirs(input_dir, exist_ok=True)
     pic_path = os.path.join(input_dir, os.path.basename(source_image))
     shutil.move(source_image, input_dir)
     if driven_audio and os.path.isfile(driven_audio):
         audio_path = os.path.join(input_dir, os.path.basename(driven_audio))
         if '.mp3' in audio_path:
     else:
         assert use_ref_video and ref_info == 'all'
     if use_ref_video and ref_info == 'all':
         ref_video_videoname = os.path.splitext(os.path.split(ref_video)[-1])[0]
         audio_path = os.path.join(save_dir, ref_video_videoname+'.wav')
     else:
         ref_video_coeff_path = None
     first_frame_dir = os.path.join(save_dir, 'first_frame_dir')
     os.makedirs(first_frame_dir, exist_ok=True)
     first_coeff_path, crop_pic_path, crop_info = preprocess_model.generate(
     if first_coeff_path is None:
         raise AttributeError("No face is detected")
     ref_pose_coeff_path, ref_eyeblink_coeff_path = None, None
     if use_ref_video:
         if ref_info == 'pose':
     else:
         ref_pose_coeff_path = ref_eyeblink_coeff_path = None
     if use_ref_video and ref_info == 'all':
         coeff_path = ref_video_coeff_path
     else:
         batch = get_data(first_coeff_path, audio_path, device, ref_eyeblink_coeff_path=ref_eyeblink_coeff_path,
                          still=still_mode, idlemode=use_idle_mode, length_of_audio=length_of_audio, use_blink=use_blink)
         pose_style = get_pose_style_from_audio(audio_path)
         coeff_path = audio_to_coeff.generate(
             batch, save_dir, pose_style, ref_pose_coeff_path)
     data = get_facerender_data(coeff_path, crop_pic_path, first_coeff_path, audio_path, batch_size, still_mode=still_mode,
                                preprocess=preprocess, size=size, expression_scale=exp_scale, facemodel=facerender)
     return_path = animate_from_coeff.generate(data, save_dir, pic_path, crop_info, enhancer='gfpgan' if use_enhancer else None,
                             use_ref_video: bool = Form(False), ref_video: UploadFile = File(None),
                             ref_info: str = Form(None), use_idle_mode: bool = Form(False),
                             length_of_audio: int = Form(0), use_blink: bool = Form(True), result_dir: str = Form('./results/')):
     temp_source_image_path = f"temp/{source_image.filename}"
     os.makedirs("temp", exist_ok=True)
     with open(temp_source_image_path, "wb") as buffer:
     else:
         temp_ref_video_path = None
     video_path = generate_video(
         source_image=temp_source_image_path,
         driven_audio=temp_driven_audio_path,
         result_dir=result_dir
     )
     shutil.rmtree("temp")
     return FileResponse(video_path)
 if __name__ == "__main__":
     import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000, reload=True)