Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -8,9 +8,9 @@ import os
|
|
| 8 |
from mutagen.mp3 import MP3
|
| 9 |
import cv2
|
| 10 |
from dotenv import load_dotenv
|
| 11 |
-
from transformers import pipeline
|
| 12 |
-
|
| 13 |
-
|
| 14 |
|
| 15 |
# Load environment variables
|
| 16 |
load_dotenv()
|
|
@@ -24,6 +24,22 @@ def resize(img_list):
|
|
| 24 |
resize_img_list.append(np.array(imResize))
|
| 25 |
return resize_img_list
|
| 26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
def merge_audio_video(entities_num, resize_img_list, text_input):
|
| 28 |
speech = text2speech(text_input)
|
| 29 |
wav_audio = AudioSegment.from_file(speech, "flac")
|
|
@@ -41,18 +57,6 @@ def merge_audio_video(entities_num, resize_img_list, text_input):
|
|
| 41 |
|
| 42 |
return mergedclip
|
| 43 |
|
| 44 |
-
def text2speech(text):
|
| 45 |
-
# Generate speech from text using FastSpeech2
|
| 46 |
-
speech_output = fastspeech(text)
|
| 47 |
-
# Save the output as a .flac file (assuming the output is in numpy format)
|
| 48 |
-
with open("speech_output.flac", "wb") as f:
|
| 49 |
-
f.write(speech_output["audio"])
|
| 50 |
-
return "speech_output.flac"
|
| 51 |
-
|
| 52 |
-
# Load FastSpeech2 model from Hugging Face directly
|
| 53 |
-
fastspeech = pipeline("text-to-speech", model="facebook/fastspeech2-en-ljspeech", use_auth_token=HF_TOKEN)
|
| 54 |
-
|
| 55 |
-
|
| 56 |
def engine(text_input):
|
| 57 |
ner = gr.Interface.load("huggingface/flair/ner-english-ontonotes-large", api_key=HF_TOKEN)
|
| 58 |
entities = ner(text_input)
|
|
@@ -68,15 +72,10 @@ def engine(text_input):
|
|
| 68 |
|
| 69 |
resize_img_list = resize(img_list)
|
| 70 |
mergedclip = merge_audio_video(entities_num, resize_img_list, text_input)
|
| 71 |
-
mergedclip.
|
| 72 |
|
| 73 |
return 'mergedvideo.mp4'
|
| 74 |
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
app = gr.Interface(
|
| 81 |
fn=engine,
|
| 82 |
inputs=gr.Textbox(lines=5, label="Input Text"),
|
|
@@ -87,4 +86,4 @@ app = gr.Interface(
|
|
| 87 |
],
|
| 88 |
title="AI Pipeline Multi Model πποΈπΏ Movie Maker π¬ π§ π¨",
|
| 89 |
article="<br><div></div>"
|
| 90 |
-
).launch(debug=True)
|
|
|
|
| 8 |
from mutagen.mp3 import MP3
|
| 9 |
import cv2
|
| 10 |
from dotenv import load_dotenv
|
| 11 |
+
from transformers import pipeline, AutoProcessor, AutoModel
|
| 12 |
+
import torch
|
| 13 |
+
import soundfile as sf
|
| 14 |
|
| 15 |
# Load environment variables
|
| 16 |
load_dotenv()
|
|
|
|
| 24 |
resize_img_list.append(np.array(imResize))
|
| 25 |
return resize_img_list
|
| 26 |
|
| 27 |
+
def text2speech(text):
|
| 28 |
+
# Using Microsoft's SpeechT5 model instead of FastSpeech2
|
| 29 |
+
processor = AutoProcessor.from_pretrained("microsoft/speecht5_tts")
|
| 30 |
+
model = AutoModel.from_pretrained("microsoft/speecht5_tts")
|
| 31 |
+
|
| 32 |
+
# Preprocessing text input
|
| 33 |
+
inputs = processor(text=text, return_tensors="pt")
|
| 34 |
+
|
| 35 |
+
# Generate speech with default speaker embedding
|
| 36 |
+
speaker_embeddings = torch.zeros((1, model.config.speaker_embedding_size))
|
| 37 |
+
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings)
|
| 38 |
+
|
| 39 |
+
# Save as flac file
|
| 40 |
+
sf.write("speech_output.flac", speech.numpy(), samplerate=16000)
|
| 41 |
+
return "speech_output.flac"
|
| 42 |
+
|
| 43 |
def merge_audio_video(entities_num, resize_img_list, text_input):
|
| 44 |
speech = text2speech(text_input)
|
| 45 |
wav_audio = AudioSegment.from_file(speech, "flac")
|
|
|
|
| 57 |
|
| 58 |
return mergedclip
|
| 59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
def engine(text_input):
|
| 61 |
ner = gr.Interface.load("huggingface/flair/ner-english-ontonotes-large", api_key=HF_TOKEN)
|
| 62 |
entities = ner(text_input)
|
|
|
|
| 72 |
|
| 73 |
resize_img_list = resize(img_list)
|
| 74 |
mergedclip = merge_audio_video(entities_num, resize_img_list, text_input)
|
| 75 |
+
mergedclip.write_videofile('mergedvideo.mp4')
|
| 76 |
|
| 77 |
return 'mergedvideo.mp4'
|
| 78 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
app = gr.Interface(
|
| 80 |
fn=engine,
|
| 81 |
inputs=gr.Textbox(lines=5, label="Input Text"),
|
|
|
|
| 86 |
],
|
| 87 |
title="AI Pipeline Multi Model πποΈπΏ Movie Maker π¬ π§ π¨",
|
| 88 |
article="<br><div></div>"
|
| 89 |
+
).launch(debug=True)
|