import gradio as gr import torch import torchaudio from diffusers import StableDiffusionPipeline from transformers import pipeline from TTS.api import TTS import moviepy.editor as mp import numpy as np import os from PIL import Image, ImageDraw, ImageFont import shlex import subprocess import spaces subprocess.run(shlex.split('pip install wheel/torchmcubes-0.1.0-cp310-cp310-linux_x86_64.whl')) @spaces.GPU def generate_script(topic): """Uses an open-source LLM to generate an engaging script of 8-10 minutes.""" llm = pipeline("text-generation", model="agentica-org/DeepScaleR-1.5B-Preview") prompt = (f"Write an engaging and informative script on the topic '{topic}'. " "The text should take about 8-10 minutes to read aloud at a normal pace.") response = llm(prompt, max_length=1500, do_sample=True, temperature=0.7) return response[0]['generated_text'] def create_centered_title(image_size, text, max_font_size=50, min_font_size=10, padding=20): """Creates a title image with auto-adjusting text size to fit within the image.""" title_img = Image.new("RGB", image_size, (0, 0, 0)) draw = ImageDraw.Draw(title_img) # Load the maximum font size font_size = max_font_size try: font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", font_size) except IOError: font = ImageFont.load_default() # Reduce font size until the text fits within the image while font_size > min_font_size: text_bbox = draw.textbbox((0, 0), text, font=font) text_w = text_bbox[2] - text_bbox[0] text_h = text_bbox[3] - text_bbox[1] if text_w <= image_size[0] - 2 * padding and text_h <= image_size[1] - 2 * padding: break # Text fits, exit loop font_size -= 2 # Decrease font size font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", font_size) # Center the text text_x = (image_size[0] - text_w) // 2 text_y = (image_size[1] - text_h) // 2 draw.text((text_x, text_y), text, font=font, fill="white") return title_img def estimate_chunk_durations(text, words_per_second=2.5, min_sec=5, max_sec=7): words = text.split() chunks = [] current_chunk = [] current_duration = 0 for word in words: current_chunk.append(word) current_duration += 1 / words_per_second if current_duration >= min_sec: if current_duration >= max_sec or len(current_chunk) > 20: chunks.append(" ".join(current_chunk)) current_chunk = [] current_duration = 0 if current_chunk: chunks.append(" ".join(current_chunk)) return chunks @spaces.GPU def generate_speech(text): tts = TTS("tts_models/en/ljspeech/glow-tts") wav_path = "speech.wav" tts.tts_to_file(text=text, file_path=wav_path) return wav_path @spaces.GPU def generate_images(chunks, image_size=(640, 480), use_diffusion=True, num_steps=40): image_paths = [] if use_diffusion: pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4") pipe.to("cuda" if torch.cuda.is_available() else "cpu") for i, chunk in enumerate(chunks): if use_diffusion: image = pipe(chunk, num_inference_steps=num_steps).images[0] image = image.resize(image_size) else: image = Image.new("RGB", image_size, (0, 0, 0)) draw = ImageDraw.Draw(image) try: font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 30) except IOError: font = ImageFont.load_default() draw.text((10, 10), chunk, font=font, fill="white") img_path = f"image_{i}.png" image.save(img_path) image_paths.append(img_path) return image_paths def create_video(images, durations, speech_path, movie_title, chunks, image_size=(640, 480)): clips = [] # Title clip using PIL instead of ImageMagick title_img = create_centered_title(image_size, movie_title) title_img_path = "title.png" title_img.save(title_img_path) title_clip = mp.ImageClip(title_img_path).set_duration(2).set_position('center') clips.append(title_clip) for img, dur, chunk in zip(images, durations, chunks): frame = np.array(Image.open(img).resize(image_size, Image.Resampling.LANCZOS)) clip = mp.ImageClip(frame).set_duration(dur) clips.append(clip) black_end = mp.ColorClip(image_size, color=(0,0,0), duration=2) video = mp.concatenate_videoclips(clips + [black_end]) audio = mp.AudioFileClip(speech_path) final_video = video.set_audio(audio) final_video.write_videofile("output.mp4", fps=24) return "output.mp4" def process_text(text, movie_title, image_size, use_diffusion, num_steps): chunks = estimate_chunk_durations(text) speech_path = generate_speech(text) image_paths = generate_images(chunks, image_size, use_diffusion, num_steps) durations = [min(10, max(5, len(chunk.split()) / 2.5)) for chunk in chunks] video_path = create_video(image_paths, durations, speech_path, movie_title, chunks, image_size) return video_path with gr.Blocks() as demo: gr.Markdown("# Text-to-Video Generator for YouTubers using AI 🎥") gr.Markdown(""" Turn your ideas into engaging videos effortlessly! 🎬 Simply upload a text file or enter a topic, and our AI will generate a compelling script for you. The system then brings your script to life by creating relevant images using Stable Diffusion and compiling them into a video. To make your content even more engaging, AI-powered text-to-speech (TTS) is used to generate realistic voice narration for the video. Perfect for content creators looking to streamline their workflow and focus on creativity! 🚀 """) text_input = gr.Textbox(label="Enter your text (or leave empty to use a topic)") topic_input = gr.Textbox(label="Or enter a topic to generate text", placeholder="Example: The Future of AI") movie_title_input = gr.Textbox(label="Movie Title", value="") file_input = gr.File(label="Or upload a .txt file") image_size_input = gr.Radio(choices=["640x480", "800x600", "1024x768"], label="Select Image Size", value="640x480") use_diffusion_input = gr.Checkbox(label="Use Diffusion Images", value=True) num_steps_input = gr.Slider(minimum=1, maximum=50, step=1, value=40, label="Diffusion Model Steps") process_btn = gr.Button("Generate Video") output_video = gr.Video() def handle_request(text, topic, movie_title, file, image_size, use_diffusion, num_steps): if file is not None and hasattr(file, "name"): # Check if 'file' is a file object text = open(file.name, "r").read() elif not text and topic: text = generate_script(topic) image_size_dict = {"640x480": (640, 480), "800x600": (800, 600), "1024x768": (1024, 768)} return process_text(text, movie_title, image_size_dict[image_size], use_diffusion, num_steps) process_btn.click(handle_request, inputs=[text_input, topic_input, movie_title_input, file_input, image_size_input, use_diffusion_input, num_steps_input], outputs=output_video) demo.launch(share=True)