Spaces:
Paused
Paused
| import gradio as gr | |
| import whisper | |
| import os | |
| import shutil | |
| import cv2 | |
| from moviepy import ImageSequenceClip, AudioFileClip, VideoFileClip | |
| from tqdm import tqdm | |
| # Constants | |
| FONT = cv2.FONT_HERSHEY_SIMPLEX | |
| FONT_SCALE = 0.8 | |
| FONT_THICKNESS = 2 | |
| class VideoTranscriber: | |
| def __init__(self, model_name, video_path): | |
| self.model = whisper.load_model(model_name) | |
| self.video_path = video_path | |
| self.audio_path = '' | |
| self.text_segments = [] | |
| self.fps = 0 | |
| self.char_width = 0 | |
| def extract_audio(self): | |
| print('[INFO] Extracting audio...') | |
| audio_path = os.path.splitext(self.video_path)[0] + "_audio.mp3" | |
| video = VideoFileClip(self.video_path) | |
| audio = video.audio | |
| audio.write_audiofile(audio_path) | |
| self.audio_path = audio_path | |
| print('[INFO] Audio extracted') | |
| def transcribe_video(self): | |
| print('[INFO] Transcribing audio...') | |
| result = self.model.transcribe(self.audio_path) | |
| segments = result["segments"] | |
| sample_text = segments[0]["text"] if segments else "Sample" | |
| textsize = cv2.getTextSize(sample_text, FONT, FONT_SCALE, FONT_THICKNESS)[0] | |
| cap = cv2.VideoCapture(self.video_path) | |
| self.fps = cap.get(cv2.CAP_PROP_FPS) | |
| width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) | |
| height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) | |
| aspect_ratio = width / height | |
| cap.release() | |
| effective_width = int(width - (width * 0.1)) | |
| self.char_width = max(int(textsize[0] / len(sample_text)), 1) | |
| for seg in tqdm(segments, desc="Transcribing"): | |
| lines = self._split_text_to_lines(seg["text"], effective_width) | |
| start_frame = int(seg["start"] * self.fps) | |
| end_frame = int(seg["end"] * self.fps) | |
| self.text_segments.extend([[line, start_frame, end_frame] for line in lines]) | |
| print('[INFO] Transcription complete') | |
| def _split_text_to_lines(self, text, max_width): | |
| words = text.split() | |
| lines, line = [], "" | |
| for word in words: | |
| if cv2.getTextSize(line + ' ' + word, FONT, FONT_SCALE, FONT_THICKNESS)[0][0] < max_width: | |
| line += (" " if line else "") + word | |
| else: | |
| lines.append(line) | |
| line = word | |
| if line: | |
| lines.append(line) | |
| return lines | |
| def extract_and_annotate_frames(self, output_dir): | |
| print('[INFO] Extracting and annotating frames...') | |
| os.makedirs(output_dir, exist_ok=True) | |
| cap = cv2.VideoCapture(self.video_path) | |
| frame_count = 0 | |
| while True: | |
| ret, frame = cap.read() | |
| if not ret: | |
| break | |
| for text, start, end in self.text_segments: | |
| if start <= frame_count <= end: | |
| text_size, _ = cv2.getTextSize(text, FONT, FONT_SCALE, FONT_THICKNESS) | |
| text_x = (frame.shape[1] - text_size[0]) // 2 | |
| text_y = frame.shape[0] - 30 | |
| cv2.putText(frame, text, (text_x, text_y), FONT, FONT_SCALE, (0, 0, 255), FONT_THICKNESS) | |
| break | |
| cv2.imwrite(os.path.join(output_dir, f"{frame_count:05d}.jpg"), frame) | |
| frame_count += 1 | |
| cap.release() | |
| print('[INFO] Frame extraction complete') | |
| def create_annotated_video(self, output_video_path): | |
| print('[INFO] Creating final video...') | |
| frames_dir = os.path.join(os.path.dirname(self.video_path), "frames_temp") | |
| self.extract_and_annotate_frames(frames_dir) | |
| image_files = sorted([os.path.join(frames_dir, f) for f in os.listdir(frames_dir) if f.endswith(".jpg")]) | |
| clip = ImageSequenceClip(image_files, fps=self.fps) | |
| audio = AudioFileClip(self.audio_path) | |
| clip = clip.with_audio(audio) | |
| clip.write_videofile(output_video_path, codec="libx264", audio_codec="aac") | |
| shutil.rmtree(frames_dir) | |
| os.remove(self.audio_path) | |
| print('[INFO] Video created successfully') | |
| def process_video(video_path): | |
| transcriber = VideoTranscriber(model_name="base", video_path=video_path) | |
| transcriber.extract_audio() | |
| transcriber.transcribe_video() | |
| output_path = os.path.splitext(video_path)[0] + "_transcribed.mp4" | |
| transcriber.create_annotated_video(output_path) | |
| return output_path | |
| # Gradio Interface | |
| def gradio_interface(video): | |
| return process_video(video) | |
| iface = gr.Interface( | |
| fn=gradio_interface, | |
| inputs=gr.Video(label="Upload Video"), | |
| outputs=gr.Video(label="Transcribed Video"), | |
| title="🎬 Whisper Video Subtitle Generator", | |
| description="Upload a video to automatically transcribe and add subtitles using Whisper AI." | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch() |