Spaces:

ar08
/

Auto-caption

Paused

App Files Files Community

Auto-caption / app.py

ar08

Update app.py

ca94fa8 verified 7 months ago

raw

history blame contribute delete

4.81 kB

	import gradio as gr
	import whisper
	import os
	import shutil
	import cv2
	from moviepy import ImageSequenceClip, AudioFileClip, VideoFileClip
	from tqdm import tqdm

	# Constants
	FONT = cv2.FONT_HERSHEY_SIMPLEX
	FONT_SCALE = 0.8
	FONT_THICKNESS = 2

	class VideoTranscriber:
	def __init__(self, model_name, video_path):
	self.model = whisper.load_model(model_name)
	self.video_path = video_path
	self.audio_path = ''
	self.text_segments = []
	self.fps = 0
	self.char_width = 0

	def extract_audio(self):
	print('[INFO] Extracting audio...')
	audio_path = os.path.splitext(self.video_path)[0] + "_audio.mp3"
	video = VideoFileClip(self.video_path)
	audio = video.audio
	audio.write_audiofile(audio_path)
	self.audio_path = audio_path
	print('[INFO] Audio extracted')

	def transcribe_video(self):
	print('[INFO] Transcribing audio...')
	result = self.model.transcribe(self.audio_path)
	segments = result["segments"]
	sample_text = segments[0]["text"] if segments else "Sample"
	textsize = cv2.getTextSize(sample_text, FONT, FONT_SCALE, FONT_THICKNESS)[0]

	cap = cv2.VideoCapture(self.video_path)
	self.fps = cap.get(cv2.CAP_PROP_FPS)
	width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
	height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
	aspect_ratio = width / height
	cap.release()

	effective_width = int(width - (width * 0.1))
	self.char_width = max(int(textsize[0] / len(sample_text)), 1)

	for seg in tqdm(segments, desc="Transcribing"):
	lines = self._split_text_to_lines(seg["text"], effective_width)
	start_frame = int(seg["start"] * self.fps)
	end_frame = int(seg["end"] * self.fps)
	self.text_segments.extend([[line, start_frame, end_frame] for line in lines])

	print('[INFO] Transcription complete')

	def _split_text_to_lines(self, text, max_width):
	words = text.split()
	lines, line = [], ""
	for word in words:
	if cv2.getTextSize(line + ' ' + word, FONT, FONT_SCALE, FONT_THICKNESS)[0][0] < max_width:
	line += (" " if line else "") + word
	else:
	lines.append(line)
	line = word
	if line:
	lines.append(line)
	return lines

	def extract_and_annotate_frames(self, output_dir):
	print('[INFO] Extracting and annotating frames...')
	os.makedirs(output_dir, exist_ok=True)
	cap = cv2.VideoCapture(self.video_path)
	frame_count = 0

	while True:
	ret, frame = cap.read()
	if not ret:
	break

	for text, start, end in self.text_segments:
	if start <= frame_count <= end:
	text_size, _ = cv2.getTextSize(text, FONT, FONT_SCALE, FONT_THICKNESS)
	text_x = (frame.shape[1] - text_size[0]) // 2
	text_y = frame.shape[0] - 30
	cv2.putText(frame, text, (text_x, text_y), FONT, FONT_SCALE, (0, 0, 255), FONT_THICKNESS)
	break

	cv2.imwrite(os.path.join(output_dir, f"{frame_count:05d}.jpg"), frame)
	frame_count += 1

	cap.release()
	print('[INFO] Frame extraction complete')

	def create_annotated_video(self, output_video_path):
	print('[INFO] Creating final video...')
	frames_dir = os.path.join(os.path.dirname(self.video_path), "frames_temp")
	self.extract_and_annotate_frames(frames_dir)

	image_files = sorted([os.path.join(frames_dir, f) for f in os.listdir(frames_dir) if f.endswith(".jpg")])
	clip = ImageSequenceClip(image_files, fps=self.fps)
	audio = AudioFileClip(self.audio_path)
	clip = clip.with_audio(audio)
	clip.write_videofile(output_video_path, codec="libx264", audio_codec="aac")

	shutil.rmtree(frames_dir)
	os.remove(self.audio_path)
	print('[INFO] Video created successfully')

	def process_video(video_path):
	transcriber = VideoTranscriber(model_name="base", video_path=video_path)
	transcriber.extract_audio()
	transcriber.transcribe_video()
	output_path = os.path.splitext(video_path)[0] + "_transcribed.mp4"
	transcriber.create_annotated_video(output_path)
	return output_path

	# Gradio Interface
	def gradio_interface(video):
	return process_video(video)

	iface = gr.Interface(
	fn=gradio_interface,
	inputs=gr.Video(label="Upload Video"),
	outputs=gr.Video(label="Transcribed Video"),
	title="🎬 Whisper Video Subtitle Generator",
	description="Upload a video to automatically transcribe and add subtitles using Whisper AI."
	)

	if __name__ == "__main__":
	iface.launch()