Spaces:
Runtime error
Runtime error
from tensorflow_docs.vis import embed | |
from tensorflow import keras | |
from imutils import paths | |
import tensorflow as tf | |
import pandas as pd | |
import numpy as np | |
import imageio | |
import cv2 | |
import os | |
from tensorflow.keras.models import model_from_json | |
import numpy | |
import gradio as gr | |
from googletrans import Translator | |
translator = Translator() | |
train_df = pd.read_csv("train.csv") | |
label_processor = keras.layers.StringLookup(num_oov_indices=0, vocabulary=np.unique(train_df["tag"])) | |
print(label_processor.get_vocabulary()) | |
labels = train_df["tag"].values | |
labels = label_processor(labels[..., None]).numpy() | |
IMG_SIZE = 224 | |
BATCH_SIZE = 64 | |
EPOCHS = 100 | |
MAX_SEQ_LENGTH = 20 | |
NUM_FEATURES = 2048 | |
json_file = open('model.json', 'r') | |
sequence_model_json = json_file.read() | |
json_file.close() | |
sequence_model = model_from_json(sequence_model_json) | |
# load weights into new model | |
sequence_model.load_weights("model.h5") | |
def crop_center_square(frame): | |
y, x = frame.shape[0:2] | |
min_dim = min(y, x) | |
start_x = (x // 2) - (min_dim // 2) | |
start_y = (y // 2) - (min_dim // 2) | |
return frame[start_y : start_y + min_dim, start_x : start_x + min_dim] | |
def load_video(path, max_frames=0, resize=(IMG_SIZE, IMG_SIZE)): | |
cap = cv2.VideoCapture(path) | |
frames = [] | |
try: | |
while True: | |
ret, frame = cap.read() | |
if not ret: | |
break | |
frame = crop_center_square(frame) | |
frame = cv2.resize(frame, resize) | |
frame = frame[:, :, [2, 1, 0]] | |
frames.append(frame) | |
if len(frames) == max_frames: | |
break | |
finally: | |
cap.release() | |
return np.array(frames) | |
def create_clips(video_path, interval): | |
interval=int(interval) | |
NoOfClips=0 | |
cap = cv2.VideoCapture(video_path) | |
fps = cap.get(cv2.CAP_PROP_FPS) | |
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) | |
duration = frame_count / fps | |
for i in range(0, int(duration), interval): | |
NoOfClips+=1 | |
start_time = i | |
end_time = min(i+interval, duration) | |
start_frame = int(start_time * fps) | |
end_frame = int(end_time * fps) | |
cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame) | |
fourcc = cv2.VideoWriter_fourcc(*'mp4v') | |
output_path = f"clip_{NoOfClips}.mp4" | |
out = cv2.VideoWriter(output_path, fourcc, fps, (int(cap.get(3)), int(cap.get(4)))) | |
for j in range(start_frame, end_frame): | |
ret, frame = cap.read() | |
if ret: | |
out.write(frame) | |
else: | |
break | |
out.release() | |
cap.release() | |
return NoOfClips | |
def build_feature_extractor(): | |
feature_extractor = keras.applications.InceptionV3( | |
weights="imagenet", | |
include_top=False, | |
pooling="avg", | |
input_shape=(IMG_SIZE, IMG_SIZE, 3), | |
) | |
preprocess_input = keras.applications.inception_v3.preprocess_input | |
inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3)) | |
preprocessed = preprocess_input(inputs) | |
outputs = feature_extractor(preprocessed) | |
return keras.Model(inputs, outputs, name="feature_extractor") | |
feature_extractor = build_feature_extractor() | |
def prepare_single_video(frames): | |
frames = frames[None, ...] | |
frame_mask = np.zeros(shape=(1, MAX_SEQ_LENGTH,), dtype="bool") | |
frame_features = np.zeros(shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32") | |
for i, batch in enumerate(frames): | |
video_length = batch.shape[0] | |
length = min(MAX_SEQ_LENGTH, video_length) | |
for j in range(length): | |
frame_features[i, j, :] = feature_extractor.predict(batch[None, j, :]) | |
frame_mask[i, :length] = 1 # 1 = not masked, 0 = masked | |
return frame_features, frame_mask | |
def sequence_prediction(path): | |
class_vocab = label_processor.get_vocabulary() | |
frames = load_video(os.path.join("test", path)) | |
frame_features, frame_mask = prepare_single_video(frames) | |
probabilities = sequence_model.predict([frame_features, frame_mask])[0] | |
for i in np.argsort(probabilities)[::-1]: | |
#if probabilities[i]* 100>0.75: | |
return class_vocab[i] | |
print(f" {class_vocab[i]}: {probabilities[i] * 100:5.2f}%") | |
return class_vocab[0] | |
def SignTotext(video,interval): | |
NoofClips=create_clips(video,interval) | |
Text=[] | |
for i in range(NoofClips): | |
Text.append(sequence_prediction(f"clip_{i}.mp4")) | |
EnglishText=" ".join(Text) | |
translated_text = translator.translate(EnglishText, dest='ur') | |
return EnglishText,translated_text.text | |
demo=gr.Interface(fn=SignTotext, | |
inputs=["video",gr.inputs.Number(label="Enter Duration in which one sign is completed")], | |
outputs=[gr.inputs.Textbox(label="English Text"),gr.inputs.Textbox(label="Urdu text")], | |
title="Urdu Sign to Video") | |
demo.launch(debug=True) | |