BilalSardar's picture
Update app.py
d84bc8b
raw
history blame
2.84 kB
import gradio as gr
import tensorflow as tf
from typing import List
import os
import cv2
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import legacy
from tensorflow.keras.layers import Conv3D, LSTM, Dense, Dropout, Bidirectional, MaxPool3D, Activation, Reshape, SpatialDropout3D, BatchNormalization, TimeDistributed, Flatten
from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler
def load_video(video_path):
cap = cv2.VideoCapture(video_path)
frames = []
for _ in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))):
ret, frame = cap.read()
frame = tf.image.rgb_to_grayscale(frame)
frames.append(frame[190:236,80:220,:])
cap.release()
mean = tf.math.reduce_mean(frames)
std = tf.math.reduce_std(tf.cast(frames, tf.float32))
return tf.cast((frames - mean), tf.float32) / std
def load_data(video_path):
frames = load_video(video_path)
return frames
vocab = [x for x in "abcdefghijklmnopqrstuvwxyz'?!123456789 "]
char_to_num = tf.keras.layers.StringLookup(vocabulary=vocab, oov_token="")
num_to_char = tf.keras.layers.StringLookup(
vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True
)
model = Sequential()
model.add(Conv3D(128, 3, input_shape=(75,46,140,1), padding='same'))
model.add(Activation('relu'))
model.add(MaxPool3D((1,2,2)))
model.add(Conv3D(256, 3, padding='same'))
model.add(Activation('relu'))
model.add(MaxPool3D((1,2,2)))
model.add(Conv3D(75, 3, padding='same'))
model.add(Activation('relu'))
model.add(MaxPool3D((1,2,2)))
model.add(TimeDistributed(Flatten()))
model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True)))
model.add(Dropout(.5))
model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True)))
model.add(Dropout(.5))
model.add(Dense(char_to_num.vocabulary_size()+1, kernel_initializer='he_normal', activation='softmax'))
model.summary()
optimizer = legacy.Adam(learning_rate=0.001) # Replace legacy.Adam with the appropriate legacy optimizer you used during training
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
#Loading weights
model.load_weights('model/checkpoint')
def Predict(Video):
sample = load_data(tf.convert_to_tensor(Video))
yhat = model.predict(tf.expand_dims(sample, axis=0))
decoded = tf.keras.backend.ctc_decode(yhat, input_length=[75], greedy=True)[0][0].numpy()
result=[tf.strings.reduce_join([num_to_char(word) for word in sentence]) for sentence in decoded]
return result[0].numpy().decode('utf-8')
interface = gr.Interface(fn=Predict,
inputs="video",
outputs="text",
title='Video Lip Reading')
interface.launch(debug=True)