BilalSardar's picture
Update app.py
c97767e
raw
history blame
3.16 kB
import gradio as gr
import tensorflow as tf
from typing import List
import os
import cv2
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import legacy
from tensorflow.keras.layers import Conv3D, LSTM, Dense, Dropout, Bidirectional, MaxPool3D, Activation, Reshape, SpatialDropout3D, BatchNormalization, TimeDistributed, Flatten
from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler
def load_video(video_path):
cap = cv2.VideoCapture(video_path)
frames = []
for _ in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))):
ret, frame = cap.read()
frame = tf.image.rgb_to_grayscale(frame)
frames.append(frame[190:236,80:220,:])
cap.release()
mean = tf.math.reduce_mean(frames)
std = tf.math.reduce_std(tf.cast(frames, tf.float32))
return tf.cast((frames - mean), tf.float32) / std
def load_data(path: str):
path = bytes.decode(path.numpy())
#file_name = path.split('/')[-1].split('.')[0]
# File name splitting for windows
file_name = path.split('\\')[-1].split('.')[0]
video_path = os.path.join('data','s1',f'{file_name}.mpg')
alignment_path = os.path.join('data','alignments','s1',f'{file_name}.align')
frames = load_video(video_path)
return frames
vocab = [x for x in "abcdefghijklmnopqrstuvwxyz'?!123456789 "]
char_to_num = tf.keras.layers.StringLookup(vocabulary=vocab, oov_token="")
num_to_char = tf.keras.layers.StringLookup(
vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True
)
model = Sequential()
model.add(Conv3D(128, 3, input_shape=(75,46,140,1), padding='same'))
model.add(Activation('relu'))
model.add(MaxPool3D((1,2,2)))
model.add(Conv3D(256, 3, padding='same'))
model.add(Activation('relu'))
model.add(MaxPool3D((1,2,2)))
model.add(Conv3D(75, 3, padding='same'))
model.add(Activation('relu'))
model.add(MaxPool3D((1,2,2)))
model.add(TimeDistributed(Flatten()))
model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True)))
model.add(Dropout(.5))
model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True)))
model.add(Dropout(.5))
model.add(Dense(char_to_num.vocabulary_size()+1, kernel_initializer='he_normal', activation='softmax'))
model.summary()
optimizer = legacy.Adam(learning_rate=0.001) # Replace legacy.Adam with the appropriate legacy optimizer you used during training
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
#Loading weights
model.load_weights('model/checkpoint')
def Predict(Video):
sample = load_data(tf.convert_to_tensor(Video))
yhat = model.predict(tf.expand_dims(sample, axis=0))
decoded = tf.keras.backend.ctc_decode(yhat, input_length=[75], greedy=True)[0][0].numpy()
result=[tf.strings.reduce_join([num_to_char(word) for word in sentence]) for sentence in decoded]
return result[0].numpy().decode('utf-8')
interface = gr.Interface(fn=Predict,
inputs="video",
outputs="text",
title='Video Lip Reading')
interface.launch(debug=True)