Spaces:
Runtime error
Runtime error
File size: 3,158 Bytes
56c7749 d6cfc3d f066e4f e830b84 56c7749 f066e4f c97767e 56c7749 c97767e f066e4f 56c7749 c2a0cd0 56c7749 f066e4f 56c7749 d84bc8b 56c7749 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
import gradio as gr
import tensorflow as tf
from typing import List
import os
import cv2
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import legacy
from tensorflow.keras.layers import Conv3D, LSTM, Dense, Dropout, Bidirectional, MaxPool3D, Activation, Reshape, SpatialDropout3D, BatchNormalization, TimeDistributed, Flatten
from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler
def load_video(video_path):
cap = cv2.VideoCapture(video_path)
frames = []
for _ in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))):
ret, frame = cap.read()
frame = tf.image.rgb_to_grayscale(frame)
frames.append(frame[190:236,80:220,:])
cap.release()
mean = tf.math.reduce_mean(frames)
std = tf.math.reduce_std(tf.cast(frames, tf.float32))
return tf.cast((frames - mean), tf.float32) / std
def load_data(path: str):
path = bytes.decode(path.numpy())
#file_name = path.split('/')[-1].split('.')[0]
# File name splitting for windows
file_name = path.split('\\')[-1].split('.')[0]
video_path = os.path.join('data','s1',f'{file_name}.mpg')
alignment_path = os.path.join('data','alignments','s1',f'{file_name}.align')
frames = load_video(video_path)
return frames
vocab = [x for x in "abcdefghijklmnopqrstuvwxyz'?!123456789 "]
char_to_num = tf.keras.layers.StringLookup(vocabulary=vocab, oov_token="")
num_to_char = tf.keras.layers.StringLookup(
vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True
)
model = Sequential()
model.add(Conv3D(128, 3, input_shape=(75,46,140,1), padding='same'))
model.add(Activation('relu'))
model.add(MaxPool3D((1,2,2)))
model.add(Conv3D(256, 3, padding='same'))
model.add(Activation('relu'))
model.add(MaxPool3D((1,2,2)))
model.add(Conv3D(75, 3, padding='same'))
model.add(Activation('relu'))
model.add(MaxPool3D((1,2,2)))
model.add(TimeDistributed(Flatten()))
model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True)))
model.add(Dropout(.5))
model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True)))
model.add(Dropout(.5))
model.add(Dense(char_to_num.vocabulary_size()+1, kernel_initializer='he_normal', activation='softmax'))
model.summary()
optimizer = legacy.Adam(learning_rate=0.001) # Replace legacy.Adam with the appropriate legacy optimizer you used during training
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
#Loading weights
model.load_weights('model/checkpoint')
def Predict(Video):
sample = load_data(tf.convert_to_tensor(Video))
yhat = model.predict(tf.expand_dims(sample, axis=0))
decoded = tf.keras.backend.ctc_decode(yhat, input_length=[75], greedy=True)[0][0].numpy()
result=[tf.strings.reduce_join([num_to_char(word) for word in sentence]) for sentence in decoded]
return result[0].numpy().decode('utf-8')
interface = gr.Interface(fn=Predict,
inputs="video",
outputs="text",
title='Video Lip Reading')
interface.launch(debug=True) |