Spaces:
Runtime error
Runtime error
import gradio as gr | |
import tensorflow as tf | |
from typing import List | |
from tensorflow.keras.models import Sequential | |
from tensorflow.keras.optimizers import legacy | |
from tensorflow.keras.layers import Conv3D, LSTM, Dense, Dropout, Bidirectional, MaxPool3D, Activation, Reshape, SpatialDropout3D, BatchNormalization, TimeDistributed, Flatten | |
from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler | |
def load_alignments(path:str) -> List[str]: | |
with open(path, 'r') as f: | |
lines = f.readlines() | |
tokens = [] | |
for line in lines: | |
line = line.split() | |
if line[2] != 'sil': | |
tokens = [*tokens,' ',line[2]] | |
return char_to_num(tf.reshape(tf.strings.unicode_split(tokens, input_encoding='UTF-8'), (-1)))[1:] | |
def load_data(path: str): | |
path = bytes.decode(path.numpy()) | |
#file_name = path.split('/')[-1].split('.')[0] | |
# File name splitting for windows | |
file_name = path.split('\\')[-1].split('.')[0] | |
video_path = os.path.join('data','s1',f'{file_name}.mpg') | |
alignment_path = os.path.join('data','alignments','s1',f'{file_name}.align') | |
frames = load_video(video_path) | |
alignments = load_alignments(alignment_path) | |
return frames, alignments | |
vocab = [x for x in "abcdefghijklmnopqrstuvwxyz'?!123456789 "] | |
char_to_num = tf.keras.layers.StringLookup(vocabulary=vocab, oov_token="") | |
num_to_char = tf.keras.layers.StringLookup( | |
vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True | |
) | |
model = Sequential() | |
model.add(Conv3D(128, 3, input_shape=(75,46,140,1), padding='same')) | |
model.add(Activation('relu')) | |
model.add(MaxPool3D((1,2,2))) | |
model.add(Conv3D(256, 3, padding='same')) | |
model.add(Activation('relu')) | |
model.add(MaxPool3D((1,2,2))) | |
model.add(Conv3D(75, 3, padding='same')) | |
model.add(Activation('relu')) | |
model.add(MaxPool3D((1,2,2))) | |
model.add(TimeDistributed(Flatten())) | |
model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True))) | |
model.add(Dropout(.5)) | |
model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True))) | |
model.add(Dropout(.5)) | |
model.add(Dense(char_to_num.vocabulary_size()+1, kernel_initializer='he_normal', activation='softmax')) | |
model.summary() | |
optimizer = legacy.Adam(learning_rate=0.001) # Replace legacy.Adam with the appropriate legacy optimizer you used during training | |
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy']) | |
#Loading weights | |
model.load_weights('/content/models/checkpoint') | |
def Predict(Video): | |
sample = load_data(tf.convert_to_tensor(Video)) | |
yhat = model.predict(tf.expand_dims(sample[0], axis=0)) | |
decoded = tf.keras.backend.ctc_decode(yhat, input_length=[75], greedy=True)[0][0].numpy() | |
result=[tf.strings.reduce_join([num_to_char(word) for word in sentence]) for sentence in decoded] | |
return result[0].numpy().decode('utf-8') | |
interface = gr.Interface(fn=Predict, | |
inputs="video", | |
outputs=[gr.inputs.Textbox(label='Generated Output')], | |
title='Video Lip Reading') | |
interface.launch(debug=True) |