import gradio as gr import tensorflow as tf from typing import List import os import cv2 from tensorflow.keras.models import Sequential from tensorflow.keras.optimizers import legacy from tensorflow.keras.layers import Conv3D, LSTM, Dense, Dropout, Bidirectional, MaxPool3D, Activation, Reshape, SpatialDropout3D, BatchNormalization, TimeDistributed, Flatten from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler def load_video(video_path): cap = cv2.VideoCapture(video_path) frames = [] for _ in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))): ret, frame = cap.read() frame = tf.image.rgb_to_grayscale(frame) frames.append(frame[190:236,80:220,:]) cap.release() mean = tf.math.reduce_mean(frames) std = tf.math.reduce_std(tf.cast(frames, tf.float32)) return tf.cast((frames - mean), tf.float32) / std def load_data(path: str): path = bytes.decode(path.numpy()) #file_name = path.split('/')[-1].split('.')[0] # File name splitting for windows file_name = path.split('\\')[-1].split('.')[0] video_path = os.path.join('data','s1',f'{file_name}.mpg') alignment_path = os.path.join('data','alignments','s1',f'{file_name}.align') frames = load_video(video_path) return frames vocab = [x for x in "abcdefghijklmnopqrstuvwxyz'?!123456789 "] char_to_num = tf.keras.layers.StringLookup(vocabulary=vocab, oov_token="") num_to_char = tf.keras.layers.StringLookup( vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True ) model = Sequential() model.add(Conv3D(128, 3, input_shape=(75,46,140,1), padding='same')) model.add(Activation('relu')) model.add(MaxPool3D((1,2,2))) model.add(Conv3D(256, 3, padding='same')) model.add(Activation('relu')) model.add(MaxPool3D((1,2,2))) model.add(Conv3D(75, 3, padding='same')) model.add(Activation('relu')) model.add(MaxPool3D((1,2,2))) model.add(TimeDistributed(Flatten())) model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True))) model.add(Dropout(.5)) model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True))) model.add(Dropout(.5)) model.add(Dense(char_to_num.vocabulary_size()+1, kernel_initializer='he_normal', activation='softmax')) model.summary() optimizer = legacy.Adam(learning_rate=0.001) # Replace legacy.Adam with the appropriate legacy optimizer you used during training model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy']) #Loading weights model.load_weights('model/checkpoint') def Predict(Video): sample = load_data(tf.convert_to_tensor(Video)) yhat = model.predict(tf.expand_dims(sample, axis=0)) decoded = tf.keras.backend.ctc_decode(yhat, input_length=[75], greedy=True)[0][0].numpy() result=[tf.strings.reduce_join([num_to_char(word) for word in sentence]) for sentence in decoded] return result[0].numpy().decode('utf-8') interface = gr.Interface(fn=Predict, inputs="video", outputs="text", title='Video Lip Reading') interface.launch(debug=True)