import pandas as pd import re import tensorflow as tf from tensorflow.keras.layers import Embedding, LSTM, Dense,Bidirectional from tensorflow.keras.models import Model from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences import numpy as np import string from string import digits from sklearn.utils import shuffle from sklearn.model_selection import train_test_split import nltk from nltk.tokenize import word_tokenize from tqdm import tqdm class Dataset: def __init__(self, data, tknizer_ass, tknizer_eng, max_len): self.encoder_inps = data['ass'].values self.decoder_inps = data['eng_inp'].values self.decoder_outs = data['eng_out'].values self.tknizer_eng = tknizer_eng self.tknizer_ass = tknizer_ass self.max_len = max_len def __getitem__(self, i): self.encoder_seq = self.tknizer_ass.texts_to_sequences([self.encoder_inps[i]]) # need to pass list of values self.decoder_inp_seq = self.tknizer_eng.texts_to_sequences([self.decoder_inps[i]]) self.decoder_out_seq = self.tknizer_eng.texts_to_sequences([self.decoder_outs[i]]) self.encoder_seq = pad_sequences(self.encoder_seq, maxlen=self.max_len, dtype='int32', padding='post') self.decoder_inp_seq = pad_sequences(self.decoder_inp_seq, maxlen=self.max_len, dtype='int32', padding='post') self.decoder_out_seq = pad_sequences(self.decoder_out_seq, maxlen=self.max_len, dtype='int32', padding='post') return self.encoder_seq, self.decoder_inp_seq, self.decoder_out_seq def __len__(self): # your model.fit_gen requires this function return len(self.encoder_inps) class Dataloder(tf.keras.utils.Sequence): def __init__(self, dataset, batch_size=1): self.dataset = dataset self.batch_size = batch_size self.indexes = np.arange(len(self.dataset.encoder_inps)) def __getitem__(self, i): start = i * self.batch_size stop = (i + 1) * self.batch_size data = [] for j in range(start, stop): data.append(self.dataset[j]) batch = [np.squeeze(np.stack(samples, axis=1), axis=0) for samples in zip(*data)] # we are creating data like ([italian, english_inp], english_out) these are already converted into seq return tuple([[batch[0],batch[1]],batch[2]]) def __len__(self): # your model.fit_gen requires this function return len(self.indexes) // self.batch_size def on_epoch_end(self): self.indexes = np.random.permutation(self.indexes)