import numpy as np def get_segments(sentence): sentence_segments = [] temp = [] i = 0 for token in sentence.split(" "): temp.append(i) if token == "[SEP]": i += 1 sentence_segments.append(temp) return sentence_segments def tokenize(text, max_length, tokenizer, second_text=None): if second_text is None: sentence = "[CLS] " + " ".join(tokenizer.tokenize(text.replace('[^\w\s]+|\n', ''))[:max_length-2]) + " [SEP]" else: text = tokenizer.tokenize(text.replace('[^\w\s]+|\n', '')) second_text = tokenizer.tokenize(second_text.replace('[^\w\s]+|\n', '')) while len(text) + len(second_text) > max_length - 3: if len(text) > len(second_text): text.pop() else: second_text.pop() sentence = "[CLS] " + " ".join(text) + " [SEP] " + " ".join(second_text) + " [SEP]" # generate masks # bert requires a mask for the words which are padded. # Say for example, maxlen is 100, sentence size is 90. then, [1]*90 + [0]*[100-90] sentence_mask = [1] * len(sentence.split(" ")) + [0] * (max_length - len(sentence.split(" "))) # generate input ids # if less than max length provided then the words are padded if len(sentence.split(" ")) != max_length: sentence_padded = sentence + " [PAD]" * (max_length - len(sentence.split(" "))) else: sentence_padded = sentence sentence_converted = tokenizer.convert_tokens_to_ids(sentence_padded.split(" ")) # generate segments # for each separation [SEP], a new segment is converted sentence_segment = get_segments(sentence_padded) # convert list into tensor integer arrays and return it # return sentences_converted,sentences_segment, sentences_mask """ return [tf.cast(sentence_converted, tf.int32), tf.cast(sentence_segment, tf.int32), tf.cast(sentence_mask, tf.int32)] """ return [np.asarray(sentence_converted, dtype=np.int32).squeeze(), np.asarray(sentence_segment, dtype=np.int32).squeeze(), np.asarray(sentence_mask, dtype=np.int32).squeeze()]