Spaces:
Sleeping
Sleeping
| import torch | |
| from torch.utils.data import Dataset | |
| import pandas as pd | |
| import numpy as np | |
| import tqdm | |
| import random | |
| from vocab import Vocab | |
| import pickle | |
| import copy | |
| from sklearn.preprocessing import OneHotEncoder | |
| class PretrainerDataset(Dataset): | |
| """ | |
| Class name: PretrainDataset | |
| """ | |
| def __init__(self, dataset_path, vocab, seq_len=30, select_next_seq= False): | |
| self.dataset_path = dataset_path | |
| self.vocab = vocab # Vocab object | |
| # Related to input dataset file | |
| self.lines = [] | |
| self.index_documents = {} | |
| seq_len_list = [] | |
| with open(self.dataset_path, "r") as reader: | |
| i = 0 | |
| index = 0 | |
| self.index_documents[i] = [] | |
| for line in tqdm.tqdm(reader.readlines()): | |
| if line: | |
| line = line.strip() | |
| if not line: | |
| i+=1 | |
| self.index_documents[i] = [] | |
| else: | |
| self.index_documents[i].append(index) | |
| self.lines.append(line.split()) | |
| len_line = len(line.split()) | |
| seq_len_list.append(len_line) | |
| index+=1 | |
| reader.close() | |
| print("Sequence Stats: ", len(seq_len_list), min(seq_len_list), max(seq_len_list), sum(seq_len_list)/len(seq_len_list)) | |
| print("Unique Sequences: ", len({tuple(ll) for ll in self.lines})) | |
| self.index_documents = {k:v for k,v in self.index_documents.items() if v} | |
| self.seq_len = seq_len | |
| self.max_mask_per_seq = 0.15 | |
| self.select_next_seq = select_next_seq | |
| print("Sequence length set at ", self.seq_len) | |
| print("select_next_seq: ", self.select_next_seq) | |
| print(len(self.index_documents)) | |
| def __len__(self): | |
| return len(self.lines) | |
| def __getitem__(self, item): | |
| token_a = self.lines[item] | |
| token_b = None | |
| is_same_student = None | |
| sa_masked = None | |
| sa_masked_label = None | |
| sb_masked = None | |
| sb_masked_label = None | |
| if self.select_next_seq: | |
| is_same_student, token_b = self.get_token_b(item) | |
| is_same_student = 1 if is_same_student else 0 | |
| token_a1, token_b1 = self.truncate_to_max_seq(token_a, token_b) | |
| sa_masked, sa_masked_label = self.random_mask_seq(token_a1) | |
| sb_masked, sb_masked_label = self.random_mask_seq(token_b1) | |
| else: | |
| token_a = token_a[:self.seq_len-2] | |
| sa_masked, sa_masked_label = self.random_mask_seq(token_a) | |
| s1 = ([self.vocab.vocab['[CLS]']] + sa_masked + [self.vocab.vocab['[SEP]']]) | |
| s1_label = ([self.vocab.vocab['[PAD]']] + sa_masked_label + [self.vocab.vocab['[PAD]']]) | |
| segment_label = [1 for _ in range(len(s1))] | |
| if self.select_next_seq: | |
| s1 = s1 + sb_masked + [self.vocab.vocab['[SEP]']] | |
| s1_label = s1_label + sb_masked_label + [self.vocab.vocab['[PAD]']] | |
| segment_label = segment_label + [2 for _ in range(len(sb_masked)+1)] | |
| padding = [self.vocab.vocab['[PAD]'] for _ in range(self.seq_len - len(s1))] | |
| s1.extend(padding), s1_label.extend(padding), segment_label.extend(padding) | |
| output = {'bert_input': s1, | |
| 'bert_label': s1_label, | |
| 'segment_label': segment_label} | |
| if self.select_next_seq: | |
| output['is_same_student'] = is_same_student | |
| # print(item, len(s1), len(s1_label), len(segment_label)) | |
| return {key: torch.tensor(value) for key, value in output.items()} | |
| def random_mask_seq(self, tokens): | |
| """ | |
| Input: original token seq | |
| Output: masked token seq, output label | |
| """ | |
| # masked_pos_label = {} | |
| output_labels = [] | |
| output_tokens = copy.deepcopy(tokens) | |
| # while(len(label_tokens) < self.max_mask_per_seq*len(tokens)): | |
| for i, token in enumerate(tokens): | |
| prob = random.random() | |
| if prob < 0.15: | |
| # chooses 15% of token positions at random | |
| # prob /= 0.15 | |
| prob = random.random() | |
| if prob < 0.8: #[MASK] token 80% of the time | |
| output_tokens[i] = self.vocab.vocab['[MASK]'] | |
| elif prob < 0.9: # a random token 10% of the time | |
| # print(".......0.8-0.9......") | |
| output_tokens[i] = random.randint(1, len(self.vocab.vocab)-1) | |
| else: # the unchanged i-th token 10% of the time | |
| # print(".......unchanged......") | |
| output_tokens[i] = self.vocab.vocab.get(token, self.vocab.vocab['[UNK]']) | |
| # True Label | |
| output_labels.append(self.vocab.vocab.get(token, self.vocab.vocab['[UNK]'])) | |
| # masked_pos_label[i] = self.vocab.vocab.get(token, self.vocab.vocab['[UNK]']) | |
| else: | |
| # i-th token with original value | |
| output_tokens[i] = self.vocab.vocab.get(token, self.vocab.vocab['[UNK]']) | |
| # Padded label | |
| output_labels.append(self.vocab.vocab['[PAD]']) | |
| # label_position = [] | |
| # label_tokens = [] | |
| # for k, v in masked_pos_label.items(): | |
| # label_position.append(k) | |
| # label_tokens.append(v) | |
| return output_tokens, output_labels | |
| def get_token_b(self, item): | |
| document_id = [k for k,v in self.index_documents.items() if item in v][0] | |
| random_document_id = document_id | |
| if random.random() < 0.5: | |
| document_ids = [k for k in self.index_documents.keys() if k != document_id] | |
| random_document_id = random.choice(document_ids) | |
| same_student = (random_document_id == document_id) | |
| nex_seq_list = self.index_documents.get(random_document_id) | |
| if same_student: | |
| if len(nex_seq_list) != 1: | |
| nex_seq_list = [v for v in nex_seq_list if v !=item] | |
| next_seq = random.choice(nex_seq_list) | |
| tokens = self.lines[next_seq] | |
| # print(f"item = {item}, tokens: {tokens}") | |
| # print(f"item={item}, next={next_seq}, same_student = {same_student}, {document_id} == {random_document_id}, b. {tokens}") | |
| return same_student, tokens | |
| def truncate_to_max_seq(self, s1, s2): | |
| sa = copy.deepcopy(s1) | |
| sb = copy.deepcopy(s1) | |
| total_allowed_seq = self.seq_len - 3 | |
| while((len(sa)+len(sb)) > total_allowed_seq): | |
| if random.random() < 0.5: | |
| sa.pop() | |
| else: | |
| sb.pop() | |
| return sa, sb | |
| class TokenizerDataset(Dataset): | |
| """ | |
| Class name: TokenizerDataset | |
| Tokenize the data in the dataset | |
| """ | |
| def __init__(self, dataset_path, label_path, vocab, seq_len=30, train=True): | |
| self.dataset_path = dataset_path | |
| self.label_path = label_path | |
| self.vocab = vocab # Vocab object | |
| self.encoder = OneHotEncoder(sparse_output=False) | |
| # Related to input dataset file | |
| self.lines = [] | |
| self.labels = [] | |
| self.labels = [] | |
| self.label_file = open(self.label_path, "r") | |
| for line in self.label_file: | |
| if line: | |
| line = line.strip() | |
| if not line: | |
| continue | |
| self.labels.append(float(line)) | |
| self.label_file.close() | |
| labeler = np.unique(self.labels) | |
| self.encoder.fit(labeler.reshape(-1,1)) | |
| self.labels = self.encoder.transform(np.array(self.labels).reshape(-1,1)) | |
| # print(f"labels: {self.labels}") | |
| # info_file_name = self.dataset_path.split('.') | |
| # info_file_name = info_file_name[0]+"_info."+info_file_name[1] | |
| # progress = [] | |
| # with open(info_file_name, "r") as f: | |
| # for line in f: | |
| # if line: | |
| # line = line.strip() | |
| # if not line: | |
| # continue | |
| # line = line.split(",")[0] | |
| # pstat = 1 if line == "GRADUATED" else 0 | |
| # progress.append(pstat) | |
| # f.close() | |
| # indices_of_grad = np.where(np.array(progress) == 1)[0] | |
| # indices_of_prom = np.where(np.array(progress) == 0)[0] | |
| # indices_of_zeros = np.where(np.array(labels) == 0)[0] | |
| # indices_of_ones = np.where(np.array(labels) == 1)[0] | |
| # number_of_items = min(len(indices_of_zeros), len(indices_of_ones)) | |
| # # number_of_items = min(len(indices_of_grad), len(indices_of_prom)) | |
| # print(number_of_items) | |
| # indices_of_zeros = indices_of_zeros[:number_of_items] | |
| # indices_of_ones = indices_of_ones[:number_of_items] | |
| # print(indices_of_zeros) | |
| # print(indices_of_ones) | |
| # indices_of_grad = indices_of_grad[:number_of_items] | |
| # indices_of_prom = indices_of_prom[:number_of_items] | |
| # print(indices_of_grad) | |
| # print(indices_of_prom) | |
| self.file = open(self.dataset_path, "r") | |
| # index = 0 | |
| for line in self.file: | |
| if line: | |
| line = line.strip() | |
| if line: | |
| self.lines.append(line) | |
| # if train: | |
| # if index in indices_of_zeros: | |
| # # if index in indices_of_prom: | |
| # self.lines.append(line) | |
| # self.labels.append(0) | |
| # if index in indices_of_ones: | |
| # # if index in indices_of_grad: | |
| # self.lines.append(line) | |
| # self.labels.append(1) | |
| # else: | |
| # self.lines.append(line) | |
| # self.labels.append(labels[index]) | |
| # self.labels.append(progress[index]) | |
| # index += 1 | |
| self.file.close() | |
| self.len = len(self.lines) | |
| self.seq_len = seq_len | |
| print("Sequence length set at ", self.seq_len, len(self.lines), len(self.labels)) | |
| def __len__(self): | |
| return self.len | |
| def __getitem__(self, item): | |
| s1 = self.vocab.to_seq(self.lines[item], self.seq_len) # This is like tokenizer and adds [CLS] and [SEP]. | |
| s1_label = self.labels[item] | |
| segment_label = [1 for _ in range(len(s1))] | |
| padding = [self.vocab.vocab['[PAD]'] for _ in range(self.seq_len - len(s1))] | |
| s1.extend(padding), segment_label.extend(padding) | |
| output = {'bert_input': s1, | |
| 'progress_status': s1_label, | |
| 'segment_label': segment_label} | |
| return {key: torch.tensor(value) for key, value in output.items()} | |
| # if __name__ == "__main__": | |
| # # import pickle | |
| # # k = pickle.load(open("dataset/CL4999_1920/unique_steps_list.pkl","rb")) | |
| # # print(k) | |
| # vocab_obj = Vocab("pretraining/vocab.txt") | |
| # vocab_obj.load_vocab() | |
| # datasetTrain = PretrainerDataset("pretraining/pretrain.txt", vocab_obj) | |
| # print(datasetTrain, len(datasetTrain))#, datasetTrain.documents_index) | |
| # print(datasetTrain[len(datasetTrain)-1]) | |
| # for i, d in enumerate(datasetTrain): | |
| # print(d.items()) | |
| # break | |
| # fine_tune = TokenizerDataset("finetuning/finetune.txt", "finetuning/finetune_label.txt", vocab_obj) | |
| # print(fine_tune) | |
| # print(fine_tune[len(fine_tune)-1]) | |
| # print(fine_tune[random.randint(0, len(fine_tune))]) | |
| # for i, d in enumerate(fine_tune): | |
| # print(d.items()) | |
| # break | |