from datasets import load_dataset from transformers import AutoTokenizer import datasets import os def main(): datasets.set_caching_enabled(False) tokenizer = AutoTokenizer.from_pretrained(r"/tokenizer/loc") tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token}) data_loc = "path/to/review/jsons" data_files = [fil.path for fil in os.scandir(data_loc)] dataset = load_dataset('online_reviews_loading.py', data_files=data_files) def tokenize_function(examples): return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512) def process_rating(example): example["labels"] = [float(item) for item in example["rating"]] return example dataset = dataset["train"].map(tokenize_function, batched=True).map(process_rating,batched=True,remove_columns=['rating']).shuffle(seed=42).train_test_split(test_size=0.1) if __name__ == "__main__": main()