| from datasets import load_dataset, concatenate_datasets | |
| from tokenizers import trainers, Tokenizer, normalizers | |
| from t5_tokenizer_model import SentencePieceUnigramTokenizer | |
| vocab_size = 50_000 | |
| input_sentence_size = None | |
| model_dir = "./" # ${MODEL_DIR} | |
| # Initialize a dataset | |
| dataset = load_dataset("json", data_files=["/mnt/disks/flaxdisk/corpus/norwegian_colossal_corpus_validation.json","/mnt/disks/flaxdisk/corpus/special_chars.json"], split='train') | |
| tokenizer = SentencePieceUnigramTokenizer(unk_token="<unk>", eos_token="</s>", pad_token="<pad>") | |
| # Build an iterator over this dataset | |
| def batch_iterator(input_sentence_size=None): | |
| if input_sentence_size is None: | |
| input_sentence_size = len(dataset) | |
| batch_length = 100 | |
| for i in range(0, input_sentence_size, batch_length): | |
| yield dataset[i: i + batch_length]["text"] | |
| # Train tokenizer | |
| tokenizer.train_from_iterator( | |
| iterator=batch_iterator(input_sentence_size=input_sentence_size), | |
| vocab_size=vocab_size, | |
| show_progress=True, | |
| ) | |
| # Save files to disk | |
| tokenizer.save(f"{model_dir}/tokenizer.json") | |
