from transformers import AutoTokenizer from datasets import load_dataset # Load tokenizer and dataset model_name = "Visdom9/Norah" tokenizer = AutoTokenizer.from_pretrained(model_name) dataset = load_dataset("OpenAssistant/oasst1", split="train") # Keep only French examples dataset = dataset.filter(lambda x: x["lang"] == "fr") # Tokenize dataset def tokenize_function(examples): model_inputs = tokenizer( examples["text"], padding="max_length", truncation=True, max_length=512 ) model_inputs["labels"] = model_inputs["input_ids"][:] # ✅ Copy input_ids as labels return model_inputs # Apply tokenization tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names) # Convert dataset to PyTorch tensors tokenized_dataset.set_format("torch") # Save tokenized dataset tokenized_dataset.save_to_disk("tokenized_norah") print("✅ Tokenization complete! Dataset saved to 'tokenized_norah'")