from datasets import load_dataset, Dataset from transformers import GPT2LMHeadModel, GPT2Config, Trainer, TrainingArguments, PreTrainedTokenizerFast from tokenizers import Tokenizer, models, trainers # Load datasets wiki_dataset = load_dataset("LexiconShiftInnovations/SinhalaWikipediaArticles") gov_dataset = load_dataset("Virajtharinda/SinhalaGOV") text_dataset = load_dataset("rmihiranga/sinhala-text-fullfill-v2") combined_texts = wiki_dataset["train"]["text"] + gov_dataset["train"]["text"] + text_dataset["train"]["text"] combined_dataset = Dataset.from_dict({"text": combined_texts}) # Train tokenizer tokenizer = Tokenizer(models.BPE()) trainer = trainers.BpeTrainer(vocab_size=30000, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) tokenizer.train_from_iterator(combined_texts, trainer) tokenizer.save("sinhala_tokenizer.json") tokenizer = PreTrainedTokenizerFast( tokenizer_file="sinhala_tokenizer.json", unk_token="[UNK]", cls_token="[CLS]", sep_token="[SEP]", pad_token="[PAD]", mask_token="[MASK]" ) def preprocess_function(examples): return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512) tokenized_dataset = combined_dataset.map(preprocess_function, batched=True) # Configure model config = GPT2Config(vocab_size=tokenizer.vocab_size, n_positions=512, n_ctx=512, n_embd=768, n_layer=6, n_head=12) model = GPT2LMHeadModel(config) # Training arguments training_args = TrainingArguments( output_dir="./sinhala_LM_V1", overwrite_output_dir=True, num_train_epochs=1, per_device_train_batch_size=4, save_steps=10_000, save_total_limit=2, logging_steps=500, fp16=True, push_to_hub=True, hub_model_id="your-username/sinhala_LM_V1", # Replace with your username ) trainer = Trainer(model=model, args=training_args, train_dataset=tokenized_dataset) # Train and push trainer.train() trainer.push_to_hub()