from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling, Trainer, TrainingArguments from datasets import load_dataset # Load the pre-trained model and tokenizer model_name = "microsoft/DialoGPT-medium" tokenizer = AutoTokenizer.from_pretrained(model_name) # Add padding token if not present if tokenizer.pad_token is None: tokenizer.add_special_tokens({'pad_token': '[PAD]'}) # Resize model embeddings to accommodate the new padding token model = AutoModelForCausalLM.from_pretrained(model_name) model.resize_token_embeddings(len(tokenizer)) # Load your dataset dataset = load_dataset('text', data_files={'train': '/kaggle/input/rahul7star-data1/data.txt'}) # Tokenize the dataset def tokenize_function(examples): return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128) tokenized_datasets = dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"]) # Set up data collator and trainer data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=False, ) training_args = TrainingArguments( output_dir="./results", overwrite_output_dir=True, num_train_epochs=3, per_device_train_batch_size=4, save_steps=10_000, save_total_limit=2, ) trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=tokenized_datasets["train"], ) # Train the model trainer.train() # Save the fine-tuned model and tokenizer model.save_pretrained("/kaggle/working/finetuned_model") tokenizer.save_pretrained("/kaggle/working/finetuned_tokenizer")