rahul7star commited on
Commit
cac26d5
·
verified ·
1 Parent(s): ad35f9e

Create model_train

Browse files
Files changed (1) hide show
  1. model_train +54 -0
model_train ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling, Trainer, TrainingArguments
2
+ from datasets import load_dataset
3
+
4
+ # Load the pre-trained model and tokenizer
5
+ model_name = "microsoft/DialoGPT-medium"
6
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
7
+
8
+ # Add padding token if not present
9
+ if tokenizer.pad_token is None:
10
+ tokenizer.add_special_tokens({'pad_token': '[PAD]'})
11
+
12
+ # Resize model embeddings to accommodate the new padding token
13
+ model = AutoModelForCausalLM.from_pretrained(model_name)
14
+ model.resize_token_embeddings(len(tokenizer))
15
+
16
+ # Load your dataset
17
+
18
+
19
+ dataset = load_dataset('text', data_files={'train': '/kaggle/input/rahul7star-data1/data.txt'})
20
+
21
+ # Tokenize the dataset
22
+ def tokenize_function(examples):
23
+ return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)
24
+
25
+ tokenized_datasets = dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])
26
+
27
+ # Set up data collator and trainer
28
+ data_collator = DataCollatorForLanguageModeling(
29
+ tokenizer=tokenizer,
30
+ mlm=False,
31
+ )
32
+
33
+ training_args = TrainingArguments(
34
+ output_dir="./results",
35
+ overwrite_output_dir=True,
36
+ num_train_epochs=3,
37
+ per_device_train_batch_size=4,
38
+ save_steps=10_000,
39
+ save_total_limit=2,
40
+ )
41
+
42
+ trainer = Trainer(
43
+ model=model,
44
+ args=training_args,
45
+ data_collator=data_collator,
46
+ train_dataset=tokenized_datasets["train"],
47
+ )
48
+
49
+ # Train the model
50
+ trainer.train()
51
+
52
+ # Save the fine-tuned model and tokenizer
53
+ model.save_pretrained("/kaggle/working/finetuned_model")
54
+ tokenizer.save_pretrained("/kaggle/working/finetuned_tokenizer")