ITLT_Journal / finetunning.py
POMAHSLS's picture
Upload folder using huggingface_hub
ce03d07 verified
raw
history blame
2.48 kB
import re
import os
import transformers
import torch
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments
print(torch.cuda.is_available())
def load_dataset(file_path, tokenizer, block_size=128):
dataset = TextDataset(
tokenizer=tokenizer,
file_path=file_path,
block_size=block_size,
)
return dataset
def load_data_collator(tokenizer, mlm=False):
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=mlm,
)
return data_collator
def train(train_file_path, model_name, output_dir, overwrite_output_dir,
per_device_train_batch_size, num_train_epochs, save_steps, resume_from_checkpoint):
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained("malteos/gpt2-uk")
train_dataset = load_dataset(train_file_path, tokenizer)
data_collator = load_data_collator(tokenizer)
tokenizer.save_pretrained(output_dir)
model = AutoModelForCausalLM.from_pretrained("malteos/gpt2-uk")
model.save_pretrained(output_dir)
training_args = TrainingArguments(
output_dir=output_dir,
overwrite_output_dir=overwrite_output_dir,
per_device_train_batch_size=per_device_train_batch_size,
num_train_epochs=num_train_epochs,
)
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=train_dataset,
)
trainer.train(resume_from_checkpoint=resume_from_checkpoint)
trainer.save_model()
train_directory = 'H:/Finetunning/q_and_a'
train_file_path = 'H:/Finetunning/journal.txt'
model_name = train_directory
output_dir = 'H:/Finetunning/custom_full_text'
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 51
save_steps = 50000
print("Починаємо навчання...")
train(
train_file_path=train_file_path,
model_name=model_name,
output_dir=output_dir,
overwrite_output_dir=overwrite_output_dir,
per_device_train_batch_size=per_device_train_batch_size,
num_train_epochs=num_train_epochs,
save_steps=save_steps,
resume_from_checkpoint=True # False для першого разу, True - з якоїсь точки остановки
)