|
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling |
|
from datasets import load_dataset |
|
import os |
|
|
|
os.environ["USE_TF"] = "0" |
|
|
|
model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
if tokenizer.pad_token is None: |
|
tokenizer.pad_token = tokenizer.eos_token |
|
model = AutoModelForCausalLM.from_pretrained(model_name) |
|
|
|
|
|
dataset = load_dataset("text", data_files={"train": "skin_disease_articles_clean.txt"}) |
|
|
|
|
|
def tokenize_function(examples): |
|
return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128) |
|
|
|
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"]) |
|
|
|
train_dataset = tokenized_datasets["train"] |
|
|
|
data_collator = DataCollatorForLanguageModeling( |
|
tokenizer=tokenizer, mlm=False |
|
) |
|
|
|
training_args = TrainingArguments( |
|
output_dir="./tinyllama-finetuned-skin", |
|
overwrite_output_dir=True, |
|
num_train_epochs=1, |
|
per_device_train_batch_size=2, |
|
save_steps=500, |
|
save_total_limit=2, |
|
prediction_loss_only=True, |
|
fp16=True |
|
) |
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
data_collator=data_collator, |
|
train_dataset=train_dataset, |
|
) |
|
|
|
trainer.train() |