skin / finetune.py
Kalyangotimothy
new
217a100
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset
import os
os.environ["USE_TF"] = "0"
model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name)
# Load your text file as a dataset
dataset = load_dataset("text", data_files={"train": "skin_disease_articles_clean.txt"})
# Tokenize the dataset
def tokenize_function(examples):
return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
train_dataset = tokenized_datasets["train"]
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer, mlm=False
)
training_args = TrainingArguments(
output_dir="./tinyllama-finetuned-skin",
overwrite_output_dir=True,
num_train_epochs=1,
per_device_train_batch_size=2,
save_steps=500,
save_total_limit=2,
prediction_loss_only=True,
fp16=True # Set True if using GPU with float16 support
)
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=train_dataset,
)
trainer.train()