File size: 1,124 Bytes
217a100 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 |
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling
model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
# Prepare dataset
def load_dataset(file_path, tokenizer, block_size=128):
return TextDataset(
tokenizer=tokenizer,
file_path=file_path,
block_size=block_size
)
train_dataset = load_dataset("skin_disease_articles_clean.txt", tokenizer)
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer, mlm=False
)
training_args = TrainingArguments(
output_dir="./tinyllama-finetuned-skin",
overwrite_output_dir=True,
num_train_epochs=1,
per_device_train_batch_size=2,
save_steps=500,
save_total_limit=2,
prediction_loss_only=True,
fp16=False # Set True if using GPU with float16 support
)
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=train_dataset,
)
trainer.train() |