from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name) # Prepare dataset def load_dataset(file_path, tokenizer, block_size=128): return TextDataset( tokenizer=tokenizer, file_path=file_path, block_size=block_size ) train_dataset = load_dataset("skin_disease_articles_clean.txt", tokenizer) data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=False ) training_args = TrainingArguments( output_dir="./tinyllama-finetuned-skin", overwrite_output_dir=True, num_train_epochs=1, per_device_train_batch_size=2, save_steps=500, save_total_limit=2, prediction_loss_only=True, fp16=False # Set True if using GPU with float16 support ) trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, ) trainer.train()