from datasets import load_dataset from transformers import ( AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, ) import torch # STEP 1: Load IMDb Dataset dataset = load_dataset("imdb") # STEP 2: Tokenize the Data checkpoint = "distilbert-base-uncased" tokenizer = AutoTokenizer.from_pretrained(checkpoint) def preprocess(example): return tokenizer(example["text"], truncation=True, padding="max_length", max_length=256) tokenized = dataset.map(preprocess, batched=True) tokenized = tokenized.remove_columns(["text"]) tokenized = tokenized.rename_column("label", "labels") tokenized.set_format("torch") # Use a smaller subset for quick training train_dataset = tokenized["train"].shuffle(seed=42).select(range(2000)) val_dataset = tokenized["test"].shuffle(seed=42).select(range(500)) # STEP 3: Load Model model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2) # STEP 4: Define Training Arguments training_args = TrainingArguments( output_dir="./results", evaluation_strategy="epoch", save_strategy="epoch", num_train_epochs=3, per_device_train_batch_size=8, per_device_eval_batch_size=8, logging_dir="./logs", logging_steps=50, report_to="none" ) # STEP 5: Train trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset, tokenizer=tokenizer, ) trainer.train() # STEP 6: Save Locally to Repo Folder model.save_pretrained("./") tokenizer.save_pretrained("./") print("✅ Model and tokenizer saved locally!")