benginVSmaliuos / finetune_mal_ben.py
datasetsANDmodels's picture
Update finetune_mal_ben.py
8b7df56 verified
#https://huggingface.co/docs/transformers/v4.17.0/en/tasks/sequence_classification
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification,BertForSequenceClassification
from datasets import load_dataset
import numpy as np
import evaluate
from huggingface_hub import HfFolder
tokenizer = AutoTokenizer.from_pretrained("roberta-large")
file_dict = {
"train" : "benmal.csv",
"test" :"benmal.csv"
}
dataset=load_dataset(
'csv',
data_files=file_dict,
delimiter=',',
column_names=['text', 'label'],
skiprows=1
)
raw_dataset=dataset.shuffle()
def tokenize(batch):
return tokenizer(batch['text'], padding='max_length', truncation=True, return_tensors="pt")
tokenized_dataset = raw_dataset.map(tokenize, batched=True,remove_columns=["text"])
model_id = "roberta-large"
model = AutoModelForSequenceClassification.from_pretrained(
model_id, num_labels=2, ignore_mismatched_sizes=True
)
metric = evaluate.load("f1")
def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return metric.compute(predictions=predictions, references=labels, average="weighted")
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
repository_id = "datasetsANDmodels/benginVSmaliuos"
training_args= TrainingArguments(
output_dir=repository_id,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
learning_rate=2e-5,
num_train_epochs=10,
# torch_compile=True,
evaluation_strategy="epoch",
save_strategy="epoch",
save_total_limit=2,
load_best_model_at_end=True,
# metric_for_best_model="f1",
# report_to="tensorboard",
push_to_hub=True,
hub_strategy="every_save",
hub_model_id=repository_id,
hub_token=HfFolder.get_token(),
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"],
eval_dataset=tokenized_dataset["train"],
# compute_metrics=compute_metrics,
# tokenizer=tokenizer,
# data_collator=data_collator,
)
import torch._dynamo
torch._dynamo.config.suppress_errors = True
trainer.train()
tokenizer.save_pretrained(repository_id)
trainer.create_model_card()
trainer.push_to_hub()