File size: 2,347 Bytes
61fe8b0 8b7df56 61fe8b0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
#https://huggingface.co/docs/transformers/v4.17.0/en/tasks/sequence_classification
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification,BertForSequenceClassification
from datasets import load_dataset
import numpy as np
import evaluate
from huggingface_hub import HfFolder
tokenizer = AutoTokenizer.from_pretrained("roberta-large")
file_dict = {
"train" : "benmal.csv",
"test" :"benmal.csv"
}
dataset=load_dataset(
'csv',
data_files=file_dict,
delimiter=',',
column_names=['text', 'label'],
skiprows=1
)
raw_dataset=dataset.shuffle()
def tokenize(batch):
return tokenizer(batch['text'], padding='max_length', truncation=True, return_tensors="pt")
tokenized_dataset = raw_dataset.map(tokenize, batched=True,remove_columns=["text"])
model_id = "roberta-large"
model = AutoModelForSequenceClassification.from_pretrained(
model_id, num_labels=2, ignore_mismatched_sizes=True
)
metric = evaluate.load("f1")
def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return metric.compute(predictions=predictions, references=labels, average="weighted")
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
repository_id = "datasetsANDmodels/benginVSmaliuos"
training_args= TrainingArguments(
output_dir=repository_id,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
learning_rate=2e-5,
num_train_epochs=10,
# torch_compile=True,
evaluation_strategy="epoch",
save_strategy="epoch",
save_total_limit=2,
load_best_model_at_end=True,
# metric_for_best_model="f1",
# report_to="tensorboard",
push_to_hub=True,
hub_strategy="every_save",
hub_model_id=repository_id,
hub_token=HfFolder.get_token(),
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"],
eval_dataset=tokenized_dataset["train"],
# compute_metrics=compute_metrics,
# tokenizer=tokenizer,
# data_collator=data_collator,
)
import torch._dynamo
torch._dynamo.config.suppress_errors = True
trainer.train()
tokenizer.save_pretrained(repository_id)
trainer.create_model_card()
trainer.push_to_hub()
|