|
|
|
|
|
from transformers import Trainer, TrainingArguments |
|
|
from transformers import AutoTokenizer |
|
|
from transformers import AutoModelForSequenceClassification,BertForSequenceClassification |
|
|
from datasets import load_dataset |
|
|
import numpy as np |
|
|
import evaluate |
|
|
from huggingface_hub import HfFolder |
|
|
tokenizer = AutoTokenizer.from_pretrained("roberta-large") |
|
|
file_dict = { |
|
|
"train" : "benmal.csv", |
|
|
"test" :"benmal.csv" |
|
|
|
|
|
} |
|
|
|
|
|
dataset=load_dataset( |
|
|
'csv', |
|
|
data_files=file_dict, |
|
|
delimiter=',', |
|
|
column_names=['text', 'label'], |
|
|
skiprows=1 |
|
|
) |
|
|
raw_dataset=dataset.shuffle() |
|
|
def tokenize(batch): |
|
|
return tokenizer(batch['text'], padding='max_length', truncation=True, return_tensors="pt") |
|
|
tokenized_dataset = raw_dataset.map(tokenize, batched=True,remove_columns=["text"]) |
|
|
|
|
|
model_id = "roberta-large" |
|
|
|
|
|
model = AutoModelForSequenceClassification.from_pretrained( |
|
|
model_id, num_labels=2, ignore_mismatched_sizes=True |
|
|
) |
|
|
metric = evaluate.load("f1") |
|
|
|
|
|
def compute_metrics(eval_pred): |
|
|
predictions, labels = eval_pred |
|
|
predictions = np.argmax(predictions, axis=1) |
|
|
return metric.compute(predictions=predictions, references=labels, average="weighted") |
|
|
|
|
|
from transformers import DataCollatorWithPadding |
|
|
|
|
|
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) |
|
|
repository_id = "datasetsANDmodels/benginVSmaliuos" |
|
|
|
|
|
training_args= TrainingArguments( |
|
|
output_dir=repository_id, |
|
|
per_device_train_batch_size=8, |
|
|
per_device_eval_batch_size=8, |
|
|
learning_rate=2e-5, |
|
|
num_train_epochs=10, |
|
|
|
|
|
evaluation_strategy="epoch", |
|
|
save_strategy="epoch", |
|
|
save_total_limit=2, |
|
|
load_best_model_at_end=True, |
|
|
|
|
|
|
|
|
push_to_hub=True, |
|
|
hub_strategy="every_save", |
|
|
hub_model_id=repository_id, |
|
|
hub_token=HfFolder.get_token(), |
|
|
|
|
|
) |
|
|
|
|
|
trainer = Trainer( |
|
|
model=model, |
|
|
args=training_args, |
|
|
train_dataset=tokenized_dataset["train"], |
|
|
eval_dataset=tokenized_dataset["train"], |
|
|
|
|
|
|
|
|
|
|
|
) |
|
|
import torch._dynamo |
|
|
torch._dynamo.config.suppress_errors = True |
|
|
trainer.train() |
|
|
tokenizer.save_pretrained(repository_id) |
|
|
trainer.create_model_card() |
|
|
trainer.push_to_hub() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|