Spaces:
Sleeping
Sleeping
import pandas as pd | |
import numpy as np | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer | |
from sklearn.model_selection import train_test_split | |
from datasets import Dataset | |
import torch | |
from tqdm import tqdm | |
def load_data(file_path): | |
df = pd.read_csv(file_path) | |
return df[['title', 'summary', 'tag']] | |
def prepare_data(df): | |
df['text'] = df['title'] + ' ' + df['summary'].fillna('') | |
unique_tags = df['tag'].unique() | |
tag2id = {tag: i for i, tag in enumerate(unique_tags)} | |
id2tag = {i: tag for i, tag in enumerate(unique_tags)} | |
df['label'] = df['tag'].map(tag2id) | |
return df, tag2id, id2tag | |
def train_model(df, model_name="distilbert-base-cased", output_dir="./models"): | |
df, tag2id, id2tag = prepare_data(df) | |
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42) | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
def tokenize_function(examples): | |
return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=512) | |
train_dataset = Dataset.from_pandas(train_df) | |
test_dataset = Dataset.from_pandas(test_df) | |
train_dataset = train_dataset.map(tokenize_function, batched=True) | |
test_dataset = test_dataset.map(tokenize_function, batched=True) | |
model = AutoModelForSequenceClassification.from_pretrained( | |
model_name, | |
num_labels=len(tag2id), | |
id2label=id2tag, | |
label2id=tag2id | |
) | |
training_args = TrainingArguments( | |
output_dir=output_dir, | |
num_train_epochs=1, | |
per_device_train_batch_size=16, | |
per_device_eval_batch_size=16, | |
warmup_steps=500, | |
weight_decay=0.01, | |
logging_dir='./logs', | |
logging_steps=10, | |
evaluation_strategy="steps", | |
eval_steps=500, | |
save_strategy="steps", | |
save_steps=500, | |
load_best_model_at_end=True, | |
) | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=train_dataset, | |
eval_dataset=test_dataset, | |
) | |
trainer.train() | |
model.save_pretrained(output_dir) | |
tokenizer.save_pretrained(output_dir) | |
return model, tokenizer, id2tag | |
if __name__ == "__main__": | |
df = load_data("../data/arxiv_data.csv") | |
model, tokenizer, id2tag = train_model(df) | |
print("успех неизбежен") |