|
|
|
import os |
|
import math |
|
import torch |
|
from transformers import ( |
|
AutoTokenizer, |
|
AutoModelForMaskedLM, |
|
Trainer, |
|
TrainingArguments, |
|
) |
|
from datasets import load_dataset |
|
|
|
|
|
os.environ["TOKENIZERS_PARALLELISM"] = "false" |
|
|
|
|
|
if "LOCAL_RANK" in os.environ: |
|
local_rank = int(os.environ["LOCAL_RANK"]) |
|
torch.distributed.init_process_group(backend="nccl") |
|
device = torch.device("cuda", local_rank) |
|
torch.cuda.set_device(device) |
|
else: |
|
local_rank = -1 |
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
|
|
|
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/kaz-eng-rus/pytorch/default/1") |
|
|
|
|
|
test_text = "Қазақ тілі өте әдемі." |
|
tokens = tokenizer.tokenize(test_text) |
|
ids = tokenizer.encode(test_text) |
|
print(f"Tokens: {tokens}") |
|
print(f"IDs: {ids}") |
|
|
|
|
|
|
|
|
|
|
|
dataset = load_dataset("json", data_files="/kaggle/input/kaz-rus-eng-wiki/train_pretrain.json") |
|
print("Первый пример из датасета:", dataset["train"][0]) |
|
|
|
|
|
|
|
|
|
|
|
model = AutoModelForMaskedLM.from_pretrained("bert-base-multilingual-cased") |
|
model.to(device) |
|
|
|
|
|
|
|
|
|
def preprocess_dataset(examples): |
|
|
|
inputs = tokenizer( |
|
examples["masked_sentence"], |
|
truncation=True, |
|
padding="max_length", |
|
max_length=128, |
|
) |
|
|
|
originals = tokenizer( |
|
examples["original_sentence"], |
|
truncation=True, |
|
padding="max_length", |
|
max_length=128, |
|
)["input_ids"] |
|
|
|
|
|
mask_token_id = tokenizer.convert_tokens_to_ids("[MASK]") |
|
|
|
|
|
labels = [ |
|
[-100 if token_id != mask_token_id else orig_id |
|
for token_id, orig_id in zip(input_ids, original_ids)] |
|
for input_ids, original_ids in zip(inputs["input_ids"], originals) |
|
] |
|
inputs["labels"] = labels |
|
return inputs |
|
|
|
|
|
tokenized_datasets = dataset.map( |
|
preprocess_dataset, |
|
batched=True, |
|
remove_columns=dataset["train"].column_names, |
|
batch_size=1000 |
|
) |
|
|
|
|
|
|
|
|
|
training_args = TrainingArguments( |
|
output_dir="./results", |
|
per_device_train_batch_size=20, |
|
num_train_epochs=3, |
|
weight_decay=0.01, |
|
save_strategy="epoch", |
|
fp16=True, |
|
dataloader_num_workers=4, |
|
report_to="none", |
|
) |
|
|
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=tokenized_datasets["train"], |
|
) |
|
|
|
|
|
|
|
|
|
trainer.train() |
|
|
|
|
|
|
|
|
|
output_dir = "./KazBERT" |
|
model.save_pretrained(output_dir) |
|
tokenizer.save_pretrained(output_dir) |
|
print(f"Модель сохранена в {output_dir}") |
|
|
|
|
|
|
|
|
|
|
|
valid_dataset = load_dataset("text", data_files="/kaggle/input/kaz-rus-eng-wiki/valid.txt", split="train[:1%]") |
|
|
|
def compute_perplexity(model, tokenizer, text): |
|
|
|
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device) |
|
with torch.no_grad(): |
|
outputs = model(**inputs, labels=inputs["input_ids"]) |
|
loss = outputs.loss |
|
return math.exp(loss.item()) |
|
|
|
|
|
ppl_scores = [compute_perplexity(model, tokenizer, sample["text"]) for sample in valid_dataset] |
|
avg_ppl = sum(ppl_scores) / len(ppl_scores) |
|
print(f"Perplexity модели: {avg_ppl:.2f}") |
|
|
|
|