Fill-Mask
Transformers
Safetensors
PyTorch
Kazakh
Russian
English
bert
KazBERT / data-pipline.py
Eraly-ml's picture
Update data-pipline.py
3229c59 verified
raw
history blame
1.19 kB
import re
import unicodedata
import random
input_file = "/kaggle/input/kaz-rus-eng-wiki/combined.txt"
train_file = "train.txt"
dev_file = "dev.txt"
def normalize_text(text):
text = unicodedata.normalize("NFC", text)
text = text.lower()
return text
def clean_text(text):
text = re.sub(r"<[^>]+>", " ", text)
text = re.sub(r"\[\[.*?\]\]", " ", text)
text = re.sub(r"\s+", " ", text)
text = text.strip()
return text
cleaned_lines = []
with open(input_file, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if len(line) < 10:
continue
line = normalize_text(line)
line = clean_text(line)
if line:
cleaned_lines.append(line)
random.shuffle(cleaned_lines)
split_index = int(0.8 * len(cleaned_lines))
train_lines = cleaned_lines[:split_index]
dev_lines = cleaned_lines[split_index:]
with open(train_file, "w", encoding="utf-8") as f:
for line in train_lines:
f.write(line + "\n")
with open(dev_file, "w", encoding="utf-8") as f:
for line in dev_lines:
f.write(line + "\n")
print(f"Train: {len(train_lines)}, Dev: {len(dev_lines)}")