import re import unicodedata import random input_file = "/kaggle/input/kaz-rus-eng-wiki/combined.txt" train_file = "train.txt" dev_file = "dev.txt" def normalize_text(text): text = unicodedata.normalize("NFC", text) text = text.lower() return text def clean_text(text): text = re.sub(r"<[^>]+>", " ", text) text = re.sub(r"\[\[.*?\]\]", " ", text) text = re.sub(r"\s+", " ", text) text = text.strip() return text cleaned_lines = [] with open(input_file, "r", encoding="utf-8") as f: for line in f: line = line.strip() if len(line) < 10: continue line = normalize_text(line) line = clean_text(line) if line: cleaned_lines.append(line) random.shuffle(cleaned_lines) split_index = int(0.8 * len(cleaned_lines)) train_lines = cleaned_lines[:split_index] dev_lines = cleaned_lines[split_index:] with open(train_file, "w", encoding="utf-8") as f: for line in train_lines: f.write(line + "\n") with open(dev_file, "w", encoding="utf-8") as f: for line in dev_lines: f.write(line + "\n") print(f"Train: {len(train_lines)}, Dev: {len(dev_lines)}")