|
import re |
|
import unicodedata |
|
import random |
|
|
|
|
|
input_file = "/kaggle/input/kaz-rus-eng-wiki/combined.txt" |
|
train_file = "train.txt" |
|
dev_file = "dev.txt" |
|
|
|
|
|
def normalize_text(text): |
|
text = unicodedata.normalize("NFC", text) |
|
text = text.lower() |
|
return text |
|
|
|
def clean_text(text): |
|
text = re.sub(r"<[^>]+>", " ", text) |
|
text = re.sub(r"\[\[.*?\]\]", " ", text) |
|
text = re.sub(r"\s+", " ", text) |
|
text = text.strip() |
|
return text |
|
|
|
|
|
cleaned_lines = [] |
|
with open(input_file, "r", encoding="utf-8") as f: |
|
for line in f: |
|
line = line.strip() |
|
if len(line) < 10: |
|
continue |
|
line = normalize_text(line) |
|
line = clean_text(line) |
|
if line: |
|
cleaned_lines.append(line) |
|
|
|
|
|
random.shuffle(cleaned_lines) |
|
|
|
|
|
split_index = int(0.8 * len(cleaned_lines)) |
|
train_lines = cleaned_lines[:split_index] |
|
dev_lines = cleaned_lines[split_index:] |
|
|
|
|
|
with open(train_file, "w", encoding="utf-8") as f: |
|
for line in train_lines: |
|
f.write(line + "\n") |
|
|
|
with open(dev_file, "w", encoding="utf-8") as f: |
|
for line in dev_lines: |
|
f.write(line + "\n") |
|
|
|
print(f"Train: {len(train_lines)}, Dev: {len(dev_lines)}") |
|
|