Eraly-ml
/

KazBERT

@@ -2,45 +2,45 @@ import re
 import unicodedata
 import random
-# Пути к файлам
 input_file = "/kaggle/input/kaz-rus-eng-wiki/combined.txt"
 train_file = "train.txt"
 dev_file = "dev.txt"
-# Функции для очистки и нормализации текста
 def normalize_text(text):
-    text = unicodedata.normalize("NFC", text)  # Приведение к нормальной форме Unicode
-    text = text.lower()  # Приведение к нижнему регистру
     return text
 def clean_text(text):
-    text = re.sub(r"<[^>]+>", " ", text)  # Удаление HTML-тегов
-    text = re.sub(r"\[\[.*?\]\]", " ", text)  # Удаление вики-разметки
-    text = re.sub(r"\s+", " ", text)  # Замена нескольких пробелов на один
     text = text.strip()
     return text
-# Читаем и очищаем текст
 cleaned_lines = []
 with open(input_file, "r", encoding="utf-8") as f:
     for line in f:
         line = line.strip()
-        if len(line) < 10:  # Пропускаем короткие строки
             continue
         line = normalize_text(line)
         line = clean_text(line)
         if line:
             cleaned_lines.append(line)
-# Перемешиваем данные
 random.shuffle(cleaned_lines)
-# Разделяем на train/dev
 split_index = int(0.8 * len(cleaned_lines))
 train_lines = cleaned_lines[:split_index]
 dev_lines = cleaned_lines[split_index:]
-# Сохраняем файлы
 with open(train_file, "w", encoding="utf-8") as f:
     for line in train_lines:
         f.write(line + "\n")
@@ -49,4 +49,4 @@ with open(dev_file, "w", encoding="utf-8") as f:
     for line in dev_lines:
         f.write(line + "\n")
-print(f"Датасет готов! Train: {len(train_lines)}, Dev: {len(dev_lines)}")

 import unicodedata
 import random
 input_file = "/kaggle/input/kaz-rus-eng-wiki/combined.txt"
 train_file = "train.txt"
 dev_file = "dev.txt"
 def normalize_text(text):
+    text = unicodedata.normalize("NFC", text)
+    text = text.lower()
     return text
 def clean_text(text):
+    text = re.sub(r"<[^>]+>", " ", text)
+    text = re.sub(r"\[\[.*?\]\]", " ", text)
+    text = re.sub(r"\s+", " ", text)
     text = text.strip()
     return text
 cleaned_lines = []
 with open(input_file, "r", encoding="utf-8") as f:
     for line in f:
         line = line.strip()
+        if len(line) < 10:
             continue
         line = normalize_text(line)
         line = clean_text(line)
         if line:
             cleaned_lines.append(line)
 random.shuffle(cleaned_lines)
 split_index = int(0.8 * len(cleaned_lines))
 train_lines = cleaned_lines[:split_index]
 dev_lines = cleaned_lines[split_index:]
 with open(train_file, "w", encoding="utf-8") as f:
     for line in train_lines:
         f.write(line + "\n")
     for line in dev_lines:
         f.write(line + "\n")
+print(f"Train: {len(train_lines)}, Dev: {len(dev_lines)}")