Fill-Mask
Transformers
Safetensors
PyTorch
Kazakh
Russian
English
bert
Eraly-ml commited on
Commit
3229c59
·
verified ·
1 Parent(s): 6d56cee

Update data-pipline.py

Browse files
Files changed (1) hide show
  1. data-pipline.py +13 -13
data-pipline.py CHANGED
@@ -2,45 +2,45 @@ import re
2
  import unicodedata
3
  import random
4
 
5
- # Пути к файлам
6
  input_file = "/kaggle/input/kaz-rus-eng-wiki/combined.txt"
7
  train_file = "train.txt"
8
  dev_file = "dev.txt"
9
 
10
- # Функции для очистки и нормализации текста
11
  def normalize_text(text):
12
- text = unicodedata.normalize("NFC", text) # Приведение к нормальной форме Unicode
13
- text = text.lower() # Приведение к нижнему регистру
14
  return text
15
 
16
  def clean_text(text):
17
- text = re.sub(r"<[^>]+>", " ", text) # Удаление HTML-тегов
18
- text = re.sub(r"\[\[.*?\]\]", " ", text) # Удаление вики-разметки
19
- text = re.sub(r"\s+", " ", text) # Замена нескольких пробелов на один
20
  text = text.strip()
21
  return text
22
 
23
- # Читаем и очищаем текст
24
  cleaned_lines = []
25
  with open(input_file, "r", encoding="utf-8") as f:
26
  for line in f:
27
  line = line.strip()
28
- if len(line) < 10: # Пропускаем короткие строки
29
  continue
30
  line = normalize_text(line)
31
  line = clean_text(line)
32
  if line:
33
  cleaned_lines.append(line)
34
 
35
- # Перемешиваем данные
36
  random.shuffle(cleaned_lines)
37
 
38
- # Разделяем на train/dev
39
  split_index = int(0.8 * len(cleaned_lines))
40
  train_lines = cleaned_lines[:split_index]
41
  dev_lines = cleaned_lines[split_index:]
42
 
43
- # Сохраняем файлы
44
  with open(train_file, "w", encoding="utf-8") as f:
45
  for line in train_lines:
46
  f.write(line + "\n")
@@ -49,4 +49,4 @@ with open(dev_file, "w", encoding="utf-8") as f:
49
  for line in dev_lines:
50
  f.write(line + "\n")
51
 
52
- print(f"Датасет готов! Train: {len(train_lines)}, Dev: {len(dev_lines)}")
 
2
  import unicodedata
3
  import random
4
 
5
+
6
  input_file = "/kaggle/input/kaz-rus-eng-wiki/combined.txt"
7
  train_file = "train.txt"
8
  dev_file = "dev.txt"
9
 
10
+
11
  def normalize_text(text):
12
+ text = unicodedata.normalize("NFC", text)
13
+ text = text.lower()
14
  return text
15
 
16
  def clean_text(text):
17
+ text = re.sub(r"<[^>]+>", " ", text)
18
+ text = re.sub(r"\[\[.*?\]\]", " ", text)
19
+ text = re.sub(r"\s+", " ", text)
20
  text = text.strip()
21
  return text
22
 
23
+
24
  cleaned_lines = []
25
  with open(input_file, "r", encoding="utf-8") as f:
26
  for line in f:
27
  line = line.strip()
28
+ if len(line) < 10:
29
  continue
30
  line = normalize_text(line)
31
  line = clean_text(line)
32
  if line:
33
  cleaned_lines.append(line)
34
 
35
+
36
  random.shuffle(cleaned_lines)
37
 
38
+
39
  split_index = int(0.8 * len(cleaned_lines))
40
  train_lines = cleaned_lines[:split_index]
41
  dev_lines = cleaned_lines[split_index:]
42
 
43
+
44
  with open(train_file, "w", encoding="utf-8") as f:
45
  for line in train_lines:
46
  f.write(line + "\n")
 
49
  for line in dev_lines:
50
  f.write(line + "\n")
51
 
52
+ print(f"Train: {len(train_lines)}, Dev: {len(dev_lines)}")