Fill-Mask
Transformers
Safetensors
PyTorch
Kazakh
Russian
English
bert
Eraly-ml commited on
Commit
6d56cee
·
verified ·
1 Parent(s): 2171bc2

Update data-pipline.py

Browse files
Files changed (1) hide show
  1. data-pipline.py +50 -186
data-pipline.py CHANGED
@@ -1,188 +1,52 @@
1
- # Import required libraries
2
- import pandas as pd
3
- from sklearn.model_selection import train_test_split
4
- import os
5
- import json
6
  import random
7
- import nltk
8
- from tokenizers import Tokenizer, models, pre_tokenizers, trainers
9
 
10
- # Download NLTK's punkt tokenizer if not already downloaded
11
- nltk.download('punkt')
12
-
13
- # ------------------------------------------------------------------------------
14
- # SECTION 1: Define file paths for datasets
15
- # ------------------------------------------------------------------------------
16
-
17
- # File paths
18
- kazakh_path = '/kaggle/input/eng-kaz/kk_wiki_articles.txt'
19
- english_path = '/kaggle/input/eng-kaz/test-00000-of-00001.parquet'
20
- russian_json_path = "hf://datasets/Den4ikAI/russian_cleared_wikipedia/wiki_dataset.json"
21
-
22
- # ------------------------------------------------------------------------------
23
- # SECTION 2: Load and preprocess the Kazakh dataset
24
- # ------------------------------------------------------------------------------
25
-
26
- # Load Kazakh dataset (each line is an article)
27
- with open(kazakh_path, "r", encoding="utf-8") as f:
28
- kazakh_texts = f.readlines()
29
-
30
- # Strip extra spaces and remove empty lines
31
- kazakh_texts = [line.strip() for line in kazakh_texts if line.strip()]
32
- print(f"Number of Kazakh articles: {len(kazakh_texts)}")
33
-
34
- # ------------------------------------------------------------------------------
35
- # SECTION 3: Load and preprocess the English dataset (Parquet format)
36
- # ------------------------------------------------------------------------------
37
-
38
- # Load the English dataset from a Parquet file
39
- english_df = pd.read_parquet(english_path)
40
- print("English dataset columns:", english_df.columns.tolist())
41
-
42
- # Assume the text is stored in the column 'text'
43
- if 'text' in english_df.columns:
44
- english_texts = english_df['text'].dropna().tolist()
45
- else:
46
- # If the column name is different, use the first column
47
- english_texts = english_df.iloc[:, 0].dropna().tolist()
48
- print(f"Number of English articles: {len(english_texts)}")
49
-
50
- # ------------------------------------------------------------------------------
51
- # SECTION 4: Load and preprocess the Russian dataset (JSON lines)
52
- # ------------------------------------------------------------------------------
53
-
54
- # Load Russian dataset (JSON, with lines=True)
55
- russian_df = pd.read_json(russian_json_path, lines=True)
56
- print("Russian dataset columns:", russian_df.columns.tolist())
57
-
58
- # Assume the text is stored in the 'text' column
59
- if 'text' in russian_df.columns:
60
- russian_texts = russian_df['text'].dropna().tolist()
61
- else:
62
- russian_texts = russian_df.iloc[:, 0].dropna().tolist()
63
- print(f"Number of Russian articles: {len(russian_texts)}")
64
-
65
- # ------------------------------------------------------------------------------
66
- # SECTION 5: Combine all articles and save to a combined file
67
- # ------------------------------------------------------------------------------
68
-
69
- # Combine all texts from the three datasets into one list
70
- all_texts = kazakh_texts + english_texts + russian_texts
71
- print(f"Total number of articles: {len(all_texts)}")
72
-
73
- # Save the combined articles to a file "combined.txt"
74
- with open("combined.txt", "w", encoding="utf-8") as f:
75
- for article in all_texts:
76
- f.write(article + "\n")
77
- print("Combined dataset saved to combined.txt")
78
-
79
- # ------------------------------------------------------------------------------
80
- # SECTION 6: Split data into training and validation sets
81
- # ------------------------------------------------------------------------------
82
-
83
- # Split data into train (80%) and validation (20%) sets
84
- train_texts, val_texts = train_test_split(all_texts, test_size=0.2, random_state=42)
85
- print(f"Number of training examples: {len(train_texts)}, Number of validation examples: {len(val_texts)}")
86
-
87
- # Save the training data to "train.txt"
88
- with open("train.txt", "w", encoding="utf-8") as f:
89
- for article in train_texts:
90
- f.write(article + "\n")
91
-
92
- # Save the validation data to "valid.txt"
93
- with open("valid.txt", "w", encoding="utf-8") as f:
94
- for article in val_texts:
95
- f.write(article + "\n")
96
-
97
- print("Files train.txt and valid.txt have been saved")
98
-
99
- # ------------------------------------------------------------------------------
100
- # SECTION 7: Create pretraining data with masked sentences for masked language modeling
101
- # ------------------------------------------------------------------------------
102
-
103
- # Read the complete training text from "train.txt"
104
- with open("/kaggle/input/kaz-rus-eng-wiki/train.txt", "r", encoding="utf-8") as f:
105
- text = f.read()
106
-
107
- # Tokenize the text into sentences using NLTK
108
- sentences = nltk.sent_tokenize(text)
109
-
110
- output_data = []
111
- for sentence in sentences:
112
- sentence = sentence.strip()
113
- # Select sentences that end with a period
114
- if sentence.endswith('.'):
115
- words = sentence.split()
116
- if len(words) < 2:
117
- masked_sentence = sentence
118
- else:
119
- # Randomly choose one word to replace with the [MASK] token
120
- idx = random.randint(0, len(words) - 1)
121
- words[idx] = "[MASK]"
122
- masked_sentence = " ".join(words)
123
- output_data.append({
124
- "original_sentence": sentence,
125
- "masked_sentence": masked_sentence
126
- })
127
-
128
- # Save the pretraining examples in JSON format to "train_pretrain.json"
129
- with open("train_pretrain.json", "w", encoding="utf-8") as f:
130
- json.dump(output_data, f, ensure_ascii=False, indent=4)
131
-
132
- print(f"Saved {len(output_data)} examples to train_pretrain.json")
133
-
134
- # ------------------------------------------------------------------------------
135
- # SECTION 8: Train a WordPiece tokenizer using the tokenizers library
136
- # ------------------------------------------------------------------------------
137
-
138
- # Read the text file for tokenizer training (using the validation file here)
139
- with open("/kaggle/working/valid.txt", "r", encoding="utf-8") as f:
140
- texts = f.readlines()
141
-
142
- # Create a WordPiece tokenizer with an unknown token
143
- tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
144
- tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
145
-
146
- # Define special tokens
147
- special_tokens = ["[PAD]", "[CLS]", "[SEP]", "[MASK]", "[UNK]"]
148
-
149
- # Setup the WordPiece trainer with vocabulary size and minimum frequency
150
- trainer = trainers.WordPieceTrainer(
151
- vocab_size=30_000,
152
- min_frequency=2,
153
- special_tokens=special_tokens
154
- )
155
-
156
- # Train the tokenizer on the texts
157
- tokenizer.train_from_iterator(texts, trainer)
158
-
159
- # Save the vocabulary to "vocab.txt"
160
- with open("vocab.txt", "w", encoding="utf-8") as f:
161
- for token, _ in sorted(tokenizer.get_vocab().items(), key=lambda x: x[1]):
162
- f.write(token + "\n")
163
-
164
- # Save the tokenizer model in JSON format to "tokenizer.json"
165
- tokenizer.save("tokenizer.json")
166
-
167
- # Create and save the special tokens map as JSON
168
- special_tokens_map = {
169
- "unk_token": "[UNK]",
170
- "sep_token": "[SEP]",
171
- "pad_token": "[PAD]",
172
- "cls_token": "[CLS]",
173
- "mask_token": "[MASK]"
174
- }
175
- with open("special_tokens_map.json", "w", encoding="utf-8") as f:
176
- json.dump(special_tokens_map, f, indent=4)
177
-
178
- # Create and save the tokenizer configuration as JSON
179
- tokenizer_config = {
180
- "do_lower_case": False,
181
- "vocab_size": 30_000,
182
- "model_max_length": 512,
183
- "special_tokens_map_file": "special_tokens_map.json"
184
- }
185
- with open("tokenizer_config.json", "w", encoding="utf-8") as f:
186
- json.dump(tokenizer_config, f, indent=4)
187
-
188
- print("✅ Tokenizer training completed! Files 'tokenizer.json', 'vocab.txt', 'special_tokens_map.json', and 'tokenizer_config.json' have been saved.")
 
1
+ import re
2
+ import unicodedata
 
 
 
3
  import random
 
 
4
 
5
+ # Пути к файлам
6
+ input_file = "/kaggle/input/kaz-rus-eng-wiki/combined.txt"
7
+ train_file = "train.txt"
8
+ dev_file = "dev.txt"
9
+
10
+ # Функции для очистки и нормализации текста
11
+ def normalize_text(text):
12
+ text = unicodedata.normalize("NFC", text) # Приведение к нормальной форме Unicode
13
+ text = text.lower() # Приведение к нижнему регистру
14
+ return text
15
+
16
+ def clean_text(text):
17
+ text = re.sub(r"<[^>]+>", " ", text) # Удаление HTML-тегов
18
+ text = re.sub(r"\[\[.*?\]\]", " ", text) # Удаление вики-разметки
19
+ text = re.sub(r"\s+", " ", text) # Замена нескольких пробелов на один
20
+ text = text.strip()
21
+ return text
22
+
23
+ # Читаем и очищаем текст
24
+ cleaned_lines = []
25
+ with open(input_file, "r", encoding="utf-8") as f:
26
+ for line in f:
27
+ line = line.strip()
28
+ if len(line) < 10: # Пропускаем короткие строки
29
+ continue
30
+ line = normalize_text(line)
31
+ line = clean_text(line)
32
+ if line:
33
+ cleaned_lines.append(line)
34
+
35
+ # Перемешиваем данные
36
+ random.shuffle(cleaned_lines)
37
+
38
+ # Разделяем на train/dev
39
+ split_index = int(0.8 * len(cleaned_lines))
40
+ train_lines = cleaned_lines[:split_index]
41
+ dev_lines = cleaned_lines[split_index:]
42
+
43
+ # Сохраняем файлы
44
+ with open(train_file, "w", encoding="utf-8") as f:
45
+ for line in train_lines:
46
+ f.write(line + "\n")
47
+
48
+ with open(dev_file, "w", encoding="utf-8") as f:
49
+ for line in dev_lines:
50
+ f.write(line + "\n")
51
+
52
+ print(f"Датасет готов! Train: {len(train_lines)}, Dev: {len(dev_lines)}")