Eraly-ml
/

KazBERT

@@ -1,188 +1,52 @@
-# Import required libraries
-import pandas as pd
-from sklearn.model_selection import train_test_split
-import os
-import json
 import random
-import nltk
-from tokenizers import Tokenizer, models, pre_tokenizers, trainers
-# Download NLTK's punkt tokenizer if not already downloaded
-nltk.download('punkt')
-# ------------------------------------------------------------------------------
-# SECTION 1: Define file paths for datasets
-# ------------------------------------------------------------------------------
-# File paths
-kazakh_path = '/kaggle/input/eng-kaz/kk_wiki_articles.txt'
-english_path = '/kaggle/input/eng-kaz/test-00000-of-00001.parquet'
-russian_json_path = "hf://datasets/Den4ikAI/russian_cleared_wikipedia/wiki_dataset.json"
-# ------------------------------------------------------------------------------
-# SECTION 2: Load and preprocess the Kazakh dataset
-# ------------------------------------------------------------------------------
-# Load Kazakh dataset (each line is an article)
-with open(kazakh_path, "r", encoding="utf-8") as f:
-    kazakh_texts = f.readlines()
-# Strip extra spaces and remove empty lines
-kazakh_texts = [line.strip() for line in kazakh_texts if line.strip()]
-print(f"Number of Kazakh articles: {len(kazakh_texts)}")
-# ------------------------------------------------------------------------------
-# SECTION 3: Load and preprocess the English dataset (Parquet format)
-# ------------------------------------------------------------------------------
-# Load the English dataset from a Parquet file
-english_df = pd.read_parquet(english_path)
-print("English dataset columns:", english_df.columns.tolist())
-# Assume the text is stored in the column 'text'
-if 'text' in english_df.columns:
-    english_texts = english_df['text'].dropna().tolist()
-else:
-    # If the column name is different, use the first column
-    english_texts = english_df.iloc[:, 0].dropna().tolist()
-print(f"Number of English articles: {len(english_texts)}")
-# ------------------------------------------------------------------------------
-# SECTION 4: Load and preprocess the Russian dataset (JSON lines)
-# ------------------------------------------------------------------------------
-# Load Russian dataset (JSON, with lines=True)
-russian_df = pd.read_json(russian_json_path, lines=True)
-print("Russian dataset columns:", russian_df.columns.tolist())
-# Assume the text is stored in the 'text' column
-if 'text' in russian_df.columns:
-    russian_texts = russian_df['text'].dropna().tolist()
-else:
-    russian_texts = russian_df.iloc[:, 0].dropna().tolist()
-print(f"Number of Russian articles: {len(russian_texts)}")
-# ------------------------------------------------------------------------------
-# SECTION 5: Combine all articles and save to a combined file
-# ------------------------------------------------------------------------------
-# Combine all texts from the three datasets into one list
-all_texts = kazakh_texts + english_texts + russian_texts
-print(f"Total number of articles: {len(all_texts)}")
-# Save the combined articles to a file "combined.txt"
-with open("combined.txt", "w", encoding="utf-8") as f:
-    for article in all_texts:
-        f.write(article + "\n")
-print("Combined dataset saved to combined.txt")
-# ------------------------------------------------------------------------------
-# SECTION 6: Split data into training and validation sets
-# ------------------------------------------------------------------------------
-# Split data into train (80%) and validation (20%) sets
-train_texts, val_texts = train_test_split(all_texts, test_size=0.2, random_state=42)
-print(f"Number of training examples: {len(train_texts)}, Number of validation examples: {len(val_texts)}")
-# Save the training data to "train.txt"
-with open("train.txt", "w", encoding="utf-8") as f:
-    for article in train_texts:
-        f.write(article + "\n")
-# Save the validation data to "valid.txt"
-with open("valid.txt", "w", encoding="utf-8") as f:
-    for article in val_texts:
-        f.write(article + "\n")
-print("Files train.txt and valid.txt have been saved")
-# ------------------------------------------------------------------------------
-# SECTION 7: Create pretraining data with masked sentences for masked language modeling
-# ------------------------------------------------------------------------------
-# Read the complete training text from "train.txt"
-with open("/kaggle/input/kaz-rus-eng-wiki/train.txt", "r", encoding="utf-8") as f:
-    text = f.read()
-# Tokenize the text into sentences using NLTK
-sentences = nltk.sent_tokenize(text)
-output_data = []
-for sentence in sentences:
-    sentence = sentence.strip()
-    # Select sentences that end with a period
-    if sentence.endswith('.'):
-        words = sentence.split()
-        if len(words) < 2:
-            masked_sentence = sentence
-        else:
-            # Randomly choose one word to replace with the [MASK] token
-            idx = random.randint(0, len(words) - 1)
-            words[idx] = "[MASK]"
-            masked_sentence = " ".join(words)
-        output_data.append({
-            "original_sentence": sentence,
-            "masked_sentence": masked_sentence
-        })
-# Save the pretraining examples in JSON format to "train_pretrain.json"
-with open("train_pretrain.json", "w", encoding="utf-8") as f:
-    json.dump(output_data, f, ensure_ascii=False, indent=4)
-print(f"Saved {len(output_data)} examples to train_pretrain.json")
-# ------------------------------------------------------------------------------
-# SECTION 8: Train a WordPiece tokenizer using the tokenizers library
-# ------------------------------------------------------------------------------
-# Read the text file for tokenizer training (using the validation file here)
-with open("/kaggle/working/valid.txt", "r", encoding="utf-8") as f:
-    texts = f.readlines()
-# Create a WordPiece tokenizer with an unknown token
-tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
-tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
-# Define special tokens
-special_tokens = ["[PAD]", "[CLS]", "[SEP]", "[MASK]", "[UNK]"]
-# Setup the WordPiece trainer with vocabulary size and minimum frequency
-trainer = trainers.WordPieceTrainer(
-    vocab_size=30_000,
-    min_frequency=2,
-    special_tokens=special_tokens
-)
-# Train the tokenizer on the texts
-tokenizer.train_from_iterator(texts, trainer)
-# Save the vocabulary to "vocab.txt"
-with open("vocab.txt", "w", encoding="utf-8") as f:
-    for token, _ in sorted(tokenizer.get_vocab().items(), key=lambda x: x[1]):
-        f.write(token + "\n")
-# Save the tokenizer model in JSON format to "tokenizer.json"
-tokenizer.save("tokenizer.json")
-# Create and save the special tokens map as JSON
-special_tokens_map = {
-    "unk_token": "[UNK]",
-    "sep_token": "[SEP]",
-    "pad_token": "[PAD]",
-    "cls_token": "[CLS]",
-    "mask_token": "[MASK]"
-}
-with open("special_tokens_map.json", "w", encoding="utf-8") as f:
-    json.dump(special_tokens_map, f, indent=4)
-# Create and save the tokenizer configuration as JSON
-tokenizer_config = {
-    "do_lower_case": False,
-    "vocab_size": 30_000,
-    "model_max_length": 512,
-    "special_tokens_map_file": "special_tokens_map.json"
-}
-with open("tokenizer_config.json", "w", encoding="utf-8") as f:
-    json.dump(tokenizer_config, f, indent=4)
-print("✅ Tokenizer training completed! Files 'tokenizer.json', 'vocab.txt', 'special_tokens_map.json', and 'tokenizer_config.json' have been saved.")

+import re
+import unicodedata
 import random
+# Пути к файлам
+input_file = "/kaggle/input/kaz-rus-eng-wiki/combined.txt"
+train_file = "train.txt"
+dev_file = "dev.txt"
+# Функции для очистки и нормализации текста
+def normalize_text(text):
+    text = unicodedata.normalize("NFC", text)  # Приведение к нормальной форме Unicode
+    text = text.lower()  # Приведение к нижнему регистру
+    return text
+def clean_text(text):
+    text = re.sub(r"<[^>]+>", " ", text)  # Удаление HTML-тегов
+    text = re.sub(r"\[\[.*?\]\]", " ", text)  # Удаление вики-разметки
+    text = re.sub(r"\s+", " ", text)  # Замена нескольких пробелов на один
+    text = text.strip()
+    return text
+# Читаем и очищаем текст
+cleaned_lines = []
+with open(input_file, "r", encoding="utf-8") as f:
+    for line in f:
+        line = line.strip()
+        if len(line) < 10:  # Пропускаем короткие строки
+            continue
+        line = normalize_text(line)
+        line = clean_text(line)
+        if line:
+            cleaned_lines.append(line)
+# Перемешиваем данные
+random.shuffle(cleaned_lines)
+# Разделяем на train/dev
+split_index = int(0.8 * len(cleaned_lines))
+train_lines = cleaned_lines[:split_index]
+dev_lines = cleaned_lines[split_index:]
+# Сохраняем файлы
+with open(train_file, "w", encoding="utf-8") as f:
+    for line in train_lines:
+        f.write(line + "\n")
+with open(dev_file, "w", encoding="utf-8") as f:
+    for line in dev_lines:
+        f.write(line + "\n")
+print(f"Датасет готов! Train: {len(train_lines)}, Dev: {len(dev_lines)}")