Upload 5 files
Browse files- create_dataset_splits.py +0 -6
- eval.py +5 -7
- preprocess_dataset.py +0 -17
- train_tokenizers.py +0 -4
create_dataset_splits.py
CHANGED
|
@@ -4,13 +4,9 @@ import tensorflow.data as tf_data
|
|
| 4 |
import pickle
|
| 5 |
import random
|
| 6 |
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
def read_files(path):
|
| 11 |
with open(path, "r", encoding="utf-8") as f:
|
| 12 |
dataset_split = f.read().split("\n")[:-1]
|
| 13 |
-
#to lowercase, idk why
|
| 14 |
dataset_split = [line.lower() for line in dataset_split]
|
| 15 |
return dataset_split
|
| 16 |
|
|
@@ -18,8 +14,6 @@ def save_list_to_file(file_path, string_list):
|
|
| 18 |
with open(file_path, 'w') as file:
|
| 19 |
file.writelines([f"{string}\n" for string in string_list])
|
| 20 |
|
| 21 |
-
|
| 22 |
-
#EUROPARL cs-en
|
| 23 |
#load files
|
| 24 |
cs_file = 'datasets/europarl/europarl-v7.cs-en.cs'
|
| 25 |
en_file = 'datasets/europarl/europarl-v7.cs-en.en'
|
|
|
|
| 4 |
import pickle
|
| 5 |
import random
|
| 6 |
|
|
|
|
|
|
|
|
|
|
| 7 |
def read_files(path):
|
| 8 |
with open(path, "r", encoding="utf-8") as f:
|
| 9 |
dataset_split = f.read().split("\n")[:-1]
|
|
|
|
| 10 |
dataset_split = [line.lower() for line in dataset_split]
|
| 11 |
return dataset_split
|
| 12 |
|
|
|
|
| 14 |
with open(file_path, 'w') as file:
|
| 15 |
file.writelines([f"{string}\n" for string in string_list])
|
| 16 |
|
|
|
|
|
|
|
| 17 |
#load files
|
| 18 |
cs_file = 'datasets/europarl/europarl-v7.cs-en.cs'
|
| 19 |
en_file = 'datasets/europarl/europarl-v7.cs-en.en'
|
eval.py
CHANGED
|
@@ -2,18 +2,17 @@
|
|
| 2 |
import keras_nlp
|
| 3 |
import keras
|
| 4 |
import tensorflow.data as tf_data
|
| 5 |
-
import pickle
|
| 6 |
import tensorflow as tf
|
| 7 |
from tensorflow.keras.optimizers import Adam
|
| 8 |
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
|
| 9 |
-
import datetime
|
| 10 |
-
import random
|
| 11 |
import re
|
| 12 |
from sacrebleu.metrics import CHRF
|
| 13 |
import time
|
| 14 |
-
|
| 15 |
-
|
| 16 |
MAX_SEQUENCE_LENGTH = 64
|
|
|
|
|
|
|
| 17 |
|
| 18 |
transformer = keras.models.load_model('models_europarl/en_cs_translator_saved_20231209_0046.keras')
|
| 19 |
def read_files(path, lowercase = False):
|
|
@@ -46,7 +45,6 @@ def next_token_logits(encoder_input_tokens, prompt, predicted_token_index):
|
|
| 46 |
|
| 47 |
|
| 48 |
def greedy_decode(encoder_input_tokens, prompt, end_token_id):
|
| 49 |
-
|
| 50 |
start_index = 1
|
| 51 |
current_prompt = prompt
|
| 52 |
for predicted_token_index in range(start_index, MAX_SEQUENCE_LENGTH):
|
|
@@ -152,7 +150,7 @@ bleu_metrics = keras_nlp.metrics.Bleu(
|
|
| 152 |
tokenizer = cs_tokenizer
|
| 153 |
)
|
| 154 |
|
| 155 |
-
|
| 156 |
chrf = CHRF()
|
| 157 |
refs = test_cs[:eval_samples]
|
| 158 |
translations = []
|
|
|
|
| 2 |
import keras_nlp
|
| 3 |
import keras
|
| 4 |
import tensorflow.data as tf_data
|
|
|
|
| 5 |
import tensorflow as tf
|
| 6 |
from tensorflow.keras.optimizers import Adam
|
| 7 |
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
|
|
|
|
|
|
|
| 8 |
import re
|
| 9 |
from sacrebleu.metrics import CHRF
|
| 10 |
import time
|
| 11 |
+
|
| 12 |
+
|
| 13 |
MAX_SEQUENCE_LENGTH = 64
|
| 14 |
+
eval_samples = 100
|
| 15 |
+
|
| 16 |
|
| 17 |
transformer = keras.models.load_model('models_europarl/en_cs_translator_saved_20231209_0046.keras')
|
| 18 |
def read_files(path, lowercase = False):
|
|
|
|
| 45 |
|
| 46 |
|
| 47 |
def greedy_decode(encoder_input_tokens, prompt, end_token_id):
|
|
|
|
| 48 |
start_index = 1
|
| 49 |
current_prompt = prompt
|
| 50 |
for predicted_token_index in range(start_index, MAX_SEQUENCE_LENGTH):
|
|
|
|
| 150 |
tokenizer = cs_tokenizer
|
| 151 |
)
|
| 152 |
|
| 153 |
+
|
| 154 |
chrf = CHRF()
|
| 155 |
refs = test_cs[:eval_samples]
|
| 156 |
translations = []
|
preprocess_dataset.py
CHANGED
|
@@ -12,13 +12,10 @@ MAX_SEQUENCE_LENGTH = 64
|
|
| 12 |
def read_files(path, lowercase = False):
|
| 13 |
with open(path, "r", encoding="utf-8") as f:
|
| 14 |
dataset_split = f.read().split("\n")[:-1]
|
| 15 |
-
#to lowercase, idk why
|
| 16 |
if(lowercase):
|
| 17 |
dataset_split = [line.lower() for line in dataset_split]
|
| 18 |
return dataset_split
|
| 19 |
|
| 20 |
-
# en_vocab = read_files("tokenizers/en_opus_vocab")
|
| 21 |
-
# cs_vocab = read_files("tokenizers/cs_opus_vocab")
|
| 22 |
en_vocab = read_files("tokenizers/en_europarl_vocab")
|
| 23 |
cs_vocab = read_files("tokenizers/cs_europarl_vocab")
|
| 24 |
|
|
@@ -32,42 +29,28 @@ cs_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
|
|
| 32 |
)
|
| 33 |
|
| 34 |
|
| 35 |
-
#opus
|
| 36 |
-
# train_cs_file = 'datasets/cs-en/opus.cs-en-train.cs'
|
| 37 |
-
# train_en_file = 'datasets/cs-en/opus.cs-en-train.en'
|
| 38 |
-
# valid_cs_file = 'datasets/cs-en/opus.cs-en-dev.cs'
|
| 39 |
-
# valid_en_file = 'datasets/cs-en/opus.cs-en-dev.en'
|
| 40 |
-
# test_cs_file = 'datasets/cs-en/opus.cs-en-test.cs'
|
| 41 |
-
# test_en_file = 'datasets/cs-en/opus.cs-en-test.en'
|
| 42 |
-
|
| 43 |
#europarl
|
| 44 |
train_cs_file = 'datasets/europarl/train-cs-en.cs'
|
| 45 |
train_en_file = 'datasets/europarl/train-cs-en.en'
|
| 46 |
valid_cs_file = 'datasets/europarl/valid-cs-en.cs'
|
| 47 |
valid_en_file = 'datasets/europarl/valid-cs-en.en'
|
| 48 |
-
test_cs_file = 'datasets/europarl/test-cs-en.cs'
|
| 49 |
-
test_en_file = 'datasets/europarl/test-cs-en.en'
|
| 50 |
|
| 51 |
|
| 52 |
train_cs = read_files(train_cs_file, True)
|
| 53 |
train_en = read_files(train_en_file, True)
|
| 54 |
valid_cs = read_files(valid_cs_file, True)
|
| 55 |
valid_en = read_files(valid_en_file, True)
|
| 56 |
-
test_cs = read_files(test_cs_file, True)
|
| 57 |
-
test_en = read_files(test_en_file, True)
|
| 58 |
|
| 59 |
def preprocess_batch(en, cs):
|
| 60 |
en = en_tokenizer(en)
|
| 61 |
cs = cs_tokenizer(cs)
|
| 62 |
|
| 63 |
-
# Pad `eng` to `MAX_SEQUENCE_LENGTH`.
|
| 64 |
en_start_end_packer = keras_nlp.layers.StartEndPacker(
|
| 65 |
sequence_length=MAX_SEQUENCE_LENGTH,
|
| 66 |
pad_value=en_tokenizer.token_to_id("[PAD]"),
|
| 67 |
)
|
| 68 |
en = en_start_end_packer(en)
|
| 69 |
|
| 70 |
-
# Add special tokens (`"[START]"` and `"[END]"`) to `spa` and pad it as well.
|
| 71 |
cs_start_end_packer = keras_nlp.layers.StartEndPacker(
|
| 72 |
sequence_length=MAX_SEQUENCE_LENGTH + 1,
|
| 73 |
start_value=cs_tokenizer.token_to_id("[START]"),
|
|
|
|
| 12 |
def read_files(path, lowercase = False):
|
| 13 |
with open(path, "r", encoding="utf-8") as f:
|
| 14 |
dataset_split = f.read().split("\n")[:-1]
|
|
|
|
| 15 |
if(lowercase):
|
| 16 |
dataset_split = [line.lower() for line in dataset_split]
|
| 17 |
return dataset_split
|
| 18 |
|
|
|
|
|
|
|
| 19 |
en_vocab = read_files("tokenizers/en_europarl_vocab")
|
| 20 |
cs_vocab = read_files("tokenizers/cs_europarl_vocab")
|
| 21 |
|
|
|
|
| 29 |
)
|
| 30 |
|
| 31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
#europarl
|
| 33 |
train_cs_file = 'datasets/europarl/train-cs-en.cs'
|
| 34 |
train_en_file = 'datasets/europarl/train-cs-en.en'
|
| 35 |
valid_cs_file = 'datasets/europarl/valid-cs-en.cs'
|
| 36 |
valid_en_file = 'datasets/europarl/valid-cs-en.en'
|
|
|
|
|
|
|
| 37 |
|
| 38 |
|
| 39 |
train_cs = read_files(train_cs_file, True)
|
| 40 |
train_en = read_files(train_en_file, True)
|
| 41 |
valid_cs = read_files(valid_cs_file, True)
|
| 42 |
valid_en = read_files(valid_en_file, True)
|
|
|
|
|
|
|
| 43 |
|
| 44 |
def preprocess_batch(en, cs):
|
| 45 |
en = en_tokenizer(en)
|
| 46 |
cs = cs_tokenizer(cs)
|
| 47 |
|
|
|
|
| 48 |
en_start_end_packer = keras_nlp.layers.StartEndPacker(
|
| 49 |
sequence_length=MAX_SEQUENCE_LENGTH,
|
| 50 |
pad_value=en_tokenizer.token_to_id("[PAD]"),
|
| 51 |
)
|
| 52 |
en = en_start_end_packer(en)
|
| 53 |
|
|
|
|
| 54 |
cs_start_end_packer = keras_nlp.layers.StartEndPacker(
|
| 55 |
sequence_length=MAX_SEQUENCE_LENGTH + 1,
|
| 56 |
start_value=cs_tokenizer.token_to_id("[START]"),
|
train_tokenizers.py
CHANGED
|
@@ -4,13 +4,9 @@ import tensorflow.data as tf_data
|
|
| 4 |
import pickle
|
| 5 |
import random
|
| 6 |
|
| 7 |
-
|
| 8 |
EN_VOCAB_SIZE = 30000
|
| 9 |
CS_VOCAB_SIZE = 30000
|
| 10 |
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
def train_word_piece(text_samples, vocab_size, reserved_tokens, save_output_path):
|
| 15 |
word_piece_ds = tf_data.Dataset.from_tensor_slices(text_samples)
|
| 16 |
vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(
|
|
|
|
| 4 |
import pickle
|
| 5 |
import random
|
| 6 |
|
|
|
|
| 7 |
EN_VOCAB_SIZE = 30000
|
| 8 |
CS_VOCAB_SIZE = 30000
|
| 9 |
|
|
|
|
|
|
|
|
|
|
| 10 |
def train_word_piece(text_samples, vocab_size, reserved_tokens, save_output_path):
|
| 11 |
word_piece_ds = tf_data.Dataset.from_tensor_slices(text_samples)
|
| 12 |
vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(
|