jkot commited on
Commit
e05693a
·
1 Parent(s): 1308da5

Upload 5 files

Browse files
Files changed (4) hide show
  1. eval.py +0 -1
  2. preprocess_dataset.py +0 -4
  3. train.py +0 -2
  4. train_tokenizers.py +0 -18
eval.py CHANGED
@@ -20,7 +20,6 @@ transformer = keras.models.load_model('models_europarl/en_cs_translator_saved_20
20
  def read_files(path, lowercase = False):
21
  with open(path, "r", encoding="utf-8") as f:
22
  dataset_split = f.read().split("\n")[:-1]
23
- #to lowercase, idk why
24
  if(lowercase):
25
  dataset_split = [line.lower() for line in dataset_split]
26
  return dataset_split
 
20
  def read_files(path, lowercase = False):
21
  with open(path, "r", encoding="utf-8") as f:
22
  dataset_split = f.read().split("\n")[:-1]
 
23
  if(lowercase):
24
  dataset_split = [line.lower() for line in dataset_split]
25
  return dataset_split
preprocess_dataset.py CHANGED
@@ -1,14 +1,10 @@
1
 
2
  import keras_nlp
3
- import keras
4
  import tensorflow.data as tf_data
5
- import pickle
6
  #hyperparameters
7
  BATCH_SIZE = 16
8
  MAX_SEQUENCE_LENGTH = 64
9
 
10
- #load tokenizers/en_vocab to list
11
-
12
  def read_files(path, lowercase = False):
13
  with open(path, "r", encoding="utf-8") as f:
14
  dataset_split = f.read().split("\n")[:-1]
 
1
 
2
  import keras_nlp
 
3
  import tensorflow.data as tf_data
 
4
  #hyperparameters
5
  BATCH_SIZE = 16
6
  MAX_SEQUENCE_LENGTH = 64
7
 
 
 
8
  def read_files(path, lowercase = False):
9
  with open(path, "r", encoding="utf-8") as f:
10
  dataset_split = f.read().split("\n")[:-1]
train.py CHANGED
@@ -2,7 +2,6 @@
2
  import keras_nlp
3
  import keras
4
  import tensorflow.data as tf_data
5
- import pickle
6
  from tensorflow.keras.optimizers import Adam
7
  from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
8
  import datetime
@@ -13,7 +12,6 @@ EPOCHS = 20
13
  EMBED_DIM = 256
14
  INTERMEDIATE_DIM = 2048
15
  NUM_HEADS = 8
16
- # TODO probably change dynamically
17
  MAX_SEQUENCE_LENGTH = 128
18
  EN_VOCAB_SIZE = 30000
19
  CS_VOCAB_SIZE = 30000
 
2
  import keras_nlp
3
  import keras
4
  import tensorflow.data as tf_data
 
5
  from tensorflow.keras.optimizers import Adam
6
  from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
7
  import datetime
 
12
  EMBED_DIM = 256
13
  INTERMEDIATE_DIM = 2048
14
  NUM_HEADS = 8
 
15
  MAX_SEQUENCE_LENGTH = 128
16
  EN_VOCAB_SIZE = 30000
17
  CS_VOCAB_SIZE = 30000
train_tokenizers.py CHANGED
@@ -1,8 +1,5 @@
1
  import keras_nlp
2
- import keras
3
  import tensorflow.data as tf_data
4
- import pickle
5
- import random
6
 
7
  EN_VOCAB_SIZE = 30000
8
  CS_VOCAB_SIZE = 30000
@@ -20,27 +17,12 @@ def train_word_piece(text_samples, vocab_size, reserved_tokens, save_output_path
20
  def read_files(path):
21
  with open(path, "r", encoding="utf-8") as f:
22
  dataset_split = f.read().split("\n")[:-1]
23
- #to lowercase, idk why
24
  dataset_split = [line.lower() for line in dataset_split]
25
  return dataset_split
26
 
27
- #OPUS cs-en
28
- # train_cs = read_files('datasets/cs-en/opus.cs-en-train.cs')
29
- # train_en = read_files('datasets/cs-en/opus.cs-en-train.en')
30
-
31
-
32
- #EUROPARL cs-en
33
  train_cs = read_files('datasets/europarl/train-cs-en.cs')
34
  train_en = read_files('datasets/europarl/train-cs-en.en')
35
-
36
-
37
-
38
- print(train_cs[0])
39
- print(train_en[0])
40
-
41
-
42
  reserved_tokens = ["[PAD]", "[UNK]", "[START]", "[END]"]
43
-
44
  en_vocab = train_word_piece(train_en, EN_VOCAB_SIZE, reserved_tokens, "tokenizers/en_europarl_vocab")
45
  cs_vocab = train_word_piece(train_cs, CS_VOCAB_SIZE, reserved_tokens, "tokenizers/cs_europarl_vocab")
46
 
 
1
  import keras_nlp
 
2
  import tensorflow.data as tf_data
 
 
3
 
4
  EN_VOCAB_SIZE = 30000
5
  CS_VOCAB_SIZE = 30000
 
17
  def read_files(path):
18
  with open(path, "r", encoding="utf-8") as f:
19
  dataset_split = f.read().split("\n")[:-1]
 
20
  dataset_split = [line.lower() for line in dataset_split]
21
  return dataset_split
22
 
 
 
 
 
 
 
23
  train_cs = read_files('datasets/europarl/train-cs-en.cs')
24
  train_en = read_files('datasets/europarl/train-cs-en.en')
 
 
 
 
 
 
 
25
  reserved_tokens = ["[PAD]", "[UNK]", "[START]", "[END]"]
 
26
  en_vocab = train_word_piece(train_en, EN_VOCAB_SIZE, reserved_tokens, "tokenizers/en_europarl_vocab")
27
  cs_vocab = train_word_piece(train_cs, CS_VOCAB_SIZE, reserved_tokens, "tokenizers/cs_europarl_vocab")
28