Spaces:
Sleeping
Sleeping
Commit
·
97fe9c2
1
Parent(s):
f0e3a09
Upload 4 files
Browse files- config.py +12 -0
- load_model.py +27 -0
- transformer.py +273 -0
- translator.py +45 -0
config.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
class Config:
|
2 |
+
vocab_size = 28000 # Vocabulary Size
|
3 |
+
sequence_length = 35
|
4 |
+
batch_size = 128
|
5 |
+
validation_split = 0.20
|
6 |
+
embed_dim = 300
|
7 |
+
num_layers = 4
|
8 |
+
latent_dim = 2048
|
9 |
+
num_heads = 12
|
10 |
+
epochs = 50 # Number of Epochs to train
|
11 |
+
is_training = True
|
12 |
+
config = Config()
|
load_model.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformer import Transformer
|
2 |
+
import tensorflow_text as tf_text
|
3 |
+
import tensorflow as tf
|
4 |
+
from config import config
|
5 |
+
|
6 |
+
|
7 |
+
def load_model(en_emb_matrix, de_emb_matrix, model_path, config):
|
8 |
+
# Initialize and rebuild your Transformer model
|
9 |
+
# (Make sure to replace '...' with actual parameters)
|
10 |
+
model = Transformer(
|
11 |
+
num_layers=config.num_layers,
|
12 |
+
d_model=config.embed_dim,
|
13 |
+
num_heads=config.num_heads,
|
14 |
+
en_embedding_matrix=en_emb_matrix,
|
15 |
+
de_embedding_matrix=de_emb_matrix,
|
16 |
+
dff=config.latent_dim,
|
17 |
+
input_vocab_size=config.vocab_size,
|
18 |
+
target_vocab_size=config.vocab_size,
|
19 |
+
dropout_rate=0.2
|
20 |
+
)
|
21 |
+
model.load_weights(model_path)
|
22 |
+
return model
|
23 |
+
|
24 |
+
def load_sp_model(path_en,path_ur):
|
25 |
+
sp_model_en = tf_text.SentencepieceTokenizer(model=tf.io.gfile.GFile(path_en, 'rb').read(),add_bos=True,add_eos=True)
|
26 |
+
sp_model_ur = tf_text.SentencepieceTokenizer(model=tf.io.gfile.GFile(path_ur, 'rb').read(),reverse=True,add_bos=True,add_eos=True)
|
27 |
+
return sp_model_en, sp_model_ur
|
transformer.py
ADDED
@@ -0,0 +1,273 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import tensorflow as tf
|
2 |
+
import numpy as np
|
3 |
+
from config import config
|
4 |
+
|
5 |
+
def positional_encoding(length, depth):
|
6 |
+
depth = depth/2
|
7 |
+
|
8 |
+
positions = np.arange(length)[:, np.newaxis] # (seq, 1)
|
9 |
+
depths = np.arange(depth)[np.newaxis, :]/depth # (1, depth)
|
10 |
+
|
11 |
+
angle_rates = 1 / (10000**depths) # (1, depth)
|
12 |
+
angle_rads = positions * angle_rates # (pos, depth)
|
13 |
+
|
14 |
+
pos_encoding = np.concatenate(
|
15 |
+
[np.sin(angle_rads), np.cos(angle_rads)],
|
16 |
+
axis=-1)
|
17 |
+
|
18 |
+
return tf.cast(pos_encoding, dtype=tf.float32)
|
19 |
+
|
20 |
+
class PositionalEmbedding(tf.keras.layers.Layer):
|
21 |
+
def __init__(self, vocab_size, d_model,embedding_matrix):
|
22 |
+
super().__init__()
|
23 |
+
self.d_model = d_model
|
24 |
+
self.embedding = tf.keras.layers.Embedding(vocab_size, d_model,
|
25 |
+
embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
|
26 |
+
mask_zero=True)
|
27 |
+
self.pos_encoding = positional_encoding(length=config.latent_dim, depth=d_model)
|
28 |
+
|
29 |
+
def compute_mask(self, *args, **kwargs):
|
30 |
+
return self.embedding.compute_mask(*args, **kwargs)
|
31 |
+
|
32 |
+
def call(self, x):
|
33 |
+
length = tf.shape(x)[1]
|
34 |
+
x = self.embedding(x)
|
35 |
+
# This factor sets the relative scale of the embedding and positonal_encoding.
|
36 |
+
x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
|
37 |
+
x = x + self.pos_encoding[tf.newaxis, :length, :]
|
38 |
+
return x
|
39 |
+
|
40 |
+
class BaseAttention(tf.keras.layers.Layer):
|
41 |
+
def __init__(self, **kwargs):
|
42 |
+
super().__init__()
|
43 |
+
self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
|
44 |
+
self.layernorm = tf.keras.layers.LayerNormalization()
|
45 |
+
self.add = tf.keras.layers.Add()
|
46 |
+
|
47 |
+
class CrossAttention(BaseAttention):
|
48 |
+
def call(self, x, context):
|
49 |
+
attn_output, attn_scores = self.mha(
|
50 |
+
query=x,
|
51 |
+
key=context,
|
52 |
+
value=context,
|
53 |
+
return_attention_scores=True)
|
54 |
+
|
55 |
+
# Cache the attention scores for plotting later.
|
56 |
+
self.last_attn_scores = attn_scores
|
57 |
+
|
58 |
+
x = self.add([x, attn_output])
|
59 |
+
x = self.layernorm(x)
|
60 |
+
|
61 |
+
return x
|
62 |
+
|
63 |
+
class GlobalSelfAttention(BaseAttention):
|
64 |
+
def call(self, x):
|
65 |
+
attn_output = self.mha(
|
66 |
+
query=x,
|
67 |
+
value=x,
|
68 |
+
key=x)
|
69 |
+
x = self.add([x, attn_output])
|
70 |
+
x = self.layernorm(x)
|
71 |
+
return x
|
72 |
+
|
73 |
+
class CausalSelfAttention(BaseAttention):
|
74 |
+
def call(self, x):
|
75 |
+
attn_output = self.mha(
|
76 |
+
query=x,
|
77 |
+
value=x,
|
78 |
+
key=x,
|
79 |
+
use_causal_mask = True)
|
80 |
+
x = self.add([x, attn_output])
|
81 |
+
x = self.layernorm(x)
|
82 |
+
return x
|
83 |
+
|
84 |
+
class FeedForward(tf.keras.layers.Layer):
|
85 |
+
def __init__(self, d_model, dff, dropout_rate=0.1):
|
86 |
+
super().__init__()
|
87 |
+
self.seq = tf.keras.Sequential([
|
88 |
+
tf.keras.layers.Dense(dff, activation='relu'),
|
89 |
+
tf.keras.layers.Dense(d_model),
|
90 |
+
tf.keras.layers.Dropout(dropout_rate)
|
91 |
+
])
|
92 |
+
self.add = tf.keras.layers.Add()
|
93 |
+
self.layer_norm = tf.keras.layers.LayerNormalization()
|
94 |
+
|
95 |
+
def call(self, x):
|
96 |
+
x = self.add([x, self.seq(x)])
|
97 |
+
x = self.layer_norm(x)
|
98 |
+
return x
|
99 |
+
|
100 |
+
class EncoderLayer(tf.keras.layers.Layer):
|
101 |
+
def __init__(self,*, d_model, num_heads, dff, dropout_rate=0.1):
|
102 |
+
super().__init__()
|
103 |
+
|
104 |
+
self.self_attention = GlobalSelfAttention(
|
105 |
+
num_heads=num_heads,
|
106 |
+
key_dim=d_model,
|
107 |
+
dropout=dropout_rate)
|
108 |
+
|
109 |
+
self.ffn = FeedForward(d_model, dff)
|
110 |
+
|
111 |
+
def call(self, x):
|
112 |
+
x = self.self_attention(x)
|
113 |
+
x = self.ffn(x)
|
114 |
+
return x
|
115 |
+
|
116 |
+
class Encoder(tf.keras.layers.Layer):
|
117 |
+
def __init__(self, *, num_layers, d_model, num_heads,embedding_matrix,
|
118 |
+
dff, vocab_size, dropout_rate=0.1):
|
119 |
+
super().__init__()
|
120 |
+
|
121 |
+
self.d_model = d_model
|
122 |
+
self.num_layers = num_layers
|
123 |
+
self.embedding_matrix = embedding_matrix
|
124 |
+
|
125 |
+
self.pos_embedding = PositionalEmbedding(
|
126 |
+
vocab_size=vocab_size, d_model=d_model,embedding_matrix=embedding_matrix)
|
127 |
+
|
128 |
+
self.enc_layers = [
|
129 |
+
EncoderLayer(d_model=d_model,
|
130 |
+
num_heads=num_heads,
|
131 |
+
dff=dff,
|
132 |
+
dropout_rate=dropout_rate)
|
133 |
+
for _ in range(num_layers)]
|
134 |
+
self.dropout = tf.keras.layers.Dropout(dropout_rate)
|
135 |
+
|
136 |
+
def call(self, x):
|
137 |
+
# `x` is token-IDs shape: (batch, seq_len)
|
138 |
+
x = self.pos_embedding(x) # Shape `(batch_size, seq_len, d_model)`.
|
139 |
+
|
140 |
+
# Add dropout.
|
141 |
+
x = self.dropout(x)
|
142 |
+
|
143 |
+
for i in range(self.num_layers):
|
144 |
+
x = self.enc_layers[i](x)
|
145 |
+
|
146 |
+
return x # Shape `(batch_size, seq_len, d_model)`.
|
147 |
+
|
148 |
+
class DecoderLayer(tf.keras.layers.Layer):
|
149 |
+
def __init__(self,
|
150 |
+
*,
|
151 |
+
d_model,
|
152 |
+
num_heads,
|
153 |
+
dff,
|
154 |
+
dropout_rate=0.1):
|
155 |
+
super(DecoderLayer, self).__init__()
|
156 |
+
|
157 |
+
self.causal_self_attention = CausalSelfAttention(
|
158 |
+
num_heads=num_heads,
|
159 |
+
key_dim=d_model,
|
160 |
+
dropout=dropout_rate)
|
161 |
+
|
162 |
+
self.cross_attention = CrossAttention(
|
163 |
+
num_heads=num_heads,
|
164 |
+
key_dim=d_model,
|
165 |
+
dropout=dropout_rate)
|
166 |
+
|
167 |
+
self.ffn = FeedForward(d_model, dff)
|
168 |
+
|
169 |
+
def call(self, x, context):
|
170 |
+
x = self.causal_self_attention(x=x)
|
171 |
+
x = self.cross_attention(x=x, context=context)
|
172 |
+
|
173 |
+
# Cache the last attention scores for plotting later
|
174 |
+
self.last_attn_scores = self.cross_attention.last_attn_scores
|
175 |
+
|
176 |
+
x = self.ffn(x) # Shape `(batch_size, seq_len, d_model)`.
|
177 |
+
return x
|
178 |
+
|
179 |
+
class Decoder(tf.keras.layers.Layer):
|
180 |
+
def __init__(self, *, num_layers, d_model, num_heads,embedding_matrix, dff, vocab_size,
|
181 |
+
dropout_rate=0.1):
|
182 |
+
super(Decoder, self).__init__()
|
183 |
+
|
184 |
+
self.d_model = d_model
|
185 |
+
self.num_layers = num_layers
|
186 |
+
self.embedding_matrix = embedding_matrix
|
187 |
+
|
188 |
+
self.pos_embedding = PositionalEmbedding(vocab_size=vocab_size,
|
189 |
+
d_model=d_model,embedding_matrix=embedding_matrix)
|
190 |
+
self.dropout = tf.keras.layers.Dropout(dropout_rate)
|
191 |
+
self.dec_layers = [
|
192 |
+
DecoderLayer(d_model=d_model, num_heads=num_heads,
|
193 |
+
dff=dff, dropout_rate=dropout_rate)
|
194 |
+
for _ in range(num_layers)]
|
195 |
+
|
196 |
+
self.last_attn_scores = None
|
197 |
+
|
198 |
+
def call(self, x, context):
|
199 |
+
# `x` is token-IDs shape (batch, target_seq_len)
|
200 |
+
x = self.pos_embedding(x) # (batch_size, target_seq_len, d_model)
|
201 |
+
|
202 |
+
x = self.dropout(x)
|
203 |
+
|
204 |
+
for i in range(self.num_layers):
|
205 |
+
x = self.dec_layers[i](x, context)
|
206 |
+
|
207 |
+
self.last_attn_scores = self.dec_layers[-1].last_attn_scores
|
208 |
+
|
209 |
+
# The shape of x is (batch_size, target_seq_len, d_model).
|
210 |
+
return x
|
211 |
+
|
212 |
+
class Transformer(tf.keras.Model):
|
213 |
+
def __init__(self, *, num_layers, d_model, num_heads,en_embedding_matrix,de_embedding_matrix, dff,
|
214 |
+
input_vocab_size, target_vocab_size, dropout_rate=0.1):
|
215 |
+
super().__init__()
|
216 |
+
self.encoder = Encoder(num_layers=num_layers, d_model=d_model,
|
217 |
+
num_heads=num_heads,embedding_matrix= en_embedding_matrix, dff=dff,
|
218 |
+
vocab_size=input_vocab_size,
|
219 |
+
dropout_rate=dropout_rate)
|
220 |
+
|
221 |
+
self.decoder = Decoder(num_layers=num_layers, d_model=d_model,
|
222 |
+
num_heads=num_heads, embedding_matrix=de_embedding_matrix,dff=dff,
|
223 |
+
vocab_size=target_vocab_size,
|
224 |
+
dropout_rate=dropout_rate)
|
225 |
+
|
226 |
+
self.final_layer = tf.keras.layers.Dense(target_vocab_size)
|
227 |
+
|
228 |
+
def call(self, inputs):
|
229 |
+
# To use a Keras model with `.fit` you must pass all your inputs in the
|
230 |
+
# first argument.
|
231 |
+
context, x = inputs
|
232 |
+
|
233 |
+
context = self.encoder(context) # (batch_size, context_len, d_model)
|
234 |
+
|
235 |
+
x = self.decoder(x, context) # (batch_size, target_len, d_model)
|
236 |
+
|
237 |
+
# Final linear layer output.
|
238 |
+
logits = self.final_layer(x) # (batch_size, target_len, target_vocab_size)
|
239 |
+
|
240 |
+
try:
|
241 |
+
# Drop the keras mask, so it doesn't scale the losses/metrics.
|
242 |
+
# b/250038731
|
243 |
+
del logits._keras_mask
|
244 |
+
except AttributeError:
|
245 |
+
pass
|
246 |
+
|
247 |
+
# Return the final output and the attention weights.
|
248 |
+
return logits
|
249 |
+
|
250 |
+
|
251 |
+
|
252 |
+
|
253 |
+
|
254 |
+
|
255 |
+
|
256 |
+
|
257 |
+
|
258 |
+
|
259 |
+
|
260 |
+
|
261 |
+
|
262 |
+
|
263 |
+
|
264 |
+
|
265 |
+
|
266 |
+
|
267 |
+
|
268 |
+
|
269 |
+
|
270 |
+
|
271 |
+
|
272 |
+
|
273 |
+
|
translator.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import tensorflow as tf
|
2 |
+
import numpy as np
|
3 |
+
from config import config
|
4 |
+
|
5 |
+
class Translator(tf.Module):
|
6 |
+
def __init__(self, sp_model_en, sp_model_ur, transformer):
|
7 |
+
self.sp_model_en = sp_model_en
|
8 |
+
self.sp_model_ur = sp_model_ur
|
9 |
+
self.transformer = transformer
|
10 |
+
|
11 |
+
def __call__(self, sentence, max_length=config.sequence_length):
|
12 |
+
sentence = tf.constant(sentence)
|
13 |
+
if len(sentence.shape) == 0:
|
14 |
+
sentence = sentence[tf.newaxis]
|
15 |
+
|
16 |
+
# Tokenize the English sentence
|
17 |
+
sentence = self.sp_model_en.tokenize(sentence).to_tensor()
|
18 |
+
|
19 |
+
encoder_input = sentence
|
20 |
+
|
21 |
+
# Initialize the output for Urdu with `[START]` token
|
22 |
+
start = self.sp_model_ur.tokenize([''])[0][0][tf.newaxis]
|
23 |
+
end = self.sp_model_ur.tokenize([''])[0][1][tf.newaxis]
|
24 |
+
|
25 |
+
output_array = tf.TensorArray(dtype=tf.int32, size=0, dynamic_size=True)
|
26 |
+
output_array = output_array.write(0, start)
|
27 |
+
|
28 |
+
for i in tf.range(max_length):
|
29 |
+
output = tf.transpose(output_array.stack())
|
30 |
+
predictions = self.transformer([encoder_input, output], training=False)
|
31 |
+
|
32 |
+
predictions = predictions[:, -1:, :] # Shape `(batch_size, 1, vocab_size)`
|
33 |
+
|
34 |
+
predicted_id = tf.argmax(predictions, axis=-1)
|
35 |
+
predicted_id = tf.cast(predicted_id, tf.int32)
|
36 |
+
|
37 |
+
output_array = output_array.write(i+1, predicted_id[0])
|
38 |
+
|
39 |
+
if predicted_id == end:
|
40 |
+
break
|
41 |
+
|
42 |
+
output = tf.transpose(output_array.stack())
|
43 |
+
text = self.sp_model_ur.detokenize(output)[0] # Shape: `()`
|
44 |
+
|
45 |
+
return text.numpy().decode('utf-8')
|