RishabA commited on
Commit
6569027
·
verified ·
1 Parent(s): a5ddee7

Upload 5 files

Browse files
Language_Translation_Transformer_in_PyTorch_from_Scratch.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
app.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import spacy
4
+ import gradio as gr
5
+ from model import make_model, translate_sentence, Vocab
6
+
7
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
8
+
9
+
10
+ def load_tokenizers():
11
+ try:
12
+ spacy_es = spacy.load("es_core_news_sm")
13
+ except OSError:
14
+ os.system("python -m spacy download es_core_news_sm")
15
+ spacy_es = spacy.load("es_core_news_sm")
16
+ try:
17
+ spacy_en = spacy.load("en_core_web_sm")
18
+ except OSError:
19
+ os.system("python -m spacy download en_core_web_sm")
20
+ spacy_en = spacy.load("en_core_web_sm")
21
+ print("Tokenizers loaded.")
22
+ return spacy_es, spacy_en
23
+
24
+
25
+ spacy_es, spacy_en = load_tokenizers()
26
+
27
+ if os.path.exists("vocab.pt"):
28
+ vocab_src, vocab_trg = torch.load("vocab.pt")
29
+ else:
30
+ raise FileNotFoundError(
31
+ "vocab.pt not found. Please build and save the vocabularies first."
32
+ )
33
+
34
+ model = make_model(
35
+ device,
36
+ vocab_src,
37
+ vocab_trg,
38
+ n_layers=3,
39
+ d_model=512,
40
+ d_ffn=2048,
41
+ n_heads=8,
42
+ dropout=0.1,
43
+ max_length=50,
44
+ )
45
+ model.to(device)
46
+
47
+ if os.path.exists("translation_model.pt"):
48
+ model.load_state_dict(torch.load("translation_model.pt", map_location=device))
49
+ print("Pretrained model loaded.")
50
+ else:
51
+ raise FileNotFoundError(
52
+ "translation_model.pt not found. Please train and save the model first."
53
+ )
54
+
55
+
56
+ def translate(text):
57
+ translation = translate_sentence(
58
+ text, model, vocab_src, vocab_trg, spacy_es, device, max_length=50
59
+ )
60
+ return translation
61
+
62
+
63
+ css_str = """
64
+ .title {
65
+ font-size: 48px;
66
+ font-weight: bold;
67
+ text-align: center;
68
+ margin-bottom: 20px;
69
+ }
70
+ .description {
71
+ font-size: 20px;
72
+ text-align: center;
73
+ margin-bottom: 40px;
74
+ }
75
+ """
76
+
77
+ with gr.Blocks(css=css_str) as demo:
78
+ gr.Markdown("<div class='title'>Spanish-to-English Translator</div>")
79
+ gr.Markdown(
80
+ "<div class='description'>Enter a Spanish sentence below to receive its English translation.</div>"
81
+ )
82
+ with gr.Row():
83
+ txt_input = gr.Textbox(
84
+ label="Enter Spanish sentence", lines=2, placeholder="Ej: ¿Cómo estás?"
85
+ )
86
+ translate_btn = gr.Button("Translate")
87
+ txt_output = gr.Textbox(label="English Translation", lines=2)
88
+ translate_btn.click(fn=translate, inputs=txt_input, outputs=txt_output)
89
+
90
+ if __name__ == "__main__":
91
+ demo.launch(share=True)
model.py ADDED
@@ -0,0 +1,627 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
+ from torch import Tensor
6
+ from collections import Counter
7
+
8
+
9
+ class Vocab:
10
+ def __init__(self, stoi, itos, default_index):
11
+ self.stoi = stoi # mapping from token to index
12
+ self.itos = itos # list of tokens
13
+ self.default_index = default_index # default index for unknown words
14
+
15
+ def __getitem__(self, token):
16
+ # Return index of token
17
+ return self.stoi.get(
18
+ token, self.default_index
19
+ ) # If not found return the default index
20
+
21
+ def get_stoi(self):
22
+ return self.stoi
23
+
24
+ def lookup_tokens(self, indices):
25
+ # Return the tokens at indices
26
+ return [self.itos[i] for i in indices]
27
+
28
+ def __len__(self):
29
+ return len(self.itos)
30
+
31
+ def __contains__(self, token):
32
+ return token in self.stoi
33
+
34
+ def __iter__(self):
35
+ return iter(self.itos)
36
+
37
+ def __repr__(self):
38
+ return f"Vocab({len(self)} tokens)"
39
+
40
+
41
+ def build_vocab_from_iterator(token_iterator, min_freq, specials):
42
+ counter = Counter() # Use counter to get tokens and frequencies
43
+ for tokens in token_iterator:
44
+ counter.update(tokens)
45
+ tokens = [
46
+ token for token, freq in counter.items() if freq >= min_freq
47
+ ] # Keep tokens with frequency >= min_freq
48
+ tokens = sorted(tokens) # Sort alphabetically
49
+ itos = list(specials) + tokens
50
+ stoi = {token: idx for idx, token in enumerate(itos)} # token-to-index
51
+ return Vocab(stoi=stoi, itos=itos, default_index=stoi.get("<unk>", 0))
52
+
53
+
54
+ """### Transformer Model"""
55
+
56
+
57
+ # Embedding Layer
58
+ class EmbeddingLayer(nn.Module):
59
+ def __init__(self, vocab_size: int, d_model: int):
60
+ """
61
+ vocab_size: size of the vocabulary
62
+ d_model: dimensions of the embeddings (number of values in each embedding vector)
63
+ """
64
+ super().__init__()
65
+
66
+ self.d_model = d_model
67
+
68
+ # Embedding look-up table (vocab_size, d_model)
69
+ self.lut = nn.Embedding(num_embeddings=vocab_size, embedding_dim=d_model)
70
+
71
+ def forward(self, x):
72
+ # x shape: (batch_size, seq_len)
73
+ # Multiply by the sqrt of the d_model as a scale factor
74
+ return self.lut(x) * math.sqrt(self.d_model) # (batch_size, seq_len, d_model)
75
+
76
+
77
+ """**Positional Encoding Equations**
78
+
79
+ $PE(k, 2i) = sin(\frac{k}{10000^{\frac{2i}{d_{model}}}})$
80
+
81
+ $PE(k, 2i + 1) = cos(\frac{k}{10000^{\frac{2i}{d_{model}}}})$
82
+ """
83
+
84
+
85
+ # Positional Encoding
86
+ class PositionalEncoding(nn.Module):
87
+ def __init__(self, d_model: int, dropout: float = 0.1, max_length: int = 5000):
88
+ """
89
+ d_model: dimensions of the embeddings (number of values in each embedding vector)
90
+ dropout: probability of dropout
91
+ max_length: max length of a sequence
92
+ """
93
+ super().__init__()
94
+
95
+ self.dropout = nn.Dropout(p=dropout)
96
+
97
+ pe = torch.zeros(max_length, d_model) # (max_length, d_model)
98
+ # Create position column
99
+ k = torch.arange(0, max_length).unsqueeze(dim=1) # (max_length, 1)
100
+
101
+ # Use the log version of the function for positional encodings
102
+ div_term = torch.exp(
103
+ torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model)
104
+ ) # (d_model / 2)
105
+
106
+ # Use sine for the even indices and cosine for the odd indices
107
+ pe[:, 0::2] = torch.sin(k * div_term)
108
+ pe[:, 1::2] = torch.cos(k * div_term)
109
+
110
+ pe = pe.unsqueeze(dim=0) # Add the batch dimension(1, max_length, d_model)
111
+
112
+ # We use a buffer because the positional encoding is fixed and not a model paramter that we want to be updated during backpropagation.
113
+ self.register_buffer(
114
+ "pe", pe
115
+ ) # Buffers are saved with the model state and are moved to the correct device
116
+
117
+ def forward(self, x):
118
+ # x shape: (batch_size, seq_length, d_model)
119
+ # Add the positional encoding to the embeddings that are passed in
120
+ x += self.pe[:, : x.size(1)]
121
+ return self.dropout(x)
122
+
123
+
124
+ """**Multi-Head Self-Attention Equations:**
125
+
126
+ $Q = X W_q$
127
+
128
+ $K = X W_k$
129
+
130
+ $V = X W_v$
131
+
132
+ $Attention(Q, K, V) = softmax(\frac{QK^T}{\sqrt{d_{key}}})V$
133
+ """
134
+
135
+
136
+ # Multi-Head Self-Attention
137
+ class MultiHeadAttention(nn.Module):
138
+ def __init__(self, d_model: int = 512, n_heads: int = 8, dropout: float = 0.1):
139
+ """
140
+ d_model: dimensions of the embeddings (number of values in each embedding vector)
141
+ n_heads: number of self attention heads per sequence
142
+ dropout: probability of dropout
143
+ """
144
+ super().__init__()
145
+ assert (
146
+ d_model % n_heads == 0
147
+ ) # We want to make sure that the dimensions are split evenly among the attention heads.
148
+ self.d_model = d_model
149
+ self.n_heads = n_heads
150
+ self.d_key = d_model // n_heads
151
+
152
+ self.Wq = nn.Linear(
153
+ in_features=d_model, out_features=d_model
154
+ ) # Learnable weights for query
155
+ self.Wk = nn.Linear(
156
+ in_features=d_model, out_features=d_model
157
+ ) # Learnable weights for key
158
+ self.Wv = nn.Linear(
159
+ in_features=d_model, out_features=d_model
160
+ ) # Learnable weights for value
161
+ self.Wo = nn.Linear(
162
+ in_features=d_model, out_features=d_model
163
+ ) # Learnable weights for output
164
+
165
+ self.dropout = nn.Dropout(p=dropout)
166
+
167
+ def forward(self, query: Tensor, key: Tensor, value: Tensor, mask: Tensor = None):
168
+ """
169
+ query: (batch_size, q_length, d_model)
170
+ key: (batch_size, k_length, d_model)
171
+ value: (batch_size, s_length, d_model)
172
+ """
173
+ batch_size = key.size(0)
174
+
175
+ # Matrix multiplication for Q, K, and V tensors
176
+ Q = self.Wq(query)
177
+ K = self.Wk(key)
178
+ V = self.Wv(value)
179
+
180
+ # Split each tensor into heads
181
+ Q = Q.view(batch_size, -1, self.n_heads, self.d_key).permute(
182
+ 0, 2, 1, 3
183
+ ) # (batch_size, n_heads, q_length, d_key)
184
+ K = K.view(batch_size, -1, self.n_heads, self.d_key).permute(
185
+ 0, 2, 1, 3
186
+ ) # (batch_size, n_heads, k_length, d_key)
187
+ V = V.view(batch_size, -1, self.n_heads, self.d_key).permute(
188
+ 0, 2, 1, 3
189
+ ) # (batch_size, n_heads, v_length, d_key)
190
+
191
+ # Scaled dot product
192
+ # K^T becomees (batch_size, n_heads, d_key, k_length)
193
+ scaled_dot_product = torch.matmul(Q, K.permute(0, 1, 3, 2)) / math.sqrt(
194
+ self.d_key
195
+ ) # (batch_size, n_heads, q_length, k_length)
196
+
197
+ if mask is not None:
198
+ scaled_dot_product = scaled_dot_product.masked_fill(
199
+ mask == 0, float("-inf")
200
+ ) # Filling it with 0 would result in 1 after the mask because e^0 = 1. Intead we fill it with an incredibly large negative number
201
+
202
+ # Softmax function for attention probabilities
203
+ attention_probs = torch.softmax(scaled_dot_product, dim=-1)
204
+
205
+ # Multiply by V to get attention with respect to the values
206
+ A = torch.matmul(self.dropout(attention_probs), V)
207
+
208
+ # Reshape attention back to (batch_size, q_length, d_model)
209
+ A = (
210
+ A.permute(0, 2, 1, 3)
211
+ .contiguous()
212
+ .view(batch_size, -1, self.n_heads * self.d_key)
213
+ )
214
+
215
+ # Pass through the final linear layer
216
+ output = self.Wo(A)
217
+
218
+ return output, attention_probs
219
+
220
+
221
+ # Position-Wise Feed Forward Network (FFN)
222
+ class PositionwiseFeedForward(nn.Module):
223
+ def __init__(self, d_model: int, d_ffn: int, dropout: float = 0.1):
224
+ """
225
+ d_model: dimensions of the embeddings (number of values in each embedding vector)
226
+ d_ffn: dimensions of the feed-forward network
227
+ dropout: probability of dropout
228
+ """
229
+ super().__init__()
230
+
231
+ self.ffn = nn.Sequential(
232
+ nn.Linear(in_features=d_model, out_features=d_ffn),
233
+ nn.ReLU(),
234
+ nn.Linear(in_features=d_ffn, out_features=d_model),
235
+ nn.Dropout(p=dropout),
236
+ )
237
+
238
+ def forward(self, x):
239
+ return self.ffn(x)
240
+
241
+
242
+ # Encoder Layer
243
+ class EncoderLayer(nn.Module):
244
+ def __init__(self, d_model: int, n_heads: int, d_ffn: int, dropout: float = 0.1):
245
+ """
246
+ d_model: dimensions of the embeddings (number of values in each embedding vector)
247
+ n_heads: number of self attention heads per sequence
248
+ d_ffn: dimensions of the feed-forward network
249
+ dropout: probability of dropout
250
+ """
251
+ super().__init__()
252
+
253
+ # Multi-Head Self-Attention sublayer
254
+ self.attention = MultiHeadAttention(
255
+ d_model=d_model, n_heads=n_heads, dropout=dropout
256
+ )
257
+ self.attention_layer_norm = nn.LayerNorm(d_model) # Layer normalization
258
+
259
+ # Position-wise Feed-forward Network
260
+ self.position_wise_ffn = PositionwiseFeedForward(
261
+ d_model=d_model, d_ffn=d_ffn, dropout=dropout
262
+ )
263
+ self.ffn_layer_norm = nn.LayerNorm(d_model) # Layer normalization
264
+
265
+ self.dropout = nn.Dropout(p=dropout)
266
+
267
+ def forward(self, src: Tensor, src_mask: Tensor):
268
+ """
269
+ src: embedded sequences (batch_size, seq_length, d_model)
270
+ src_mask: mask for the sequences (batch_size, 1, 1, seq_length)
271
+ """
272
+ # Multi-Head Attention
273
+
274
+ # The source mask ensures the model ignores these padding positions by assigning them near-zero attention scores.
275
+ _src, attention_probs = self.attention(src, src, src, src_mask) # Q, K, V, mask
276
+
277
+ # Residual Addition and Layer Normalization
278
+ src = self.attention_layer_norm(
279
+ src + self.dropout(_src)
280
+ ) # We do residual addition by adding back the src (the embeddings) to the output of Self-Attention
281
+
282
+ # Position-wise Feed-forward Network
283
+ _src = self.position_wise_ffn(src)
284
+
285
+ # Residual Addition and Layer Normalization
286
+ src = self.ffn_layer_norm(src + self.dropout(_src))
287
+
288
+ return src, attention_probs
289
+
290
+
291
+ # The Encoder
292
+ class Encoder(nn.Module):
293
+ def __init__(
294
+ self,
295
+ d_model: int,
296
+ n_layers: int,
297
+ n_heads: int,
298
+ d_ffn: int,
299
+ dropout: float = 0.1,
300
+ ):
301
+ """
302
+ d_model: dimensions of the embeddings (number of values in each embedding vector)
303
+ n_layers: number of encoder layers in the encoder block
304
+ n_heads: number of self attention heads per sequence
305
+ d_ffn: dimensions of the feed-forward network
306
+ dropout: probability of dropout
307
+ """
308
+ super().__init__()
309
+
310
+ # Create n_layers encoders
311
+ self.layers = nn.ModuleList(
312
+ [
313
+ EncoderLayer(
314
+ d_model=d_model, n_heads=n_heads, d_ffn=d_ffn, dropout=dropout
315
+ )
316
+ for layer in range(n_layers)
317
+ ]
318
+ )
319
+ self.dropout = nn.Dropout(p=dropout)
320
+
321
+ def forward(self, src: Tensor, src_mask: Tensor):
322
+ """
323
+ src: embedded sequences (batch_size, seq_length, d_model)
324
+ src_mask: mask for the sequences (batch_size, 1, 1, seq_length)
325
+ """
326
+
327
+ # Pass the sequences through each encoder layer
328
+ for layer in self.layers:
329
+ src, attention_probs = layer(src, src_mask)
330
+
331
+ self.attention_probs = attention_probs
332
+
333
+ src += torch.randn_like(src) * 0.001
334
+
335
+ return src
336
+
337
+
338
+ # Decoder Layer
339
+ class DecoderLayer(nn.Module):
340
+ def __init__(self, d_model: int, n_heads: int, d_ffn: int, dropout: float = 0.1):
341
+ """
342
+ d_model: dimensions of the embeddings (number of values in each embedding vector)
343
+ n_heads: number of self attention heads per sequence
344
+ d_ffn: dimensions of the feed-forward network
345
+ dropout: probability of dropout
346
+ """
347
+ super().__init__()
348
+
349
+ # Masked Multi-Head Self-Attention sublayer
350
+ self.masked_attention = MultiHeadAttention(
351
+ d_model=d_model, n_heads=n_heads, dropout=dropout
352
+ )
353
+ self.masked_attention_layer_norm = nn.LayerNorm(d_model) # Layer normalization
354
+
355
+ # Multi-Head Self-Attention sublayer
356
+ self.attention = MultiHeadAttention(
357
+ d_model=d_model, n_heads=n_heads, dropout=dropout
358
+ )
359
+ self.attention_layer_norm = nn.LayerNorm(d_model) # Layer normalization
360
+
361
+ # Position-wise Feed-forward Network
362
+ self.position_wise_ffn = PositionwiseFeedForward(
363
+ d_model=d_model, d_ffn=d_ffn, dropout=dropout
364
+ )
365
+ self.ffn_layer_norm = nn.LayerNorm(d_model) # Layer normalization
366
+
367
+ self.dropout = nn.Dropout(p=dropout)
368
+
369
+ def forward(self, trg: Tensor, src: Tensor, trg_mask: Tensor, src_mask: Tensor):
370
+ """
371
+ trg: embedded sequences (batch_size, trg_seq_length, d_model)
372
+ src: embedded sequences (batch_size, src_seq_length, d_model)
373
+ trg_mask: mask for the sequences (batch_size, 1, trg_seq_length, trg_seq_length)
374
+ src_mask: mask for the sequences (batch_size, 1, 1, src_seq_length)
375
+ """
376
+
377
+ # Masked Multi-Head Attention
378
+
379
+ # The target mask is used to prevent the model from seeing future tokens. This ensures that the prediction is made solely based on past and present tokens.
380
+ _trg, masked_attention_probs = self.masked_attention(
381
+ trg, trg, trg, trg_mask
382
+ ) # Q, K, V, mask
383
+ # Residual Addition and Layer Normalization
384
+ trg = self.masked_attention_layer_norm(trg + self.dropout(_trg))
385
+
386
+ # Multi-Head Attention - This time, we also pass in the output of the encoder layers as src.
387
+ # This is important because this allows us to keep track of and learn relationships between the input and output tokens.
388
+ _trg, attention_probs = self.attention(trg, src, src, src_mask) # Q, K, V, mask
389
+ # Residual Addition and Layer Normalization
390
+ trg = self.attention_layer_norm(trg + self.dropout(_trg))
391
+
392
+ # Position-wise Feed-forward Network
393
+ _trg = self.position_wise_ffn(trg)
394
+ # Residual Addition and Layer Normalization
395
+ trg = self.ffn_layer_norm(trg + self.dropout(_trg))
396
+
397
+ return trg, attention_probs, masked_attention_probs
398
+
399
+
400
+ # The Decoder
401
+ class Decoder(nn.Module):
402
+ def __init__(
403
+ self,
404
+ vocab_size: int,
405
+ d_model: int,
406
+ n_layers: int,
407
+ n_heads: int,
408
+ d_ffn: int,
409
+ dropout: float = 0.1,
410
+ ):
411
+ """
412
+ vocab_size: size of the target vocabulary
413
+ d_model: dimensions of the embeddings (number of values in each embedding vector)
414
+ n_layers: number of encoder layers in the encoder block
415
+ n_heads: number of self attention heads per sequence
416
+ d_ffn: dimensions of the feed-forward network
417
+ dropout: probability of dropout
418
+ """
419
+ super().__init__()
420
+
421
+ # Create n_layers decoders
422
+ self.layers = nn.ModuleList(
423
+ [
424
+ DecoderLayer(
425
+ d_model=d_model, n_heads=n_heads, d_ffn=d_ffn, dropout=dropout
426
+ )
427
+ for layer in range(n_layers)
428
+ ]
429
+ )
430
+ self.dropout = nn.Dropout(p=dropout)
431
+
432
+ # Output layer
433
+ self.Wo = nn.Linear(in_features=d_model, out_features=vocab_size)
434
+
435
+ def forward(self, trg: Tensor, src: Tensor, trg_mask: Tensor, src_mask: Tensor):
436
+ """
437
+ trg: embedded sequences (batch_size, trg_seq_length, d_model)
438
+ src: embedded sequences (batch_size, src_seq_length, d_model)
439
+ trg_mask: mask for the sequences (batch_size, 1, trg_seq_length, trg_seq_length)
440
+ src_mask: mask for the sequences (batch_size, 1, 1, src_seq_length)
441
+ """
442
+
443
+ # Pass the sequences through each decoder layer
444
+ for layer in self.layers:
445
+ trg, attention_probs, masked_attention_probs = layer(
446
+ trg, src, trg_mask, src_mask
447
+ )
448
+
449
+ self.attention_probs = attention_probs
450
+ self.masked_attention_probs = masked_attention_probs
451
+
452
+ trg += torch.randn_like(trg) * 0.001
453
+
454
+ return self.Wo(trg)
455
+
456
+
457
+ # The Transformer
458
+ class Transformer(nn.Module):
459
+ def __init__(
460
+ self,
461
+ encoder: Encoder,
462
+ decoder: Decoder,
463
+ src_embed: EmbeddingLayer,
464
+ trg_embed: EmbeddingLayer,
465
+ src_pad_idx: int,
466
+ trg_pad_idx: int,
467
+ device,
468
+ ):
469
+ """
470
+ encoder: encoder stack
471
+ decoder: decoder stack
472
+ src_embed: source embeddings
473
+ trg_embd: target embeddings
474
+ src_pad_idx: source padding index
475
+ trg_pad_idx: target padding index
476
+ device: device
477
+ """
478
+ super().__init__()
479
+
480
+ self.encoder = encoder
481
+ self.decoder = decoder
482
+ self.src_embed = src_embed
483
+ self.trg_embed = trg_embed
484
+ self.device = device
485
+ self.src_pad_idx = src_pad_idx
486
+ self.trg_pad_idx = trg_pad_idx
487
+
488
+ def make_src_mask(self, src: Tensor):
489
+ # Assign 1 to tokens that need attended to and 0 to padding tokens, then add 2 dimensions
490
+ src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
491
+
492
+ return src_mask
493
+
494
+ def make_trg_mask(self, trg: Tensor):
495
+ seq_length = trg.shape[1]
496
+
497
+ # Assign True to tokens that need attended to and False to padding tokens, then add 2 dimensions
498
+ trg_mask = (
499
+ (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)
500
+ ) # (batch_size, 1, 1, seq_length)
501
+
502
+ # Generate subsequent mask
503
+ trg_sub_mask = torch.tril(
504
+ torch.ones((seq_length, seq_length), device=self.device)
505
+ ).bool() # (batch_size, 1, seq_length, seq_length)
506
+
507
+ # Bottom triangle is True, top triangle is False
508
+ trg_mask = trg_mask & trg_sub_mask
509
+
510
+ return trg_mask
511
+
512
+ def forward(self, src: Tensor, trg: Tensor):
513
+ """
514
+ trg: raw target sequences (batch_size, trg_seq_length)
515
+ src: raw src sequences (batch_size, src_seq_length)
516
+ """
517
+
518
+ # Create source and target masks
519
+ src_mask = self.make_src_mask(src) # (batch_size, 1, 1, src_seq_length)
520
+
521
+ # The lower triangle of the mask is filled with 1s
522
+ trg_mask = self.make_trg_mask(
523
+ trg
524
+ ) # (batch_size, 1, trg_seq_length, trg_seq_length)
525
+
526
+ # Encoder layers
527
+ src = self.encoder(
528
+ self.src_embed(src), src_mask
529
+ ) # (batch_size, src_seq_length, d_model)
530
+
531
+ # Decoder layers
532
+ output = self.decoder(
533
+ self.trg_embed(trg), src, trg_mask, src_mask
534
+ ) # Pass in both the target (for Masked Multi-Head Self-Attention) and source for (Cross-Attention)
535
+
536
+ return output
537
+
538
+
539
+ def make_model(
540
+ device,
541
+ src_vocab,
542
+ trg_vocab,
543
+ n_layers: int = 3,
544
+ d_model: int = 512,
545
+ d_ffn: int = 2048,
546
+ n_heads: int = 8,
547
+ dropout: float = 0.1,
548
+ max_length: int = 5000,
549
+ ):
550
+ """
551
+ src_vocab: source vocabulary
552
+ trg_vocab: target vocabulary
553
+ n_layers: number of encoder layers in the encoder block
554
+ d_model: dimensions of the embeddings (number of values in each embedding vector)
555
+ d_ffn: dimensions of the feed-forward network
556
+ n_heads: number of self attention heads per sequence
557
+ dropout: probability of dropout
558
+ max_length: maximum sequence length for positional encodings
559
+ """
560
+
561
+ encoder = Encoder(
562
+ d_model=d_model,
563
+ n_layers=n_layers,
564
+ n_heads=n_heads,
565
+ d_ffn=d_ffn,
566
+ dropout=dropout,
567
+ )
568
+
569
+ decoder = Decoder(
570
+ vocab_size=len(trg_vocab),
571
+ d_model=d_model,
572
+ n_layers=n_layers,
573
+ n_heads=n_heads,
574
+ d_ffn=d_ffn,
575
+ dropout=dropout,
576
+ )
577
+
578
+ src_embed = EmbeddingLayer(vocab_size=len(src_vocab), d_model=d_model)
579
+ trg_embed = EmbeddingLayer(vocab_size=len(trg_vocab), d_model=d_model)
580
+
581
+ pos_enc = PositionalEncoding(
582
+ d_model=d_model, dropout=dropout, max_length=max_length
583
+ )
584
+
585
+ model = Transformer(
586
+ encoder=encoder,
587
+ decoder=decoder,
588
+ src_embed=nn.Sequential(src_embed, pos_enc),
589
+ trg_embed=nn.Sequential(trg_embed, pos_enc),
590
+ src_pad_idx=src_vocab.get_stoi()["<pad>"],
591
+ trg_pad_idx=trg_vocab.get_stoi()["<pad>"],
592
+ device=device,
593
+ )
594
+
595
+ # Initialize parameters with Xaviar/Glorot
596
+ # This maintains a consistent variance of activations throughout the network
597
+ # Helps avoid issues like vanishing or exploding gradients.
598
+ for p in model.parameters():
599
+ if p.dim() > 1:
600
+ nn.init.xavier_uniform_(p)
601
+
602
+ return model
603
+
604
+
605
+ def translate_sentence(
606
+ sentence, model, vocab_src, vocab_trg, spacy_es, device, max_length=50
607
+ ):
608
+ model.eval()
609
+ if isinstance(sentence, str):
610
+ src = (
611
+ ["<bos>"] + [token.text.lower() for token in spacy_es(sentence)] + ["<eos>"]
612
+ )
613
+ else:
614
+ src = ["<bos>"] + sentence + ["<eos>"]
615
+ src_indexes = [vocab_src[token] for token in src]
616
+ src_tensor = torch.tensor(src_indexes).int().unsqueeze(0).to(device)
617
+ trg_indexes = [vocab_trg.stoi["<bos>"]]
618
+ for _ in range(max_length):
619
+ trg_tensor = torch.tensor(trg_indexes).int().unsqueeze(0).to(device)
620
+ with torch.no_grad():
621
+ logits = model(src_tensor, trg_tensor)
622
+ pred_token = logits.argmax(dim=2)[:, -1].item()
623
+ if pred_token == vocab_trg.stoi["<eos>"]:
624
+ break
625
+ trg_indexes.append(pred_token)
626
+ trg_tokens = vocab_trg.lookup_tokens(trg_indexes)
627
+ return " ".join(trg_tokens)
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ spacy
3
+ gradio
4
+ datasets
5
+ nltk
6
+ tqdm
7
+ matplotlib
8
+ numpy
translation_model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5b13d937caa89b9320f6c0c57c0634c87f12ed8771868bad531dc8bf0bd60d5
3
+ size 646480754