In [21]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import math
#import tensorflow as tf

In [22]:
with open("C:/Users/adity/Projects_of_Aditya/Working/India, officially the Republic of I.txt",'r',encoding='utf-8') as f:
    raw_text=f.read()
print(len(raw_text))
print(raw_text[:100])

55955
India, officially the Republic of India,[j][21] is a country in South Asia. It is the seventh-larges


In [23]:
train_ratio = 0.9
train_size = int(train_ratio * len(raw_text))
train_text = raw_text[:train_size]
val_text = raw_text[train_size:]

In [24]:
class BinarizeFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input):
        ctx.save_for_backward(input)
        return torch.sign(input)
    @staticmethod
    def backward(ctx, grad_output):
        input, = ctx.saved_tensors
        mask=(input.abs()<=1).float()
        grad_input = grad_output * mask
        return grad_input

In [25]:
class QuantizedLinear(nn.Module):
    def __init__(self, in_features, out_features, bias=True):
        super(QuantizedLinear, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = nn.Parameter(torch.Tensor(out_features, in_features))
        if bias:
            self.bias = nn.Parameter(torch.Tensor(out_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self):
        nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
        if self.bias is not None:
            fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight)
            bound = 1 / math.sqrt(fan_in)
            nn.init.uniform_(self.bias, -bound, bound)
    def forward(self, input):
        weight = BinarizeFunction.apply(self.weight)
        if self.bias is not None:
            return torch.nn.functional.linear(input, weight, self.bias)
        else:
            return torch.nn.functional.linear(input, weight)
    def extra_repr(self):
        return 'in_features={}, out_features={}, bias={}'.format(
            self.in_features, self.out_features, self.bias is not None
        )

In [26]:
from torch.utils.data import Dataset, DataLoader
import tiktoken

class GPTTokenizerDataset(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.target_ids = []
        token_ids = self.tokenizer.encode(txt)

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1:i + max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
    def __len__(self):
        return len(self.input_ids)
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]
def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True):
    tokenizer = tiktoken.get_encoding("cl100k_base")
    dataset = GPTTokenizerDataset(txt, tokenizer, max_length, stride)
    dataloader = DataLoader(
        dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last
    )
    return dataloader

In [27]:
def generate_text(model,idx,max_new_tokens,context_size,temperature=0.4,top_k=3):
    for _ in range(max_new_tokens):
        idx_cond=idx[:,-context_size:]
        with torch.no_grad():
            logits=model(idx_cond)
        logits=logits[:,-1,:]
        if top_k is not None:
            top_logits,_=torch.topk(logits,top_k)
            min_val=top_logits[:,-1]
            logits=torch.where(logits<min_val,torch.tensor(float('-inf')).to(logits.device),logits)
        if temperature>0.0:
            logits=logits/temperature
            probs=torch.softmax(logits,dim=-1)
            idx_next=torch.multinomial(probs,num_samples=1)
        else:
            idx_next=torch.argmax(logits,dim=-1,keepdim=True)
        idx=torch.cat((idx,idx_next),dim=1)
    return idx

In [28]:
tokenizer = tiktoken.get_encoding("cl100k_base")
def text_to_token_ids(text,tokenizer):
    encoded=tokenizer.encode(text,allowed_special={'<|endoftext|>'})
    encoded_tensor=torch.tensor(encoded).unsqueeze(0)
    return encoded_tensor
def token_ids_to_text(token_ids,tokenizer):
    flat=token_ids.squeeze(0)
    return tokenizer.decode(flat.tolist())

Coding up the Attention model:- Here we would be creating a class of the causal attention and instantiating multiple times for the multihead attention model.

Now for example if we set the number of heads we want is 10, then what exactly happens:-
--> we obtain a tensor with ten sets of context vector matrices.
--> In each context vector matrix the rows represent the context vectors corresponding to the tokens, and the columns corresponding to the embedding dimension specified via d_out.
--> Final embedding dimension is 10 x 10.

IMPLEMENTING THE PARALLEL METHOD OF IMPLEMENTATION.

In [29]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert d_out % num_heads == 0, "d_out must be divisible by num_heads"
        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads
        self.W_query = QuantizedLinear(d_in, d_out, bias=qkv_bias)
        self.W_key = QuantizedLinear(d_in, d_out, bias=qkv_bias)
        self.W_value = QuantizedLinear(d_in, d_out, bias=qkv_bias)
        self.out_proj = QuantizedLinear(d_out, d_out)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
            'mask',
            torch.triu(torch.ones(context_length, context_length), diagonal=1)
        )
    def forward(self, x):
        b, num_tokens, d_in = x.shape
        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)
        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)
        attn_scores = queries @ keys.transpose(2, 3)
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
        attn_scores.masked_fill_(mask_bool, -torch.inf)
        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)
        context_vec = (attn_weights @ values).transpose(1, 2)
        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec)
        return context_vec

In [30]:
config_tokenizer=tiktoken.get_encoding("cl100k_base")
actual_vocab_size=config_tokenizer.n_vocab
print("Vocab size:", actual_vocab_size)

Vocab size: 100277


In [31]:
#Defining the parameters
GPT_CONFIG={
    'vocab_size':actual_vocab_size,
    'context_length':256, # Change it to 1024 or greater if you have gpu
    'embedding_dim':512,
    'num_heads':16,
    'n_layers':12,
    'dropout':0.1,
    'qkv_bias':False #Whether to include a bias layer in the linear layers of the multi head attention for query,key and value computations.
}

Coding up the placeholder architecture, it is like the mothership from where all the robots will branch out

In [32]:
class GPT_Model(nn.Module):
    def __init__(self, cfg):
        #The __init__ constructor of this GPTModel class initializes the token and positional embedding layers using the configurations passed in via a Python dictionary, cfg.
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["embedding_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["embedding_dim"])
        self.drop_emb = nn.Dropout(cfg["dropout"])
        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])]
        )
        self.final_norm = LayerNormalization(cfg["embedding_dim"])
        self.out_head = QuantizedLinear(cfg["embedding_dim"], cfg["vocab_size"], bias=False)
    def forward(self,in_idx):
        batch_size,seq_len=in_idx.shape
        in_idx = torch.clamp(in_idx, 0, self.tok_emb.num_embeddings - 1) #This was initially commented out
        token_embeddings=self.tok_emb(in_idx)
        positions = torch.arange(seq_len, device=in_idx.device).unsqueeze(0) #this is the extra added line
        positional_embeddings=self.pos_emb(positions)
        x=token_embeddings+positional_embeddings
        x=self.drop_emb(x)
        x=self.trf_blocks(x)
        x=self.final_norm(x)
        logits=self.out_head(x)
        return logits

In [33]:
class LayerNormalization(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))
    def forward(self,x):
        mean= x.mean(-1, keepdim=True)
        variance = x.var(-1, keepdim=True)
        norm_x=(x-mean)/(torch.sqrt(variance+self.eps))
        return self.scale*norm_x + self.shift

In [34]:
class TransformerBlock(nn.Module):
    def __init__(self,config):
        super().__init__()
        self.att=MultiHeadAttention(
            d_in=config["embedding_dim"],
            d_out=config["embedding_dim"],
            context_length=config['context_length'],
            dropout=config['dropout'],
            num_heads=config['num_heads'],
            qkv_bias=config['qkv_bias']
        )
        self.ff=FeedForward(config)
        self.norm1=LayerNormalization(config["embedding_dim"])
        self.norm2=LayerNormalization(config["embedding_dim"])
        self.drop_resid=nn.Dropout(config['dropout'])
    def forward(self,x):
        shortcut=x
        x=self.norm1(x)
        x=self.att(x)
        x=self.drop_resid(x)
        x=x+shortcut
        shortcut=x
        x=self.norm2(x)
        x=self.ff(x)
        x=self.drop_resid(x)
        x=x+shortcut
        return x

We will use swish activation function.

In [35]:
class Swish(nn.Module):
    def __init__(self):
        super(Swish, self).__init__()
    def forward(self, x):
        return x * torch.sigmoid(x)

In [36]:
class FeedForward(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.layers=nn.Sequential(
            nn.Linear(config["embedding_dim"], 4*config["embedding_dim"]),
            Swish(),
            nn.Linear(4*config["embedding_dim"], config["embedding_dim"]),
        )
    def forward(self, x):
        return self.layers(x)

In [37]:
class DeepNeuralNetwork(nn.Module):
    def __init__(self, layer_sizes,use_shortcut):
        super().__init__()
        self.layers=nn.ModuleList([
            #We would be implementing 10 layers
            nn.Sequential(nn.Linear(layer_sizes[0], layer_sizes[1])),
            nn.Sequential(nn.Linear(layer_sizes[1], layer_sizes[2])),
            nn.Sequential(nn.Linear(layer_sizes[2], layer_sizes[3])),
            nn.Sequential(nn.Linear(layer_sizes[3], layer_sizes[4])),
            nn.Sequential(nn.Linear(layer_sizes[4], layer_sizes[5])),
            nn.Sequential(nn.Linear(layer_sizes[5], layer_sizes[6])),
            nn.Sequential(nn.Linear(layer_sizes[6], layer_sizes[7])),
            nn.Sequential(nn.Linear(layer_sizes[7], layer_sizes[8])),
            nn.Sequential(nn.Linear(layer_sizes[8], layer_sizes[9])),
            nn.Sequential(nn.Linear(layer_sizes[9], layer_sizes[10])),
        ])
    def forward(self,x):
        for layer in self.layers:
            #Computing the output of the current layer
            layer_output=layer(x)
            #Check if shortcut can be applied
            if self.use_shortcut and x.shape==layer_output.shape:
                x=x+layer_output
            else:
                x=layer_output
            return x
def print_gradients(model,x):
    #First would be the forward pass
    output = model(x)
    target=torch.tensor([0,])
    #Loss calculation
    loss=nn.MSELoss()
    loss=loss(output,target)
    loss.backward()
    for name, param in model.named_parameters():
        if 'weight' in name:
            print(f"{name} grad: {param.grad}")

Now let us initialise

In [38]:
batch_size = 2  # Number of samples in the batch
sequence_length = 10  # Length of each sequence
vocab_size = 100  # Size of the vocabulary
batch = torch.randint(0, vocab_size, (batch_size, sequence_length))
print(batch)

tensor([[36, 24, 61,  0, 41, 81, 18, 26, 93, 88],
        [26, 96, 17, 74, 20, 82, 52, 43, 96, 70]])


In [39]:
torch.manual_seed(123)
model=GPT_Model(GPT_CONFIG)
out=model(batch)
print("Input batch:\n",batch)
print("Output batch:\n",out.shape)
print(out)

Input batch:
 tensor([[36, 24, 61,  0, 41, 81, 18, 26, 93, 88],
        [26, 96, 17, 74, 20, 82, 52, 43, 96, 70]])
Output batch:
 torch.Size([2, 10, 100277])
tensor([[[ 1.6182e+01, -1.6015e+01, -9.4095e+00,  ...,  3.0794e-03,
           2.9054e+01,  1.6988e+01],
         [ 5.2240e+00,  2.7572e+01, -6.9735e+00,  ..., -8.0013e+00,
          -4.0101e-01,  2.8758e+01],
         [ 6.6475e+00, -1.1150e+01,  7.9781e+00,  ..., -2.5136e+01,
           7.3388e+00,  9.9231e+00],
         ...,
         [-4.3846e+00, -1.7154e+01,  1.0174e+01,  ..., -4.6591e+00,
          -8.3947e+00,  1.1043e+01],
         [ 3.5968e+01, -2.7967e+00, -2.8498e+01,  ..., -2.2024e+00,
          -1.1003e+01, -2.4883e-02],
         [ 1.9451e+01, -3.6966e+01,  7.5978e+00,  ...,  9.3602e+00,
           8.6090e+00, -2.6628e+00]],

        [[-2.8687e+01,  1.6627e+01, -1.4998e+01,  ..., -1.7184e+01,
           2.0726e+01,  8.0321e+00],
         [-4.0979e+01,  6.5536e-01,  4.1383e+00,  ..., -1.2853e+01,
          -1.7279e+01, 

Displaying the number of parameters for the GPT model

In [40]:
total_parameters=sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_parameters}")
print("Token embedding layer shape:", model.tok_emb.weight.shape)
print("Output layer shape:", model.out_head.weight.shape)

Total number of parameters: 140625920
Token embedding layer shape: torch.Size([100277, 512])
Output layer shape: torch.Size([100277, 512])


Number of trainable parameters in the model

In [41]:
total_params_gpt2 = total_parameters - sum(p.numel() for p in model.out_head.parameters())
print(f"Number of trainable parameters considering weight tying: {total_params_gpt2}")

Number of trainable parameters considering weight tying: 89284096


In [42]:
total_size_in_bytes=total_parameters*4

total_size_of_the_model_in_MB=total_size_in_bytes/(1024*1024)
print(f"Total size of the model : {total_size_of_the_model_in_MB:.2f} MB")

Total size of the model : 536.45 MB


Total size of the model : 341.55 MB
Number of trainable parameters considering weight tying: 63935488


The next step is to now decode these tensors to proper text. Which would be coding up in the subsequent steps

In [43]:
#Let us try out the decoding procedure
start_context="Hello, I am Aditya."
tokenizer = tiktoken.get_encoding("cl100k_base")
encoded=tokenizer.encode(start_context)
print(encoded)

[9906, 11, 358, 1097, 2467, 488, 64, 13]


In [44]:
model.eval()

GPT_Model(
  (tok_emb): Embedding(100277, 512)
  (pos_emb): Embedding(256, 512)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): QuantizedLinear(in_features=512, out_features=512, bias=False)
        (W_key): QuantizedLinear(in_features=512, out_features=512, bias=False)
        (W_value): QuantizedLinear(in_features=512, out_features=512, bias=False)
        (out_proj): QuantizedLinear(in_features=512, out_features=512, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=512, out_features=2048, bias=True)
          (1): Swish()
          (2): Linear(in_features=2048, out_features=512, bias=True)
        )
      )
      (norm1): LayerNormalization()
      (norm2): LayerNormalization()
      (drop_resid): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): Mul

In [45]:
model.eval()
out=generate_text(model=model,idx=torch.tensor(encoded).unsqueeze(0),max_new_tokens=6,context_size=GPT_CONFIG["context_length"])
print("Output:\n",out)

Output:
 tensor([[ 9906,    11,   358,  1097,  2467,   488,    64,    13, 48400, 85624,
          1993, 61732, 73414, 87133]])


In [46]:
start_context="Hello, I am Aditya I want to become a CEO one day of my own company"
token_ids=generate_text(model=model,idx=text_to_token_ids(start_context,tokenizer),max_new_tokens=10,context_size=GPT_CONFIG["context_length"])
print("Output text:\n",token_ids_to_text(token_ids,tokenizer))

Output text:
 Hello, I am Aditya I want to become a CEO one day of my own company steadily;/*	model collateral字符 Lois Middletonarios_DECL loophole


In [47]:
inputs=torch.tensor([[ 9906,    11,   358,  1097,  2467,   488,    64,    13, 41867, 40540,
         15145, 30876, 46468, 30001]])  # Remove extra comma and parenthesis to make it a tensor
with torch.no_grad():
  logits=model(inputs)
probas=torch.softmax(logits,dim=-1)
print(probas.shape)

torch.Size([1, 14, 100277])


In [48]:
torch.manual_seed(123)
train_loader=create_dataloader_v1(train_text,batch_size=4,max_length=GPT_CONFIG["context_length"],
                                  stride=GPT_CONFIG['context_length'],
                                  drop_last=True,
                                  shuffle=True
                                  )
val_loader=create_dataloader_v1(val_text,batch_size=4,max_length=GPT_CONFIG["context_length"],
                                  stride=GPT_CONFIG['context_length'],
                                  drop_last=True,
                                  shuffle=True
                                  )

In [49]:
print("Train loader:")
for x,y in train_loader:
    print(x.shape,y.shape)
print("\n Validation Loader:")
for x,y in val_loader:
    print(x.shape,y.shape)
# The output implies that the model has 18 training set batches with 2 samples and 256 tokens each

Train loader:
torch.Size([4, 256]) torch.Size([4, 256])
torch.Size([4, 256]) torch.Size([4, 256])
torch.Size([4, 256]) torch.Size([4, 256])
torch.Size([4, 256]) torch.Size([4, 256])
torch.Size([4, 256]) torch.Size([4, 256])
torch.Size([4, 256]) torch.Size([4, 256])
torch.Size([4, 256]) torch.Size([4, 256])
torch.Size([4, 256]) torch.Size([4, 256])
torch.Size([4, 256]) torch.Size([4, 256])
torch.Size([4, 256]) torch.Size([4, 256])
torch.Size([4, 256]) torch.Size([4, 256])

 Validation Loader:
torch.Size([4, 256]) torch.Size([4, 256])


In [50]:
def calculation_of_loss(input_batch,target_batch,model,device):
  input_batch,target_batch=input_batch.to(device),target_batch.to(device)
  logits=model(input_batch)
  loss=torch.nn.functional.cross_entropy(logits.flatten(0,1),target_batch.flatten())
  return loss

In [51]:
def loss_loader(data_loader, model, device, num_batches=4):
    total_loss = 0 
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calculation_of_loss(input_batch, target_batch, model, device)
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches

In [52]:
device='cpu'
model.to(device)
train_loss = loss_loader(train_loader, model, device='cpu',num_batches=4)
val_loss=loss_loader(val_loader,model,device='cpu',num_batches=4)
print(f"Train loss: {train_loss:.4f}")
print(f"Validation loss: {val_loss:.4f}")

Train loss: 98.4413
Validation loss: 24.3542


In [53]:
print(len(train_loader))
print(len(val_loader))

11
1


In [54]:
def train_the_model(model,train_loader,val_loader,epochs=1,learning_rate=3e-4):
    optimizer=torch.optim.AdamW(model.parameters(),lr=learning_rate)
    for epoch in range(epochs):
        model.train()
        for i,(input_batch,target_batch) in enumerate(train_loader):
            input_batch,target_batch=input_batch.to(device),target_batch.to(device)
            optimizer.zero_grad()
            logits=model(input_batch)
            loss=torch.nn.functional.cross_entropy(logits.flatten(0,1),target_batch.flatten())
            loss.backward()
            optimizer.step()
            if i%100==0:
                print(f"Epoch {epoch+1}/{epochs}, Batch {i}/{len(train_loader)}, Loss: {loss.item():.4f}")
        model.eval()
        train_loss = loss_loader(train_loader, model, device='cpu',num_batches=4)
        val_loss = loss_loader(val_loader, model, device='cpu',num_batches=4)
        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")
        return train_loss, val_loss

In [55]:
def evaluate_model(model,train_loader, val_loader, device='cpu', num_batches=4):
    model.eval()
    with torch.no_grad():
        train_loss = loss_loader(train_loader, model, device=device, num_batches=num_batches)
        val_loss = loss_loader(val_loader, model, device=device, num_batches=num_batches)
    model.train()
    print(f"Train Loss: {train_loss:.4f}")
    print(f"Validation Loss: {val_loss:.4f}")
    return train_loss, val_loss

In [56]:
torch.manual_seed(123)
model=GPT_Model(GPT_CONFIG)
model.to(device)
train_loss, val_loss = train_the_model(model, train_loader, val_loader, epochs=10, learning_rate=3e-4)

Epoch 1/10, Batch 0/11, Loss: 98.6930
Epoch 1/10, Train Loss: 94.4102, Validation Loss: 23.4683


In [57]:
while True:
    start_context=input()
    token_ids=generate_text(model=model,idx=text_to_token_ids(start_context,tokenizer),max_new_tokens=15,context_size=GPT_CONFIG["context_length"],temperature=0.4,top_k=3)
    print("Output text:\n",token_ids_to_text(token_ids,tokenizer))

Output text:
 Hi	raise pitched že beh Difference_rg Commons licens	sh taped LSUesco microseconds haberhandleRequest
Output text:
 Can you talk in english-authored Alert 값을 together Arlington Pert DatePicker CitProductName/mswonerrassouth995 considerably
Output text:
 Yup little bit less chinese	ll amongst Companies_Details_Details_Details_Details(diistribute sampano PUasingbowerazzo


RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)

In [61]:
optimizer=torch.optim.AdamW(model.parameters(),lr=3e-4)
torch.save({"model weights and biases":model.state_dict(),
            "optimizer_weights":optimizer.state_dict(),},
            "GPT_model.pth")

In [None]:
#Load the weights using the following code
#model = GPT_Model(GPT_CONFIG)
#model.load_state_dict(torch.load("GPT_model.pth"))
#model.eval()

GPT_Model(
  (tok_emb): Embedding(100277, 512)
  (pos_emb): Embedding(256, 512)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=512, out_features=512, bias=False)
        (W_key): Linear(in_features=512, out_features=512, bias=False)
        (W_value): Linear(in_features=512, out_features=512, bias=False)
        (out_proj): Linear(in_features=512, out_features=512, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=512, out_features=2048, bias=True)
          (1): Swish()
          (2): Linear(in_features=2048, out_features=512, bias=True)
        )
      )
      (norm1): LayerNormalization()
      (norm2): LayerNormalization()
      (drop_resid): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): 