{ "cells": [ { "cell_type": "code", "execution_count": 21, "id": "initial_id", "metadata": { "ExecuteTime": { "end_time": "2025-04-20T10:30:19.511335Z", "start_time": "2025-04-20T10:30:14.130243Z" }, "collapsed": true, "id": "initial_id" }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import torch\n", "import torch.nn as nn\n", "import math\n", "#import tensorflow as tf" ] }, { "cell_type": "code", "execution_count": 22, "id": "420a4dfdadcdee66", "metadata": { "ExecuteTime": { "end_time": "2025-04-20T10:30:21.755678Z", "start_time": "2025-04-20T10:30:21.729677Z" }, "colab": { "base_uri": "https://localhost:8080/" }, "id": "420a4dfdadcdee66", "outputId": "a0132552-6de3-4c64-c3ab-73cdf858dbc0" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "55955\n", "India, officially the Republic of India,[j][21] is a country in South Asia. It is the seventh-larges\n" ] } ], "source": [ "with open(\"C:/Users/adity/Projects_of_Aditya/Working/India, officially the Republic of I.txt\",'r',encoding='utf-8') as f:\n", " raw_text=f.read()\n", "print(len(raw_text))\n", "print(raw_text[:100])" ] }, { "cell_type": "code", "execution_count": 23, "id": "YJ4KwDtekrSy", "metadata": { "id": "YJ4KwDtekrSy" }, "outputs": [], "source": [ "train_ratio = 0.9\n", "train_size = int(train_ratio * len(raw_text))\n", "train_text = raw_text[:train_size]\n", "val_text = raw_text[train_size:]" ] }, { "cell_type": "code", "execution_count": 24, "id": "ebcdc51c", "metadata": {}, "outputs": [], "source": [ "class BinarizeFunction(torch.autograd.Function):\n", " @staticmethod\n", " def forward(ctx, input):\n", " ctx.save_for_backward(input)\n", " return torch.sign(input)\n", " @staticmethod\n", " def backward(ctx, grad_output):\n", " input, = ctx.saved_tensors\n", " mask=(input.abs()<=1).float()\n", " grad_input = grad_output * mask\n", " return grad_input" ] }, { "cell_type": "code", "execution_count": 25, "id": "6dd4cfd0", "metadata": {}, "outputs": [], "source": [ "class QuantizedLinear(nn.Module):\n", " def __init__(self, in_features, out_features, bias=True):\n", " super(QuantizedLinear, self).__init__()\n", " self.in_features = in_features\n", " self.out_features = out_features\n", " self.weight = nn.Parameter(torch.Tensor(out_features, in_features))\n", " if bias:\n", " self.bias = nn.Parameter(torch.Tensor(out_features))\n", " else:\n", " self.register_parameter('bias', None)\n", " self.reset_parameters()\n", "\n", " def reset_parameters(self):\n", " nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))\n", " if self.bias is not None:\n", " fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight)\n", " bound = 1 / math.sqrt(fan_in)\n", " nn.init.uniform_(self.bias, -bound, bound)\n", " def forward(self, input):\n", " weight = BinarizeFunction.apply(self.weight)\n", " if self.bias is not None:\n", " return torch.nn.functional.linear(input, weight, self.bias)\n", " else:\n", " return torch.nn.functional.linear(input, weight)\n", " def extra_repr(self):\n", " return 'in_features={}, out_features={}, bias={}'.format(\n", " self.in_features, self.out_features, self.bias is not None\n", " )" ] }, { "cell_type": "code", "execution_count": 26, "id": "dd29070035dafb99", "metadata": { "ExecuteTime": { "end_time": "2025-04-20T10:30:25.020010Z", "start_time": "2025-04-20T10:30:24.959908Z" }, "id": "dd29070035dafb99" }, "outputs": [], "source": [ "from torch.utils.data import Dataset, DataLoader\n", "import tiktoken\n", "\n", "class GPTTokenizerDataset(Dataset):\n", " def __init__(self, txt, tokenizer, max_length, stride):\n", " self.tokenizer = tokenizer\n", " self.input_ids = []\n", " self.target_ids = []\n", " token_ids = self.tokenizer.encode(txt)\n", "\n", " for i in range(0, len(token_ids) - max_length, stride):\n", " input_chunk = token_ids[i:i + max_length]\n", " target_chunk = token_ids[i + 1:i + max_length+1]\n", " self.input_ids.append(torch.tensor(input_chunk))\n", " self.target_ids.append(torch.tensor(target_chunk))\n", " def __len__(self):\n", " return len(self.input_ids)\n", " def __getitem__(self, idx):\n", " return self.input_ids[idx], self.target_ids[idx]\n", "def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True):\n", " tokenizer = tiktoken.get_encoding(\"cl100k_base\")\n", " dataset = GPTTokenizerDataset(txt, tokenizer, max_length, stride)\n", " dataloader = DataLoader(\n", " dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last\n", " )\n", " return dataloader" ] }, { "cell_type": "code", "execution_count": 27, "id": "40a9c2660445b78c", "metadata": { "ExecuteTime": { "end_time": "2025-04-20T10:30:52.552634Z", "start_time": "2025-04-20T10:30:52.545337Z" }, "id": "40a9c2660445b78c" }, "outputs": [], "source": [ "def generate_text(model,idx,max_new_tokens,context_size,temperature=0.4,top_k=3):\n", " for _ in range(max_new_tokens):\n", " idx_cond=idx[:,-context_size:]\n", " with torch.no_grad():\n", " logits=model(idx_cond)\n", " logits=logits[:,-1,:]\n", " if top_k is not None:\n", " top_logits,_=torch.topk(logits,top_k)\n", " min_val=top_logits[:,-1]\n", " logits=torch.where(logits0.0:\n", " logits=logits/temperature\n", " probs=torch.softmax(logits,dim=-1)\n", " idx_next=torch.multinomial(probs,num_samples=1)\n", " else:\n", " idx_next=torch.argmax(logits,dim=-1,keepdim=True)\n", " idx=torch.cat((idx,idx_next),dim=1)\n", " return idx" ] }, { "cell_type": "code", "execution_count": 28, "id": "22a98021f476cc4d", "metadata": { "ExecuteTime": { "end_time": "2025-04-20T10:30:56.399874Z", "start_time": "2025-04-20T10:30:55.660994Z" }, "id": "22a98021f476cc4d" }, "outputs": [], "source": [ "tokenizer = tiktoken.get_encoding(\"cl100k_base\")\n", "def text_to_token_ids(text,tokenizer):\n", " encoded=tokenizer.encode(text,allowed_special={'<|endoftext|>'})\n", " encoded_tensor=torch.tensor(encoded).unsqueeze(0)\n", " return encoded_tensor\n", "def token_ids_to_text(token_ids,tokenizer):\n", " flat=token_ids.squeeze(0)\n", " return tokenizer.decode(flat.tolist())" ] }, { "cell_type": "markdown", "id": "c34f6594f2501fd3", "metadata": { "id": "c34f6594f2501fd3" }, "source": [ "Coding up the Attention model:- Here we would be creating a class of the causal attention and instantiating multiple times for the multihead attention model." ] }, { "cell_type": "markdown", "id": "779103be54de3305", "metadata": { "id": "779103be54de3305" }, "source": [ "Now for example if we set the number of heads we want is 10, then what exactly happens:-\n", "--> we obtain a tensor with ten sets of context vector matrices.\n", "--> In each context vector matrix the rows represent the context vectors corresponding to the tokens, and the columns corresponding to the embedding dimension specified via d_out.\n", "--> Final embedding dimension is 10 x 10." ] }, { "cell_type": "markdown", "id": "55a1ded1a5143e4b", "metadata": { "id": "55a1ded1a5143e4b" }, "source": [ "IMPLEMENTING THE PARALLEL METHOD OF IMPLEMENTATION." ] }, { "cell_type": "code", "execution_count": 29, "id": "9ffdb4830dd6536c", "metadata": { "ExecuteTime": { "end_time": "2025-04-20T10:31:00.004231Z", "start_time": "2025-04-20T10:30:59.989116Z" }, "id": "9ffdb4830dd6536c" }, "outputs": [], "source": [ "class MultiHeadAttention(nn.Module):\n", " def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):\n", " super().__init__()\n", " assert d_out % num_heads == 0, \"d_out must be divisible by num_heads\"\n", " self.d_out = d_out\n", " self.num_heads = num_heads\n", " self.head_dim = d_out // num_heads\n", " self.W_query = QuantizedLinear(d_in, d_out, bias=qkv_bias)\n", " self.W_key = QuantizedLinear(d_in, d_out, bias=qkv_bias)\n", " self.W_value = QuantizedLinear(d_in, d_out, bias=qkv_bias)\n", " self.out_proj = QuantizedLinear(d_out, d_out)\n", " self.dropout = nn.Dropout(dropout)\n", " self.register_buffer(\n", " 'mask',\n", " torch.triu(torch.ones(context_length, context_length), diagonal=1)\n", " )\n", " def forward(self, x):\n", " b, num_tokens, d_in = x.shape\n", " keys = self.W_key(x)\n", " queries = self.W_query(x)\n", " values = self.W_value(x)\n", " keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)\n", " values = values.view(b, num_tokens, self.num_heads, self.head_dim)\n", " queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)\n", " keys = keys.transpose(1, 2)\n", " queries = queries.transpose(1, 2)\n", " values = values.transpose(1, 2)\n", " attn_scores = queries @ keys.transpose(2, 3)\n", " mask_bool = self.mask.bool()[:num_tokens, :num_tokens]\n", " attn_scores.masked_fill_(mask_bool, -torch.inf)\n", " attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)\n", " attn_weights = self.dropout(attn_weights)\n", " context_vec = (attn_weights @ values).transpose(1, 2)\n", " context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)\n", " context_vec = self.out_proj(context_vec)\n", " return context_vec" ] }, { "cell_type": "code", "execution_count": 30, "id": "a361c4d3", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Vocab size: 100277\n" ] } ], "source": [ "config_tokenizer=tiktoken.get_encoding(\"cl100k_base\")\n", "actual_vocab_size=config_tokenizer.n_vocab\n", "print(\"Vocab size:\", actual_vocab_size)" ] }, { "cell_type": "code", "execution_count": 31, "id": "4f7ad555c6c06399", "metadata": { "ExecuteTime": { "end_time": "2025-04-20T10:31:03.321536Z", "start_time": "2025-04-20T10:31:03.313914Z" }, "id": "4f7ad555c6c06399" }, "outputs": [], "source": [ "#Defining the parameters\n", "GPT_CONFIG={\n", " 'vocab_size':actual_vocab_size,\n", " 'context_length':256, # Change it to 1024 or greater if you have gpu\n", " 'embedding_dim':512,\n", " 'num_heads':16,\n", " 'n_layers':12,\n", " 'dropout':0.1,\n", " 'qkv_bias':False #Whether to include a bias layer in the linear layers of the multi head attention for query,key and value computations.\n", "}" ] }, { "cell_type": "markdown", "id": "47e51a02ecec92d5", "metadata": { "id": "47e51a02ecec92d5" }, "source": [ "Coding up the placeholder architecture, it is like the mothership from where all the robots will branch out" ] }, { "cell_type": "code", "execution_count": 32, "id": "4bb79e5ab1baf62a", "metadata": { "ExecuteTime": { "end_time": "2025-04-20T10:31:06.415202Z", "start_time": "2025-04-20T10:31:06.403427Z" }, "id": "4bb79e5ab1baf62a" }, "outputs": [], "source": [ "class GPT_Model(nn.Module):\n", " def __init__(self, cfg):\n", " #The __init__ constructor of this GPTModel class initializes the token and positional embedding layers using the configurations passed in via a Python dictionary, cfg.\n", " super().__init__()\n", " self.tok_emb = nn.Embedding(cfg[\"vocab_size\"], cfg[\"embedding_dim\"])\n", " self.pos_emb = nn.Embedding(cfg[\"context_length\"], cfg[\"embedding_dim\"])\n", " self.drop_emb = nn.Dropout(cfg[\"dropout\"])\n", " self.trf_blocks = nn.Sequential(\n", " *[TransformerBlock(cfg) for _ in range(cfg[\"n_layers\"])]\n", " )\n", " self.final_norm = LayerNormalization(cfg[\"embedding_dim\"])\n", " self.out_head = QuantizedLinear(cfg[\"embedding_dim\"], cfg[\"vocab_size\"], bias=False)\n", " def forward(self,in_idx):\n", " batch_size,seq_len=in_idx.shape\n", " in_idx = torch.clamp(in_idx, 0, self.tok_emb.num_embeddings - 1) #This was initially commented out\n", " token_embeddings=self.tok_emb(in_idx)\n", " positions = torch.arange(seq_len, device=in_idx.device).unsqueeze(0) #this is the extra added line\n", " positional_embeddings=self.pos_emb(positions)\n", " x=token_embeddings+positional_embeddings\n", " x=self.drop_emb(x)\n", " x=self.trf_blocks(x)\n", " x=self.final_norm(x)\n", " logits=self.out_head(x)\n", " return logits" ] }, { "cell_type": "code", "execution_count": 33, "id": "72748550", "metadata": {}, "outputs": [], "source": [ "class LayerNormalization(nn.Module):\n", " def __init__(self, emb_dim):\n", " super().__init__()\n", " self.eps = 1e-5\n", " self.scale = nn.Parameter(torch.ones(emb_dim))\n", " self.shift = nn.Parameter(torch.zeros(emb_dim))\n", " def forward(self,x):\n", " mean= x.mean(-1, keepdim=True)\n", " variance = x.var(-1, keepdim=True)\n", " norm_x=(x-mean)/(torch.sqrt(variance+self.eps))\n", " return self.scale*norm_x + self.shift" ] }, { "cell_type": "code", "execution_count": 34, "id": "b81d6de9cdc325eb", "metadata": { "ExecuteTime": { "end_time": "2025-04-20T10:31:09.094024Z", "start_time": "2025-04-20T10:31:09.082533Z" }, "id": "b81d6de9cdc325eb" }, "outputs": [], "source": [ "class TransformerBlock(nn.Module):\n", " def __init__(self,config):\n", " super().__init__()\n", " self.att=MultiHeadAttention(\n", " d_in=config[\"embedding_dim\"],\n", " d_out=config[\"embedding_dim\"],\n", " context_length=config['context_length'],\n", " dropout=config['dropout'],\n", " num_heads=config['num_heads'],\n", " qkv_bias=config['qkv_bias']\n", " )\n", " self.ff=FeedForward(config)\n", " self.norm1=LayerNormalization(config[\"embedding_dim\"])\n", " self.norm2=LayerNormalization(config[\"embedding_dim\"])\n", " self.drop_resid=nn.Dropout(config['dropout'])\n", " def forward(self,x):\n", " shortcut=x\n", " x=self.norm1(x)\n", " x=self.att(x)\n", " x=self.drop_resid(x)\n", " x=x+shortcut\n", " shortcut=x\n", " x=self.norm2(x)\n", " x=self.ff(x)\n", " x=self.drop_resid(x)\n", " x=x+shortcut\n", " return x" ] }, { "cell_type": "markdown", "id": "ee7086fdb0d258aa", "metadata": { "id": "ee7086fdb0d258aa" }, "source": [ "We will use swish activation function." ] }, { "cell_type": "code", "execution_count": 35, "id": "aafae17704f79949", "metadata": { "ExecuteTime": { "end_time": "2025-04-20T10:31:14.198107Z", "start_time": "2025-04-20T10:31:14.183061Z" }, "id": "aafae17704f79949" }, "outputs": [], "source": [ "class Swish(nn.Module):\n", " def __init__(self):\n", " super(Swish, self).__init__()\n", " def forward(self, x):\n", " return x * torch.sigmoid(x)" ] }, { "cell_type": "code", "execution_count": 36, "id": "4b3a9eeaf0282a32", "metadata": { "ExecuteTime": { "end_time": "2025-04-20T10:31:16.572707Z", "start_time": "2025-04-20T10:31:16.567278Z" }, "id": "4b3a9eeaf0282a32" }, "outputs": [], "source": [ "class FeedForward(nn.Module):\n", " def __init__(self, config):\n", " super().__init__()\n", " self.layers=nn.Sequential(\n", " nn.Linear(config[\"embedding_dim\"], 4*config[\"embedding_dim\"]),\n", " Swish(),\n", " nn.Linear(4*config[\"embedding_dim\"], config[\"embedding_dim\"]),\n", " )\n", " def forward(self, x):\n", " return self.layers(x)" ] }, { "cell_type": "code", "execution_count": 37, "id": "3888c877e7bb59fa", "metadata": { "ExecuteTime": { "end_time": "2025-04-20T10:31:37.956131Z", "start_time": "2025-04-20T10:31:37.943199Z" }, "id": "3888c877e7bb59fa" }, "outputs": [], "source": [ "class DeepNeuralNetwork(nn.Module):\n", " def __init__(self, layer_sizes,use_shortcut):\n", " super().__init__()\n", " self.layers=nn.ModuleList([\n", " #We would be implementing 10 layers\n", " nn.Sequential(nn.Linear(layer_sizes[0], layer_sizes[1])),\n", " nn.Sequential(nn.Linear(layer_sizes[1], layer_sizes[2])),\n", " nn.Sequential(nn.Linear(layer_sizes[2], layer_sizes[3])),\n", " nn.Sequential(nn.Linear(layer_sizes[3], layer_sizes[4])),\n", " nn.Sequential(nn.Linear(layer_sizes[4], layer_sizes[5])),\n", " nn.Sequential(nn.Linear(layer_sizes[5], layer_sizes[6])),\n", " nn.Sequential(nn.Linear(layer_sizes[6], layer_sizes[7])),\n", " nn.Sequential(nn.Linear(layer_sizes[7], layer_sizes[8])),\n", " nn.Sequential(nn.Linear(layer_sizes[8], layer_sizes[9])),\n", " nn.Sequential(nn.Linear(layer_sizes[9], layer_sizes[10])),\n", " ])\n", " def forward(self,x):\n", " for layer in self.layers:\n", " #Computing the output of the current layer\n", " layer_output=layer(x)\n", " #Check if shortcut can be applied\n", " if self.use_shortcut and x.shape==layer_output.shape:\n", " x=x+layer_output\n", " else:\n", " x=layer_output\n", " return x\n", "def print_gradients(model,x):\n", " #First would be the forward pass\n", " output = model(x)\n", " target=torch.tensor([0,])\n", " #Loss calculation\n", " loss=nn.MSELoss()\n", " loss=loss(output,target)\n", " loss.backward()\n", " for name, param in model.named_parameters():\n", " if 'weight' in name:\n", " print(f\"{name} grad: {param.grad}\")" ] }, { "cell_type": "markdown", "id": "78ab409a0177825", "metadata": { "id": "78ab409a0177825" }, "source": [ "Now let us initialise" ] }, { "cell_type": "code", "execution_count": 38, "id": "6710dda1f52d8b41", "metadata": { "ExecuteTime": { "end_time": "2025-04-20T10:31:41.037621Z", "start_time": "2025-04-20T10:31:40.974254Z" }, "colab": { "base_uri": "https://localhost:8080/" }, "id": "6710dda1f52d8b41", "outputId": "c2753e89-89dc-4c5b-c086-53132aded738" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tensor([[36, 24, 61, 0, 41, 81, 18, 26, 93, 88],\n", " [26, 96, 17, 74, 20, 82, 52, 43, 96, 70]])\n" ] } ], "source": [ "batch_size = 2 # Number of samples in the batch\n", "sequence_length = 10 # Length of each sequence\n", "vocab_size = 100 # Size of the vocabulary\n", "batch = torch.randint(0, vocab_size, (batch_size, sequence_length))\n", "print(batch)" ] }, { "cell_type": "code", "execution_count": 39, "id": "b376992b9eb9a68c", "metadata": { "ExecuteTime": { "end_time": "2025-04-20T10:31:44.349704Z", "start_time": "2025-04-20T10:31:43.391715Z" }, "colab": { "base_uri": "https://localhost:8080/" }, "id": "b376992b9eb9a68c", "outputId": "f67dc607-f218-4c20-848d-47212f38b749" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Input batch:\n", " tensor([[36, 24, 61, 0, 41, 81, 18, 26, 93, 88],\n", " [26, 96, 17, 74, 20, 82, 52, 43, 96, 70]])\n", "Output batch:\n", " torch.Size([2, 10, 100277])\n", "tensor([[[ 1.6182e+01, -1.6015e+01, -9.4095e+00, ..., 3.0794e-03,\n", " 2.9054e+01, 1.6988e+01],\n", " [ 5.2240e+00, 2.7572e+01, -6.9735e+00, ..., -8.0013e+00,\n", " -4.0101e-01, 2.8758e+01],\n", " [ 6.6475e+00, -1.1150e+01, 7.9781e+00, ..., -2.5136e+01,\n", " 7.3388e+00, 9.9231e+00],\n", " ...,\n", " [-4.3846e+00, -1.7154e+01, 1.0174e+01, ..., -4.6591e+00,\n", " -8.3947e+00, 1.1043e+01],\n", " [ 3.5968e+01, -2.7967e+00, -2.8498e+01, ..., -2.2024e+00,\n", " -1.1003e+01, -2.4883e-02],\n", " [ 1.9451e+01, -3.6966e+01, 7.5978e+00, ..., 9.3602e+00,\n", " 8.6090e+00, -2.6628e+00]],\n", "\n", " [[-2.8687e+01, 1.6627e+01, -1.4998e+01, ..., -1.7184e+01,\n", " 2.0726e+01, 8.0321e+00],\n", " [-4.0979e+01, 6.5536e-01, 4.1383e+00, ..., -1.2853e+01,\n", " -1.7279e+01, -1.3240e+01],\n", " [-1.9607e+01, 2.3471e+00, 7.2976e+00, ..., 4.8977e-01,\n", " -1.7134e+01, 3.4321e+00],\n", " ...,\n", " [-1.1025e+01, -2.4218e+00, 2.6663e+01, ..., 1.4770e+00,\n", " -4.0925e+01, 5.0661e-01],\n", " [-3.4426e+01, -2.2701e+00, 2.6099e+01, ..., -1.2846e+01,\n", " -2.4183e+01, -4.9127e+01],\n", " [ 1.6595e+00, -1.6062e+00, 1.8436e+01, ..., 3.3674e+01,\n", " -3.5222e+01, -2.4692e+01]]], grad_fn=)\n" ] } ], "source": [ "torch.manual_seed(123)\n", "model=GPT_Model(GPT_CONFIG)\n", "out=model(batch)\n", "print(\"Input batch:\\n\",batch)\n", "print(\"Output batch:\\n\",out.shape)\n", "print(out)" ] }, { "cell_type": "markdown", "id": "32204ab3e2917ca1", "metadata": { "id": "32204ab3e2917ca1" }, "source": [ "Displaying the number of parameters for the GPT model" ] }, { "cell_type": "code", "execution_count": 40, "id": "bfd0d944c222bfbf", "metadata": { "ExecuteTime": { "end_time": "2025-04-20T10:31:49.707504Z", "start_time": "2025-04-20T10:31:49.699751Z" }, "colab": { "base_uri": "https://localhost:8080/" }, "id": "bfd0d944c222bfbf", "outputId": "bbad64e6-f379-475e-80d5-6d3fe5e79824" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total number of parameters: 140625920\n", "Token embedding layer shape: torch.Size([100277, 512])\n", "Output layer shape: torch.Size([100277, 512])\n" ] } ], "source": [ "total_parameters=sum(p.numel() for p in model.parameters())\n", "print(f\"Total number of parameters: {total_parameters}\")\n", "print(\"Token embedding layer shape:\", model.tok_emb.weight.shape)\n", "print(\"Output layer shape:\", model.out_head.weight.shape)" ] }, { "cell_type": "markdown", "id": "c2b39710a7897efb", "metadata": { "id": "c2b39710a7897efb" }, "source": [ "Number of trainable parameters in the model" ] }, { "cell_type": "code", "execution_count": 41, "id": "e047e3f5d5b4e540", "metadata": { "ExecuteTime": { "end_time": "2025-04-20T10:31:53.034490Z", "start_time": "2025-04-20T10:31:53.027104Z" }, "colab": { "base_uri": "https://localhost:8080/" }, "id": "e047e3f5d5b4e540", "outputId": "b1793806-df53-4cf2-a09d-52e8485bb35f" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of trainable parameters considering weight tying: 89284096\n" ] } ], "source": [ "total_params_gpt2 = total_parameters - sum(p.numel() for p in model.out_head.parameters())\n", "print(f\"Number of trainable parameters considering weight tying: {total_params_gpt2}\")" ] }, { "cell_type": "code", "execution_count": 42, "id": "f611c62fb559142f", "metadata": { "ExecuteTime": { "end_time": "2025-04-20T10:31:57.287950Z", "start_time": "2025-04-20T10:31:57.279346Z" }, "colab": { "base_uri": "https://localhost:8080/" }, "id": "f611c62fb559142f", "outputId": "24b7ef8b-df10-40a3-b192-46a8d32cf3e3" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total size of the model : 536.45 MB\n" ] } ], "source": [ "total_size_in_bytes=total_parameters*4\n", "\n", "total_size_of_the_model_in_MB=total_size_in_bytes/(1024*1024)\n", "print(f\"Total size of the model : {total_size_of_the_model_in_MB:.2f} MB\")" ] }, { "cell_type": "markdown", "id": "645fa9c01a21b0e3", "metadata": { "id": "645fa9c01a21b0e3" }, "source": [ "Total size of the model : 341.55 MB\n", "Number of trainable parameters considering weight tying: 63935488\n" ] }, { "cell_type": "markdown", "id": "e32325eb6463fa21", "metadata": { "id": "e32325eb6463fa21" }, "source": [ "The next step is to now decode these tensors to proper text. Which would be coding up in the subsequent steps" ] }, { "cell_type": "code", "execution_count": 43, "id": "af8f873de4b1ea1f", "metadata": { "ExecuteTime": { "end_time": "2025-04-20T10:36:18.521800Z", "start_time": "2025-04-20T10:36:18.507080Z" }, "colab": { "base_uri": "https://localhost:8080/" }, "id": "af8f873de4b1ea1f", "outputId": "8761b2e0-af06-4027-fc7b-b09c306d69cf" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[9906, 11, 358, 1097, 2467, 488, 64, 13]\n" ] } ], "source": [ "#Let us try out the decoding procedure\n", "start_context=\"Hello, I am Aditya.\"\n", "tokenizer = tiktoken.get_encoding(\"cl100k_base\")\n", "encoded=tokenizer.encode(start_context)\n", "print(encoded)" ] }, { "cell_type": "code", "execution_count": 44, "id": "baf2d02c627a5911", "metadata": { "ExecuteTime": { "end_time": "2025-04-20T10:32:31.432690Z", "start_time": "2025-04-20T10:32:31.416839Z" }, "colab": { "base_uri": "https://localhost:8080/" }, "id": "baf2d02c627a5911", "outputId": "b6a59155-048a-49e4-c1b5-683dbbad8f0a" }, "outputs": [ { "data": { "text/plain": [ "GPT_Model(\n", " (tok_emb): Embedding(100277, 512)\n", " (pos_emb): Embedding(256, 512)\n", " (drop_emb): Dropout(p=0.1, inplace=False)\n", " (trf_blocks): Sequential(\n", " (0): TransformerBlock(\n", " (att): MultiHeadAttention(\n", " (W_query): QuantizedLinear(in_features=512, out_features=512, bias=False)\n", " (W_key): QuantizedLinear(in_features=512, out_features=512, bias=False)\n", " (W_value): QuantizedLinear(in_features=512, out_features=512, bias=False)\n", " (out_proj): QuantizedLinear(in_features=512, out_features=512, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ff): FeedForward(\n", " (layers): Sequential(\n", " (0): Linear(in_features=512, out_features=2048, bias=True)\n", " (1): Swish()\n", " (2): Linear(in_features=2048, out_features=512, bias=True)\n", " )\n", " )\n", " (norm1): LayerNormalization()\n", " (norm2): LayerNormalization()\n", " (drop_resid): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): TransformerBlock(\n", " (att): MultiHeadAttention(\n", " (W_query): QuantizedLinear(in_features=512, out_features=512, bias=False)\n", " (W_key): QuantizedLinear(in_features=512, out_features=512, bias=False)\n", " (W_value): QuantizedLinear(in_features=512, out_features=512, bias=False)\n", " (out_proj): QuantizedLinear(in_features=512, out_features=512, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ff): FeedForward(\n", " (layers): Sequential(\n", " (0): Linear(in_features=512, out_features=2048, bias=True)\n", " (1): Swish()\n", " (2): Linear(in_features=2048, out_features=512, bias=True)\n", " )\n", " )\n", " (norm1): LayerNormalization()\n", " (norm2): LayerNormalization()\n", " (drop_resid): Dropout(p=0.1, inplace=False)\n", " )\n", " (2): TransformerBlock(\n", " (att): MultiHeadAttention(\n", " (W_query): QuantizedLinear(in_features=512, out_features=512, bias=False)\n", " (W_key): QuantizedLinear(in_features=512, out_features=512, bias=False)\n", " (W_value): QuantizedLinear(in_features=512, out_features=512, bias=False)\n", " (out_proj): QuantizedLinear(in_features=512, out_features=512, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ff): FeedForward(\n", " (layers): Sequential(\n", " (0): Linear(in_features=512, out_features=2048, bias=True)\n", " (1): Swish()\n", " (2): Linear(in_features=2048, out_features=512, bias=True)\n", " )\n", " )\n", " (norm1): LayerNormalization()\n", " (norm2): LayerNormalization()\n", " (drop_resid): Dropout(p=0.1, inplace=False)\n", " )\n", " (3): TransformerBlock(\n", " (att): MultiHeadAttention(\n", " (W_query): QuantizedLinear(in_features=512, out_features=512, bias=False)\n", " (W_key): QuantizedLinear(in_features=512, out_features=512, bias=False)\n", " (W_value): QuantizedLinear(in_features=512, out_features=512, bias=False)\n", " (out_proj): QuantizedLinear(in_features=512, out_features=512, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ff): FeedForward(\n", " (layers): Sequential(\n", " (0): Linear(in_features=512, out_features=2048, bias=True)\n", " (1): Swish()\n", " (2): Linear(in_features=2048, out_features=512, bias=True)\n", " )\n", " )\n", " (norm1): LayerNormalization()\n", " (norm2): LayerNormalization()\n", " (drop_resid): Dropout(p=0.1, inplace=False)\n", " )\n", " (4): TransformerBlock(\n", " (att): MultiHeadAttention(\n", " (W_query): QuantizedLinear(in_features=512, out_features=512, bias=False)\n", " (W_key): QuantizedLinear(in_features=512, out_features=512, bias=False)\n", " (W_value): QuantizedLinear(in_features=512, out_features=512, bias=False)\n", " (out_proj): QuantizedLinear(in_features=512, out_features=512, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ff): FeedForward(\n", " (layers): Sequential(\n", " (0): Linear(in_features=512, out_features=2048, bias=True)\n", " (1): Swish()\n", " (2): Linear(in_features=2048, out_features=512, bias=True)\n", " )\n", " )\n", " (norm1): LayerNormalization()\n", " (norm2): LayerNormalization()\n", " (drop_resid): Dropout(p=0.1, inplace=False)\n", " )\n", " (5): TransformerBlock(\n", " (att): MultiHeadAttention(\n", " (W_query): QuantizedLinear(in_features=512, out_features=512, bias=False)\n", " (W_key): QuantizedLinear(in_features=512, out_features=512, bias=False)\n", " (W_value): QuantizedLinear(in_features=512, out_features=512, bias=False)\n", " (out_proj): QuantizedLinear(in_features=512, out_features=512, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ff): FeedForward(\n", " (layers): Sequential(\n", " (0): Linear(in_features=512, out_features=2048, bias=True)\n", " (1): Swish()\n", " (2): Linear(in_features=2048, out_features=512, bias=True)\n", " )\n", " )\n", " (norm1): LayerNormalization()\n", " (norm2): LayerNormalization()\n", " (drop_resid): Dropout(p=0.1, inplace=False)\n", " )\n", " (6): TransformerBlock(\n", " (att): MultiHeadAttention(\n", " (W_query): QuantizedLinear(in_features=512, out_features=512, bias=False)\n", " (W_key): QuantizedLinear(in_features=512, out_features=512, bias=False)\n", " (W_value): QuantizedLinear(in_features=512, out_features=512, bias=False)\n", " (out_proj): QuantizedLinear(in_features=512, out_features=512, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ff): FeedForward(\n", " (layers): Sequential(\n", " (0): Linear(in_features=512, out_features=2048, bias=True)\n", " (1): Swish()\n", " (2): Linear(in_features=2048, out_features=512, bias=True)\n", " )\n", " )\n", " (norm1): LayerNormalization()\n", " (norm2): LayerNormalization()\n", " (drop_resid): Dropout(p=0.1, inplace=False)\n", " )\n", " (7): TransformerBlock(\n", " (att): MultiHeadAttention(\n", " (W_query): QuantizedLinear(in_features=512, out_features=512, bias=False)\n", " (W_key): QuantizedLinear(in_features=512, out_features=512, bias=False)\n", " (W_value): QuantizedLinear(in_features=512, out_features=512, bias=False)\n", " (out_proj): QuantizedLinear(in_features=512, out_features=512, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ff): FeedForward(\n", " (layers): Sequential(\n", " (0): Linear(in_features=512, out_features=2048, bias=True)\n", " (1): Swish()\n", " (2): Linear(in_features=2048, out_features=512, bias=True)\n", " )\n", " )\n", " (norm1): LayerNormalization()\n", " (norm2): LayerNormalization()\n", " (drop_resid): Dropout(p=0.1, inplace=False)\n", " )\n", " (8): TransformerBlock(\n", " (att): MultiHeadAttention(\n", " (W_query): QuantizedLinear(in_features=512, out_features=512, bias=False)\n", " (W_key): QuantizedLinear(in_features=512, out_features=512, bias=False)\n", " (W_value): QuantizedLinear(in_features=512, out_features=512, bias=False)\n", " (out_proj): QuantizedLinear(in_features=512, out_features=512, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ff): FeedForward(\n", " (layers): Sequential(\n", " (0): Linear(in_features=512, out_features=2048, bias=True)\n", " (1): Swish()\n", " (2): Linear(in_features=2048, out_features=512, bias=True)\n", " )\n", " )\n", " (norm1): LayerNormalization()\n", " (norm2): LayerNormalization()\n", " (drop_resid): Dropout(p=0.1, inplace=False)\n", " )\n", " (9): TransformerBlock(\n", " (att): MultiHeadAttention(\n", " (W_query): QuantizedLinear(in_features=512, out_features=512, bias=False)\n", " (W_key): QuantizedLinear(in_features=512, out_features=512, bias=False)\n", " (W_value): QuantizedLinear(in_features=512, out_features=512, bias=False)\n", " (out_proj): QuantizedLinear(in_features=512, out_features=512, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ff): FeedForward(\n", " (layers): Sequential(\n", " (0): Linear(in_features=512, out_features=2048, bias=True)\n", " (1): Swish()\n", " (2): Linear(in_features=2048, out_features=512, bias=True)\n", " )\n", " )\n", " (norm1): LayerNormalization()\n", " (norm2): LayerNormalization()\n", " (drop_resid): Dropout(p=0.1, inplace=False)\n", " )\n", " (10): TransformerBlock(\n", " (att): MultiHeadAttention(\n", " (W_query): QuantizedLinear(in_features=512, out_features=512, bias=False)\n", " (W_key): QuantizedLinear(in_features=512, out_features=512, bias=False)\n", " (W_value): QuantizedLinear(in_features=512, out_features=512, bias=False)\n", " (out_proj): QuantizedLinear(in_features=512, out_features=512, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ff): FeedForward(\n", " (layers): Sequential(\n", " (0): Linear(in_features=512, out_features=2048, bias=True)\n", " (1): Swish()\n", " (2): Linear(in_features=2048, out_features=512, bias=True)\n", " )\n", " )\n", " (norm1): LayerNormalization()\n", " (norm2): LayerNormalization()\n", " (drop_resid): Dropout(p=0.1, inplace=False)\n", " )\n", " (11): TransformerBlock(\n", " (att): MultiHeadAttention(\n", " (W_query): QuantizedLinear(in_features=512, out_features=512, bias=False)\n", " (W_key): QuantizedLinear(in_features=512, out_features=512, bias=False)\n", " (W_value): QuantizedLinear(in_features=512, out_features=512, bias=False)\n", " (out_proj): QuantizedLinear(in_features=512, out_features=512, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ff): FeedForward(\n", " (layers): Sequential(\n", " (0): Linear(in_features=512, out_features=2048, bias=True)\n", " (1): Swish()\n", " (2): Linear(in_features=2048, out_features=512, bias=True)\n", " )\n", " )\n", " (norm1): LayerNormalization()\n", " (norm2): LayerNormalization()\n", " (drop_resid): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (final_norm): LayerNormalization()\n", " (out_head): QuantizedLinear(in_features=512, out_features=100277, bias=False)\n", ")" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.eval()" ] }, { "cell_type": "code", "execution_count": 45, "id": "8e6a5e5afc3272d6", "metadata": { "ExecuteTime": { "end_time": "2025-04-20T10:36:21.766425Z", "start_time": "2025-04-20T10:36:21.340642Z" }, "colab": { "base_uri": "https://localhost:8080/" }, "id": "8e6a5e5afc3272d6", "outputId": "4b2dcdff-161f-47c8-cca4-e84a9e117e2f" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Output:\n", " tensor([[ 9906, 11, 358, 1097, 2467, 488, 64, 13, 48400, 85624,\n", " 1993, 61732, 73414, 87133]])\n" ] } ], "source": [ "model.eval()\n", "out=generate_text(model=model,idx=torch.tensor(encoded).unsqueeze(0),max_new_tokens=6,context_size=GPT_CONFIG[\"context_length\"])\n", "print(\"Output:\\n\",out)" ] }, { "cell_type": "code", "execution_count": 46, "id": "1ffca81eb2e208dd", "metadata": { "ExecuteTime": { "end_time": "2025-04-20T10:36:31.970156Z", "start_time": "2025-04-20T10:36:30.980631Z" }, "colab": { "base_uri": "https://localhost:8080/" }, "id": "1ffca81eb2e208dd", "outputId": "5d1b6fe6-0368-46c9-ead1-7cc1a3174322" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Output text:\n", " Hello, I am Aditya I want to become a CEO one day of my own company steadily;/*\tmodel collateral字符 Lois Middletonarios_DECL loophole\n" ] } ], "source": [ "start_context=\"Hello, I am Aditya I want to become a CEO one day of my own company\"\n", "token_ids=generate_text(model=model,idx=text_to_token_ids(start_context,tokenizer),max_new_tokens=10,context_size=GPT_CONFIG[\"context_length\"])\n", "print(\"Output text:\\n\",token_ids_to_text(token_ids,tokenizer))" ] }, { "cell_type": "code", "execution_count": 47, "id": "yxZH4QzR-ydZ", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "yxZH4QzR-ydZ", "outputId": "d46883fa-15f6-44e9-d69f-797a3af7a8c4" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "torch.Size([1, 14, 100277])\n" ] } ], "source": [ "inputs=torch.tensor([[ 9906, 11, 358, 1097, 2467, 488, 64, 13, 41867, 40540,\n", " 15145, 30876, 46468, 30001]]) # Remove extra comma and parenthesis to make it a tensor\n", "with torch.no_grad():\n", " logits=model(inputs)\n", "probas=torch.softmax(logits,dim=-1)\n", "print(probas.shape)" ] }, { "cell_type": "code", "execution_count": 48, "id": "MTItfymWGhRZ", "metadata": { "id": "MTItfymWGhRZ" }, "outputs": [], "source": [ "torch.manual_seed(123)\n", "train_loader=create_dataloader_v1(train_text,batch_size=4,max_length=GPT_CONFIG[\"context_length\"],\n", " stride=GPT_CONFIG['context_length'],\n", " drop_last=True,\n", " shuffle=True\n", " )\n", "val_loader=create_dataloader_v1(val_text,batch_size=4,max_length=GPT_CONFIG[\"context_length\"],\n", " stride=GPT_CONFIG['context_length'],\n", " drop_last=True,\n", " shuffle=True\n", " )" ] }, { "cell_type": "code", "execution_count": 49, "id": "e853b287", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train loader:\n", "torch.Size([4, 256]) torch.Size([4, 256])\n", "torch.Size([4, 256]) torch.Size([4, 256])\n", "torch.Size([4, 256]) torch.Size([4, 256])\n", "torch.Size([4, 256]) torch.Size([4, 256])\n", "torch.Size([4, 256]) torch.Size([4, 256])\n", "torch.Size([4, 256]) torch.Size([4, 256])\n", "torch.Size([4, 256]) torch.Size([4, 256])\n", "torch.Size([4, 256]) torch.Size([4, 256])\n", "torch.Size([4, 256]) torch.Size([4, 256])\n", "torch.Size([4, 256]) torch.Size([4, 256])\n", "torch.Size([4, 256]) torch.Size([4, 256])\n", "\n", " Validation Loader:\n", "torch.Size([4, 256]) torch.Size([4, 256])\n" ] } ], "source": [ "print(\"Train loader:\")\n", "for x,y in train_loader:\n", " print(x.shape,y.shape)\n", "print(\"\\n Validation Loader:\")\n", "for x,y in val_loader:\n", " print(x.shape,y.shape)\n", "# The output implies that the model has 18 training set batches with 2 samples and 256 tokens each" ] }, { "cell_type": "code", "execution_count": 50, "id": "Df2uwuFnmOp3", "metadata": { "id": "Df2uwuFnmOp3" }, "outputs": [], "source": [ "def calculation_of_loss(input_batch,target_batch,model,device):\n", " input_batch,target_batch=input_batch.to(device),target_batch.to(device)\n", " logits=model(input_batch)\n", " loss=torch.nn.functional.cross_entropy(logits.flatten(0,1),target_batch.flatten())\n", " return loss" ] }, { "cell_type": "code", "execution_count": 51, "id": "hdoiK6MLcrYV", "metadata": { "id": "hdoiK6MLcrYV" }, "outputs": [], "source": [ "def loss_loader(data_loader, model, device, num_batches=4):\n", " total_loss = 0 \n", " for i, (input_batch, target_batch) in enumerate(data_loader):\n", " if i < num_batches:\n", " loss = calculation_of_loss(input_batch, target_batch, model, device)\n", " total_loss += loss.item()\n", " else:\n", " break\n", " return total_loss / num_batches" ] }, { "cell_type": "code", "execution_count": 52, "id": "x89QUR65ePEs", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 383 }, "id": "x89QUR65ePEs", "outputId": "7b4bc307-b3fb-45b7-d067-724481f7bbce" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train loss: 98.4413\n", "Validation loss: 24.3542\n" ] } ], "source": [ "device='cpu'\n", "model.to(device)\n", "train_loss = loss_loader(train_loader, model, device='cpu',num_batches=4)\n", "val_loss=loss_loader(val_loader,model,device='cpu',num_batches=4)\n", "print(f\"Train loss: {train_loss:.4f}\")\n", "print(f\"Validation loss: {val_loss:.4f}\")" ] }, { "cell_type": "code", "execution_count": 53, "id": "4aa447fc", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "11\n", "1\n" ] } ], "source": [ "print(len(train_loader))\n", "print(len(val_loader))" ] }, { "cell_type": "code", "execution_count": 54, "id": "a0020a0e", "metadata": {}, "outputs": [], "source": [ "def train_the_model(model,train_loader,val_loader,epochs=1,learning_rate=3e-4):\n", " optimizer=torch.optim.AdamW(model.parameters(),lr=learning_rate)\n", " for epoch in range(epochs):\n", " model.train()\n", " for i,(input_batch,target_batch) in enumerate(train_loader):\n", " input_batch,target_batch=input_batch.to(device),target_batch.to(device)\n", " optimizer.zero_grad()\n", " logits=model(input_batch)\n", " loss=torch.nn.functional.cross_entropy(logits.flatten(0,1),target_batch.flatten())\n", " loss.backward()\n", " optimizer.step()\n", " if i%100==0:\n", " print(f\"Epoch {epoch+1}/{epochs}, Batch {i}/{len(train_loader)}, Loss: {loss.item():.4f}\")\n", " model.eval()\n", " train_loss = loss_loader(train_loader, model, device='cpu',num_batches=4)\n", " val_loss = loss_loader(val_loader, model, device='cpu',num_batches=4)\n", " print(f\"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}\")\n", " return train_loss, val_loss" ] }, { "cell_type": "code", "execution_count": 55, "id": "b8407429", "metadata": {}, "outputs": [], "source": [ "def evaluate_model(model,train_loader, val_loader, device='cpu', num_batches=4):\n", " model.eval()\n", " with torch.no_grad():\n", " train_loss = loss_loader(train_loader, model, device=device, num_batches=num_batches)\n", " val_loss = loss_loader(val_loader, model, device=device, num_batches=num_batches)\n", " model.train()\n", " print(f\"Train Loss: {train_loss:.4f}\")\n", " print(f\"Validation Loss: {val_loss:.4f}\")\n", " return train_loss, val_loss" ] }, { "cell_type": "code", "execution_count": 56, "id": "96d3965f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/10, Batch 0/11, Loss: 98.6930\n", "Epoch 1/10, Train Loss: 94.4102, Validation Loss: 23.4683\n" ] } ], "source": [ "torch.manual_seed(123)\n", "model=GPT_Model(GPT_CONFIG)\n", "model.to(device)\n", "train_loss, val_loss = train_the_model(model, train_loader, val_loader, epochs=10, learning_rate=3e-4)" ] }, { "cell_type": "code", "execution_count": 57, "id": "fac91e1d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Output text:\n", " Hi\traise pitched že beh Difference_rg Commons licens\tsh taped LSUesco microseconds haberhandleRequest\n", "Output text:\n", " Can you talk in english-authored Alert 값을 together Arlington Pert DatePicker CitProductName/mswonerrassouth995 considerably\n", "Output text:\n", " Yup little bit less chinese\tll amongst Companies_Details_Details_Details_Details(diistribute sampano PUasingbowerazzo\n" ] }, { "ename": "RuntimeError", "evalue": "Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)", "output_type": "error", "traceback": [ "\u001b[31m---------------------------------------------------------------------------\u001b[39m", "\u001b[31mRuntimeError\u001b[39m Traceback (most recent call last)", "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[57]\u001b[39m\u001b[32m, line 3\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[32m 2\u001b[39m start_context=\u001b[38;5;28minput\u001b[39m()\n\u001b[32m----> \u001b[39m\u001b[32m3\u001b[39m token_ids=\u001b[43mgenerate_text\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m=\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43midx\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtext_to_token_ids\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstart_context\u001b[49m\u001b[43m,\u001b[49m\u001b[43mtokenizer\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43mmax_new_tokens\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m15\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43mcontext_size\u001b[49m\u001b[43m=\u001b[49m\u001b[43mGPT_CONFIG\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mcontext_length\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43mtemperature\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m0.4\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43mtop_k\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m3\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m 4\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[33mOutput text:\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[33m\"\u001b[39m,token_ids_to_text(token_ids,tokenizer))\n", "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[27]\u001b[39m\u001b[32m, line 5\u001b[39m, in \u001b[36mgenerate_text\u001b[39m\u001b[34m(model, idx, max_new_tokens, context_size, temperature, top_k)\u001b[39m\n\u001b[32m 3\u001b[39m idx_cond=idx[:,-context_size:]\n\u001b[32m 4\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m torch.no_grad():\n\u001b[32m----> \u001b[39m\u001b[32m5\u001b[39m logits=\u001b[43mmodel\u001b[49m\u001b[43m(\u001b[49m\u001b[43midx_cond\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 6\u001b[39m logits=logits[:,-\u001b[32m1\u001b[39m,:]\n\u001b[32m 7\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m top_k \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", "\u001b[36mFile \u001b[39m\u001b[32m~\\AppData\\Roaming\\Python\\Python313\\site-packages\\torch\\nn\\modules\\module.py:1751\u001b[39m, in \u001b[36mModule._wrapped_call_impl\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m 1749\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._compiled_call_impl(*args, **kwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[32m 1750\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1751\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", "\u001b[36mFile \u001b[39m\u001b[32m~\\AppData\\Roaming\\Python\\Python313\\site-packages\\torch\\nn\\modules\\module.py:1762\u001b[39m, in \u001b[36mModule._call_impl\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m 1757\u001b[39m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[32m 1758\u001b[39m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[32m 1759\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m._backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._forward_pre_hooks\n\u001b[32m 1760\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[32m 1761\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[32m-> \u001b[39m\u001b[32m1762\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1764\u001b[39m result = \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m 1765\u001b[39m called_always_called_hooks = \u001b[38;5;28mset\u001b[39m()\n", "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[32]\u001b[39m\u001b[32m, line 16\u001b[39m, in \u001b[36mGPT_Model.forward\u001b[39m\u001b[34m(self, in_idx)\u001b[39m\n\u001b[32m 14\u001b[39m batch_size,seq_len=in_idx.shape\n\u001b[32m 15\u001b[39m in_idx = torch.clamp(in_idx, \u001b[32m0\u001b[39m, \u001b[38;5;28mself\u001b[39m.tok_emb.num_embeddings - \u001b[32m1\u001b[39m) \u001b[38;5;66;03m#This was initially commented out\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m16\u001b[39m token_embeddings=\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mtok_emb\u001b[49m\u001b[43m(\u001b[49m\u001b[43min_idx\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 17\u001b[39m positions = torch.arange(seq_len, device=in_idx.device).unsqueeze(\u001b[32m0\u001b[39m) \u001b[38;5;66;03m#this is the extra added line\u001b[39;00m\n\u001b[32m 18\u001b[39m positional_embeddings=\u001b[38;5;28mself\u001b[39m.pos_emb(positions)\n", "\u001b[36mFile \u001b[39m\u001b[32m~\\AppData\\Roaming\\Python\\Python313\\site-packages\\torch\\nn\\modules\\module.py:1751\u001b[39m, in \u001b[36mModule._wrapped_call_impl\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m 1749\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._compiled_call_impl(*args, **kwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[32m 1750\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1751\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", "\u001b[36mFile \u001b[39m\u001b[32m~\\AppData\\Roaming\\Python\\Python313\\site-packages\\torch\\nn\\modules\\module.py:1762\u001b[39m, in \u001b[36mModule._call_impl\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m 1757\u001b[39m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[32m 1758\u001b[39m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[32m 1759\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m._backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._forward_pre_hooks\n\u001b[32m 1760\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[32m 1761\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[32m-> \u001b[39m\u001b[32m1762\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1764\u001b[39m result = \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m 1765\u001b[39m called_always_called_hooks = \u001b[38;5;28mset\u001b[39m()\n", "\u001b[36mFile \u001b[39m\u001b[32m~\\AppData\\Roaming\\Python\\Python313\\site-packages\\torch\\nn\\modules\\sparse.py:190\u001b[39m, in \u001b[36mEmbedding.forward\u001b[39m\u001b[34m(self, input)\u001b[39m\n\u001b[32m 189\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m: Tensor) -> Tensor:\n\u001b[32m--> \u001b[39m\u001b[32m190\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mF\u001b[49m\u001b[43m.\u001b[49m\u001b[43membedding\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 191\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 192\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 193\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mpadding_idx\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 194\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mmax_norm\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 195\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mnorm_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 196\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mscale_grad_by_freq\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 197\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43msparse\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 198\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", "\u001b[36mFile \u001b[39m\u001b[32m~\\AppData\\Roaming\\Python\\Python313\\site-packages\\torch\\nn\\functional.py:2551\u001b[39m, in \u001b[36membedding\u001b[39m\u001b[34m(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)\u001b[39m\n\u001b[32m 2545\u001b[39m \u001b[38;5;66;03m# Note [embedding_renorm set_grad_enabled]\u001b[39;00m\n\u001b[32m 2546\u001b[39m \u001b[38;5;66;03m# XXX: equivalent to\u001b[39;00m\n\u001b[32m 2547\u001b[39m \u001b[38;5;66;03m# with torch.no_grad():\u001b[39;00m\n\u001b[32m 2548\u001b[39m \u001b[38;5;66;03m# torch.embedding_renorm_\u001b[39;00m\n\u001b[32m 2549\u001b[39m \u001b[38;5;66;03m# remove once script supports set_grad_enabled\u001b[39;00m\n\u001b[32m 2550\u001b[39m _no_grad_embedding_renorm_(weight, \u001b[38;5;28minput\u001b[39m, max_norm, norm_type)\n\u001b[32m-> \u001b[39m\u001b[32m2551\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mtorch\u001b[49m\u001b[43m.\u001b[49m\u001b[43membedding\u001b[49m\u001b[43m(\u001b[49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpadding_idx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mscale_grad_by_freq\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msparse\u001b[49m\u001b[43m)\u001b[49m\n", "\u001b[31mRuntimeError\u001b[39m: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)" ] } ], "source": [ "while True:\n", " start_context=input()\n", " token_ids=generate_text(model=model,idx=text_to_token_ids(start_context,tokenizer),max_new_tokens=15,context_size=GPT_CONFIG[\"context_length\"],temperature=0.4,top_k=3)\n", " print(\"Output text:\\n\",token_ids_to_text(token_ids,tokenizer))" ] }, { "cell_type": "code", "execution_count": 61, "id": "19ea61ce", "metadata": {}, "outputs": [], "source": [ "optimizer=torch.optim.AdamW(model.parameters(),lr=3e-4)\n", "torch.save({\"model weights and biases\":model.state_dict(),\n", " \"optimizer_weights\":optimizer.state_dict(),},\n", " \"GPT_model.pth\")" ] }, { "cell_type": "code", "execution_count": null, "id": "d88da3c8", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "GPT_Model(\n", " (tok_emb): Embedding(100277, 512)\n", " (pos_emb): Embedding(256, 512)\n", " (drop_emb): Dropout(p=0.1, inplace=False)\n", " (trf_blocks): Sequential(\n", " (0): TransformerBlock(\n", " (att): MultiHeadAttention(\n", " (W_query): Linear(in_features=512, out_features=512, bias=False)\n", " (W_key): Linear(in_features=512, out_features=512, bias=False)\n", " (W_value): Linear(in_features=512, out_features=512, bias=False)\n", " (out_proj): Linear(in_features=512, out_features=512, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ff): FeedForward(\n", " (layers): Sequential(\n", " (0): Linear(in_features=512, out_features=2048, bias=True)\n", " (1): Swish()\n", " (2): Linear(in_features=2048, out_features=512, bias=True)\n", " )\n", " )\n", " (norm1): LayerNormalization()\n", " (norm2): LayerNormalization()\n", " (drop_resid): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): TransformerBlock(\n", " (att): MultiHeadAttention(\n", " (W_query): Linear(in_features=512, out_features=512, bias=False)\n", " (W_key): Linear(in_features=512, out_features=512, bias=False)\n", " (W_value): Linear(in_features=512, out_features=512, bias=False)\n", " (out_proj): Linear(in_features=512, out_features=512, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ff): FeedForward(\n", " (layers): Sequential(\n", " (0): Linear(in_features=512, out_features=2048, bias=True)\n", " (1): Swish()\n", " (2): Linear(in_features=2048, out_features=512, bias=True)\n", " )\n", " )\n", " (norm1): LayerNormalization()\n", " (norm2): LayerNormalization()\n", " (drop_resid): Dropout(p=0.1, inplace=False)\n", " )\n", " (2): TransformerBlock(\n", " (att): MultiHeadAttention(\n", " (W_query): Linear(in_features=512, out_features=512, bias=False)\n", " (W_key): Linear(in_features=512, out_features=512, bias=False)\n", " (W_value): Linear(in_features=512, out_features=512, bias=False)\n", " (out_proj): Linear(in_features=512, out_features=512, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ff): FeedForward(\n", " (layers): Sequential(\n", " (0): Linear(in_features=512, out_features=2048, bias=True)\n", " (1): Swish()\n", " (2): Linear(in_features=2048, out_features=512, bias=True)\n", " )\n", " )\n", " (norm1): LayerNormalization()\n", " (norm2): LayerNormalization()\n", " (drop_resid): Dropout(p=0.1, inplace=False)\n", " )\n", " (3): TransformerBlock(\n", " (att): MultiHeadAttention(\n", " (W_query): Linear(in_features=512, out_features=512, bias=False)\n", " (W_key): Linear(in_features=512, out_features=512, bias=False)\n", " (W_value): Linear(in_features=512, out_features=512, bias=False)\n", " (out_proj): Linear(in_features=512, out_features=512, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ff): FeedForward(\n", " (layers): Sequential(\n", " (0): Linear(in_features=512, out_features=2048, bias=True)\n", " (1): Swish()\n", " (2): Linear(in_features=2048, out_features=512, bias=True)\n", " )\n", " )\n", " (norm1): LayerNormalization()\n", " (norm2): LayerNormalization()\n", " (drop_resid): Dropout(p=0.1, inplace=False)\n", " )\n", " (4): TransformerBlock(\n", " (att): MultiHeadAttention(\n", " (W_query): Linear(in_features=512, out_features=512, bias=False)\n", " (W_key): Linear(in_features=512, out_features=512, bias=False)\n", " (W_value): Linear(in_features=512, out_features=512, bias=False)\n", " (out_proj): Linear(in_features=512, out_features=512, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ff): FeedForward(\n", " (layers): Sequential(\n", " (0): Linear(in_features=512, out_features=2048, bias=True)\n", " (1): Swish()\n", " (2): Linear(in_features=2048, out_features=512, bias=True)\n", " )\n", " )\n", " (norm1): LayerNormalization()\n", " (norm2): LayerNormalization()\n", " (drop_resid): Dropout(p=0.1, inplace=False)\n", " )\n", " (5): TransformerBlock(\n", " (att): MultiHeadAttention(\n", " (W_query): Linear(in_features=512, out_features=512, bias=False)\n", " (W_key): Linear(in_features=512, out_features=512, bias=False)\n", " (W_value): Linear(in_features=512, out_features=512, bias=False)\n", " (out_proj): Linear(in_features=512, out_features=512, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ff): FeedForward(\n", " (layers): Sequential(\n", " (0): Linear(in_features=512, out_features=2048, bias=True)\n", " (1): Swish()\n", " (2): Linear(in_features=2048, out_features=512, bias=True)\n", " )\n", " )\n", " (norm1): LayerNormalization()\n", " (norm2): LayerNormalization()\n", " (drop_resid): Dropout(p=0.1, inplace=False)\n", " )\n", " (6): TransformerBlock(\n", " (att): MultiHeadAttention(\n", " (W_query): Linear(in_features=512, out_features=512, bias=False)\n", " (W_key): Linear(in_features=512, out_features=512, bias=False)\n", " (W_value): Linear(in_features=512, out_features=512, bias=False)\n", " (out_proj): Linear(in_features=512, out_features=512, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ff): FeedForward(\n", " (layers): Sequential(\n", " (0): Linear(in_features=512, out_features=2048, bias=True)\n", " (1): Swish()\n", " (2): Linear(in_features=2048, out_features=512, bias=True)\n", " )\n", " )\n", " (norm1): LayerNormalization()\n", " (norm2): LayerNormalization()\n", " (drop_resid): Dropout(p=0.1, inplace=False)\n", " )\n", " (7): TransformerBlock(\n", " (att): MultiHeadAttention(\n", " (W_query): Linear(in_features=512, out_features=512, bias=False)\n", " (W_key): Linear(in_features=512, out_features=512, bias=False)\n", " (W_value): Linear(in_features=512, out_features=512, bias=False)\n", " (out_proj): Linear(in_features=512, out_features=512, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ff): FeedForward(\n", " (layers): Sequential(\n", " (0): Linear(in_features=512, out_features=2048, bias=True)\n", " (1): Swish()\n", " (2): Linear(in_features=2048, out_features=512, bias=True)\n", " )\n", " )\n", " (norm1): LayerNormalization()\n", " (norm2): LayerNormalization()\n", " (drop_resid): Dropout(p=0.1, inplace=False)\n", " )\n", " (8): TransformerBlock(\n", " (att): MultiHeadAttention(\n", " (W_query): Linear(in_features=512, out_features=512, bias=False)\n", " (W_key): Linear(in_features=512, out_features=512, bias=False)\n", " (W_value): Linear(in_features=512, out_features=512, bias=False)\n", " (out_proj): Linear(in_features=512, out_features=512, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ff): FeedForward(\n", " (layers): Sequential(\n", " (0): Linear(in_features=512, out_features=2048, bias=True)\n", " (1): Swish()\n", " (2): Linear(in_features=2048, out_features=512, bias=True)\n", " )\n", " )\n", " (norm1): LayerNormalization()\n", " (norm2): LayerNormalization()\n", " (drop_resid): Dropout(p=0.1, inplace=False)\n", " )\n", " (9): TransformerBlock(\n", " (att): MultiHeadAttention(\n", " (W_query): Linear(in_features=512, out_features=512, bias=False)\n", " (W_key): Linear(in_features=512, out_features=512, bias=False)\n", " (W_value): Linear(in_features=512, out_features=512, bias=False)\n", " (out_proj): Linear(in_features=512, out_features=512, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ff): FeedForward(\n", " (layers): Sequential(\n", " (0): Linear(in_features=512, out_features=2048, bias=True)\n", " (1): Swish()\n", " (2): Linear(in_features=2048, out_features=512, bias=True)\n", " )\n", " )\n", " (norm1): LayerNormalization()\n", " (norm2): LayerNormalization()\n", " (drop_resid): Dropout(p=0.1, inplace=False)\n", " )\n", " (10): TransformerBlock(\n", " (att): MultiHeadAttention(\n", " (W_query): Linear(in_features=512, out_features=512, bias=False)\n", " (W_key): Linear(in_features=512, out_features=512, bias=False)\n", " (W_value): Linear(in_features=512, out_features=512, bias=False)\n", " (out_proj): Linear(in_features=512, out_features=512, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ff): FeedForward(\n", " (layers): Sequential(\n", " (0): Linear(in_features=512, out_features=2048, bias=True)\n", " (1): Swish()\n", " (2): Linear(in_features=2048, out_features=512, bias=True)\n", " )\n", " )\n", " (norm1): LayerNormalization()\n", " (norm2): LayerNormalization()\n", " (drop_resid): Dropout(p=0.1, inplace=False)\n", " )\n", " (11): TransformerBlock(\n", " (att): MultiHeadAttention(\n", " (W_query): Linear(in_features=512, out_features=512, bias=False)\n", " (W_key): Linear(in_features=512, out_features=512, bias=False)\n", " (W_value): Linear(in_features=512, out_features=512, bias=False)\n", " (out_proj): Linear(in_features=512, out_features=512, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ff): FeedForward(\n", " (layers): Sequential(\n", " (0): Linear(in_features=512, out_features=2048, bias=True)\n", " (1): Swish()\n", " (2): Linear(in_features=2048, out_features=512, bias=True)\n", " )\n", " )\n", " (norm1): LayerNormalization()\n", " (norm2): LayerNormalization()\n", " (drop_resid): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (final_norm): LayerNormalization()\n", " (out_head): Linear(in_features=512, out_features=100277, bias=False)\n", ")" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Load the weights using the following code\n", "#model = GPT_Model(GPT_CONFIG)\n", "#model.load_state_dict(torch.load(\"GPT_model.pth\"))\n", "#model.eval()" ] } ], "metadata": { "colab": { "provenance": [] }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.2" } }, "nbformat": 4, "nbformat_minor": 5 }