{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": 21,
      "id": "initial_id",
      "metadata": {
        "ExecuteTime": {
          "end_time": "2025-04-20T10:30:19.511335Z",
          "start_time": "2025-04-20T10:30:14.130243Z"
        },
        "collapsed": true,
        "id": "initial_id"
      },
      "outputs": [],
      "source": [
        "import numpy as np\n",
        "import pandas as pd\n",
        "import torch\n",
        "import torch.nn as nn\n",
        "import math\n",
        "#import tensorflow as tf"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 22,
      "id": "420a4dfdadcdee66",
      "metadata": {
        "ExecuteTime": {
          "end_time": "2025-04-20T10:30:21.755678Z",
          "start_time": "2025-04-20T10:30:21.729677Z"
        },
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "420a4dfdadcdee66",
        "outputId": "a0132552-6de3-4c64-c3ab-73cdf858dbc0"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "55955\n",
            "India, officially the Republic of India,[j][21] is a country in South Asia. It is the seventh-larges\n"
          ]
        }
      ],
      "source": [
        "with open(\"C:/Users/adity/Projects_of_Aditya/Working/India, officially the Republic of I.txt\",'r',encoding='utf-8') as f:\n",
        "    raw_text=f.read()\n",
        "print(len(raw_text))\n",
        "print(raw_text[:100])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 23,
      "id": "YJ4KwDtekrSy",
      "metadata": {
        "id": "YJ4KwDtekrSy"
      },
      "outputs": [],
      "source": [
        "train_ratio = 0.9\n",
        "train_size = int(train_ratio * len(raw_text))\n",
        "train_text = raw_text[:train_size]\n",
        "val_text = raw_text[train_size:]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 24,
      "id": "ebcdc51c",
      "metadata": {},
      "outputs": [],
      "source": [
        "class BinarizeFunction(torch.autograd.Function):\n",
        "    @staticmethod\n",
        "    def forward(ctx, input):\n",
        "        ctx.save_for_backward(input)\n",
        "        return torch.sign(input)\n",
        "    @staticmethod\n",
        "    def backward(ctx, grad_output):\n",
        "        input, = ctx.saved_tensors\n",
        "        mask=(input.abs()<=1).float()\n",
        "        grad_input = grad_output * mask\n",
        "        return grad_input"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 25,
      "id": "6dd4cfd0",
      "metadata": {},
      "outputs": [],
      "source": [
        "class QuantizedLinear(nn.Module):\n",
        "    def __init__(self, in_features, out_features, bias=True):\n",
        "        super(QuantizedLinear, self).__init__()\n",
        "        self.in_features = in_features\n",
        "        self.out_features = out_features\n",
        "        self.weight = nn.Parameter(torch.Tensor(out_features, in_features))\n",
        "        if bias:\n",
        "            self.bias = nn.Parameter(torch.Tensor(out_features))\n",
        "        else:\n",
        "            self.register_parameter('bias', None)\n",
        "        self.reset_parameters()\n",
        "\n",
        "    def reset_parameters(self):\n",
        "        nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))\n",
        "        if self.bias is not None:\n",
        "            fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight)\n",
        "            bound = 1 / math.sqrt(fan_in)\n",
        "            nn.init.uniform_(self.bias, -bound, bound)\n",
        "    def forward(self, input):\n",
        "        weight = BinarizeFunction.apply(self.weight)\n",
        "        if self.bias is not None:\n",
        "            return torch.nn.functional.linear(input, weight, self.bias)\n",
        "        else:\n",
        "            return torch.nn.functional.linear(input, weight)\n",
        "    def extra_repr(self):\n",
        "        return 'in_features={}, out_features={}, bias={}'.format(\n",
        "            self.in_features, self.out_features, self.bias is not None\n",
        "        )"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 26,
      "id": "dd29070035dafb99",
      "metadata": {
        "ExecuteTime": {
          "end_time": "2025-04-20T10:30:25.020010Z",
          "start_time": "2025-04-20T10:30:24.959908Z"
        },
        "id": "dd29070035dafb99"
      },
      "outputs": [],
      "source": [
        "from torch.utils.data import Dataset, DataLoader\n",
        "import tiktoken\n",
        "\n",
        "class GPTTokenizerDataset(Dataset):\n",
        "    def __init__(self, txt, tokenizer, max_length, stride):\n",
        "        self.tokenizer = tokenizer\n",
        "        self.input_ids = []\n",
        "        self.target_ids = []\n",
        "        token_ids = self.tokenizer.encode(txt)\n",
        "\n",
        "        for i in range(0, len(token_ids) - max_length, stride):\n",
        "            input_chunk = token_ids[i:i + max_length]\n",
        "            target_chunk = token_ids[i + 1:i + max_length+1]\n",
        "            self.input_ids.append(torch.tensor(input_chunk))\n",
        "            self.target_ids.append(torch.tensor(target_chunk))\n",
        "    def __len__(self):\n",
        "        return len(self.input_ids)\n",
        "    def __getitem__(self, idx):\n",
        "        return self.input_ids[idx], self.target_ids[idx]\n",
        "def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True):\n",
        "    tokenizer = tiktoken.get_encoding(\"cl100k_base\")\n",
        "    dataset = GPTTokenizerDataset(txt, tokenizer, max_length, stride)\n",
        "    dataloader = DataLoader(\n",
        "        dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last\n",
        "    )\n",
        "    return dataloader"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 27,
      "id": "40a9c2660445b78c",
      "metadata": {
        "ExecuteTime": {
          "end_time": "2025-04-20T10:30:52.552634Z",
          "start_time": "2025-04-20T10:30:52.545337Z"
        },
        "id": "40a9c2660445b78c"
      },
      "outputs": [],
      "source": [
        "def generate_text(model,idx,max_new_tokens,context_size,temperature=0.4,top_k=3):\n",
        "    for _ in range(max_new_tokens):\n",
        "        idx_cond=idx[:,-context_size:]\n",
        "        with torch.no_grad():\n",
        "            logits=model(idx_cond)\n",
        "        logits=logits[:,-1,:]\n",
        "        if top_k is not None:\n",
        "            top_logits,_=torch.topk(logits,top_k)\n",
        "            min_val=top_logits[:,-1]\n",
        "            logits=torch.where(logits<min_val,torch.tensor(float('-inf')).to(logits.device),logits)\n",
        "        if temperature>0.0:\n",
        "            logits=logits/temperature\n",
        "            probs=torch.softmax(logits,dim=-1)\n",
        "            idx_next=torch.multinomial(probs,num_samples=1)\n",
        "        else:\n",
        "            idx_next=torch.argmax(logits,dim=-1,keepdim=True)\n",
        "        idx=torch.cat((idx,idx_next),dim=1)\n",
        "    return idx"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 28,
      "id": "22a98021f476cc4d",
      "metadata": {
        "ExecuteTime": {
          "end_time": "2025-04-20T10:30:56.399874Z",
          "start_time": "2025-04-20T10:30:55.660994Z"
        },
        "id": "22a98021f476cc4d"
      },
      "outputs": [],
      "source": [
        "tokenizer = tiktoken.get_encoding(\"cl100k_base\")\n",
        "def text_to_token_ids(text,tokenizer):\n",
        "    encoded=tokenizer.encode(text,allowed_special={'<|endoftext|>'})\n",
        "    encoded_tensor=torch.tensor(encoded).unsqueeze(0)\n",
        "    return encoded_tensor\n",
        "def token_ids_to_text(token_ids,tokenizer):\n",
        "    flat=token_ids.squeeze(0)\n",
        "    return tokenizer.decode(flat.tolist())"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "c34f6594f2501fd3",
      "metadata": {
        "id": "c34f6594f2501fd3"
      },
      "source": [
        "Coding up the Attention model:- Here we would be creating a class of the causal attention and instantiating multiple times for the multihead attention model."
      ]
    },
    {
      "cell_type": "markdown",
      "id": "779103be54de3305",
      "metadata": {
        "id": "779103be54de3305"
      },
      "source": [
        "Now for example if we set the number of heads we want is 10, then what exactly happens:-\n",
        "--> we obtain a tensor with ten sets of context vector matrices.\n",
        "--> In each context vector matrix the rows represent the context vectors corresponding to the tokens, and the columns corresponding to the embedding dimension specified via d_out.\n",
        "--> Final embedding dimension is 10 x 10."
      ]
    },
    {
      "cell_type": "markdown",
      "id": "55a1ded1a5143e4b",
      "metadata": {
        "id": "55a1ded1a5143e4b"
      },
      "source": [
        "IMPLEMENTING THE PARALLEL METHOD OF IMPLEMENTATION."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 29,
      "id": "9ffdb4830dd6536c",
      "metadata": {
        "ExecuteTime": {
          "end_time": "2025-04-20T10:31:00.004231Z",
          "start_time": "2025-04-20T10:30:59.989116Z"
        },
        "id": "9ffdb4830dd6536c"
      },
      "outputs": [],
      "source": [
        "class MultiHeadAttention(nn.Module):\n",
        "    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):\n",
        "        super().__init__()\n",
        "        assert d_out % num_heads == 0, \"d_out must be divisible by num_heads\"\n",
        "        self.d_out = d_out\n",
        "        self.num_heads = num_heads\n",
        "        self.head_dim = d_out // num_heads\n",
        "        self.W_query = QuantizedLinear(d_in, d_out, bias=qkv_bias)\n",
        "        self.W_key = QuantizedLinear(d_in, d_out, bias=qkv_bias)\n",
        "        self.W_value = QuantizedLinear(d_in, d_out, bias=qkv_bias)\n",
        "        self.out_proj = QuantizedLinear(d_out, d_out)\n",
        "        self.dropout = nn.Dropout(dropout)\n",
        "        self.register_buffer(\n",
        "            'mask',\n",
        "            torch.triu(torch.ones(context_length, context_length), diagonal=1)\n",
        "        )\n",
        "    def forward(self, x):\n",
        "        b, num_tokens, d_in = x.shape\n",
        "        keys = self.W_key(x)\n",
        "        queries = self.W_query(x)\n",
        "        values = self.W_value(x)\n",
        "        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)\n",
        "        values = values.view(b, num_tokens, self.num_heads, self.head_dim)\n",
        "        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)\n",
        "        keys = keys.transpose(1, 2)\n",
        "        queries = queries.transpose(1, 2)\n",
        "        values = values.transpose(1, 2)\n",
        "        attn_scores = queries @ keys.transpose(2, 3)\n",
        "        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]\n",
        "        attn_scores.masked_fill_(mask_bool, -torch.inf)\n",
        "        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)\n",
        "        attn_weights = self.dropout(attn_weights)\n",
        "        context_vec = (attn_weights @ values).transpose(1, 2)\n",
        "        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)\n",
        "        context_vec = self.out_proj(context_vec)\n",
        "        return context_vec"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 30,
      "id": "a361c4d3",
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Vocab size: 100277\n"
          ]
        }
      ],
      "source": [
        "config_tokenizer=tiktoken.get_encoding(\"cl100k_base\")\n",
        "actual_vocab_size=config_tokenizer.n_vocab\n",
        "print(\"Vocab size:\", actual_vocab_size)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 31,
      "id": "4f7ad555c6c06399",
      "metadata": {
        "ExecuteTime": {
          "end_time": "2025-04-20T10:31:03.321536Z",
          "start_time": "2025-04-20T10:31:03.313914Z"
        },
        "id": "4f7ad555c6c06399"
      },
      "outputs": [],
      "source": [
        "#Defining the parameters\n",
        "GPT_CONFIG={\n",
        "    'vocab_size':actual_vocab_size,\n",
        "    'context_length':256, # Change it to 1024 or greater if you have gpu\n",
        "    'embedding_dim':512,\n",
        "    'num_heads':16,\n",
        "    'n_layers':12,\n",
        "    'dropout':0.1,\n",
        "    'qkv_bias':False #Whether to include a bias layer in the linear layers of the multi head attention for query,key and value computations.\n",
        "}"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "47e51a02ecec92d5",
      "metadata": {
        "id": "47e51a02ecec92d5"
      },
      "source": [
        "Coding up the placeholder architecture, it is like the mothership from where all the robots will branch out"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 32,
      "id": "4bb79e5ab1baf62a",
      "metadata": {
        "ExecuteTime": {
          "end_time": "2025-04-20T10:31:06.415202Z",
          "start_time": "2025-04-20T10:31:06.403427Z"
        },
        "id": "4bb79e5ab1baf62a"
      },
      "outputs": [],
      "source": [
        "class GPT_Model(nn.Module):\n",
        "    def __init__(self, cfg):\n",
        "        #The __init__ constructor of this GPTModel class initializes the token and positional embedding layers using the configurations passed in via a Python dictionary, cfg.\n",
        "        super().__init__()\n",
        "        self.tok_emb = nn.Embedding(cfg[\"vocab_size\"], cfg[\"embedding_dim\"])\n",
        "        self.pos_emb = nn.Embedding(cfg[\"context_length\"], cfg[\"embedding_dim\"])\n",
        "        self.drop_emb = nn.Dropout(cfg[\"dropout\"])\n",
        "        self.trf_blocks = nn.Sequential(\n",
        "            *[TransformerBlock(cfg) for _ in range(cfg[\"n_layers\"])]\n",
        "        )\n",
        "        self.final_norm = LayerNormalization(cfg[\"embedding_dim\"])\n",
        "        self.out_head = QuantizedLinear(cfg[\"embedding_dim\"], cfg[\"vocab_size\"], bias=False)\n",
        "    def forward(self,in_idx):\n",
        "        batch_size,seq_len=in_idx.shape\n",
        "        in_idx = torch.clamp(in_idx, 0, self.tok_emb.num_embeddings - 1) #This was initially commented out\n",
        "        token_embeddings=self.tok_emb(in_idx)\n",
        "        positions = torch.arange(seq_len, device=in_idx.device).unsqueeze(0) #this is the extra added line\n",
        "        positional_embeddings=self.pos_emb(positions)\n",
        "        x=token_embeddings+positional_embeddings\n",
        "        x=self.drop_emb(x)\n",
        "        x=self.trf_blocks(x)\n",
        "        x=self.final_norm(x)\n",
        "        logits=self.out_head(x)\n",
        "        return logits"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 33,
      "id": "72748550",
      "metadata": {},
      "outputs": [],
      "source": [
        "class LayerNormalization(nn.Module):\n",
        "    def __init__(self, emb_dim):\n",
        "        super().__init__()\n",
        "        self.eps = 1e-5\n",
        "        self.scale = nn.Parameter(torch.ones(emb_dim))\n",
        "        self.shift = nn.Parameter(torch.zeros(emb_dim))\n",
        "    def forward(self,x):\n",
        "        mean= x.mean(-1, keepdim=True)\n",
        "        variance = x.var(-1, keepdim=True)\n",
        "        norm_x=(x-mean)/(torch.sqrt(variance+self.eps))\n",
        "        return self.scale*norm_x + self.shift"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 34,
      "id": "b81d6de9cdc325eb",
      "metadata": {
        "ExecuteTime": {
          "end_time": "2025-04-20T10:31:09.094024Z",
          "start_time": "2025-04-20T10:31:09.082533Z"
        },
        "id": "b81d6de9cdc325eb"
      },
      "outputs": [],
      "source": [
        "class TransformerBlock(nn.Module):\n",
        "    def __init__(self,config):\n",
        "        super().__init__()\n",
        "        self.att=MultiHeadAttention(\n",
        "            d_in=config[\"embedding_dim\"],\n",
        "            d_out=config[\"embedding_dim\"],\n",
        "            context_length=config['context_length'],\n",
        "            dropout=config['dropout'],\n",
        "            num_heads=config['num_heads'],\n",
        "            qkv_bias=config['qkv_bias']\n",
        "        )\n",
        "        self.ff=FeedForward(config)\n",
        "        self.norm1=LayerNormalization(config[\"embedding_dim\"])\n",
        "        self.norm2=LayerNormalization(config[\"embedding_dim\"])\n",
        "        self.drop_resid=nn.Dropout(config['dropout'])\n",
        "    def forward(self,x):\n",
        "        shortcut=x\n",
        "        x=self.norm1(x)\n",
        "        x=self.att(x)\n",
        "        x=self.drop_resid(x)\n",
        "        x=x+shortcut\n",
        "        shortcut=x\n",
        "        x=self.norm2(x)\n",
        "        x=self.ff(x)\n",
        "        x=self.drop_resid(x)\n",
        "        x=x+shortcut\n",
        "        return x"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "ee7086fdb0d258aa",
      "metadata": {
        "id": "ee7086fdb0d258aa"
      },
      "source": [
        "We will use swish activation function."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 35,
      "id": "aafae17704f79949",
      "metadata": {
        "ExecuteTime": {
          "end_time": "2025-04-20T10:31:14.198107Z",
          "start_time": "2025-04-20T10:31:14.183061Z"
        },
        "id": "aafae17704f79949"
      },
      "outputs": [],
      "source": [
        "class Swish(nn.Module):\n",
        "    def __init__(self):\n",
        "        super(Swish, self).__init__()\n",
        "    def forward(self, x):\n",
        "        return x * torch.sigmoid(x)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 36,
      "id": "4b3a9eeaf0282a32",
      "metadata": {
        "ExecuteTime": {
          "end_time": "2025-04-20T10:31:16.572707Z",
          "start_time": "2025-04-20T10:31:16.567278Z"
        },
        "id": "4b3a9eeaf0282a32"
      },
      "outputs": [],
      "source": [
        "class FeedForward(nn.Module):\n",
        "    def __init__(self, config):\n",
        "        super().__init__()\n",
        "        self.layers=nn.Sequential(\n",
        "            nn.Linear(config[\"embedding_dim\"], 4*config[\"embedding_dim\"]),\n",
        "            Swish(),\n",
        "            nn.Linear(4*config[\"embedding_dim\"], config[\"embedding_dim\"]),\n",
        "        )\n",
        "    def forward(self, x):\n",
        "        return self.layers(x)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 37,
      "id": "3888c877e7bb59fa",
      "metadata": {
        "ExecuteTime": {
          "end_time": "2025-04-20T10:31:37.956131Z",
          "start_time": "2025-04-20T10:31:37.943199Z"
        },
        "id": "3888c877e7bb59fa"
      },
      "outputs": [],
      "source": [
        "class DeepNeuralNetwork(nn.Module):\n",
        "    def __init__(self, layer_sizes,use_shortcut):\n",
        "        super().__init__()\n",
        "        self.layers=nn.ModuleList([\n",
        "            #We would be implementing 10 layers\n",
        "            nn.Sequential(nn.Linear(layer_sizes[0], layer_sizes[1])),\n",
        "            nn.Sequential(nn.Linear(layer_sizes[1], layer_sizes[2])),\n",
        "            nn.Sequential(nn.Linear(layer_sizes[2], layer_sizes[3])),\n",
        "            nn.Sequential(nn.Linear(layer_sizes[3], layer_sizes[4])),\n",
        "            nn.Sequential(nn.Linear(layer_sizes[4], layer_sizes[5])),\n",
        "            nn.Sequential(nn.Linear(layer_sizes[5], layer_sizes[6])),\n",
        "            nn.Sequential(nn.Linear(layer_sizes[6], layer_sizes[7])),\n",
        "            nn.Sequential(nn.Linear(layer_sizes[7], layer_sizes[8])),\n",
        "            nn.Sequential(nn.Linear(layer_sizes[8], layer_sizes[9])),\n",
        "            nn.Sequential(nn.Linear(layer_sizes[9], layer_sizes[10])),\n",
        "        ])\n",
        "    def forward(self,x):\n",
        "        for layer in self.layers:\n",
        "            #Computing the output of the current layer\n",
        "            layer_output=layer(x)\n",
        "            #Check if shortcut can be applied\n",
        "            if self.use_shortcut and x.shape==layer_output.shape:\n",
        "                x=x+layer_output\n",
        "            else:\n",
        "                x=layer_output\n",
        "            return x\n",
        "def print_gradients(model,x):\n",
        "    #First would be the forward pass\n",
        "    output = model(x)\n",
        "    target=torch.tensor([0,])\n",
        "    #Loss calculation\n",
        "    loss=nn.MSELoss()\n",
        "    loss=loss(output,target)\n",
        "    loss.backward()\n",
        "    for name, param in model.named_parameters():\n",
        "        if 'weight' in name:\n",
        "            print(f\"{name} grad: {param.grad}\")"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "78ab409a0177825",
      "metadata": {
        "id": "78ab409a0177825"
      },
      "source": [
        "Now let us initialise"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 38,
      "id": "6710dda1f52d8b41",
      "metadata": {
        "ExecuteTime": {
          "end_time": "2025-04-20T10:31:41.037621Z",
          "start_time": "2025-04-20T10:31:40.974254Z"
        },
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "6710dda1f52d8b41",
        "outputId": "c2753e89-89dc-4c5b-c086-53132aded738"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "tensor([[36, 24, 61,  0, 41, 81, 18, 26, 93, 88],\n",
            "        [26, 96, 17, 74, 20, 82, 52, 43, 96, 70]])\n"
          ]
        }
      ],
      "source": [
        "batch_size = 2  # Number of samples in the batch\n",
        "sequence_length = 10  # Length of each sequence\n",
        "vocab_size = 100  # Size of the vocabulary\n",
        "batch = torch.randint(0, vocab_size, (batch_size, sequence_length))\n",
        "print(batch)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 39,
      "id": "b376992b9eb9a68c",
      "metadata": {
        "ExecuteTime": {
          "end_time": "2025-04-20T10:31:44.349704Z",
          "start_time": "2025-04-20T10:31:43.391715Z"
        },
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "b376992b9eb9a68c",
        "outputId": "f67dc607-f218-4c20-848d-47212f38b749"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Input batch:\n",
            " tensor([[36, 24, 61,  0, 41, 81, 18, 26, 93, 88],\n",
            "        [26, 96, 17, 74, 20, 82, 52, 43, 96, 70]])\n",
            "Output batch:\n",
            " torch.Size([2, 10, 100277])\n",
            "tensor([[[ 1.6182e+01, -1.6015e+01, -9.4095e+00,  ...,  3.0794e-03,\n",
            "           2.9054e+01,  1.6988e+01],\n",
            "         [ 5.2240e+00,  2.7572e+01, -6.9735e+00,  ..., -8.0013e+00,\n",
            "          -4.0101e-01,  2.8758e+01],\n",
            "         [ 6.6475e+00, -1.1150e+01,  7.9781e+00,  ..., -2.5136e+01,\n",
            "           7.3388e+00,  9.9231e+00],\n",
            "         ...,\n",
            "         [-4.3846e+00, -1.7154e+01,  1.0174e+01,  ..., -4.6591e+00,\n",
            "          -8.3947e+00,  1.1043e+01],\n",
            "         [ 3.5968e+01, -2.7967e+00, -2.8498e+01,  ..., -2.2024e+00,\n",
            "          -1.1003e+01, -2.4883e-02],\n",
            "         [ 1.9451e+01, -3.6966e+01,  7.5978e+00,  ...,  9.3602e+00,\n",
            "           8.6090e+00, -2.6628e+00]],\n",
            "\n",
            "        [[-2.8687e+01,  1.6627e+01, -1.4998e+01,  ..., -1.7184e+01,\n",
            "           2.0726e+01,  8.0321e+00],\n",
            "         [-4.0979e+01,  6.5536e-01,  4.1383e+00,  ..., -1.2853e+01,\n",
            "          -1.7279e+01, -1.3240e+01],\n",
            "         [-1.9607e+01,  2.3471e+00,  7.2976e+00,  ...,  4.8977e-01,\n",
            "          -1.7134e+01,  3.4321e+00],\n",
            "         ...,\n",
            "         [-1.1025e+01, -2.4218e+00,  2.6663e+01,  ...,  1.4770e+00,\n",
            "          -4.0925e+01,  5.0661e-01],\n",
            "         [-3.4426e+01, -2.2701e+00,  2.6099e+01,  ..., -1.2846e+01,\n",
            "          -2.4183e+01, -4.9127e+01],\n",
            "         [ 1.6595e+00, -1.6062e+00,  1.8436e+01,  ...,  3.3674e+01,\n",
            "          -3.5222e+01, -2.4692e+01]]], grad_fn=<UnsafeViewBackward0>)\n"
          ]
        }
      ],
      "source": [
        "torch.manual_seed(123)\n",
        "model=GPT_Model(GPT_CONFIG)\n",
        "out=model(batch)\n",
        "print(\"Input batch:\\n\",batch)\n",
        "print(\"Output batch:\\n\",out.shape)\n",
        "print(out)"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "32204ab3e2917ca1",
      "metadata": {
        "id": "32204ab3e2917ca1"
      },
      "source": [
        "Displaying the number of parameters for the GPT model"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 40,
      "id": "bfd0d944c222bfbf",
      "metadata": {
        "ExecuteTime": {
          "end_time": "2025-04-20T10:31:49.707504Z",
          "start_time": "2025-04-20T10:31:49.699751Z"
        },
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "bfd0d944c222bfbf",
        "outputId": "bbad64e6-f379-475e-80d5-6d3fe5e79824"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Total number of parameters: 140625920\n",
            "Token embedding layer shape: torch.Size([100277, 512])\n",
            "Output layer shape: torch.Size([100277, 512])\n"
          ]
        }
      ],
      "source": [
        "total_parameters=sum(p.numel() for p in model.parameters())\n",
        "print(f\"Total number of parameters: {total_parameters}\")\n",
        "print(\"Token embedding layer shape:\", model.tok_emb.weight.shape)\n",
        "print(\"Output layer shape:\", model.out_head.weight.shape)"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "c2b39710a7897efb",
      "metadata": {
        "id": "c2b39710a7897efb"
      },
      "source": [
        "Number of trainable parameters in the model"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 41,
      "id": "e047e3f5d5b4e540",
      "metadata": {
        "ExecuteTime": {
          "end_time": "2025-04-20T10:31:53.034490Z",
          "start_time": "2025-04-20T10:31:53.027104Z"
        },
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "e047e3f5d5b4e540",
        "outputId": "b1793806-df53-4cf2-a09d-52e8485bb35f"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Number of trainable parameters considering weight tying: 89284096\n"
          ]
        }
      ],
      "source": [
        "total_params_gpt2 = total_parameters - sum(p.numel() for p in model.out_head.parameters())\n",
        "print(f\"Number of trainable parameters considering weight tying: {total_params_gpt2}\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 42,
      "id": "f611c62fb559142f",
      "metadata": {
        "ExecuteTime": {
          "end_time": "2025-04-20T10:31:57.287950Z",
          "start_time": "2025-04-20T10:31:57.279346Z"
        },
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "f611c62fb559142f",
        "outputId": "24b7ef8b-df10-40a3-b192-46a8d32cf3e3"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Total size of the model : 536.45 MB\n"
          ]
        }
      ],
      "source": [
        "total_size_in_bytes=total_parameters*4\n",
        "\n",
        "total_size_of_the_model_in_MB=total_size_in_bytes/(1024*1024)\n",
        "print(f\"Total size of the model : {total_size_of_the_model_in_MB:.2f} MB\")"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "645fa9c01a21b0e3",
      "metadata": {
        "id": "645fa9c01a21b0e3"
      },
      "source": [
        "Total size of the model : 341.55 MB\n",
        "Number of trainable parameters considering weight tying: 63935488\n"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "e32325eb6463fa21",
      "metadata": {
        "id": "e32325eb6463fa21"
      },
      "source": [
        "The next step is to now decode these tensors to proper text. Which would be coding up in the subsequent steps"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 43,
      "id": "af8f873de4b1ea1f",
      "metadata": {
        "ExecuteTime": {
          "end_time": "2025-04-20T10:36:18.521800Z",
          "start_time": "2025-04-20T10:36:18.507080Z"
        },
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "af8f873de4b1ea1f",
        "outputId": "8761b2e0-af06-4027-fc7b-b09c306d69cf"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "[9906, 11, 358, 1097, 2467, 488, 64, 13]\n"
          ]
        }
      ],
      "source": [
        "#Let us try out the decoding procedure\n",
        "start_context=\"Hello, I am Aditya.\"\n",
        "tokenizer = tiktoken.get_encoding(\"cl100k_base\")\n",
        "encoded=tokenizer.encode(start_context)\n",
        "print(encoded)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 44,
      "id": "baf2d02c627a5911",
      "metadata": {
        "ExecuteTime": {
          "end_time": "2025-04-20T10:32:31.432690Z",
          "start_time": "2025-04-20T10:32:31.416839Z"
        },
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "baf2d02c627a5911",
        "outputId": "b6a59155-048a-49e4-c1b5-683dbbad8f0a"
      },
      "outputs": [
        {
          "data": {
            "text/plain": [
              "GPT_Model(\n",
              "  (tok_emb): Embedding(100277, 512)\n",
              "  (pos_emb): Embedding(256, 512)\n",
              "  (drop_emb): Dropout(p=0.1, inplace=False)\n",
              "  (trf_blocks): Sequential(\n",
              "    (0): TransformerBlock(\n",
              "      (att): MultiHeadAttention(\n",
              "        (W_query): QuantizedLinear(in_features=512, out_features=512, bias=False)\n",
              "        (W_key): QuantizedLinear(in_features=512, out_features=512, bias=False)\n",
              "        (W_value): QuantizedLinear(in_features=512, out_features=512, bias=False)\n",
              "        (out_proj): QuantizedLinear(in_features=512, out_features=512, bias=True)\n",
              "        (dropout): Dropout(p=0.1, inplace=False)\n",
              "      )\n",
              "      (ff): FeedForward(\n",
              "        (layers): Sequential(\n",
              "          (0): Linear(in_features=512, out_features=2048, bias=True)\n",
              "          (1): Swish()\n",
              "          (2): Linear(in_features=2048, out_features=512, bias=True)\n",
              "        )\n",
              "      )\n",
              "      (norm1): LayerNormalization()\n",
              "      (norm2): LayerNormalization()\n",
              "      (drop_resid): Dropout(p=0.1, inplace=False)\n",
              "    )\n",
              "    (1): TransformerBlock(\n",
              "      (att): MultiHeadAttention(\n",
              "        (W_query): QuantizedLinear(in_features=512, out_features=512, bias=False)\n",
              "        (W_key): QuantizedLinear(in_features=512, out_features=512, bias=False)\n",
              "        (W_value): QuantizedLinear(in_features=512, out_features=512, bias=False)\n",
              "        (out_proj): QuantizedLinear(in_features=512, out_features=512, bias=True)\n",
              "        (dropout): Dropout(p=0.1, inplace=False)\n",
              "      )\n",
              "      (ff): FeedForward(\n",
              "        (layers): Sequential(\n",
              "          (0): Linear(in_features=512, out_features=2048, bias=True)\n",
              "          (1): Swish()\n",
              "          (2): Linear(in_features=2048, out_features=512, bias=True)\n",
              "        )\n",
              "      )\n",
              "      (norm1): LayerNormalization()\n",
              "      (norm2): LayerNormalization()\n",
              "      (drop_resid): Dropout(p=0.1, inplace=False)\n",
              "    )\n",
              "    (2): TransformerBlock(\n",
              "      (att): MultiHeadAttention(\n",
              "        (W_query): QuantizedLinear(in_features=512, out_features=512, bias=False)\n",
              "        (W_key): QuantizedLinear(in_features=512, out_features=512, bias=False)\n",
              "        (W_value): QuantizedLinear(in_features=512, out_features=512, bias=False)\n",
              "        (out_proj): QuantizedLinear(in_features=512, out_features=512, bias=True)\n",
              "        (dropout): Dropout(p=0.1, inplace=False)\n",
              "      )\n",
              "      (ff): FeedForward(\n",
              "        (layers): Sequential(\n",
              "          (0): Linear(in_features=512, out_features=2048, bias=True)\n",
              "          (1): Swish()\n",
              "          (2): Linear(in_features=2048, out_features=512, bias=True)\n",
              "        )\n",
              "      )\n",
              "      (norm1): LayerNormalization()\n",
              "      (norm2): LayerNormalization()\n",
              "      (drop_resid): Dropout(p=0.1, inplace=False)\n",
              "    )\n",
              "    (3): TransformerBlock(\n",
              "      (att): MultiHeadAttention(\n",
              "        (W_query): QuantizedLinear(in_features=512, out_features=512, bias=False)\n",
              "        (W_key): QuantizedLinear(in_features=512, out_features=512, bias=False)\n",
              "        (W_value): QuantizedLinear(in_features=512, out_features=512, bias=False)\n",
              "        (out_proj): QuantizedLinear(in_features=512, out_features=512, bias=True)\n",
              "        (dropout): Dropout(p=0.1, inplace=False)\n",
              "      )\n",
              "      (ff): FeedForward(\n",
              "        (layers): Sequential(\n",
              "          (0): Linear(in_features=512, out_features=2048, bias=True)\n",
              "          (1): Swish()\n",
              "          (2): Linear(in_features=2048, out_features=512, bias=True)\n",
              "        )\n",
              "      )\n",
              "      (norm1): LayerNormalization()\n",
              "      (norm2): LayerNormalization()\n",
              "      (drop_resid): Dropout(p=0.1, inplace=False)\n",
              "    )\n",
              "    (4): TransformerBlock(\n",
              "      (att): MultiHeadAttention(\n",
              "        (W_query): QuantizedLinear(in_features=512, out_features=512, bias=False)\n",
              "        (W_key): QuantizedLinear(in_features=512, out_features=512, bias=False)\n",
              "        (W_value): QuantizedLinear(in_features=512, out_features=512, bias=False)\n",
              "        (out_proj): QuantizedLinear(in_features=512, out_features=512, bias=True)\n",
              "        (dropout): Dropout(p=0.1, inplace=False)\n",
              "      )\n",
              "      (ff): FeedForward(\n",
              "        (layers): Sequential(\n",
              "          (0): Linear(in_features=512, out_features=2048, bias=True)\n",
              "          (1): Swish()\n",
              "          (2): Linear(in_features=2048, out_features=512, bias=True)\n",
              "        )\n",
              "      )\n",
              "      (norm1): LayerNormalization()\n",
              "      (norm2): LayerNormalization()\n",
              "      (drop_resid): Dropout(p=0.1, inplace=False)\n",
              "    )\n",
              "    (5): TransformerBlock(\n",
              "      (att): MultiHeadAttention(\n",
              "        (W_query): QuantizedLinear(in_features=512, out_features=512, bias=False)\n",
              "        (W_key): QuantizedLinear(in_features=512, out_features=512, bias=False)\n",
              "        (W_value): QuantizedLinear(in_features=512, out_features=512, bias=False)\n",
              "        (out_proj): QuantizedLinear(in_features=512, out_features=512, bias=True)\n",
              "        (dropout): Dropout(p=0.1, inplace=False)\n",
              "      )\n",
              "      (ff): FeedForward(\n",
              "        (layers): Sequential(\n",
              "          (0): Linear(in_features=512, out_features=2048, bias=True)\n",
              "          (1): Swish()\n",
              "          (2): Linear(in_features=2048, out_features=512, bias=True)\n",
              "        )\n",
              "      )\n",
              "      (norm1): LayerNormalization()\n",
              "      (norm2): LayerNormalization()\n",
              "      (drop_resid): Dropout(p=0.1, inplace=False)\n",
              "    )\n",
              "    (6): TransformerBlock(\n",
              "      (att): MultiHeadAttention(\n",
              "        (W_query): QuantizedLinear(in_features=512, out_features=512, bias=False)\n",
              "        (W_key): QuantizedLinear(in_features=512, out_features=512, bias=False)\n",
              "        (W_value): QuantizedLinear(in_features=512, out_features=512, bias=False)\n",
              "        (out_proj): QuantizedLinear(in_features=512, out_features=512, bias=True)\n",
              "        (dropout): Dropout(p=0.1, inplace=False)\n",
              "      )\n",
              "      (ff): FeedForward(\n",
              "        (layers): Sequential(\n",
              "          (0): Linear(in_features=512, out_features=2048, bias=True)\n",
              "          (1): Swish()\n",
              "          (2): Linear(in_features=2048, out_features=512, bias=True)\n",
              "        )\n",
              "      )\n",
              "      (norm1): LayerNormalization()\n",
              "      (norm2): LayerNormalization()\n",
              "      (drop_resid): Dropout(p=0.1, inplace=False)\n",
              "    )\n",
              "    (7): TransformerBlock(\n",
              "      (att): MultiHeadAttention(\n",
              "        (W_query): QuantizedLinear(in_features=512, out_features=512, bias=False)\n",
              "        (W_key): QuantizedLinear(in_features=512, out_features=512, bias=False)\n",
              "        (W_value): QuantizedLinear(in_features=512, out_features=512, bias=False)\n",
              "        (out_proj): QuantizedLinear(in_features=512, out_features=512, bias=True)\n",
              "        (dropout): Dropout(p=0.1, inplace=False)\n",
              "      )\n",
              "      (ff): FeedForward(\n",
              "        (layers): Sequential(\n",
              "          (0): Linear(in_features=512, out_features=2048, bias=True)\n",
              "          (1): Swish()\n",
              "          (2): Linear(in_features=2048, out_features=512, bias=True)\n",
              "        )\n",
              "      )\n",
              "      (norm1): LayerNormalization()\n",
              "      (norm2): LayerNormalization()\n",
              "      (drop_resid): Dropout(p=0.1, inplace=False)\n",
              "    )\n",
              "    (8): TransformerBlock(\n",
              "      (att): MultiHeadAttention(\n",
              "        (W_query): QuantizedLinear(in_features=512, out_features=512, bias=False)\n",
              "        (W_key): QuantizedLinear(in_features=512, out_features=512, bias=False)\n",
              "        (W_value): QuantizedLinear(in_features=512, out_features=512, bias=False)\n",
              "        (out_proj): QuantizedLinear(in_features=512, out_features=512, bias=True)\n",
              "        (dropout): Dropout(p=0.1, inplace=False)\n",
              "      )\n",
              "      (ff): FeedForward(\n",
              "        (layers): Sequential(\n",
              "          (0): Linear(in_features=512, out_features=2048, bias=True)\n",
              "          (1): Swish()\n",
              "          (2): Linear(in_features=2048, out_features=512, bias=True)\n",
              "        )\n",
              "      )\n",
              "      (norm1): LayerNormalization()\n",
              "      (norm2): LayerNormalization()\n",
              "      (drop_resid): Dropout(p=0.1, inplace=False)\n",
              "    )\n",
              "    (9): TransformerBlock(\n",
              "      (att): MultiHeadAttention(\n",
              "        (W_query): QuantizedLinear(in_features=512, out_features=512, bias=False)\n",
              "        (W_key): QuantizedLinear(in_features=512, out_features=512, bias=False)\n",
              "        (W_value): QuantizedLinear(in_features=512, out_features=512, bias=False)\n",
              "        (out_proj): QuantizedLinear(in_features=512, out_features=512, bias=True)\n",
              "        (dropout): Dropout(p=0.1, inplace=False)\n",
              "      )\n",
              "      (ff): FeedForward(\n",
              "        (layers): Sequential(\n",
              "          (0): Linear(in_features=512, out_features=2048, bias=True)\n",
              "          (1): Swish()\n",
              "          (2): Linear(in_features=2048, out_features=512, bias=True)\n",
              "        )\n",
              "      )\n",
              "      (norm1): LayerNormalization()\n",
              "      (norm2): LayerNormalization()\n",
              "      (drop_resid): Dropout(p=0.1, inplace=False)\n",
              "    )\n",
              "    (10): TransformerBlock(\n",
              "      (att): MultiHeadAttention(\n",
              "        (W_query): QuantizedLinear(in_features=512, out_features=512, bias=False)\n",
              "        (W_key): QuantizedLinear(in_features=512, out_features=512, bias=False)\n",
              "        (W_value): QuantizedLinear(in_features=512, out_features=512, bias=False)\n",
              "        (out_proj): QuantizedLinear(in_features=512, out_features=512, bias=True)\n",
              "        (dropout): Dropout(p=0.1, inplace=False)\n",
              "      )\n",
              "      (ff): FeedForward(\n",
              "        (layers): Sequential(\n",
              "          (0): Linear(in_features=512, out_features=2048, bias=True)\n",
              "          (1): Swish()\n",
              "          (2): Linear(in_features=2048, out_features=512, bias=True)\n",
              "        )\n",
              "      )\n",
              "      (norm1): LayerNormalization()\n",
              "      (norm2): LayerNormalization()\n",
              "      (drop_resid): Dropout(p=0.1, inplace=False)\n",
              "    )\n",
              "    (11): TransformerBlock(\n",
              "      (att): MultiHeadAttention(\n",
              "        (W_query): QuantizedLinear(in_features=512, out_features=512, bias=False)\n",
              "        (W_key): QuantizedLinear(in_features=512, out_features=512, bias=False)\n",
              "        (W_value): QuantizedLinear(in_features=512, out_features=512, bias=False)\n",
              "        (out_proj): QuantizedLinear(in_features=512, out_features=512, bias=True)\n",
              "        (dropout): Dropout(p=0.1, inplace=False)\n",
              "      )\n",
              "      (ff): FeedForward(\n",
              "        (layers): Sequential(\n",
              "          (0): Linear(in_features=512, out_features=2048, bias=True)\n",
              "          (1): Swish()\n",
              "          (2): Linear(in_features=2048, out_features=512, bias=True)\n",
              "        )\n",
              "      )\n",
              "      (norm1): LayerNormalization()\n",
              "      (norm2): LayerNormalization()\n",
              "      (drop_resid): Dropout(p=0.1, inplace=False)\n",
              "    )\n",
              "  )\n",
              "  (final_norm): LayerNormalization()\n",
              "  (out_head): QuantizedLinear(in_features=512, out_features=100277, bias=False)\n",
              ")"
            ]
          },
          "execution_count": 44,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "model.eval()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 45,
      "id": "8e6a5e5afc3272d6",
      "metadata": {
        "ExecuteTime": {
          "end_time": "2025-04-20T10:36:21.766425Z",
          "start_time": "2025-04-20T10:36:21.340642Z"
        },
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "8e6a5e5afc3272d6",
        "outputId": "4b2dcdff-161f-47c8-cca4-e84a9e117e2f"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Output:\n",
            " tensor([[ 9906,    11,   358,  1097,  2467,   488,    64,    13, 48400, 85624,\n",
            "          1993, 61732, 73414, 87133]])\n"
          ]
        }
      ],
      "source": [
        "model.eval()\n",
        "out=generate_text(model=model,idx=torch.tensor(encoded).unsqueeze(0),max_new_tokens=6,context_size=GPT_CONFIG[\"context_length\"])\n",
        "print(\"Output:\\n\",out)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 46,
      "id": "1ffca81eb2e208dd",
      "metadata": {
        "ExecuteTime": {
          "end_time": "2025-04-20T10:36:31.970156Z",
          "start_time": "2025-04-20T10:36:30.980631Z"
        },
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "1ffca81eb2e208dd",
        "outputId": "5d1b6fe6-0368-46c9-ead1-7cc1a3174322"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Output text:\n",
            " Hello, I am Aditya I want to become a CEO one day of my own company steadily;/*\tmodel collateral字符 Lois Middletonarios_DECL loophole\n"
          ]
        }
      ],
      "source": [
        "start_context=\"Hello, I am Aditya I want to become a CEO one day of my own company\"\n",
        "token_ids=generate_text(model=model,idx=text_to_token_ids(start_context,tokenizer),max_new_tokens=10,context_size=GPT_CONFIG[\"context_length\"])\n",
        "print(\"Output text:\\n\",token_ids_to_text(token_ids,tokenizer))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 47,
      "id": "yxZH4QzR-ydZ",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "yxZH4QzR-ydZ",
        "outputId": "d46883fa-15f6-44e9-d69f-797a3af7a8c4"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "torch.Size([1, 14, 100277])\n"
          ]
        }
      ],
      "source": [
        "inputs=torch.tensor([[ 9906,    11,   358,  1097,  2467,   488,    64,    13, 41867, 40540,\n",
        "         15145, 30876, 46468, 30001]])  # Remove extra comma and parenthesis to make it a tensor\n",
        "with torch.no_grad():\n",
        "  logits=model(inputs)\n",
        "probas=torch.softmax(logits,dim=-1)\n",
        "print(probas.shape)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 48,
      "id": "MTItfymWGhRZ",
      "metadata": {
        "id": "MTItfymWGhRZ"
      },
      "outputs": [],
      "source": [
        "torch.manual_seed(123)\n",
        "train_loader=create_dataloader_v1(train_text,batch_size=4,max_length=GPT_CONFIG[\"context_length\"],\n",
        "                                  stride=GPT_CONFIG['context_length'],\n",
        "                                  drop_last=True,\n",
        "                                  shuffle=True\n",
        "                                  )\n",
        "val_loader=create_dataloader_v1(val_text,batch_size=4,max_length=GPT_CONFIG[\"context_length\"],\n",
        "                                  stride=GPT_CONFIG['context_length'],\n",
        "                                  drop_last=True,\n",
        "                                  shuffle=True\n",
        "                                  )"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 49,
      "id": "e853b287",
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Train loader:\n",
            "torch.Size([4, 256]) torch.Size([4, 256])\n",
            "torch.Size([4, 256]) torch.Size([4, 256])\n",
            "torch.Size([4, 256]) torch.Size([4, 256])\n",
            "torch.Size([4, 256]) torch.Size([4, 256])\n",
            "torch.Size([4, 256]) torch.Size([4, 256])\n",
            "torch.Size([4, 256]) torch.Size([4, 256])\n",
            "torch.Size([4, 256]) torch.Size([4, 256])\n",
            "torch.Size([4, 256]) torch.Size([4, 256])\n",
            "torch.Size([4, 256]) torch.Size([4, 256])\n",
            "torch.Size([4, 256]) torch.Size([4, 256])\n",
            "torch.Size([4, 256]) torch.Size([4, 256])\n",
            "\n",
            " Validation Loader:\n",
            "torch.Size([4, 256]) torch.Size([4, 256])\n"
          ]
        }
      ],
      "source": [
        "print(\"Train loader:\")\n",
        "for x,y in train_loader:\n",
        "    print(x.shape,y.shape)\n",
        "print(\"\\n Validation Loader:\")\n",
        "for x,y in val_loader:\n",
        "    print(x.shape,y.shape)\n",
        "# The output implies that the model has 18 training set batches with 2 samples and 256 tokens each"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 50,
      "id": "Df2uwuFnmOp3",
      "metadata": {
        "id": "Df2uwuFnmOp3"
      },
      "outputs": [],
      "source": [
        "def calculation_of_loss(input_batch,target_batch,model,device):\n",
        "  input_batch,target_batch=input_batch.to(device),target_batch.to(device)\n",
        "  logits=model(input_batch)\n",
        "  loss=torch.nn.functional.cross_entropy(logits.flatten(0,1),target_batch.flatten())\n",
        "  return loss"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 51,
      "id": "hdoiK6MLcrYV",
      "metadata": {
        "id": "hdoiK6MLcrYV"
      },
      "outputs": [],
      "source": [
        "def loss_loader(data_loader, model, device, num_batches=4):\n",
        "    total_loss = 0 \n",
        "    for i, (input_batch, target_batch) in enumerate(data_loader):\n",
        "        if i < num_batches:\n",
        "            loss = calculation_of_loss(input_batch, target_batch, model, device)\n",
        "            total_loss += loss.item()\n",
        "        else:\n",
        "            break\n",
        "    return total_loss / num_batches"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 52,
      "id": "x89QUR65ePEs",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 383
        },
        "id": "x89QUR65ePEs",
        "outputId": "7b4bc307-b3fb-45b7-d067-724481f7bbce"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Train loss: 98.4413\n",
            "Validation loss: 24.3542\n"
          ]
        }
      ],
      "source": [
        "device='cpu'\n",
        "model.to(device)\n",
        "train_loss = loss_loader(train_loader, model, device='cpu',num_batches=4)\n",
        "val_loss=loss_loader(val_loader,model,device='cpu',num_batches=4)\n",
        "print(f\"Train loss: {train_loss:.4f}\")\n",
        "print(f\"Validation loss: {val_loss:.4f}\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 53,
      "id": "4aa447fc",
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "11\n",
            "1\n"
          ]
        }
      ],
      "source": [
        "print(len(train_loader))\n",
        "print(len(val_loader))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 54,
      "id": "a0020a0e",
      "metadata": {},
      "outputs": [],
      "source": [
        "def train_the_model(model,train_loader,val_loader,epochs=1,learning_rate=3e-4):\n",
        "    optimizer=torch.optim.AdamW(model.parameters(),lr=learning_rate)\n",
        "    for epoch in range(epochs):\n",
        "        model.train()\n",
        "        for i,(input_batch,target_batch) in enumerate(train_loader):\n",
        "            input_batch,target_batch=input_batch.to(device),target_batch.to(device)\n",
        "            optimizer.zero_grad()\n",
        "            logits=model(input_batch)\n",
        "            loss=torch.nn.functional.cross_entropy(logits.flatten(0,1),target_batch.flatten())\n",
        "            loss.backward()\n",
        "            optimizer.step()\n",
        "            if i%100==0:\n",
        "                print(f\"Epoch {epoch+1}/{epochs}, Batch {i}/{len(train_loader)}, Loss: {loss.item():.4f}\")\n",
        "        model.eval()\n",
        "        train_loss = loss_loader(train_loader, model, device='cpu',num_batches=4)\n",
        "        val_loss = loss_loader(val_loader, model, device='cpu',num_batches=4)\n",
        "        print(f\"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}\")\n",
        "        return train_loss, val_loss"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 55,
      "id": "b8407429",
      "metadata": {},
      "outputs": [],
      "source": [
        "def evaluate_model(model,train_loader, val_loader, device='cpu', num_batches=4):\n",
        "    model.eval()\n",
        "    with torch.no_grad():\n",
        "        train_loss = loss_loader(train_loader, model, device=device, num_batches=num_batches)\n",
        "        val_loss = loss_loader(val_loader, model, device=device, num_batches=num_batches)\n",
        "    model.train()\n",
        "    print(f\"Train Loss: {train_loss:.4f}\")\n",
        "    print(f\"Validation Loss: {val_loss:.4f}\")\n",
        "    return train_loss, val_loss"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 56,
      "id": "96d3965f",
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Epoch 1/10, Batch 0/11, Loss: 98.6930\n",
            "Epoch 1/10, Train Loss: 94.4102, Validation Loss: 23.4683\n"
          ]
        }
      ],
      "source": [
        "torch.manual_seed(123)\n",
        "model=GPT_Model(GPT_CONFIG)\n",
        "model.to(device)\n",
        "train_loss, val_loss = train_the_model(model, train_loader, val_loader, epochs=10, learning_rate=3e-4)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 57,
      "id": "fac91e1d",
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Output text:\n",
            " Hi\traise pitched že beh Difference_rg Commons licens\tsh taped LSUesco microseconds haberhandleRequest\n",
            "Output text:\n",
            " Can you talk in english-authored Alert 값을 together Arlington Pert DatePicker CitProductName/mswonerrassouth995 considerably\n",
            "Output text:\n",
            " Yup little bit less chinese\tll amongst Companies_Details_Details_Details_Details(diistribute sampano PUasingbowerazzo\n"
          ]
        },
        {
          "ename": "RuntimeError",
          "evalue": "Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)",
          "output_type": "error",
          "traceback": [
            "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
            "\u001b[31mRuntimeError\u001b[39m                              Traceback (most recent call last)",
            "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[57]\u001b[39m\u001b[32m, line 3\u001b[39m\n\u001b[32m      1\u001b[39m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[32m      2\u001b[39m     start_context=\u001b[38;5;28minput\u001b[39m()\n\u001b[32m----> \u001b[39m\u001b[32m3\u001b[39m     token_ids=\u001b[43mgenerate_text\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m=\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43midx\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtext_to_token_ids\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstart_context\u001b[49m\u001b[43m,\u001b[49m\u001b[43mtokenizer\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43mmax_new_tokens\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m15\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43mcontext_size\u001b[49m\u001b[43m=\u001b[49m\u001b[43mGPT_CONFIG\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mcontext_length\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43mtemperature\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m0.4\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43mtop_k\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m3\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m      4\u001b[39m     \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[33mOutput text:\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[33m\"\u001b[39m,token_ids_to_text(token_ids,tokenizer))\n",
            "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[27]\u001b[39m\u001b[32m, line 5\u001b[39m, in \u001b[36mgenerate_text\u001b[39m\u001b[34m(model, idx, max_new_tokens, context_size, temperature, top_k)\u001b[39m\n\u001b[32m      3\u001b[39m idx_cond=idx[:,-context_size:]\n\u001b[32m      4\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m torch.no_grad():\n\u001b[32m----> \u001b[39m\u001b[32m5\u001b[39m     logits=\u001b[43mmodel\u001b[49m\u001b[43m(\u001b[49m\u001b[43midx_cond\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m      6\u001b[39m logits=logits[:,-\u001b[32m1\u001b[39m,:]\n\u001b[32m      7\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m top_k \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
            "\u001b[36mFile \u001b[39m\u001b[32m~\\AppData\\Roaming\\Python\\Python313\\site-packages\\torch\\nn\\modules\\module.py:1751\u001b[39m, in \u001b[36mModule._wrapped_call_impl\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m   1749\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._compiled_call_impl(*args, **kwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[32m   1750\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1751\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
            "\u001b[36mFile \u001b[39m\u001b[32m~\\AppData\\Roaming\\Python\\Python313\\site-packages\\torch\\nn\\modules\\module.py:1762\u001b[39m, in \u001b[36mModule._call_impl\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m   1757\u001b[39m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[32m   1758\u001b[39m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[32m   1759\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m._backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._forward_pre_hooks\n\u001b[32m   1760\u001b[39m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[32m   1761\u001b[39m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[32m-> \u001b[39m\u001b[32m1762\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m   1764\u001b[39m result = \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m   1765\u001b[39m called_always_called_hooks = \u001b[38;5;28mset\u001b[39m()\n",
            "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[32]\u001b[39m\u001b[32m, line 16\u001b[39m, in \u001b[36mGPT_Model.forward\u001b[39m\u001b[34m(self, in_idx)\u001b[39m\n\u001b[32m     14\u001b[39m batch_size,seq_len=in_idx.shape\n\u001b[32m     15\u001b[39m in_idx = torch.clamp(in_idx, \u001b[32m0\u001b[39m, \u001b[38;5;28mself\u001b[39m.tok_emb.num_embeddings - \u001b[32m1\u001b[39m) \u001b[38;5;66;03m#This was initially commented out\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m16\u001b[39m token_embeddings=\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mtok_emb\u001b[49m\u001b[43m(\u001b[49m\u001b[43min_idx\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m     17\u001b[39m positions = torch.arange(seq_len, device=in_idx.device).unsqueeze(\u001b[32m0\u001b[39m) \u001b[38;5;66;03m#this is the extra added line\u001b[39;00m\n\u001b[32m     18\u001b[39m positional_embeddings=\u001b[38;5;28mself\u001b[39m.pos_emb(positions)\n",
            "\u001b[36mFile \u001b[39m\u001b[32m~\\AppData\\Roaming\\Python\\Python313\\site-packages\\torch\\nn\\modules\\module.py:1751\u001b[39m, in \u001b[36mModule._wrapped_call_impl\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m   1749\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._compiled_call_impl(*args, **kwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[32m   1750\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1751\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
            "\u001b[36mFile \u001b[39m\u001b[32m~\\AppData\\Roaming\\Python\\Python313\\site-packages\\torch\\nn\\modules\\module.py:1762\u001b[39m, in \u001b[36mModule._call_impl\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m   1757\u001b[39m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[32m   1758\u001b[39m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[32m   1759\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m._backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._forward_pre_hooks\n\u001b[32m   1760\u001b[39m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[32m   1761\u001b[39m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[32m-> \u001b[39m\u001b[32m1762\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m   1764\u001b[39m result = \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m   1765\u001b[39m called_always_called_hooks = \u001b[38;5;28mset\u001b[39m()\n",
            "\u001b[36mFile \u001b[39m\u001b[32m~\\AppData\\Roaming\\Python\\Python313\\site-packages\\torch\\nn\\modules\\sparse.py:190\u001b[39m, in \u001b[36mEmbedding.forward\u001b[39m\u001b[34m(self, input)\u001b[39m\n\u001b[32m    189\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m: Tensor) -> Tensor:\n\u001b[32m--> \u001b[39m\u001b[32m190\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mF\u001b[49m\u001b[43m.\u001b[49m\u001b[43membedding\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m    191\u001b[39m \u001b[43m        \u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m    192\u001b[39m \u001b[43m        \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    193\u001b[39m \u001b[43m        \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mpadding_idx\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    194\u001b[39m \u001b[43m        \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mmax_norm\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    195\u001b[39m \u001b[43m        \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mnorm_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    196\u001b[39m \u001b[43m        \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mscale_grad_by_freq\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    197\u001b[39m \u001b[43m        \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43msparse\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    198\u001b[39m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
            "\u001b[36mFile \u001b[39m\u001b[32m~\\AppData\\Roaming\\Python\\Python313\\site-packages\\torch\\nn\\functional.py:2551\u001b[39m, in \u001b[36membedding\u001b[39m\u001b[34m(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)\u001b[39m\n\u001b[32m   2545\u001b[39m     \u001b[38;5;66;03m# Note [embedding_renorm set_grad_enabled]\u001b[39;00m\n\u001b[32m   2546\u001b[39m     \u001b[38;5;66;03m# XXX: equivalent to\u001b[39;00m\n\u001b[32m   2547\u001b[39m     \u001b[38;5;66;03m# with torch.no_grad():\u001b[39;00m\n\u001b[32m   2548\u001b[39m     \u001b[38;5;66;03m#   torch.embedding_renorm_\u001b[39;00m\n\u001b[32m   2549\u001b[39m     \u001b[38;5;66;03m# remove once script supports set_grad_enabled\u001b[39;00m\n\u001b[32m   2550\u001b[39m     _no_grad_embedding_renorm_(weight, \u001b[38;5;28minput\u001b[39m, max_norm, norm_type)\n\u001b[32m-> \u001b[39m\u001b[32m2551\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mtorch\u001b[49m\u001b[43m.\u001b[49m\u001b[43membedding\u001b[49m\u001b[43m(\u001b[49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpadding_idx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mscale_grad_by_freq\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msparse\u001b[49m\u001b[43m)\u001b[49m\n",
            "\u001b[31mRuntimeError\u001b[39m: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)"
          ]
        }
      ],
      "source": [
        "while True:\n",
        "    start_context=input()\n",
        "    token_ids=generate_text(model=model,idx=text_to_token_ids(start_context,tokenizer),max_new_tokens=15,context_size=GPT_CONFIG[\"context_length\"],temperature=0.4,top_k=3)\n",
        "    print(\"Output text:\\n\",token_ids_to_text(token_ids,tokenizer))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 61,
      "id": "19ea61ce",
      "metadata": {},
      "outputs": [],
      "source": [
        "optimizer=torch.optim.AdamW(model.parameters(),lr=3e-4)\n",
        "torch.save({\"model weights and biases\":model.state_dict(),\n",
        "            \"optimizer_weights\":optimizer.state_dict(),},\n",
        "            \"GPT_model.pth\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "d88da3c8",
      "metadata": {},
      "outputs": [
        {
          "data": {
            "text/plain": [
              "GPT_Model(\n",
              "  (tok_emb): Embedding(100277, 512)\n",
              "  (pos_emb): Embedding(256, 512)\n",
              "  (drop_emb): Dropout(p=0.1, inplace=False)\n",
              "  (trf_blocks): Sequential(\n",
              "    (0): TransformerBlock(\n",
              "      (att): MultiHeadAttention(\n",
              "        (W_query): Linear(in_features=512, out_features=512, bias=False)\n",
              "        (W_key): Linear(in_features=512, out_features=512, bias=False)\n",
              "        (W_value): Linear(in_features=512, out_features=512, bias=False)\n",
              "        (out_proj): Linear(in_features=512, out_features=512, bias=True)\n",
              "        (dropout): Dropout(p=0.1, inplace=False)\n",
              "      )\n",
              "      (ff): FeedForward(\n",
              "        (layers): Sequential(\n",
              "          (0): Linear(in_features=512, out_features=2048, bias=True)\n",
              "          (1): Swish()\n",
              "          (2): Linear(in_features=2048, out_features=512, bias=True)\n",
              "        )\n",
              "      )\n",
              "      (norm1): LayerNormalization()\n",
              "      (norm2): LayerNormalization()\n",
              "      (drop_resid): Dropout(p=0.1, inplace=False)\n",
              "    )\n",
              "    (1): TransformerBlock(\n",
              "      (att): MultiHeadAttention(\n",
              "        (W_query): Linear(in_features=512, out_features=512, bias=False)\n",
              "        (W_key): Linear(in_features=512, out_features=512, bias=False)\n",
              "        (W_value): Linear(in_features=512, out_features=512, bias=False)\n",
              "        (out_proj): Linear(in_features=512, out_features=512, bias=True)\n",
              "        (dropout): Dropout(p=0.1, inplace=False)\n",
              "      )\n",
              "      (ff): FeedForward(\n",
              "        (layers): Sequential(\n",
              "          (0): Linear(in_features=512, out_features=2048, bias=True)\n",
              "          (1): Swish()\n",
              "          (2): Linear(in_features=2048, out_features=512, bias=True)\n",
              "        )\n",
              "      )\n",
              "      (norm1): LayerNormalization()\n",
              "      (norm2): LayerNormalization()\n",
              "      (drop_resid): Dropout(p=0.1, inplace=False)\n",
              "    )\n",
              "    (2): TransformerBlock(\n",
              "      (att): MultiHeadAttention(\n",
              "        (W_query): Linear(in_features=512, out_features=512, bias=False)\n",
              "        (W_key): Linear(in_features=512, out_features=512, bias=False)\n",
              "        (W_value): Linear(in_features=512, out_features=512, bias=False)\n",
              "        (out_proj): Linear(in_features=512, out_features=512, bias=True)\n",
              "        (dropout): Dropout(p=0.1, inplace=False)\n",
              "      )\n",
              "      (ff): FeedForward(\n",
              "        (layers): Sequential(\n",
              "          (0): Linear(in_features=512, out_features=2048, bias=True)\n",
              "          (1): Swish()\n",
              "          (2): Linear(in_features=2048, out_features=512, bias=True)\n",
              "        )\n",
              "      )\n",
              "      (norm1): LayerNormalization()\n",
              "      (norm2): LayerNormalization()\n",
              "      (drop_resid): Dropout(p=0.1, inplace=False)\n",
              "    )\n",
              "    (3): TransformerBlock(\n",
              "      (att): MultiHeadAttention(\n",
              "        (W_query): Linear(in_features=512, out_features=512, bias=False)\n",
              "        (W_key): Linear(in_features=512, out_features=512, bias=False)\n",
              "        (W_value): Linear(in_features=512, out_features=512, bias=False)\n",
              "        (out_proj): Linear(in_features=512, out_features=512, bias=True)\n",
              "        (dropout): Dropout(p=0.1, inplace=False)\n",
              "      )\n",
              "      (ff): FeedForward(\n",
              "        (layers): Sequential(\n",
              "          (0): Linear(in_features=512, out_features=2048, bias=True)\n",
              "          (1): Swish()\n",
              "          (2): Linear(in_features=2048, out_features=512, bias=True)\n",
              "        )\n",
              "      )\n",
              "      (norm1): LayerNormalization()\n",
              "      (norm2): LayerNormalization()\n",
              "      (drop_resid): Dropout(p=0.1, inplace=False)\n",
              "    )\n",
              "    (4): TransformerBlock(\n",
              "      (att): MultiHeadAttention(\n",
              "        (W_query): Linear(in_features=512, out_features=512, bias=False)\n",
              "        (W_key): Linear(in_features=512, out_features=512, bias=False)\n",
              "        (W_value): Linear(in_features=512, out_features=512, bias=False)\n",
              "        (out_proj): Linear(in_features=512, out_features=512, bias=True)\n",
              "        (dropout): Dropout(p=0.1, inplace=False)\n",
              "      )\n",
              "      (ff): FeedForward(\n",
              "        (layers): Sequential(\n",
              "          (0): Linear(in_features=512, out_features=2048, bias=True)\n",
              "          (1): Swish()\n",
              "          (2): Linear(in_features=2048, out_features=512, bias=True)\n",
              "        )\n",
              "      )\n",
              "      (norm1): LayerNormalization()\n",
              "      (norm2): LayerNormalization()\n",
              "      (drop_resid): Dropout(p=0.1, inplace=False)\n",
              "    )\n",
              "    (5): TransformerBlock(\n",
              "      (att): MultiHeadAttention(\n",
              "        (W_query): Linear(in_features=512, out_features=512, bias=False)\n",
              "        (W_key): Linear(in_features=512, out_features=512, bias=False)\n",
              "        (W_value): Linear(in_features=512, out_features=512, bias=False)\n",
              "        (out_proj): Linear(in_features=512, out_features=512, bias=True)\n",
              "        (dropout): Dropout(p=0.1, inplace=False)\n",
              "      )\n",
              "      (ff): FeedForward(\n",
              "        (layers): Sequential(\n",
              "          (0): Linear(in_features=512, out_features=2048, bias=True)\n",
              "          (1): Swish()\n",
              "          (2): Linear(in_features=2048, out_features=512, bias=True)\n",
              "        )\n",
              "      )\n",
              "      (norm1): LayerNormalization()\n",
              "      (norm2): LayerNormalization()\n",
              "      (drop_resid): Dropout(p=0.1, inplace=False)\n",
              "    )\n",
              "    (6): TransformerBlock(\n",
              "      (att): MultiHeadAttention(\n",
              "        (W_query): Linear(in_features=512, out_features=512, bias=False)\n",
              "        (W_key): Linear(in_features=512, out_features=512, bias=False)\n",
              "        (W_value): Linear(in_features=512, out_features=512, bias=False)\n",
              "        (out_proj): Linear(in_features=512, out_features=512, bias=True)\n",
              "        (dropout): Dropout(p=0.1, inplace=False)\n",
              "      )\n",
              "      (ff): FeedForward(\n",
              "        (layers): Sequential(\n",
              "          (0): Linear(in_features=512, out_features=2048, bias=True)\n",
              "          (1): Swish()\n",
              "          (2): Linear(in_features=2048, out_features=512, bias=True)\n",
              "        )\n",
              "      )\n",
              "      (norm1): LayerNormalization()\n",
              "      (norm2): LayerNormalization()\n",
              "      (drop_resid): Dropout(p=0.1, inplace=False)\n",
              "    )\n",
              "    (7): TransformerBlock(\n",
              "      (att): MultiHeadAttention(\n",
              "        (W_query): Linear(in_features=512, out_features=512, bias=False)\n",
              "        (W_key): Linear(in_features=512, out_features=512, bias=False)\n",
              "        (W_value): Linear(in_features=512, out_features=512, bias=False)\n",
              "        (out_proj): Linear(in_features=512, out_features=512, bias=True)\n",
              "        (dropout): Dropout(p=0.1, inplace=False)\n",
              "      )\n",
              "      (ff): FeedForward(\n",
              "        (layers): Sequential(\n",
              "          (0): Linear(in_features=512, out_features=2048, bias=True)\n",
              "          (1): Swish()\n",
              "          (2): Linear(in_features=2048, out_features=512, bias=True)\n",
              "        )\n",
              "      )\n",
              "      (norm1): LayerNormalization()\n",
              "      (norm2): LayerNormalization()\n",
              "      (drop_resid): Dropout(p=0.1, inplace=False)\n",
              "    )\n",
              "    (8): TransformerBlock(\n",
              "      (att): MultiHeadAttention(\n",
              "        (W_query): Linear(in_features=512, out_features=512, bias=False)\n",
              "        (W_key): Linear(in_features=512, out_features=512, bias=False)\n",
              "        (W_value): Linear(in_features=512, out_features=512, bias=False)\n",
              "        (out_proj): Linear(in_features=512, out_features=512, bias=True)\n",
              "        (dropout): Dropout(p=0.1, inplace=False)\n",
              "      )\n",
              "      (ff): FeedForward(\n",
              "        (layers): Sequential(\n",
              "          (0): Linear(in_features=512, out_features=2048, bias=True)\n",
              "          (1): Swish()\n",
              "          (2): Linear(in_features=2048, out_features=512, bias=True)\n",
              "        )\n",
              "      )\n",
              "      (norm1): LayerNormalization()\n",
              "      (norm2): LayerNormalization()\n",
              "      (drop_resid): Dropout(p=0.1, inplace=False)\n",
              "    )\n",
              "    (9): TransformerBlock(\n",
              "      (att): MultiHeadAttention(\n",
              "        (W_query): Linear(in_features=512, out_features=512, bias=False)\n",
              "        (W_key): Linear(in_features=512, out_features=512, bias=False)\n",
              "        (W_value): Linear(in_features=512, out_features=512, bias=False)\n",
              "        (out_proj): Linear(in_features=512, out_features=512, bias=True)\n",
              "        (dropout): Dropout(p=0.1, inplace=False)\n",
              "      )\n",
              "      (ff): FeedForward(\n",
              "        (layers): Sequential(\n",
              "          (0): Linear(in_features=512, out_features=2048, bias=True)\n",
              "          (1): Swish()\n",
              "          (2): Linear(in_features=2048, out_features=512, bias=True)\n",
              "        )\n",
              "      )\n",
              "      (norm1): LayerNormalization()\n",
              "      (norm2): LayerNormalization()\n",
              "      (drop_resid): Dropout(p=0.1, inplace=False)\n",
              "    )\n",
              "    (10): TransformerBlock(\n",
              "      (att): MultiHeadAttention(\n",
              "        (W_query): Linear(in_features=512, out_features=512, bias=False)\n",
              "        (W_key): Linear(in_features=512, out_features=512, bias=False)\n",
              "        (W_value): Linear(in_features=512, out_features=512, bias=False)\n",
              "        (out_proj): Linear(in_features=512, out_features=512, bias=True)\n",
              "        (dropout): Dropout(p=0.1, inplace=False)\n",
              "      )\n",
              "      (ff): FeedForward(\n",
              "        (layers): Sequential(\n",
              "          (0): Linear(in_features=512, out_features=2048, bias=True)\n",
              "          (1): Swish()\n",
              "          (2): Linear(in_features=2048, out_features=512, bias=True)\n",
              "        )\n",
              "      )\n",
              "      (norm1): LayerNormalization()\n",
              "      (norm2): LayerNormalization()\n",
              "      (drop_resid): Dropout(p=0.1, inplace=False)\n",
              "    )\n",
              "    (11): TransformerBlock(\n",
              "      (att): MultiHeadAttention(\n",
              "        (W_query): Linear(in_features=512, out_features=512, bias=False)\n",
              "        (W_key): Linear(in_features=512, out_features=512, bias=False)\n",
              "        (W_value): Linear(in_features=512, out_features=512, bias=False)\n",
              "        (out_proj): Linear(in_features=512, out_features=512, bias=True)\n",
              "        (dropout): Dropout(p=0.1, inplace=False)\n",
              "      )\n",
              "      (ff): FeedForward(\n",
              "        (layers): Sequential(\n",
              "          (0): Linear(in_features=512, out_features=2048, bias=True)\n",
              "          (1): Swish()\n",
              "          (2): Linear(in_features=2048, out_features=512, bias=True)\n",
              "        )\n",
              "      )\n",
              "      (norm1): LayerNormalization()\n",
              "      (norm2): LayerNormalization()\n",
              "      (drop_resid): Dropout(p=0.1, inplace=False)\n",
              "    )\n",
              "  )\n",
              "  (final_norm): LayerNormalization()\n",
              "  (out_head): Linear(in_features=512, out_features=100277, bias=False)\n",
              ")"
            ]
          },
          "execution_count": 44,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "#Load the weights using the following code\n",
        "#model = GPT_Model(GPT_CONFIG)\n",
        "#model.load_state_dict(torch.load(\"GPT_model.pth\"))\n",
        "#model.eval()"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.13.2"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
}