Upload 9 files
Browse filesCommitted Files!
- README.md +169 -7
- app.py +68 -0
- config.py +15 -0
- data/model_tf.pth +3 -0
- datasets.py +37 -0
- inference.py +37 -0
- model.py +143 -0
- requirements.txt +8 -0
- utils.py +26 -0
README.md
CHANGED
@@ -1,13 +1,175 @@
|
|
1 |
---
|
2 |
-
title: Shakespeare Coriolanus
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
-
sdk_version:
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
-
short_description: Recreated the Space for Assignment 12
|
11 |
---
|
12 |
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
title: Shakespeare Coriolanus Transformer
|
3 |
+
emoji: 📚
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: red
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 3.50.2
|
8 |
app_file: app.py
|
9 |
pinned: false
|
|
|
10 |
---
|
11 |
|
12 |
+
# Shakespeare Coriolanus Transformer
|
13 |
+
This is a test model created to train and test a basic small decoder only transfomer with 124m parameters. The code has modules to both train and test the model. The trained model can be tested on HugginFace.
|
14 |
+
|
15 |
+
# Steps to Run Locally
|
16 |
+
1. Create and activate a virtual environment:
|
17 |
+
```bash
|
18 |
+
python -m venv venv
|
19 |
+
source venv/bin/activate # On Windows: venv\Scripts\activate
|
20 |
+
```
|
21 |
+
|
22 |
+
2. Install the requirements and the Hugging Face CLI:
|
23 |
+
```bash
|
24 |
+
pip install -r requirements.txt
|
25 |
+
pip install --upgrade huggingface-hub
|
26 |
+
```
|
27 |
+
4. To train the model:
|
28 |
+
```bash
|
29 |
+
python src/train.py
|
30 |
+
```
|
31 |
+
|
32 |
+
5. To run the app:
|
33 |
+
```bash
|
34 |
+
python src/app.py
|
35 |
+
```
|
36 |
+
The interface will be available at `http://localhost:7860` by default.
|
37 |
+
|
38 |
+
# Training Logs
|
39 |
+
```
|
40 |
+
loaded 338025 tokens
|
41 |
+
1 epoch = 41 batches
|
42 |
+
BatchSize: 256 || Tokens per batch; 32
|
43 |
+
[STEP 2] Initializing model...
|
44 |
+
[STEP 3] Printing Model Architecture Summary...
|
45 |
+
|
46 |
+
Model Architecture:
|
47 |
+
DecoderTransformer(
|
48 |
+
(wte): Embedding(50257, 768)
|
49 |
+
(wpe): Embedding(1024, 768)
|
50 |
+
(blocks): ModuleList(
|
51 |
+
(0-11): 12 x Block(
|
52 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
53 |
+
(att): Attention(
|
54 |
+
(w_qkv): Linear(in_features=768, out_features=2304, bias=True)
|
55 |
+
(proj): Linear(in_features=768, out_features=768, bias=True)
|
56 |
+
)
|
57 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
58 |
+
(mlp): MLP(
|
59 |
+
(fc): Linear(in_features=768, out_features=3072, bias=True)
|
60 |
+
(gelu): GELU(approximate='tanh')
|
61 |
+
(proj): Linear(in_features=3072, out_features=768, bias=True)
|
62 |
+
)
|
63 |
+
)
|
64 |
+
)
|
65 |
+
(lm_head): Linear(in_features=768, out_features=50257, bias=False)
|
66 |
+
)
|
67 |
+
|
68 |
+
Total Parameters: 124.44M
|
69 |
+
Total Steps 41 (epochs 1 , stepsPerEpoch 41)
|
70 |
+
[STEP 4] Starting Training...
|
71 |
+
(venv) gitesh.grover@Giteshs-MacBook-Pro ai-era-assignment12 % python train.py
|
72 |
+
|
73 |
+
[INFO] Using device: mps
|
74 |
+
[STEP 1] Preparing datasets...
|
75 |
+
/Users/gitesh.grover/Study/AI-ERA/venv/lib/python3.9/site-packages/urllib3/__init__.py:35: NotOpenSSLWarning: urllib3 v2 only supports OpenSSL 1.1.1+, currently the 'ssl' module is compiled with 'LibreSSL 2.8.3'. See: https://github.com/urllib3/urllib3/issues/3020
|
76 |
+
warnings.warn(
|
77 |
+
loaded 338025 tokens
|
78 |
+
1 epoch = 41 batches
|
79 |
+
BatchSize: 256 || Tokens per batch; 32
|
80 |
+
[STEP 2] Initializing model...
|
81 |
+
[STEP 3] Printing Model Architecture Summary...
|
82 |
+
|
83 |
+
Model Architecture:
|
84 |
+
DecoderTransformer(
|
85 |
+
(wte): Embedding(50257, 768)
|
86 |
+
(wpe): Embedding(1024, 768)
|
87 |
+
(blocks): ModuleList(
|
88 |
+
(0-11): 12 x Block(
|
89 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
90 |
+
(att): Attention(
|
91 |
+
(w_qkv): Linear(in_features=768, out_features=2304, bias=True)
|
92 |
+
(proj): Linear(in_features=768, out_features=768, bias=True)
|
93 |
+
)
|
94 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
95 |
+
(mlp): MLP(
|
96 |
+
(fc): Linear(in_features=768, out_features=3072, bias=True)
|
97 |
+
(gelu): GELU(approximate='tanh')
|
98 |
+
(proj): Linear(in_features=3072, out_features=768, bias=True)
|
99 |
+
)
|
100 |
+
)
|
101 |
+
)
|
102 |
+
(lm_head): Linear(in_features=768, out_features=50257, bias=False)
|
103 |
+
)
|
104 |
+
|
105 |
+
Total Parameters: 124.44M
|
106 |
+
Total Steps 12300 (epochs 300 , stepsPerEpoch 41)
|
107 |
+
[STEP 4] Starting Training...
|
108 |
+
Epoch 1, Loss: 11.0051
|
109 |
+
Epoch 2, Loss: 6.6564
|
110 |
+
Epoch 3, Loss: 6.1045
|
111 |
+
Epoch 4, Loss: 5.6797
|
112 |
+
Epoch 5, Loss: 5.3227
|
113 |
+
Epoch 6, Loss: 4.9817
|
114 |
+
Epoch 7, Loss: 4.6557
|
115 |
+
Epoch 8, Loss: 4.4270
|
116 |
+
Epoch 9, Loss: 4.2327
|
117 |
+
Epoch 10, Loss: 3.9861
|
118 |
+
Epoch 11, Loss: 3.7526
|
119 |
+
Epoch 12, Loss: 3.5475
|
120 |
+
Epoch 13, Loss: 3.3379
|
121 |
+
Epoch 14, Loss: 3.1133
|
122 |
+
Epoch 15, Loss: 2.8888
|
123 |
+
Epoch 16, Loss: 2.7211
|
124 |
+
Epoch 17, Loss: 2.4558
|
125 |
+
Epoch 18, Loss: 2.1982
|
126 |
+
Epoch 19, Loss: 1.9944
|
127 |
+
Epoch 20, Loss: 1.7707
|
128 |
+
Epoch 21, Loss: 1.6288
|
129 |
+
Epoch 22, Loss: 1.4231
|
130 |
+
Epoch 23, Loss: 1.2248
|
131 |
+
Epoch 24, Loss: 1.0180
|
132 |
+
Epoch 25, Loss: 0.8970
|
133 |
+
Epoch 26, Loss: 0.7644
|
134 |
+
Epoch 27, Loss: 0.6474
|
135 |
+
Epoch 28, Loss: 0.5318
|
136 |
+
Epoch 29, Loss: 0.4483
|
137 |
+
Epoch 30, Loss: 0.3601
|
138 |
+
Epoch 31, Loss: 0.2932
|
139 |
+
Epoch 32, Loss: 0.2754
|
140 |
+
Epoch 33, Loss: 0.2155
|
141 |
+
Epoch 34, Loss: 0.2092
|
142 |
+
Epoch 35, Loss: 0.1893
|
143 |
+
Epoch 36, Loss: 0.1753
|
144 |
+
Epoch 37, Loss: 0.1671
|
145 |
+
|
146 |
+
:
|
147 |
+
:
|
148 |
+
|
149 |
+
Epoch 203, Loss: 0.1224
|
150 |
+
Epoch 204, Loss: 0.1243
|
151 |
+
Epoch 205, Loss: 0.1308
|
152 |
+
Epoch 206, Loss: 0.1358
|
153 |
+
Epoch 207, Loss: 0.1413
|
154 |
+
Epoch 208, Loss: 0.1425
|
155 |
+
Epoch 209, Loss: 0.1281
|
156 |
+
Epoch 210, Loss: 0.1264
|
157 |
+
Epoch 211, Loss: 0.1305
|
158 |
+
Epoch 212, Loss: 0.1399
|
159 |
+
Epoch 213, Loss: 0.1266
|
160 |
+
Epoch 214, Loss: 0.1135
|
161 |
+
Epoch 215, Loss: 0.1127
|
162 |
+
Epoch 216, Loss: 0.1137
|
163 |
+
Epoch 217, Loss: 0.1045
|
164 |
+
Epoch 218, Loss: 0.1074
|
165 |
+
Epoch 219, Loss: 0.1014
|
166 |
+
Epoch 220, Loss: 0.0997
|
167 |
+
|
168 |
+
Target loss achieved at step 8979. Breaking
|
169 |
+
0.09973063319921494
|
170 |
+
[STEP 5] Saving Model...
|
171 |
+
[STEP 6] Testing by predicting next few tokens
|
172 |
+
X Shape before test: torch.Size([256, 32])
|
173 |
+
256
|
174 |
+
Y Shape after test: torch.Size([256, 30])
|
175 |
+
```
|
app.py
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import torch
|
3 |
+
import torch.nn as nn
|
4 |
+
import tiktoken
|
5 |
+
import torchvision.transforms as transforms
|
6 |
+
from model import DecoderTransformer
|
7 |
+
from config import Config
|
8 |
+
from inference import predict
|
9 |
+
from utils import get_device
|
10 |
+
|
11 |
+
def generate_sequence(text):
|
12 |
+
config = Config()
|
13 |
+
device = get_device()
|
14 |
+
# Load model
|
15 |
+
model = DecoderTransformer(config)
|
16 |
+
model.load_state_dict(torch.load(config.saved_model_path, weights_only=True))
|
17 |
+
model.to(device)
|
18 |
+
model.eval()
|
19 |
+
|
20 |
+
enc = tiktoken.get_encoding('gpt2')
|
21 |
+
tokens = enc.encode(text)
|
22 |
+
T = len(tokens)
|
23 |
+
input_tensor = torch.tensor(tokens, device=device)
|
24 |
+
input_tensor = input_tensor.view(1, T)
|
25 |
+
|
26 |
+
max_output_len = 30
|
27 |
+
y = predict(input_tensor, model, max_output_len=max_output_len)
|
28 |
+
output_tokens = y[0, :].tolist()
|
29 |
+
return enc.decode(output_tokens)
|
30 |
+
|
31 |
+
# # Convert input text to tensor using tokenizer
|
32 |
+
# input_tensor = torch.tensor([config.tokenizer.encode(text)], device=config.device)
|
33 |
+
|
34 |
+
# Generate sequence
|
35 |
+
# with torch.no_grad():
|
36 |
+
# # Initialize start token and empty sequence
|
37 |
+
# current_seq = torch.tensor([[config.start_token]], device=config.device)
|
38 |
+
|
39 |
+
# # Generate tokens one by one
|
40 |
+
# for _ in range(config.max_seq_length):
|
41 |
+
# # Get model predictions
|
42 |
+
# output = model(input_tensor, current_seq)
|
43 |
+
# next_token_logits = output[:, -1, :]
|
44 |
+
# next_token = torch.argmax(next_token_logits, dim=-1)
|
45 |
+
|
46 |
+
# # Add predicted token to sequence
|
47 |
+
# current_seq = torch.cat([current_seq, next_token.unsqueeze(0)], dim=1)
|
48 |
+
|
49 |
+
# # Stop if end token is generated
|
50 |
+
# if next_token.item() == config.end_token:
|
51 |
+
# break
|
52 |
+
|
53 |
+
# # Convert tokens to text
|
54 |
+
# generated_sequence = config.tokenizer.decode(current_seq[0].tolist())
|
55 |
+
# return generated_sequence
|
56 |
+
|
57 |
+
# Create Gradio interface
|
58 |
+
iface = gr.Interface(
|
59 |
+
fn=generate_sequence,
|
60 |
+
inputs=gr.Textbox(),
|
61 |
+
outputs=gr.Textbox(),
|
62 |
+
title="Text Generation",
|
63 |
+
description="Enter text to generate a continuation",
|
64 |
+
allow_flagging=False
|
65 |
+
)
|
66 |
+
|
67 |
+
if __name__ == "__main__":
|
68 |
+
iface.launch()
|
config.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass
|
2 |
+
|
3 |
+
@dataclass
|
4 |
+
class Config:
|
5 |
+
vocab_size: int = 50257 # number of tokens: 50,000 BPE merges + 256 bytes tokens + 1 <|endoftext|> token
|
6 |
+
nn_layer: int = 12 # number of layers
|
7 |
+
nn_head: int = 12 # number of heads
|
8 |
+
nn_embed: int = 768 # embedding dimension
|
9 |
+
nn_max_tok_seq: int = 1024 # max token sequence length (for pos embedding) # Block size
|
10 |
+
nn_train_tok_seq: int = 32 # Actual training token sequence
|
11 |
+
nn_mlp_expansion: int = 4 # Expansion in the MLP layer
|
12 |
+
batch_size: int = 256
|
13 |
+
train_tok_size: int = 32
|
14 |
+
saved_model_path = 'data/model_tf.pth'
|
15 |
+
train_input_file = 'data/input.txt'
|
data/model_tf.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6fe9fa8e75332d711c50372e863ddfe6cfb4f8fc3b56e8cf2455fb8fb7ca605a
|
3 |
+
size 548137112
|
datasets.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import tiktoken
|
2 |
+
import torch
|
3 |
+
|
4 |
+
class DataLoader:
|
5 |
+
def __init__(self, B, T, inputFile):
|
6 |
+
# Batch size and token sequence length
|
7 |
+
self.B = B
|
8 |
+
self.T = T
|
9 |
+
|
10 |
+
# at init load tokens from disk and store them in memory
|
11 |
+
|
12 |
+
# Custom Input text
|
13 |
+
with open(inputFile, 'r') as f:
|
14 |
+
text = f.read()
|
15 |
+
# Using Gpt2 encoding tokens
|
16 |
+
enc = tiktoken.get_encoding('gpt2')
|
17 |
+
tokens = enc.encode(text)
|
18 |
+
self.tokens = torch.tensor(tokens)
|
19 |
+
self.enc = enc
|
20 |
+
print(f'loaded {len(self.tokens)} tokens')
|
21 |
+
print(f'1 epoch = {len(self.tokens) // (B * T)} batches')
|
22 |
+
|
23 |
+
# state
|
24 |
+
self.current_position = 0
|
25 |
+
|
26 |
+
def next_batch(self):
|
27 |
+
B, T = self.B, self.T
|
28 |
+
# Load B*T +1 tokens (+1 for target)
|
29 |
+
buf = self.tokens[self.current_position: self.current_position + B * T + 1]
|
30 |
+
x = (buf[:-1]).view(B, T) # inputs [0-B*T)
|
31 |
+
y = (buf[1:]).view(B, T) # targets [1 - B*T +1)
|
32 |
+
# advance the position to B*T in the tensor
|
33 |
+
self.current_position += B*T
|
34 |
+
# if loading the next batch would be out of bounds, reset (to keep going)
|
35 |
+
if self.current_position + (B * T + 1) > len(self.tokens):
|
36 |
+
self.current_position = 0
|
37 |
+
return x, y
|
inference.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
from torch.nn import functional as F
|
4 |
+
from utils import get_device
|
5 |
+
from model import DecoderTransformer
|
6 |
+
|
7 |
+
def predict(x, model, max_output_len = 30):
|
8 |
+
device = get_device(seed=37)
|
9 |
+
|
10 |
+
input_len = x.size(1)
|
11 |
+
# x is of shape (B, Tr). Tr = running token size increased by 1 afer every loop below
|
12 |
+
while (x.size(1) < input_len + max_output_len):
|
13 |
+
# forward the model to get the logits
|
14 |
+
with torch.no_grad():
|
15 |
+
# TODO what is [0]?
|
16 |
+
logits = model(x)[0] # (B, Tr, vocab_size)
|
17 |
+
# take the logits at the last position as thats the prediction
|
18 |
+
logits = logits[:, -1, :] # (B, vocab_size)
|
19 |
+
# get the probabilities (from predicted vocab)
|
20 |
+
probs = F.softmax(logits, dim=-1)
|
21 |
+
# do top-k sampling of 50 (huggingface pipeline default)
|
22 |
+
# topk_probs here becomes (5, 50), topk_indices is (5, 50)
|
23 |
+
topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
|
24 |
+
# select a token from the top-k probabilities
|
25 |
+
# note: multinomial does not demand the input to sum to 1
|
26 |
+
ix = torch.multinomial(topk_probs, 1) # (B, 1)
|
27 |
+
# gather the corresponding indices
|
28 |
+
xcol = torch.gather(topk_indices, -1, ix) # (B, 1)
|
29 |
+
# append to the sequence increaing the Tr by 1
|
30 |
+
x = torch.cat((x, xcol), dim=1) # (B, Tr).. Tr = Tr+1
|
31 |
+
|
32 |
+
# Stop if end token is generated
|
33 |
+
# if xcol == config.end_token:
|
34 |
+
# break
|
35 |
+
|
36 |
+
return x[:, input_len:] # B, max_output_len
|
37 |
+
|
model.py
ADDED
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import math
|
3 |
+
import time
|
4 |
+
import inspect
|
5 |
+
from dataclasses import dataclass
|
6 |
+
import torch
|
7 |
+
import torch.nn as nn
|
8 |
+
from torch.nn import functional as F
|
9 |
+
|
10 |
+
class Attention(nn.Module):
|
11 |
+
|
12 |
+
def __init__(self, config):
|
13 |
+
super().__init__()
|
14 |
+
|
15 |
+
assert config.nn_embed % config.nn_head == 0
|
16 |
+
|
17 |
+
self.nn_head = config.nn_head
|
18 |
+
self.nn_embed = config.nn_embed
|
19 |
+
|
20 |
+
# K,Q,V NN layer calculated for the every token of the every batch
|
21 |
+
self.w_qkv = nn.Linear(config.nn_embed, config.nn_embed * 3) # (X, embed) -> (X, 3*embed)
|
22 |
+
|
23 |
+
# Projection layer to mix up the heads or the every token of the every batch
|
24 |
+
self.proj = nn.Linear(config.nn_embed, config.nn_embed) # (X, embed) -> (X, embed)
|
25 |
+
# TODO What does the following line do (coiped from class)
|
26 |
+
self.register_buffer("bias", torch.tril(torch.ones(config.nn_max_tok_seq, config.nn_max_tok_seq)).view(1, 1, config.nn_max_tok_seq, config.nn_max_tok_seq))
|
27 |
+
|
28 |
+
|
29 |
+
def forward(self, x):
|
30 |
+
B, T, E = x.size() # Batch size, token numbers, Embediing(nn_embed)
|
31 |
+
q, k, v = self.w_qkv(x).split(self.nn_embed, dim=2) # Split the last dimension in size od embed ie into 3
|
32 |
+
|
33 |
+
# divide the q,k,v last dim in groups (heads) and then shuffle to for the calculation
|
34 |
+
q = q.view(B, T, self.nn_head, E//self.nn_head).transpose(1,2) # (B, head, T, headEmbedSize)
|
35 |
+
k = k.view(B, T, self.nn_head, E//self.nn_head).transpose(1,2) # (B, head, T, headEmbedSize)
|
36 |
+
v = v.view(B, T, self.nn_head, E//self.nn_head).transpose(1,2) # (B, head, T, headEmbedSize)
|
37 |
+
|
38 |
+
att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) # Q*K / sqt(headEmbedSize)...(B, head, T, T)
|
39 |
+
att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf')) # Mask fill the B,headEmbedSize,T.a,T.b with -infinity where T.a < T.b
|
40 |
+
att = F.softmax(att, dim = -1) # maxFilled vals -infinity will become 0 after softmax
|
41 |
+
y = att @ v # B, head, T, headEmbedSize
|
42 |
+
# Shuffle the head and headEmbedSize together and append one after another to get back embed
|
43 |
+
y = y.transpose(1,2).contiguous().view(B, T, E) # B, T, head, headEmbedSize -> B, T, E
|
44 |
+
# Projection NN layer to shuffle the last dim that were stacked together
|
45 |
+
y = self.proj(y) # B, T, E
|
46 |
+
return y
|
47 |
+
|
48 |
+
# Feed Forward NN Layer
|
49 |
+
class MLP(nn.Module):
|
50 |
+
|
51 |
+
def __init__(self, config):
|
52 |
+
super().__init__()
|
53 |
+
|
54 |
+
self.fc = nn.Linear(config.nn_embed, config.nn_embed * config.nn_mlp_expansion)
|
55 |
+
self.gelu = nn.GELU(approximate='tanh')
|
56 |
+
self.proj = nn.Linear(config.nn_embed * config.nn_mlp_expansion, config.nn_embed)
|
57 |
+
self.proj.NANGPT_SCALE_INIT = 1
|
58 |
+
|
59 |
+
def forward(self, x):
|
60 |
+
x = self.fc(x)
|
61 |
+
x = self.gelu(x)
|
62 |
+
x = self.proj(x)
|
63 |
+
return x
|
64 |
+
|
65 |
+
class Block(nn.Module):
|
66 |
+
def __init__(self, config):
|
67 |
+
super().__init__()
|
68 |
+
self.ln_1 = nn.LayerNorm(config.nn_embed)
|
69 |
+
self.att = Attention(config)
|
70 |
+
self.ln_2 = nn.LayerNorm(config.nn_embed)
|
71 |
+
self.mlp = MLP(config)
|
72 |
+
|
73 |
+
def forward(self, x):
|
74 |
+
x = x + self.att(self.ln_1(x))
|
75 |
+
x = x + self.mlp(self.ln_2(x))
|
76 |
+
return x
|
77 |
+
|
78 |
+
|
79 |
+
class DecoderTransformer(nn.Module):
|
80 |
+
def __init__(self, config):
|
81 |
+
super().__init__()
|
82 |
+
self.config = config
|
83 |
+
|
84 |
+
self.wte = nn.Embedding(config.vocab_size, config.nn_embed)
|
85 |
+
self.wpe = nn.Embedding(config.nn_max_tok_seq, config.nn_embed)
|
86 |
+
self.blocks = nn.ModuleList([Block(config) for _ in range(0, config.nn_layer)])
|
87 |
+
self.lm_head = nn.Linear(config.nn_embed, config.vocab_size, bias=False)
|
88 |
+
|
89 |
+
# weight sharing for cost optimization
|
90 |
+
self.wte.weight = self.lm_head.weight
|
91 |
+
|
92 |
+
# weight initialization
|
93 |
+
self.apply(self._init_weights)
|
94 |
+
|
95 |
+
|
96 |
+
def _init_weights(self, module):
|
97 |
+
if isinstance(module, nn.Linear):
|
98 |
+
std = 0.02
|
99 |
+
if hasattr(module, 'NANGPT_SCALE_INIT'):
|
100 |
+
std *= (2 * self.config.nn_layer) ** -0.5
|
101 |
+
torch.nn.init.normal_(module.weight, mean = 0.0, std = std)
|
102 |
+
if module.bias is not None:
|
103 |
+
torch.nn.init.zeros_(module.bias)
|
104 |
+
elif isinstance(module, nn.Embedding):
|
105 |
+
torch.nn.init.normal_(module.weight, mean=0.0, std = 0.02)
|
106 |
+
|
107 |
+
def forward(self, idx, targets=None):
|
108 |
+
B, T = idx.size()
|
109 |
+
assert T <= self.config.nn_max_tok_seq, f"Token length ({T}) can not exceed the max allowed sequence size (block size) ({self.config.nn_max_tok_seq})"
|
110 |
+
|
111 |
+
# Embedding Layer
|
112 |
+
pos = torch.arange(0, T, dtype=torch.long, device=idx.device) # 1-D vector from 0..T represing token seq of a single batch
|
113 |
+
pos_embed = self.wpe(pos) # position embedding (T, nn_embed) - every token of given sequence will have a nn_embed size output
|
114 |
+
tok_embed = self.wte(idx) # Token embedding (B, T, nn_embed) - every token of a batch will have individual token embedding
|
115 |
+
# As pos embedding would be same for all the batches (as it is based on token sequence and not value), it can be added to every batch as is
|
116 |
+
x = pos_embed + tok_embed # B, T, nn_embed
|
117 |
+
|
118 |
+
# Transformer blocks..nn_layers
|
119 |
+
for block in self.blocks:
|
120 |
+
x = block(x) # B, T, nn_embed
|
121 |
+
|
122 |
+
# Head - last layer
|
123 |
+
logits = self.lm_head(x) # B, T, vocab_size
|
124 |
+
|
125 |
+
# If targets are supplied, calculate loss and return both logits & loss, otherwise just the loss
|
126 |
+
loss = None
|
127 |
+
if targets is not None:
|
128 |
+
loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
|
129 |
+
return logits, loss
|
130 |
+
|
131 |
+
|
132 |
+
|
133 |
+
|
134 |
+
|
135 |
+
|
136 |
+
|
137 |
+
|
138 |
+
|
139 |
+
|
140 |
+
|
141 |
+
|
142 |
+
|
143 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch
|
2 |
+
torchvision
|
3 |
+
pytest
|
4 |
+
numpy
|
5 |
+
torchsummary
|
6 |
+
gradio
|
7 |
+
transformers
|
8 |
+
tiktoken
|
utils.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
|
3 |
+
def get_device(seed = 1):
|
4 |
+
# Seed is to generate the same random data for each run
|
5 |
+
# For reproducibility
|
6 |
+
torch.manual_seed(seed)
|
7 |
+
|
8 |
+
# Set device
|
9 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
|
10 |
+
|
11 |
+
if torch.cuda.is_available():
|
12 |
+
print(f"[INFO] GPU: {torch.cuda.get_device_name(0)}")
|
13 |
+
print(f"[INFO] CUDA Version: {torch.version.cuda}\n")
|
14 |
+
torch.cuda.manual_seed(seed)
|
15 |
+
|
16 |
+
if not torch.backends.mps.is_available():
|
17 |
+
if not torch.backends.mps.is_built():
|
18 |
+
print("MPS not available because the current PyTorch install was not "
|
19 |
+
"built with MPS enabled.")
|
20 |
+
else:
|
21 |
+
print("MPS not available because the current MacOS version is not 12.3+ "
|
22 |
+
"and/or you do not have an MPS-enabled device on this machine.")
|
23 |
+
else:
|
24 |
+
torch.mps.manual_seed(seed)
|
25 |
+
|
26 |
+
return device
|