Upload folder using huggingface_hub
Browse files- config.json +15 -0
- llama_interpretation.py +188 -0
- model.safetensors +3 -0
- special_tokens_map.json +30 -0
- tokenizer.json +0 -0
- tokenizer.model +3 -0
- tokenizer_config.json +44 -0
config.json
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"CustomTransformerForCausalLM"
|
4 |
+
],
|
5 |
+
"d_model": 512,
|
6 |
+
"dim_feedforward": 2048,
|
7 |
+
"dropout": 0.1,
|
8 |
+
"group_size": 16,
|
9 |
+
"model_type": "custom_transformer",
|
10 |
+
"n_heads": 8,
|
11 |
+
"num_layers": 6,
|
12 |
+
"torch_dtype": "float32",
|
13 |
+
"transformers_version": "4.42.4",
|
14 |
+
"vocab_size": 32000
|
15 |
+
}
|
llama_interpretation.py
ADDED
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import torch.nn.functional as F
|
4 |
+
import math
|
5 |
+
import torch.optim as optim
|
6 |
+
from transformers import AutoModelForCausalLM
|
7 |
+
from transformers.modeling_utils import PreTrainedModel
|
8 |
+
from transformers.configuration_utils import PretrainedConfig
|
9 |
+
# Update the DecoderLayer to use the grouped MultiHeadAttention
|
10 |
+
class DecoderLayer(nn.Module):
|
11 |
+
def __init__(self, d_model, n_heads, dim_feedforward, dropout=0.1, group_size=16):
|
12 |
+
super(DecoderLayer, self).__init__()
|
13 |
+
self.self_attn = MultiHeadAttention(d_model, n_heads, dropout, group_size)
|
14 |
+
self.feed_forward = PositionwiseFeedForward(d_model, dim_feedforward, dropout)
|
15 |
+
self.layer_norm1 = nn.LayerNorm(d_model)
|
16 |
+
self.layer_norm2 = nn.LayerNorm(d_model)
|
17 |
+
self.dropout = nn.Dropout(dropout)
|
18 |
+
|
19 |
+
def forward(self, x):
|
20 |
+
# Self-Attention Mechanism (SA)
|
21 |
+
norm_x = self.layer_norm1(x)
|
22 |
+
x = x + self.dropout(self.self_attn(norm_x, norm_x, norm_x))
|
23 |
+
# Feed-Forward Network (FFN)
|
24 |
+
norm_x = self.layer_norm2(x)
|
25 |
+
x = x + self.dropout(self.feed_forward(norm_x))
|
26 |
+
return x
|
27 |
+
class MultiHeadAttention(nn.Module):
|
28 |
+
def __init__(self, d_model, n_heads, dropout=0.1, group_size=16):
|
29 |
+
super(MultiHeadAttention, self).__init__()
|
30 |
+
self.query_linear = nn.Linear(d_model, d_model)
|
31 |
+
self.key_linear = nn.Linear(d_model, d_model)
|
32 |
+
self.value_linear = nn.Linear(d_model, d_model)
|
33 |
+
self.dropout = nn.Dropout(dropout)
|
34 |
+
self.n_heads = n_heads
|
35 |
+
self.d_model = d_model
|
36 |
+
self.group_size = group_size
|
37 |
+
|
38 |
+
def forward(self, query, key, value):
|
39 |
+
# Compute attention scores
|
40 |
+
query = self.query_linear(query)
|
41 |
+
key = self.key_linear(key)
|
42 |
+
value = self.value_linear(value)
|
43 |
+
|
44 |
+
# Split the input sequences into groups
|
45 |
+
query_groups = query.chunk(self.group_size, dim=1)
|
46 |
+
key_groups = key.chunk(self.group_size, dim=1)
|
47 |
+
value_groups = value.chunk(self.group_size, dim=1)
|
48 |
+
|
49 |
+
attention_scores = []
|
50 |
+
for q, k, v in zip(query_groups, key_groups, value_groups):
|
51 |
+
scores = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(self.d_model)
|
52 |
+
scores = F.softmax(scores, dim=-1)
|
53 |
+
scores = self.dropout(scores)
|
54 |
+
attention_scores.append(torch.matmul(scores, v))
|
55 |
+
|
56 |
+
# Concatenate the outputs from all groups
|
57 |
+
output = torch.cat(attention_scores, dim=1)
|
58 |
+
return output
|
59 |
+
|
60 |
+
class PositionwiseFeedForward(nn.Module):
|
61 |
+
def __init__(self, d_model, dim_feedforward, dropout=0.1):
|
62 |
+
super(PositionwiseFeedForward, self).__init__()
|
63 |
+
self.linear1 = nn.Linear(d_model, dim_feedforward)
|
64 |
+
self.dropout = nn.Dropout(dropout)
|
65 |
+
self.linear2 = nn.Linear(dim_feedforward, d_model)
|
66 |
+
|
67 |
+
def forward(self, x):
|
68 |
+
x = F.relu(self.linear1(x))
|
69 |
+
x = self.dropout(x)
|
70 |
+
x = self.linear2(x)
|
71 |
+
return x
|
72 |
+
|
73 |
+
# Update the Decoder class to use the grouped MultiHeadAttention
|
74 |
+
class Decoder(nn.Module):
|
75 |
+
def __init__(self, num_layers, d_model, n_heads, dim_feedforward, dropout=0.1, group_size=16):
|
76 |
+
super(Decoder, self).__init__()
|
77 |
+
self.layers = nn.ModuleList([
|
78 |
+
DecoderLayer(d_model, n_heads, dim_feedforward, dropout, group_size)
|
79 |
+
for _ in range(num_layers)
|
80 |
+
])
|
81 |
+
self.layer_norm = nn.LayerNorm(d_model)
|
82 |
+
|
83 |
+
def forward(self, x):
|
84 |
+
for layer in self.layers:
|
85 |
+
x = layer(x)
|
86 |
+
x = self.layer_norm(x)
|
87 |
+
return x
|
88 |
+
|
89 |
+
class Embeddings(nn.Module):
|
90 |
+
def __init__(self, d_model, vocab_size):
|
91 |
+
super(Embeddings, self).__init__()
|
92 |
+
self.lut = nn.Embedding(vocab_size, d_model)
|
93 |
+
self.d_model = d_model
|
94 |
+
|
95 |
+
def forward(self, x):
|
96 |
+
return self.lut(x) * math.sqrt(self.d_model)
|
97 |
+
|
98 |
+
class PositionalEncoding(nn.Module):
|
99 |
+
def __init__(self, d_model, dropout=0.1, max_len=5000):
|
100 |
+
super(PositionalEncoding, self).__init__()
|
101 |
+
self.dropout = nn.Dropout(dropout)
|
102 |
+
|
103 |
+
pe = torch.zeros(max_len, d_model)
|
104 |
+
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
|
105 |
+
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
|
106 |
+
pe[:, 0::2] = torch.sin(position * div_term)
|
107 |
+
pe[:, 1::2] = torch.cos(position * div_term)
|
108 |
+
pe = pe.unsqueeze(0).transpose(0, 1)
|
109 |
+
self.register_buffer('pe', pe)
|
110 |
+
|
111 |
+
def forward(self, x):
|
112 |
+
x = x + self.pe[:x.size(0), :]
|
113 |
+
return self.dropout(x)
|
114 |
+
class RMSNorm(nn.Module):
|
115 |
+
def __init__(self, dim, epsilon=1e-6, scale=True):
|
116 |
+
super(RMSNorm, self).__init__()
|
117 |
+
self.epsilon = epsilon
|
118 |
+
self.scale = scale
|
119 |
+
self.weight = nn.Parameter(torch.ones(dim))
|
120 |
+
|
121 |
+
def forward(self, x):
|
122 |
+
rms = torch.sqrt(torch.mean(torch.square(x), dim=-1, keepdim=True))
|
123 |
+
if self.scale:
|
124 |
+
weight = self.weight / (rms + self.epsilon)
|
125 |
+
return weight * x
|
126 |
+
else:
|
127 |
+
return x / (rms + self.epsilon)
|
128 |
+
class TransformerDecoder(nn.Module):
|
129 |
+
def __init__(self, num_layers, d_model, n_heads, dim_feedforward, dropout=0.1, vocab_size=10000, group_size=16):
|
130 |
+
super(TransformerDecoder, self).__init__()
|
131 |
+
self.embeddings = Embeddings(d_model, vocab_size)
|
132 |
+
self.positional_encoding = PositionalEncoding(d_model, dropout)
|
133 |
+
self.decoder = Decoder(num_layers, d_model, n_heads, dim_feedforward, dropout)
|
134 |
+
self.rms_norm = RMSNorm(d_model)
|
135 |
+
self.group_size = group_size
|
136 |
+
|
137 |
+
def forward(self, x):
|
138 |
+
x = self.embeddings(x)
|
139 |
+
x = self.positional_encoding(x)
|
140 |
+
x = self.decoder(x)
|
141 |
+
x = self.rms_norm(x)
|
142 |
+
return x
|
143 |
+
class TransformerDecoderLM(nn.Module):
|
144 |
+
def __init__(self, num_layers, d_model, n_heads, dim_feedforward, dropout=0.1, vocab_size=10000, group_size=16):
|
145 |
+
super(TransformerDecoderLM, self).__init__()
|
146 |
+
self.transformer = TransformerDecoder(num_layers, d_model, n_heads, dim_feedforward, dropout, vocab_size, group_size)
|
147 |
+
self.lm_head = nn.Linear(d_model, vocab_size)
|
148 |
+
|
149 |
+
def forward(self, input_ids):
|
150 |
+
transformer_output = self.transformer(input_ids)
|
151 |
+
lm_logits = self.lm_head(transformer_output)
|
152 |
+
return lm_logits
|
153 |
+
class CustomConfig(PretrainedConfig):
|
154 |
+
model_type = "custom_transformer"
|
155 |
+
def __init__(self, num_layers=6, d_model=512, n_heads=8, dim_feedforward=2048, dropout=0.1, vocab_size=10000, group_size=16, **kwargs):
|
156 |
+
self.num_layers = num_layers
|
157 |
+
self.d_model = d_model
|
158 |
+
self.n_heads = n_heads
|
159 |
+
self.dim_feedforward = dim_feedforward
|
160 |
+
self.dropout = dropout
|
161 |
+
self.vocab_size = vocab_size
|
162 |
+
self.group_size = group_size
|
163 |
+
super().__init__(**kwargs)
|
164 |
+
|
165 |
+
class CustomTransformerForCausalLM(PreTrainedModel):
|
166 |
+
config_class = CustomConfig
|
167 |
+
def __init__(self, config):
|
168 |
+
super().__init__(config)
|
169 |
+
self.transformer = TransformerDecoderLM(
|
170 |
+
num_layers=config.num_layers,
|
171 |
+
d_model=config.d_model,
|
172 |
+
n_heads=config.n_heads,
|
173 |
+
dim_feedforward=config.dim_feedforward,
|
174 |
+
dropout=config.dropout,
|
175 |
+
vocab_size=config.vocab_size,
|
176 |
+
group_size=config.group_size
|
177 |
+
)
|
178 |
+
|
179 |
+
def forward(self, input_ids, labels=None):
|
180 |
+
logits = self.transformer(input_ids)
|
181 |
+
|
182 |
+
loss = None
|
183 |
+
if labels is not None:
|
184 |
+
loss_fct = nn.CrossEntropyLoss()
|
185 |
+
loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
|
186 |
+
|
187 |
+
return {"loss": loss, "logits": logits}
|
188 |
+
|
model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:607759b9a7ea1bf6b24467f3fb0d72571495503b6acda537705d775bda98c4fc
|
3 |
+
size 210811824
|
special_tokens_map.json
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<s>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": false,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"eos_token": {
|
10 |
+
"content": "</s>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"pad_token": {
|
17 |
+
"content": "<unk>",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
},
|
23 |
+
"unk_token": {
|
24 |
+
"content": "<unk>",
|
25 |
+
"lstrip": false,
|
26 |
+
"normalized": false,
|
27 |
+
"rstrip": false,
|
28 |
+
"single_word": false
|
29 |
+
}
|
30 |
+
}
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
|
3 |
+
size 493443
|
tokenizer_config.json
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_bos_token": true,
|
3 |
+
"add_eos_token": false,
|
4 |
+
"add_prefix_space": null,
|
5 |
+
"added_tokens_decoder": {
|
6 |
+
"0": {
|
7 |
+
"content": "<unk>",
|
8 |
+
"lstrip": false,
|
9 |
+
"normalized": false,
|
10 |
+
"rstrip": false,
|
11 |
+
"single_word": false,
|
12 |
+
"special": true
|
13 |
+
},
|
14 |
+
"1": {
|
15 |
+
"content": "<s>",
|
16 |
+
"lstrip": false,
|
17 |
+
"normalized": false,
|
18 |
+
"rstrip": false,
|
19 |
+
"single_word": false,
|
20 |
+
"special": true
|
21 |
+
},
|
22 |
+
"2": {
|
23 |
+
"content": "</s>",
|
24 |
+
"lstrip": false,
|
25 |
+
"normalized": false,
|
26 |
+
"rstrip": false,
|
27 |
+
"single_word": false,
|
28 |
+
"special": true
|
29 |
+
}
|
30 |
+
},
|
31 |
+
"additional_special_tokens": [],
|
32 |
+
"bos_token": "<s>",
|
33 |
+
"clean_up_tokenization_spaces": false,
|
34 |
+
"eos_token": "</s>",
|
35 |
+
"legacy": true,
|
36 |
+
"model_max_length": 32768,
|
37 |
+
"pad_token": "<unk>",
|
38 |
+
"padding_side": "right",
|
39 |
+
"sp_model_kwargs": {},
|
40 |
+
"spaces_between_special_tokens": false,
|
41 |
+
"tokenizer_class": "LlamaTokenizer",
|
42 |
+
"unk_token": "<unk>",
|
43 |
+
"use_default_system_prompt": false
|
44 |
+
}
|