Spaces:
Runtime error
Runtime error
File size: 5,166 Bytes
de071e9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import torch
from transformers import AutoModelForCausalLM
from tracing.utils.llama.model import rotate_model_t5
torch.set_default_dtype(torch.bfloat16)
model_name = "meta-llama/Llama-2-7b-hf"
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16).to("cuda")
model_rot_name = "yahma/llama-7b-hf"
model_rotated = AutoModelForCausalLM.from_pretrained(model_rot_name, torch_dtype=torch.bfloat16).to(
"cuda"
)
rotate_model_t5(model_rotated)
# rotate_model(model_rotated)
# Fixing the layer norms to 1's (HUReF works)
# """
# fix_layer_norm(model)
# fix_layer_norm(model_rotated)
# """
# base_tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
# dataset = prepare_hf_dataset("dlwh/wikitext_103_detokenized",512,base_tokenizer)
# dataloader = prepare_hf_dataloader(dataset,1)
# evaluate_model = evaluate(model, dataloader)
# evaluate_rotated = evaluate(model_rotated, dataloader)
# print("outputs are aligned: ")
# print([abs(evaluate_model[i] - evaluate_rotated[i]) <= 0.01 for i in range(len(evaluate_model))])
weights = model.state_dict()
weights_rotated = model_rotated.state_dict()
# model.to('cuda')
# print("invariant 1")
# print(weights['model.embed_tokens.weight']@weights['model.layers.0.self_attn.q_proj.weight'].T@weights['model.layers.0.self_attn.k_proj.weight']@weights['model.embed_tokens.weight'].T)
# print("invariant 2")
# print(weights['model.embed_tokens.weight']@weights['model.layers.0.self_attn.v_proj.weight'].T@weights['model.layers.0.self_attn.o_proj.weight'].T@weights['model.embed_tokens.weight'].T)
# print("invariant 3")
# print(weights['model.embed_tokens.weight']@weights[f'model.layers.0.mlp.up_proj.weight'].T@weights[f'model.layers.0.mlp.down_proj.weight'].T@weights['model.embed_tokens.weight'].T)
# print()
# model.to('cpu')
# model_rotated.to('cuda')
# print("rotated")
# print("invariant 1")
# print(weights_rotated['model.embed_tokens.weight']@weights_rotated['model.layers.0.self_attn.q_proj.weight'].T@weights_rotated['model.layers.0.self_attn.k_proj.weight']@weights_rotated['model.embed_tokens.weight'].T)
# print("invariant 2")
# print(weights_rotated['model.embed_tokens.weight']@weights_rotated['model.layers.0.self_attn.v_proj.weight'].T@weights_rotated['model.layers.0.self_attn.o_proj.weight'].T@weights_rotated['model.embed_tokens.weight'].T)
# print("invariant 3")
# print(weights_rotated['model.embed_tokens.weight']@weights_rotated[f'model.layers.0.mlp.up_proj.weight'].T@weights_rotated[f'model.layers.0.mlp.down_proj.weight'].T@weights_rotated['model.embed_tokens.weight'].T)
# print()
# model_rotated.to('cpu')
# Cosine similarity
print("cosine similarity")
model.to("cuda")
print("invariant 1")
invariant = (
weights["model.embed_tokens.weight"]
@ weights["model.layers.0.self_attn.q_proj.weight"].T
@ weights["model.layers.0.self_attn.k_proj.weight"]
@ (weights["model.embed_tokens.weight"]).T
)
model.to("cpu")
model_rotated.to("cuda")
invariant_rotated = (
weights_rotated["model.embed_tokens.weight"]
@ weights_rotated["model.layers.0.self_attn.q_proj.weight"].T
@ weights_rotated["model.layers.0.self_attn.k_proj.weight"]
@ (weights_rotated["model.embed_tokens.weight"]).T
)
model_rotated.to("cpu")
invariant.to("cuda")
invariant_rotated.to("cuda")
invariant = torch.flatten(invariant)
invariant_rotated = torch.flatten(invariant_rotated)
print(
torch.dot(invariant, invariant_rotated)
/ (torch.norm(invariant) * torch.norm(invariant_rotated))
)
model.to("cuda")
print("invariant 2")
invariant = (
weights["model.embed_tokens.weight"]
@ weights["model.layers.0.self_attn.v_proj.weight"].T
@ weights["model.layers.0.self_attn.o_proj.weight"].T
@ weights["model.embed_tokens.weight"].T
)
model.to("cpu")
model_rotated.to("cuda")
invariant_rotated = (
weights_rotated["model.embed_tokens.weight"]
@ weights_rotated["model.layers.0.self_attn.v_proj.weight"].T
@ weights_rotated["model.layers.0.self_attn.o_proj.weight"].T
@ weights_rotated["model.embed_tokens.weight"].T
)
model_rotated.to("cpu")
invariant.to("cuda")
invariant_rotated.to("cuda")
invariant = torch.flatten(invariant)
invariant_rotated = torch.flatten(invariant_rotated)
print(
torch.dot(invariant, invariant_rotated)
/ (torch.norm(invariant) * torch.norm(invariant_rotated))
)
model.to("cuda")
print("invariant 3")
invariant = (
weights["model.embed_tokens.weight"]
@ weights["model.layers.0.mlp.up_proj.weight"].T
@ weights["model.layers.0.mlp.down_proj.weight"].T
@ weights["model.embed_tokens.weight"].T
)
model.to("cpu")
model_rotated.to("cuda")
invariant_rotated = (
weights_rotated["model.embed_tokens.weight"]
@ weights_rotated["model.layers.0.mlp.up_proj.weight"].T
@ weights_rotated["model.layers.0.mlp.down_proj.weight"].T
@ weights_rotated["model.embed_tokens.weight"].T
)
model_rotated.to("cpu")
invariant.to("cuda")
invariant_rotated.to("cuda")
invariant = torch.flatten(invariant)
invariant_rotated = torch.flatten(invariant_rotated)
print(
torch.dot(invariant, invariant_rotated)
/ (torch.norm(invariant) * torch.norm(invariant_rotated))
)
|