Ahmed Ahmed
Add model-tracing code for p-value computation (without binary files)
de071e9
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
import itertools
import os
from datasets import load_dataset
from tqdm import tqdm
import math
import matplotlib.pyplot as plt
import csv
from utils import interpolate_models
import time
import argparse
import glob
import gc
block_size = 2048
"""
Script for running ablation of tests on m2d2 dataset rather
than simply wikitext
"""
def group_texts(examples):
# Concatenate all texts.
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs.
total_length = (total_length // block_size) * block_size
# Split by chunks of max_len.
result = {
k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
for k, t in concatenated_examples.items()
}
result["labels"] = result["input_ids"].copy()
return result
def load_model(model_name):
return AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
def main(args):
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
os.environ["WANDB_MODE"] = "disabled"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
model_arch = args.model_arch
if model_arch == "llama":
model_list = [
"meta-llama/Llama-2-7b-hf",
"meta-llama/Llama-2-7b-chat-hf",
"meta-llama/CodeLlama-7b-Python-hf",
"meta-llama/CodeLlama-7b-Instruct-hf",
"codellama/CodeLlama-7b-hf",
"lmsys/vicuna-7b-v1.5",
"lmsys/vicuna-7b-v1.1",
"EleutherAI/llemma_7b",
"LLM360/Amber",
]
elif model_arch == "olmo":
model_list = [
"/scr/ahmedah/olmo/step1000_4B_tokens/seed_0_4B",
"/scr/ahmedah/olmo/step1000_4B_tokens/seed_42_4B",
]
tokenizer = AutoTokenizer.from_pretrained(model_list[0])
tokenizer.pad_token = tokenizer.eos_token
test_cases = [
{
"test_name": folder_name,
"json_dir": f"/juice4/scr4/nlp/model-tracing/m2d2_s2orc/{folder_name}",
"save_dir": f"/juice4/scr4/nlp/model-tracing/m2d2_s2orc/results_{folder_name}",
"columns_ignored": ["text", "added", "id", "source", "timestamp", "subdomain"],
}
for folder_name in [
"AI",
"CV",
"ET",
"IM",
"mtrl-sci",
"stat-mech",
"AR",
"CY",
"IR",
"NA",
"str-el",
"art",
"DB",
"FL",
"supr-con",
"CC",
"DC",
"GA",
"LG",
"phil",
"CE",
"dis-nn",
"GL",
"LO",
"CG",
"DL",
"GR",
"MA",
"quant-gas",
"CL",
"DM",
"GT",
"mes-hall",
"CO",
"DS",
"HC",
"MM",
"soft",
"CR",
"EP",
"HE",
"MS",
"SR",
]
]
for test_case in test_cases:
test_name = test_case["test_name"]
json_dir = test_case["json_dir"]
save_dir = test_case["save_dir"]
columns_ignored = ["text", "added", "id", "source", "subdomain"]
json_files = glob.glob(f"{json_dir}/*.json")
if not os.path.exists(save_dir):
os.makedirs(save_dir)
for json_file in json_files:
print(f"Processing {json_file}")
eval_dataset = load_dataset("json", data_files=json_file)
def tokenize_function(examples):
return tokenizer(examples["text"])
tokenized_datasets = eval_dataset.map(
tokenize_function, batched=True, num_proc=4, remove_columns=columns_ignored
)
lm_datasets = tokenized_datasets.map(
group_texts,
batched=True,
batch_size=1000,
num_proc=8,
)
training_args = TrainingArguments(
output_dir="./hf_results",
per_device_eval_batch_size=15,
do_eval=True,
report_to=None,
dataloader_num_workers=8,
use_cpu=True,
)
alphas = [0.0, 0.3, 0.5, 0.7, 1.0]
initial_model = load_model(model_list[0])
trainer = Trainer(model=initial_model, args=training_args, eval_dataset=lm_datasets)
eval_dataloader = trainer.get_test_dataloader(lm_datasets["train"])
del initial_model
model_pairs = list(itertools.combinations(enumerate(model_list), 2))
base_dir = f"{save_dir}/{test_name}"
os.makedirs(base_dir, exist_ok=True)
imgs_dir = os.path.join(base_dir, "imgs")
os.makedirs(imgs_dir, exist_ok=True)
csv_dir = os.path.join(base_dir, "csv")
os.makedirs(csv_dir, exist_ok=True)
current_model_a, current_model_b = None, None
current_model_a_name, current_model_b_name = None, None
for (idx_a, model_a_name), (idx_b, model_b_name) in tqdm(
model_pairs, desc="Model Interpolation"
):
if idx_a < idx_b:
perplexities = []
if current_model_a is None or current_model_a_name != model_a_name:
if current_model_a is not None:
del current_model_a
torch.cuda.empty_cache()
current_model_a = load_model(model_a_name).to("cpu")
current_model_a_name = model_a_name
if current_model_b is None or current_model_b_name != model_b_name:
if current_model_b is not None:
del current_model_b
torch.cuda.empty_cache()
current_model_b = load_model(model_b_name).to("cpu")
current_model_b_name = model_b_name
with torch.no_grad():
for alpha in tqdm(
alphas,
desc=f" \n Alpha Perplexities for {model_a_name} and {model_b_name}",
):
interpolated_model = interpolate_models(
current_model_a, current_model_b, alpha, model_arch=model_arch
)
interpolated_model = interpolated_model.half().to(device)
start_time = time.time()
losses = []
for batch in tqdm(eval_dataloader, desc=f"\n Evaluating {alpha}"):
input_ids = batch["input_ids"].to(device)
attention_mask = batch["attention_mask"].to(device)
labels = batch["labels"].to(device)
outputs = interpolated_model(
input_ids=input_ids,
attention_mask=attention_mask,
labels=labels,
)
loss = outputs.loss
losses.append(loss.item())
loss_mean = sum(losses) / len(losses)
print(f"Loss mean: {loss_mean}")
end_time = time.time()
execution_time = end_time - start_time
print(f"Execution time base: {execution_time} seconds")
perplexity = math.exp(loss_mean)
perplexities.append(perplexity)
interpolated_model.to("cpu")
del interpolated_model, input_ids, attention_mask, labels, outputs, loss
torch.cuda.empty_cache()
gc.collect()
model_a_name = model_a_name.split("/")[-1]
model_b_name = model_b_name.split("/")[-1]
json_filename = os.path.splitext(os.path.basename(json_file))[0]
csv_filename = f"{csv_dir}/perplexities_{json_filename}.csv"
csv_header = ["Model Pair"] + [f"Alpha {alpha}" for alpha in alphas]
if not os.path.exists(csv_filename):
with open(csv_filename, "w", newline="") as csvfile:
writer = csv.writer(csvfile)
writer.writerow(csv_header)
with open(csv_filename, "a", newline="") as csvfile:
writer = csv.writer(csvfile)
model_pair = f"{model_a_name} vs {model_b_name}"
row = [model_pair] + perplexities
writer.writerow(row)
plt.figure(figsize=(8, 6))
plt.plot(alphas, perplexities)
plt.xlabel("Alpha")
plt.ylabel("Perplexity")
plt.title(f"{model_a_name} (Left) vs {model_b_name} (Right)")
plot_filename = (
f"alpha_vs_perplexity_{model_a_name}_vs_{model_b_name}_{json_filename}.png"
)
plot_path = f"{imgs_dir}/{plot_filename}"
plt.savefig(plot_path, dpi=300, bbox_inches="tight")
plt.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Model Interpolation")
parser.add_argument(
"--model_arch",
choices=["llama", "olmo"],
default="llama",
help="default model architecture to use",
)
args = parser.parse_args()
main(args)