Spaces:

ahmedsqrd
/

model_trace

Runtime error

model_trace / model-tracing /scripts /docs /m2d_trace.py

Ahmed Ahmed

Add model-tracing code for p-value computation (without binary files)

de071e9 10 days ago

10.3 kB

	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
	import itertools
	import os
	from datasets import load_dataset
	from tqdm import tqdm
	import math
	import matplotlib.pyplot as plt
	import csv
	from utils import interpolate_models
	import time
	import argparse
	import glob
	import gc

	block_size = 2048
	"""
	Script for running ablation of tests on m2d2 dataset rather
	than simply wikitext
	"""


	def group_texts(examples):
	# Concatenate all texts.
	concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
	total_length = len(concatenated_examples[list(examples.keys())[0]])
	# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
	# customize this part to your needs.
	total_length = (total_length // block_size) * block_size
	# Split by chunks of max_len.
	result = {
	k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
	for k, t in concatenated_examples.items()
	}
	result["labels"] = result["input_ids"].copy()
	return result


	def load_model(model_name):
	return AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)


	def main(args):
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	print(f"Using device: {device}")
	os.environ["WANDB_MODE"] = "disabled"
	os.environ["TOKENIZERS_PARALLELISM"] = "false"

	model_arch = args.model_arch
	if model_arch == "llama":
	model_list = [
	"meta-llama/Llama-2-7b-hf",
	"meta-llama/Llama-2-7b-chat-hf",
	"meta-llama/CodeLlama-7b-Python-hf",
	"meta-llama/CodeLlama-7b-Instruct-hf",
	"codellama/CodeLlama-7b-hf",
	"lmsys/vicuna-7b-v1.5",
	"lmsys/vicuna-7b-v1.1",
	"EleutherAI/llemma_7b",
	"LLM360/Amber",
	]
	elif model_arch == "olmo":
	model_list = [
	"/scr/ahmedah/olmo/step1000_4B_tokens/seed_0_4B",
	"/scr/ahmedah/olmo/step1000_4B_tokens/seed_42_4B",
	]

	tokenizer = AutoTokenizer.from_pretrained(model_list[0])
	tokenizer.pad_token = tokenizer.eos_token

	test_cases = [
	{
	"test_name": folder_name,
	"json_dir": f"/juice4/scr4/nlp/model-tracing/m2d2_s2orc/{folder_name}",
	"save_dir": f"/juice4/scr4/nlp/model-tracing/m2d2_s2orc/results_{folder_name}",
	"columns_ignored": ["text", "added", "id", "source", "timestamp", "subdomain"],
	}
	for folder_name in [
	"AI",
	"CV",
	"ET",
	"IM",
	"mtrl-sci",
	"stat-mech",
	"AR",
	"CY",
	"IR",
	"NA",
	"str-el",
	"art",
	"DB",
	"FL",
	"supr-con",
	"CC",
	"DC",
	"GA",
	"LG",
	"phil",
	"CE",
	"dis-nn",
	"GL",
	"LO",
	"CG",
	"DL",
	"GR",
	"MA",
	"quant-gas",
	"CL",
	"DM",
	"GT",
	"mes-hall",
	"CO",
	"DS",
	"HC",
	"MM",
	"soft",
	"CR",
	"EP",
	"HE",
	"MS",
	"SR",
	]
	]

	for test_case in test_cases:
	test_name = test_case["test_name"]
	json_dir = test_case["json_dir"]
	save_dir = test_case["save_dir"]
	columns_ignored = ["text", "added", "id", "source", "subdomain"]

	json_files = glob.glob(f"{json_dir}/*.json")
	if not os.path.exists(save_dir):
	os.makedirs(save_dir)

	for json_file in json_files:
	print(f"Processing {json_file}")

	eval_dataset = load_dataset("json", data_files=json_file)

	def tokenize_function(examples):
	return tokenizer(examples["text"])

	tokenized_datasets = eval_dataset.map(
	tokenize_function, batched=True, num_proc=4, remove_columns=columns_ignored
	)
	lm_datasets = tokenized_datasets.map(
	group_texts,
	batched=True,
	batch_size=1000,
	num_proc=8,
	)

	training_args = TrainingArguments(
	output_dir="./hf_results",
	per_device_eval_batch_size=15,
	do_eval=True,
	report_to=None,
	dataloader_num_workers=8,
	use_cpu=True,
	)
	alphas = [0.0, 0.3, 0.5, 0.7, 1.0]
	initial_model = load_model(model_list[0])
	trainer = Trainer(model=initial_model, args=training_args, eval_dataset=lm_datasets)
	eval_dataloader = trainer.get_test_dataloader(lm_datasets["train"])
	del initial_model

	model_pairs = list(itertools.combinations(enumerate(model_list), 2))

	base_dir = f"{save_dir}/{test_name}"
	os.makedirs(base_dir, exist_ok=True)
	imgs_dir = os.path.join(base_dir, "imgs")
	os.makedirs(imgs_dir, exist_ok=True)
	csv_dir = os.path.join(base_dir, "csv")
	os.makedirs(csv_dir, exist_ok=True)

	current_model_a, current_model_b = None, None
	current_model_a_name, current_model_b_name = None, None

	for (idx_a, model_a_name), (idx_b, model_b_name) in tqdm(
	model_pairs, desc="Model Interpolation"
	):
	if idx_a < idx_b:
	perplexities = []

	if current_model_a is None or current_model_a_name != model_a_name:
	if current_model_a is not None:
	del current_model_a
	torch.cuda.empty_cache()
	current_model_a = load_model(model_a_name).to("cpu")
	current_model_a_name = model_a_name

	if current_model_b is None or current_model_b_name != model_b_name:
	if current_model_b is not None:
	del current_model_b
	torch.cuda.empty_cache()
	current_model_b = load_model(model_b_name).to("cpu")
	current_model_b_name = model_b_name

	with torch.no_grad():
	for alpha in tqdm(
	alphas,
	desc=f" \n Alpha Perplexities for {model_a_name} and {model_b_name}",
	):
	interpolated_model = interpolate_models(
	current_model_a, current_model_b, alpha, model_arch=model_arch
	)
	interpolated_model = interpolated_model.half().to(device)

	start_time = time.time()
	losses = []

	for batch in tqdm(eval_dataloader, desc=f"\n Evaluating {alpha}"):
	input_ids = batch["input_ids"].to(device)
	attention_mask = batch["attention_mask"].to(device)
	labels = batch["labels"].to(device)

	outputs = interpolated_model(
	input_ids=input_ids,
	attention_mask=attention_mask,
	labels=labels,
	)
	loss = outputs.loss
	losses.append(loss.item())

	loss_mean = sum(losses) / len(losses)
	print(f"Loss mean: {loss_mean}")
	end_time = time.time()
	execution_time = end_time - start_time
	print(f"Execution time base: {execution_time} seconds")

	perplexity = math.exp(loss_mean)
	perplexities.append(perplexity)

	interpolated_model.to("cpu")
	del interpolated_model, input_ids, attention_mask, labels, outputs, loss
	torch.cuda.empty_cache()
	gc.collect()

	model_a_name = model_a_name.split("/")[-1]
	model_b_name = model_b_name.split("/")[-1]
	json_filename = os.path.splitext(os.path.basename(json_file))[0]
	csv_filename = f"{csv_dir}/perplexities_{json_filename}.csv"
	csv_header = ["Model Pair"] + [f"Alpha {alpha}" for alpha in alphas]

	if not os.path.exists(csv_filename):
	with open(csv_filename, "w", newline="") as csvfile:
	writer = csv.writer(csvfile)
	writer.writerow(csv_header)

	with open(csv_filename, "a", newline="") as csvfile:
	writer = csv.writer(csvfile)
	model_pair = f"{model_a_name} vs {model_b_name}"
	row = [model_pair] + perplexities
	writer.writerow(row)

	plt.figure(figsize=(8, 6))
	plt.plot(alphas, perplexities)
	plt.xlabel("Alpha")
	plt.ylabel("Perplexity")
	plt.title(f"{model_a_name} (Left) vs {model_b_name} (Right)")

	plot_filename = (
	f"alpha_vs_perplexity_{model_a_name}_vs_{model_b_name}_{json_filename}.png"
	)
	plot_path = f"{imgs_dir}/{plot_filename}"
	plt.savefig(plot_path, dpi=300, bbox_inches="tight")
	plt.close()


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Model Interpolation")
	parser.add_argument(
	"--model_arch",
	choices=["llama", "olmo"],
	default="llama",
	help="default model architecture to use",
	)
	args = parser.parse_args()
	main(args)