Update README.md

89994da verified 6 months ago

4.46 kB

	---
	license: llama3.3
	base_model:
	- meta-llama/Llama-3.3-70B-Instruct
	language:
	- en
	- hi
	- it
	- de
	- fr
	- th
	- es
	- pt
	library_name: transformers
	tags:
	- meta
	- pytorch
	- llama
	---
	# MODEL DESCRIPTION
	Simple compression of llama-3.3-70B-instruct model using AWQ method.

	## Loading model with AutoModelForCausalLM
	```python
	from transformers import AutoModelForCausalLM

	model_name = "uyiosa/Llama-3.3-70b-Instruct-AWQ-4bit-GEMM"

	model = AutoModelForCausalLM.from_pretrained(model_name)

	print(model)
	```

	## Loading this model with VLLM via docker
	```
	docker run --runtime nvidia --gpus all \
	--env "HUGGING_FACE_HUB_TOKEN = .........." \
	-p 8000:8000 \
	--ipc=host --model jsbaicenter/Llama-3.3-70b-Instruct-AWQ-4BIT-GEMM \
	--gpu-memory-utilization 0.9 \
	--swap-space 0 \
	--max-seq-len-to-capture 512 \
	--max-num-seqs 1 \
	--api-key "token-abc123" \
	--max-model-len 8000 \
	--trust-remote-code --enable-chunked-prefill \
	--max_num_batched_tokens 1024
	```

	## A method to merge adapter weights to the base model and quantize
	```python
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from peft import PeftModel
	import os
	from awq import AutoAWQForCausalLM
	import gc

	def clear_gpu_memory():
	"""Clear GPU memory and cache"""
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	gc.collect()

	def merge_model(base_model_path: str, adapter_path: str, merged_path: str, device: str = "cuda"):
	"""Merge adapter with base model and save"""
	print("Loading base model...")
	base_model = AutoModelForCausalLM.from_pretrained(
	base_model_path,
	torch_dtype=torch.float16,
	device_map=device
	)

	print("Loading adapter...")
	adapter_model = PeftModel.from_pretrained(
	base_model,
	adapter_path,
	device_map=device
	)

	print("Merging adapter with base model...")
	merged_model = adapter_model.merge_and_unload()

	print("Saving merged model...")
	merged_model.save_pretrained(merged_path)

	# Clear model from GPU memory
	del base_model
	del adapter_model
	del merged_model
	clear_gpu_memory()
	print("Cleared GPU memory after merge")

	def quantize_model(merged_path: str, quantized_path: str, device: str = "cuda"):
	"""Quantize the merged model"""
	print("Starting quantization...")
	quant_config = {
	"bits": 4,
	"group_size": 128,
	"zero_point": True,
	"modules_to_not_convert": [
	"attention", # keep attention in fp16
	"rotary_emb", # keep embeddings in fp16
	"norm", # keep normalization in fp16
	"adapter", # keep adapter weights in fp16
	"lora" # keep any remaining LoRA weights in fp16
	]
	}

	# Load and quantize
	print("Loading merged model for quantization...")
	quantizer = AutoAWQForCausalLM.from_pretrained(
	merged_path,
	**quant_config,
	device_map=device
	)

	quantized_model = quantizer.quantize(
	examples=128,
	verify_loading=True
	)

	print("Saving quantized model...")
	quantized_model.save_pretrained(quantized_path)

	# Clear GPU memory again
	del quantizer
	del quantized_model
	clear_gpu_memory()
	print("Cleared GPU memory after quantization")

	def process_model(base_model_path: str, adapter_path: str, output_dir: str):
	"""Main processing function"""
	os.makedirs(output_dir, exist_ok=True)
	merged_path = os.path.join(output_dir, "merged_model")
	quantized_path = os.path.join(output_dir, "quantized_model")

	try:
	# Step 1: Merge
	merge_model(base_model_path, adapter_path, merged_path)

	# Step 2: Quantize
	quantize_model(merged_path, quantized_path)

	print("Process completed successfully!")
	return True

	except Exception as e:
	print(f"Error during processing: {str(e)}")
	clear_gpu_memory() # Clear memory if there's an error
	return False

	if __name__ == "__main__":
	# Configuration
	BASE_MODEL_PATH = "meta-llama/Llama-3.3-70B-Instruct"
	ADAPTER_PATH = "./checkpoint-781" # Directory with adapter_config.json
	OUTPUT_DIR = "llama-3.3-70b-FT781-AWQ-GEMM"

	# Run the process
	success = process_model(
	base_model_path=BASE_MODEL_PATH,
	adapter_path=ADAPTER_PATH,
	output_dir=OUTPUT_DIR
	)
	```