|
--- |
|
license: llama3.3 |
|
base_model: |
|
- meta-llama/Llama-3.3-70B-Instruct |
|
language: |
|
- en |
|
- hi |
|
- it |
|
- de |
|
- fr |
|
- th |
|
- es |
|
- pt |
|
library_name: transformers |
|
tags: |
|
- meta |
|
- pytorch |
|
- llama |
|
--- |
|
# MODEL DESCRIPTION |
|
Simple compression of llama-3.3-70B-instruct model using AWQ method. |
|
|
|
## Loading model with AutoModelForCausalLM |
|
```python |
|
from transformers import AutoModelForCausalLM |
|
|
|
model_name = "uyiosa/Llama-3.3-70b-Instruct-AWQ-4bit-GEMM" |
|
|
|
model = AutoModelForCausalLM.from_pretrained(model_name) |
|
|
|
print(model) |
|
``` |
|
|
|
## Loading this model with VLLM via docker |
|
``` |
|
docker run --runtime nvidia --gpus all \ |
|
--env "HUGGING_FACE_HUB_TOKEN = .........." \ |
|
-p 8000:8000 \ |
|
--ipc=host --model jsbaicenter/Llama-3.3-70b-Instruct-AWQ-4BIT-GEMM \ |
|
--gpu-memory-utilization 0.9 \ |
|
--swap-space 0 \ |
|
--max-seq-len-to-capture 512 \ |
|
--max-num-seqs 1 \ |
|
--api-key "token-abc123" \ |
|
--max-model-len 8000 \ |
|
--trust-remote-code --enable-chunked-prefill \ |
|
--max_num_batched_tokens 1024 |
|
``` |
|
|
|
## A method to merge adapter weights to the base model and quantize |
|
```python |
|
import torch |
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
from peft import PeftModel |
|
import os |
|
from awq import AutoAWQForCausalLM |
|
import gc |
|
|
|
def clear_gpu_memory(): |
|
"""Clear GPU memory and cache""" |
|
if torch.cuda.is_available(): |
|
torch.cuda.empty_cache() |
|
gc.collect() |
|
|
|
def merge_model(base_model_path: str, adapter_path: str, merged_path: str, device: str = "cuda"): |
|
"""Merge adapter with base model and save""" |
|
print("Loading base model...") |
|
base_model = AutoModelForCausalLM.from_pretrained( |
|
base_model_path, |
|
torch_dtype=torch.float16, |
|
device_map=device |
|
) |
|
|
|
print("Loading adapter...") |
|
adapter_model = PeftModel.from_pretrained( |
|
base_model, |
|
adapter_path, |
|
device_map=device |
|
) |
|
|
|
print("Merging adapter with base model...") |
|
merged_model = adapter_model.merge_and_unload() |
|
|
|
print("Saving merged model...") |
|
merged_model.save_pretrained(merged_path) |
|
|
|
# Clear model from GPU memory |
|
del base_model |
|
del adapter_model |
|
del merged_model |
|
clear_gpu_memory() |
|
print("Cleared GPU memory after merge") |
|
|
|
def quantize_model(merged_path: str, quantized_path: str, device: str = "cuda"): |
|
"""Quantize the merged model""" |
|
print("Starting quantization...") |
|
quant_config = { |
|
"bits": 4, |
|
"group_size": 128, |
|
"zero_point": True, |
|
"modules_to_not_convert": [ |
|
"attention", # keep attention in fp16 |
|
"rotary_emb", # keep embeddings in fp16 |
|
"norm", # keep normalization in fp16 |
|
"adapter", # keep adapter weights in fp16 |
|
"lora" # keep any remaining LoRA weights in fp16 |
|
] |
|
} |
|
|
|
# Load and quantize |
|
print("Loading merged model for quantization...") |
|
quantizer = AutoAWQForCausalLM.from_pretrained( |
|
merged_path, |
|
**quant_config, |
|
device_map=device |
|
) |
|
|
|
quantized_model = quantizer.quantize( |
|
examples=128, |
|
verify_loading=True |
|
) |
|
|
|
print("Saving quantized model...") |
|
quantized_model.save_pretrained(quantized_path) |
|
|
|
# Clear GPU memory again |
|
del quantizer |
|
del quantized_model |
|
clear_gpu_memory() |
|
print("Cleared GPU memory after quantization") |
|
|
|
def process_model(base_model_path: str, adapter_path: str, output_dir: str): |
|
"""Main processing function""" |
|
os.makedirs(output_dir, exist_ok=True) |
|
merged_path = os.path.join(output_dir, "merged_model") |
|
quantized_path = os.path.join(output_dir, "quantized_model") |
|
|
|
try: |
|
# Step 1: Merge |
|
merge_model(base_model_path, adapter_path, merged_path) |
|
|
|
# Step 2: Quantize |
|
quantize_model(merged_path, quantized_path) |
|
|
|
print("Process completed successfully!") |
|
return True |
|
|
|
except Exception as e: |
|
print(f"Error during processing: {str(e)}") |
|
clear_gpu_memory() # Clear memory if there's an error |
|
return False |
|
|
|
if __name__ == "__main__": |
|
# Configuration |
|
BASE_MODEL_PATH = "meta-llama/Llama-3.3-70B-Instruct" |
|
ADAPTER_PATH = "./checkpoint-781" # Directory with adapter_config.json |
|
OUTPUT_DIR = "llama-3.3-70b-FT781-AWQ-GEMM" |
|
|
|
# Run the process |
|
success = process_model( |
|
base_model_path=BASE_MODEL_PATH, |
|
adapter_path=ADAPTER_PATH, |
|
output_dir=OUTPUT_DIR |
|
) |
|
``` |