# Activate Python virtual environment with all required packages (torch, transformers, peft, etc.) | |
# This keeps dependencies isolated from your system Python. | |
source llm-finetune/bin/activate | |
# Step 1: Run the fine-tuning script (LoRA training) | |
# - llama_finetuning.py trains your LLaMA model using Q&A pairs. | |
# - The output will be a LoRA adapter stored in a subdirectory. | |
python3 llama_finetuning.py | |
# Step 2: Make sure the locally built llamafile launcher is available | |
# - We installed llamafile into ~/dev/tools/llamafile/bin | |
# - Add that directory to PATH so its binaries can be found automatically. | |
export PATH="$HOME/dev/tools/llamafile/bin:$PATH" | |
# Step 3: Merge the LoRA adapter with the base model | |
# - LoRA is efficient for training, but for deployment we want a single merged model. | |
# - merge_with_autopeft.py loads the base weights and adapter, merges them, and saves FP16 weights in ./merged-fp16 | |
python3 merge_with_autopeft.py | |
# Step 4: Convert Hugging Face FP16 model -> GGUF (llama.cpp runtime format) | |
# - ./merged-fp16 is the Hugging Face directory created by the merge step. | |
# - --outfile sets the name of the GGUF file. | |
# - --outtype f16 ensures weights are saved in FP16 precision before quantization. | |
python3 ../llama.cpp/convert_hf_to_gguf.py merged-fp16 --outfile merged-fp16.gguf --outtype f16 | |
# Step 5: Quantize FP16 GGUF -> Q6_K GGUF | |
# - Q6_K is a 6-bit quantization that balances speed, quality, and size. | |
# - merged-fp16.gguf is the input, merged-Q6_K.gguf is the output. | |
# - This step makes the model small enough to run efficiently on CPU/GPU. | |
../llama.cpp/build/bin/llama-quantize merged-fp16.gguf merged-Q6_K.gguf q6_k | |
# Step 6: Copy the llamafile launcher | |
# - "llamafile" is the universal runtime that knows how to run GGUF models. | |
# - We copy it to resume.llamafile, which will become the final self-contained binary. | |
cp ~/dev/tools/llamafile/bin/llamafile resume.llamafile | |
# Step 7: Pack the model, args, and docs into the llamafile | |
# - zipalign appends files into the llamafile binary as an uncompressed ZIP archive. | |
# - merged-Q6_K.gguf is the quantized model. | |
# - .args contains default runtime arguments (e.g. -m model, --threads, --ctx-size). | |
# - README.md is included so end users have documentation directly inside the llamafile. | |
# - The -j0 option ensures "store only" (no compression) so llamafile can memory-map the model efficiently. | |
zipalign -j0 resume.llamafile merged-Q6_K.gguf .args README.md | |
#Key points for education purpose | |
# Virtual environment keeps fine-tuning dependencies isolated. | |
# LoRA fine-tuning produces small adapter weights → later merged for simplicity. | |
# Merge step is critical: it creates a “normal” Hugging Face model again, which can be exported. | |
# convert_hf_to_gguf.py translates HF → GGUF (runtime format for llama.cpp + llamafile). | |
# Quantization (Q6_K) reduces model size by ~3–4× with minimal loss in quality, making it run fast on CPU. | |
# llamafile packaging produces a single executable that works on Linux/macOS directly; on Windows you just rename it to .exe. | |
# zipalign -j0 ensures files are stored uncompressed, which llamafile requires for mmap loading. | |