#!/bin/bash # Activate Python virtual environment with all required packages (torch, transformers, peft, etc.) # This keeps dependencies isolated from your system Python. source llm-finetune/bin/activate # Step 1: Run the fine-tuning script (LoRA training) # - llama_finetuning.py trains your LLaMA model using Q&A pairs. # - The output will be a LoRA adapter stored in a subdirectory. python3 llama_finetuning.py # Step 2: Make sure the locally built llamafile launcher is available # - We installed llamafile into ~/dev/tools/llamafile/bin # - Add that directory to PATH so its binaries can be found automatically. export PATH="$HOME/dev/tools/llamafile/bin:$PATH" # Step 3: Merge the LoRA adapter with the base model # - LoRA is efficient for training, but for deployment we want a single merged model. # - merge_with_autopeft.py loads the base weights and adapter, merges them, and saves FP16 weights in ./merged-fp16 python3 merge_with_autopeft.py # Step 4: Convert Hugging Face FP16 model -> GGUF (llama.cpp runtime format) # - ./merged-fp16 is the Hugging Face directory created by the merge step. # - --outfile sets the name of the GGUF file. # - --outtype f16 ensures weights are saved in FP16 precision before quantization. python3 ../llama.cpp/convert_hf_to_gguf.py merged-fp16 --outfile merged-fp16.gguf --outtype f16 # Step 5: Quantize FP16 GGUF -> Q6_K GGUF # - Q6_K is a 6-bit quantization that balances speed, quality, and size. # - merged-fp16.gguf is the input, merged-Q6_K.gguf is the output. # - This step makes the model small enough to run efficiently on CPU/GPU. ../llama.cpp/build/bin/llama-quantize merged-fp16.gguf merged-Q6_K.gguf q6_k # Step 6: Copy the llamafile launcher # - "llamafile" is the universal runtime that knows how to run GGUF models. # - We copy it to resume.llamafile, which will become the final self-contained binary. cp ~/dev/tools/llamafile/bin/llamafile resume.llamafile # Step 7: Pack the model, args, and docs into the llamafile # - zipalign appends files into the llamafile binary as an uncompressed ZIP archive. # - merged-Q6_K.gguf is the quantized model. # - .args contains default runtime arguments (e.g. -m model, --threads, --ctx-size). # - README.md is included so end users have documentation directly inside the llamafile. # - The -j0 option ensures "store only" (no compression) so llamafile can memory-map the model efficiently. zipalign -j0 resume.llamafile merged-Q6_K.gguf .args README.md #Key points for education purpose # Virtual environment keeps fine-tuning dependencies isolated. # LoRA fine-tuning produces small adapter weights → later merged for simplicity. # Merge step is critical: it creates a “normal” Hugging Face model again, which can be exported. # convert_hf_to_gguf.py translates HF → GGUF (runtime format for llama.cpp + llamafile). # Quantization (Q6_K) reduces model size by ~3–4× with minimal loss in quality, making it run fast on CPU. # llamafile packaging produces a single executable that works on Linux/macOS directly; on Windows you just rename it to .exe. # zipalign -j0 ensures files are stored uncompressed, which llamafile requires for mmap loading.