#!/bin/bash

# Activate Python virtual environment with all required packages (torch, transformers, peft, etc.)
# This keeps dependencies isolated from your system Python.
source llm-finetune/bin/activate

# Step 1: Run the fine-tuning script (LoRA training)
# - llama_finetuning.py trains your LLaMA model using Q&A pairs.
# - The output will be a LoRA adapter stored in a subdirectory.
python3 llama_finetuning.py

# Step 2: Make sure the locally built llamafile launcher is available
# - We installed llamafile into ~/dev/tools/llamafile/bin
# - Add that directory to PATH so its binaries can be found automatically.
export PATH="$HOME/dev/tools/llamafile/bin:$PATH"

# Step 3: Merge the LoRA adapter with the base model
# - LoRA is efficient for training, but for deployment we want a single merged model.
# - merge_with_autopeft.py loads the base weights and adapter, merges them, and saves FP16 weights in ./merged-fp16
python3 merge_with_autopeft.py

# Step 4: Convert Hugging Face FP16 model -> GGUF (llama.cpp runtime format)
# - ./merged-fp16 is the Hugging Face directory created by the merge step.
# - --outfile sets the name of the GGUF file.
# - --outtype f16 ensures weights are saved in FP16 precision before quantization.
python3 ../llama.cpp/convert_hf_to_gguf.py merged-fp16 --outfile merged-fp16.gguf --outtype f16

# Step 5: Quantize FP16 GGUF -> Q6_K GGUF
# - Q6_K is a 6-bit quantization that balances speed, quality, and size.
# - merged-fp16.gguf is the input, merged-Q6_K.gguf is the output.
# - This step makes the model small enough to run efficiently on CPU/GPU.
../llama.cpp/build/bin/llama-quantize merged-fp16.gguf merged-Q6_K.gguf q6_k

# Step 6: Copy the llamafile launcher
# - "llamafile" is the universal runtime that knows how to run GGUF models.
# - We copy it to resume.llamafile, which will become the final self-contained binary.
cp ~/dev/tools/llamafile/bin/llamafile resume.llamafile

# Step 7: Pack the model, args, and docs into the llamafile
# - zipalign appends files into the llamafile binary as an uncompressed ZIP archive.
# - merged-Q6_K.gguf is the quantized model.
# - .args contains default runtime arguments (e.g. -m model, --threads, --ctx-size).
# - README.md is included so end users have documentation directly inside the llamafile.
# - The -j0 option ensures "store only" (no compression) so llamafile can memory-map the model efficiently.
zipalign -j0 resume.llamafile merged-Q6_K.gguf .args README.md


#Key points for education purpose

#    Virtual environment keeps fine-tuning dependencies isolated.

#    LoRA fine-tuning produces small adapter weights → later merged for simplicity.

#    Merge step is critical: it creates a “normal” Hugging Face model again, which can be exported.

#    convert_hf_to_gguf.py translates HF → GGUF (runtime format for llama.cpp + llamafile).

#    Quantization (Q6_K) reduces model size by ~3–4× with minimal loss in quality, making it run fast on CPU.

#    llamafile packaging produces a single executable that works on Linux/macOS directly; on Windows you just rename it to .exe.

#    zipalign -j0 ensures files are stored uncompressed, which llamafile requires for mmap loading.