|
#!/bin/bash |
|
|
|
|
|
GREEN='\033[0;32m' |
|
RED='\033[0;31m' |
|
YELLOW='\033[1;33m' |
|
BLUE='\033[0;34m' |
|
NC='\033[0m' |
|
|
|
|
|
PROJECT_ROOT="${PROJECT_ROOT:-$(pwd)}" |
|
VENV_PATH="${VENV_PATH:-${PROJECT_ROOT}/venv}" |
|
CHECKPOINT_DIR="${CHECKPOINT_DIR:-${PROJECT_ROOT}/checkpoints}" |
|
LORA_CHECKPOINT_DIR="${LORA_CHECKPOINT_DIR:-${PROJECT_ROOT}/lora_checkpoints}" |
|
REQUIRED_SPACE_MB="${REQUIRED_SPACE_MB:-2000}" |
|
|
|
|
|
print_status() { |
|
echo -e "${GREEN}[+] $1${NC}" |
|
} |
|
|
|
print_error() { |
|
echo -e "${RED}[-] $1${NC}" |
|
} |
|
|
|
print_warning() { |
|
echo -e "${YELLOW}[!] $1${NC}" |
|
} |
|
|
|
print_info() { |
|
echo -e "${BLUE}[i] $1${NC}" |
|
} |
|
|
|
|
|
handle_error() { |
|
print_error "$1" |
|
exit 1 |
|
} |
|
|
|
|
|
command_exists() { |
|
command -v "$1" &> /dev/null |
|
} |
|
|
|
|
|
check_disk_space() { |
|
local available_space_mb=$(df -m . | awk 'NR==2 {print $4}') |
|
if [ "$available_space_mb" -lt "$REQUIRED_SPACE_MB" ]; then |
|
print_warning "Low disk space. Only ${available_space_mb}MB available, ${REQUIRED_SPACE_MB}MB required." |
|
return 1 |
|
fi |
|
return 0 |
|
} |
|
|
|
|
|
check_gpu_memory() { |
|
if command_exists nvidia-smi; then |
|
local total_memory=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits) |
|
local free_memory=$(nvidia-smi --query-gpu=memory.free --format=csv,noheader,nounits) |
|
local used_memory=$((total_memory - free_memory)) |
|
print_status "GPU Memory: ${used_memory}MB used, ${free_memory}MB free of ${total_memory}MB total" |
|
|
|
|
|
if [ "$free_memory" -lt 4000 ]; then |
|
print_warning "Low GPU memory. Consider reducing batch size or model size." |
|
fi |
|
else |
|
print_warning "nvidia-smi not found. GPU training may not be available." |
|
fi |
|
} |
|
|
|
|
|
create_project_structure() { |
|
print_status "Creating project structure..." |
|
mkdir -p "${PROJECT_ROOT}/src/data" \ |
|
"${PROJECT_ROOT}/src/model" \ |
|
"${PROJECT_ROOT}/src/training" \ |
|
"${PROJECT_ROOT}/src/inference" \ |
|
"${CHECKPOINT_DIR}" \ |
|
"${LORA_CHECKPOINT_DIR}" || handle_error "Failed to create directories" |
|
} |
|
|
|
|
|
setup_virtual_env() { |
|
print_status "Creating virtual environment..." |
|
python3 -m venv "${VENV_PATH}" || handle_error "Failed to create virtual environment" |
|
source "${VENV_PATH}/bin/activate" || handle_error "Failed to activate virtual environment" |
|
|
|
print_status "Installing dependencies..." |
|
pip install --upgrade pip |
|
pip install -r requirements.txt || handle_error "Failed to install requirements" |
|
} |
|
|
|
|
|
prepare_dataset() { |
|
print_status "Preparing dataset..." |
|
cd "${PROJECT_ROOT}" || handle_error "Failed to change to project directory" |
|
|
|
|
|
cat > process_data.py << 'EOF' |
|
import os |
|
import sys |
|
|
|
|
|
sys.path.append(os.path.join(os.path.dirname(__file__), 'src')) |
|
|
|
from data.data_processor import DeepSeekDataProcessor |
|
|
|
def main(): |
|
print("[+] Processing dataset into binary files...") |
|
processor = DeepSeekDataProcessor() |
|
processor.prepare_dataset() |
|
print("[+] Data processing completed successfully!") |
|
|
|
if __name__ == "__main__": |
|
main() |
|
EOF |
|
|
|
|
|
python3 process_data.py || handle_error "Failed to process dataset" |
|
|
|
|
|
if [ ! -f "${PROJECT_ROOT}/src/data/train.bin" ] || [ ! -f "${PROJECT_ROOT}/src/data/validation.bin" ]; then |
|
handle_error "Data processing failed - required files not created" |
|
fi |
|
} |
|
|
|
|
|
train_base_model() { |
|
print_status "Starting DeepSeek base model training..." |
|
cd "${PROJECT_ROOT}" || handle_error "Failed to change to project directory" |
|
|
|
python3 src/run_training.py \ |
|
--batch-size "${BATCH_SIZE:-12}" \ |
|
--max-iters "${MAX_ITERS:-20000}" \ |
|
--eval-interval "${EVAL_INTERVAL:-1000}" \ |
|
--eval-iters "${EVAL_ITERS:-200}" \ |
|
--learning-rate "${LEARNING_RATE:-6e-4}" \ |
|
--weight-decay "${WEIGHT_DECAY:-0.1}" \ |
|
--warmup-iters "${WARMUP_ITERS:-2000}" \ |
|
--lr-decay-iters "${LR_DECAY_ITERS:-20000}" \ |
|
--min-lr "${MIN_LR:-6e-5}" \ |
|
--moe-experts "${MOE_EXPERTS:-4}" \ |
|
--multi-token "${MULTI_TOKEN:-2}" || handle_error "Base model training failed" |
|
} |
|
|
|
|
|
finetune_lora() { |
|
while true; do |
|
read -p "Do you want to perform LoRA finetuning? (y/n) " do_finetune |
|
case $do_finetune in |
|
[Yy]* ) |
|
print_status "Starting LoRA finetuning..." |
|
cd "${PROJECT_ROOT}" || handle_error "Failed to change to project directory" |
|
|
|
|
|
cat > finetune_lora.py << 'EOF' |
|
import torch |
|
import os |
|
import sys |
|
sys.path.append('src') |
|
|
|
from model.deepseek import DeepSeek, DeepSeekConfig |
|
from peft import get_peft_model, LoraConfig, TaskType |
|
|
|
def main(): |
|
print("Loading base model...") |
|
checkpoint = torch.load('checkpoints/best_model.pt', map_location='cuda' if torch.cuda.is_available() else 'cpu') |
|
model = DeepSeek(checkpoint['config']) |
|
model.load_state_dict(checkpoint['model']) |
|
|
|
|
|
lora_config = LoraConfig( |
|
task_type=TaskType.CAUSAL_LM, |
|
r=8, |
|
lora_alpha=32, |
|
lora_dropout=0.1, |
|
target_modules=["q_a_proj", "q_b_proj", "kv_a_proj", "kv_b_proj"] |
|
) |
|
|
|
|
|
model = get_peft_model(model, lora_config) |
|
model.print_trainable_parameters() |
|
|
|
print("LoRA finetuning setup complete!") |
|
|
|
if __name__ == "__main__": |
|
main() |
|
EOF |
|
|
|
python3 finetune_lora.py || handle_error "LoRA finetuning failed" |
|
break |
|
;; |
|
[Nn]* ) |
|
print_status "Skipping LoRA finetuning..." |
|
break |
|
;; |
|
* ) |
|
echo "Please answer 'y' or 'n'" |
|
;; |
|
esac |
|
done |
|
} |
|
|
|
|
|
test_model() { |
|
while true; do |
|
read -p "Do you want to test the trained model? (y/n) " do_test |
|
case $do_test in |
|
[Yy]* ) |
|
print_status "Testing the trained model..." |
|
cd "${PROJECT_ROOT}" || handle_error "Failed to change to project directory" |
|
|
|
|
|
prompts=( |
|
"Once upon a time" |
|
"In a magical forest" |
|
"The little robot" |
|
"The brave knight" |
|
) |
|
|
|
|
|
for prompt in "${prompts[@]}"; do |
|
print_status "Testing with prompt: '$prompt'" |
|
python3 src/generate.py \ |
|
--model-path "${CHECKPOINT_DIR}/best_model.pt" \ |
|
--prompt "$prompt" \ |
|
--max-tokens 100 \ |
|
--temperature 0.8 \ |
|
--top-k 40 |
|
echo |
|
done |
|
break |
|
;; |
|
[Nn]* ) |
|
print_status "Skipping model testing..." |
|
break |
|
;; |
|
* ) |
|
echo "Please answer 'y' or 'n'" |
|
;; |
|
esac |
|
done |
|
} |
|
|
|
|
|
show_usage() { |
|
print_info "DeepSeek Children's Stories Model Setup Complete!" |
|
print_info "" |
|
print_info "Next steps:" |
|
print_info "1. Activate virtual environment: source venv/bin/activate" |
|
print_info "2. Train the model: python src/run_training.py" |
|
print_info "3. Generate stories: python src/generate.py --prompt 'your prompt'" |
|
print_info "4. Interactive mode: python src/generate.py --interactive" |
|
print_info "" |
|
print_info "Model files:" |
|
print_info "- Base model: checkpoints/best_model.pt" |
|
print_info "- LoRA model: lora_checkpoints/best_lora_model.pt" |
|
print_info "" |
|
print_info "Configuration options:" |
|
print_info "- Adjust model size: --n-layer, --n-head, --n-embd" |
|
print_info "- Training parameters: --batch-size, --learning-rate, --max-iters" |
|
print_info "- Advanced features: --moe-experts, --multi-token" |
|
} |
|
|
|
|
|
main() { |
|
print_info "DeepSeek Children's Stories Model Setup" |
|
print_info "======================================" |
|
|
|
|
|
if ! command_exists python3; then |
|
handle_error "Python 3 is required but not installed" |
|
fi |
|
|
|
if ! command_exists pip; then |
|
handle_error "pip is required but not installed" |
|
fi |
|
|
|
|
|
if ! check_disk_space; then |
|
print_warning "Continuing with low disk space..." |
|
fi |
|
|
|
|
|
check_gpu_memory |
|
|
|
|
|
create_project_structure |
|
|
|
|
|
setup_virtual_env |
|
|
|
|
|
prepare_dataset |
|
|
|
|
|
train_base_model |
|
|
|
|
|
finetune_lora |
|
|
|
|
|
test_model |
|
|
|
|
|
show_usage |
|
|
|
print_status "Setup completed successfully!" |
|
} |
|
|
|
|
|
main "$@" |