ThomasTheMaker
/

Deepseek-Small-Stories

Model card Files Files and versions Community

Deepseek-Small-Stories / setup.sh

ThomasTheMaker

Upload folder using huggingface_hub

01ae771 verified 11 days ago

raw

history blame contribute delete

9.65 kB

	#!/bin/bash

	# Colors for output
	GREEN='\033[0;32m'
	RED='\033[0;31m'
	YELLOW='\033[1;33m'
	BLUE='\033[0;34m'
	NC='\033[0m' # No Color

	# Default configuration
	PROJECT_ROOT="${PROJECT_ROOT:-$(pwd)}"
	VENV_PATH="${VENV_PATH:-${PROJECT_ROOT}/venv}"
	CHECKPOINT_DIR="${CHECKPOINT_DIR:-${PROJECT_ROOT}/checkpoints}"
	LORA_CHECKPOINT_DIR="${LORA_CHECKPOINT_DIR:-${PROJECT_ROOT}/lora_checkpoints}"
	REQUIRED_SPACE_MB="${REQUIRED_SPACE_MB:-2000}"

	# Function to print status messages
	print_status() {
	echo -e "${GREEN}[+] $1${NC}"
	}

	print_error() {
	echo -e "${RED}[-] $1${NC}"
	}

	print_warning() {
	echo -e "${YELLOW}[!] $1${NC}"
	}

	print_info() {
	echo -e "${BLUE}[i] $1${NC}"
	}

	# Function to handle errors
	handle_error() {
	print_error "$1"
	exit 1
	}

	# Function to check if a command exists
	command_exists() {
	command -v "$1" &> /dev/null
	}

	# Function to check disk space
	check_disk_space() {
	local available_space_mb=$(df -m . \| awk 'NR==2 {print $4}')
	if [ "$available_space_mb" -lt "$REQUIRED_SPACE_MB" ]; then
	print_warning "Low disk space. Only ${available_space_mb}MB available, ${REQUIRED_SPACE_MB}MB required."
	return 1
	fi
	return 0
	}

	# Function to check GPU memory
	check_gpu_memory() {
	if command_exists nvidia-smi; then
	local total_memory=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits)
	local free_memory=$(nvidia-smi --query-gpu=memory.free --format=csv,noheader,nounits)
	local used_memory=$((total_memory - free_memory))
	print_status "GPU Memory: ${used_memory}MB used, ${free_memory}MB free of ${total_memory}MB total"

	# Check if we have enough memory for training
	if [ "$free_memory" -lt 4000 ]; then
	print_warning "Low GPU memory. Consider reducing batch size or model size."
	fi
	else
	print_warning "nvidia-smi not found. GPU training may not be available."
	fi
	}

	# Function to create project structure
	create_project_structure() {
	print_status "Creating project structure..."
	mkdir -p "${PROJECT_ROOT}/src/data" \
	"${PROJECT_ROOT}/src/model" \
	"${PROJECT_ROOT}/src/training" \
	"${PROJECT_ROOT}/src/inference" \
	"${CHECKPOINT_DIR}" \
	"${LORA_CHECKPOINT_DIR}" \|\| handle_error "Failed to create directories"
	}

	# Function to setup virtual environment
	setup_virtual_env() {
	print_status "Creating virtual environment..."
	python3 -m venv "${VENV_PATH}" \|\| handle_error "Failed to create virtual environment"
	source "${VENV_PATH}/bin/activate" \|\| handle_error "Failed to activate virtual environment"

	print_status "Installing dependencies..."
	pip install --upgrade pip
	pip install -r requirements.txt \|\| handle_error "Failed to install requirements"
	}

	# Function to prepare dataset
	prepare_dataset() {
	print_status "Preparing dataset..."
	cd "${PROJECT_ROOT}" \|\| handle_error "Failed to change to project directory"

	# Create a Python script to process the data
	cat > process_data.py << 'EOF'
	import os
	import sys

	# Add the src directory to Python path
	sys.path.append(os.path.join(os.path.dirname(__file__), 'src'))

	from data.data_processor import DeepSeekDataProcessor

	def main():
	print("[+] Processing dataset into binary files...")
	processor = DeepSeekDataProcessor()
	processor.prepare_dataset()
	print("[+] Data processing completed successfully!")

	if __name__ == "__main__":
	main()
	EOF

	# Run the data processing script
	python3 process_data.py \|\| handle_error "Failed to process dataset"

	# Verify the files were created
	if [ ! -f "${PROJECT_ROOT}/src/data/train.bin" ] \|\| [ ! -f "${PROJECT_ROOT}/src/data/validation.bin" ]; then
	handle_error "Data processing failed - required files not created"
	fi
	}

	# Function to train base model
	train_base_model() {
	print_status "Starting DeepSeek base model training..."
	cd "${PROJECT_ROOT}" \|\| handle_error "Failed to change to project directory"

	python3 src/run_training.py \
	--batch-size "${BATCH_SIZE:-12}" \
	--max-iters "${MAX_ITERS:-20000}" \
	--eval-interval "${EVAL_INTERVAL:-1000}" \
	--eval-iters "${EVAL_ITERS:-200}" \
	--learning-rate "${LEARNING_RATE:-6e-4}" \
	--weight-decay "${WEIGHT_DECAY:-0.1}" \
	--warmup-iters "${WARMUP_ITERS:-2000}" \
	--lr-decay-iters "${LR_DECAY_ITERS:-20000}" \
	--min-lr "${MIN_LR:-6e-5}" \
	--moe-experts "${MOE_EXPERTS:-4}" \
	--multi-token "${MULTI_TOKEN:-2}" \|\| handle_error "Base model training failed"
	}

	# Function to perform LoRA finetuning
	finetune_lora() {
	while true; do
	read -p "Do you want to perform LoRA finetuning? (y/n) " do_finetune
	case $do_finetune in
	[Yy]* )
	print_status "Starting LoRA finetuning..."
	cd "${PROJECT_ROOT}" \|\| handle_error "Failed to change to project directory"

	# Create LoRA finetuning script
	cat > finetune_lora.py << 'EOF'
	import torch
	import os
	import sys
	sys.path.append('src')

	from model.deepseek import DeepSeek, DeepSeekConfig
	from peft import get_peft_model, LoraConfig, TaskType

	def main():
	print("Loading base model...")
	checkpoint = torch.load('checkpoints/best_model.pt', map_location='cuda' if torch.cuda.is_available() else 'cpu')
	model = DeepSeek(checkpoint['config'])
	model.load_state_dict(checkpoint['model'])

	# Define LoRA configuration
	lora_config = LoraConfig(
	task_type=TaskType.CAUSAL_LM,
	r=8, # rank
	lora_alpha=32,
	lora_dropout=0.1,
	target_modules=["q_a_proj", "q_b_proj", "kv_a_proj", "kv_b_proj"]
	)

	# Get PEFT model
	model = get_peft_model(model, lora_config)
	model.print_trainable_parameters()

	print("LoRA finetuning setup complete!")

	if __name__ == "__main__":
	main()
	EOF

	python3 finetune_lora.py \|\| handle_error "LoRA finetuning failed"
	break
	;;
	[Nn]* )
	print_status "Skipping LoRA finetuning..."
	break
	;;
	* )
	echo "Please answer 'y' or 'n'"
	;;
	esac
	done
	}

	# Function to test the trained model
	test_model() {
	while true; do
	read -p "Do you want to test the trained model? (y/n) " do_test
	case $do_test in
	[Yy]* )
	print_status "Testing the trained model..."
	cd "${PROJECT_ROOT}" \|\| handle_error "Failed to change to project directory"

	# Create test prompts
	prompts=(
	"Once upon a time"
	"In a magical forest"
	"The little robot"
	"The brave knight"
	)

	# Test each prompt
	for prompt in "${prompts[@]}"; do
	print_status "Testing with prompt: '$prompt'"
	python3 src/generate.py \
	--model-path "${CHECKPOINT_DIR}/best_model.pt" \
	--prompt "$prompt" \
	--max-tokens 100 \
	--temperature 0.8 \
	--top-k 40
	echo
	done
	break
	;;
	[Nn]* )
	print_status "Skipping model testing..."
	break
	;;
	* )
	echo "Please answer 'y' or 'n'"
	;;
	esac
	done
	}

	# Function to show usage information
	show_usage() {
	print_info "DeepSeek Children's Stories Model Setup Complete!"
	print_info ""
	print_info "Next steps:"
	print_info "1. Activate virtual environment: source venv/bin/activate"
	print_info "2. Train the model: python src/run_training.py"
	print_info "3. Generate stories: python src/generate.py --prompt 'your prompt'"
	print_info "4. Interactive mode: python src/generate.py --interactive"
	print_info ""
	print_info "Model files:"
	print_info "- Base model: checkpoints/best_model.pt"
	print_info "- LoRA model: lora_checkpoints/best_lora_model.pt"
	print_info ""
	print_info "Configuration options:"
	print_info "- Adjust model size: --n-layer, --n-head, --n-embd"
	print_info "- Training parameters: --batch-size, --learning-rate, --max-iters"
	print_info "- Advanced features: --moe-experts, --multi-token"
	}

	# Main setup function
	main() {
	print_info "DeepSeek Children's Stories Model Setup"
	print_info "======================================"

	# Check prerequisites
	if ! command_exists python3; then
	handle_error "Python 3 is required but not installed"
	fi

	if ! command_exists pip; then
	handle_error "pip is required but not installed"
	fi

	# Check disk space
	if ! check_disk_space; then
	print_warning "Continuing with low disk space..."
	fi

	# Check GPU
	check_gpu_memory

	# Create project structure
	create_project_structure

	# Setup virtual environment
	setup_virtual_env

	# Prepare dataset
	prepare_dataset

	# Train base model
	train_base_model

	# Optional LoRA finetuning
	finetune_lora

	# Optional model testing
	test_model

	# Show usage information
	show_usage

	print_status "Setup completed successfully!"
	}

	# Run main function
	main "$@"