Molchevsky
/

ai_resume

+import os
+import json
+import torch
+from transformers import (
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    TrainingArguments,
+    Trainer,
+    BitsAndBytesConfig,
+    DataCollatorForLanguageModeling
+)
+from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
+from datasets import Dataset
+import warnings
+import glob
+# Suppress warnings
+warnings.filterwarnings("ignore", category=FutureWarning)
+os.environ['TOKENIZERS_PARALLELISM'] = 'false'
+def load_jsonl_data(data_dir):
+    """Load conversation data from all JSONL files in the specified directory"""
+    conversations = []
+    # Find all JSONL files in the directory
+    jsonl_files = glob.glob(os.path.join(data_dir, "*.jsonl"))
+    if not jsonl_files:
+        print(f"⚠️ No JSONL files found in {data_dir}")
+        return []
+    print(f"Found {len(jsonl_files)} JSONL files:")
+    for file in jsonl_files:
+        print(f"  • {os.path.basename(file)}")
+    # Load data from each file
+    for file_path in jsonl_files:
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                for line_num, line in enumerate(f, 1):
+                    line = line.strip()
+                    if not line:
+                        continue
+                    try:
+                        data = json.loads(line)
+                        if 'messages' in data:
+                            conversations.append(data['messages'])
+                        else:
+                            print(f"⚠️ Skipping line {line_num} in {file_path}: no 'messages' field")
+                    except json.JSONDecodeError as e:
+                        print(f"⚠️ Skipping invalid JSON on line {line_num} in {file_path}: {e}")
+        except Exception as e:
+            print(f"❌ Error reading file {file_path}: {e}")
+    print(f"Loaded {len(conversations)} conversations from {data_dir}")
+    return conversations
+def format_conversation_for_training(messages):
+    """
+    Format a conversation with system, user, and assistant messages for Llama training
+    Args:
+        messages: List of message dictionaries with 'role' and 'content' keys
+    Returns:
+        Formatted string ready for training
+    """
+    formatted_parts = ["<|begin_of_text|>"]
+    for message in messages:
+        role = message.get('role', '').lower()
+        content = message.get('content', '').strip()
+        if not content:
+            continue
+        if role == 'system':
+            formatted_parts.append(f"<|start_header_id|>system<|end_header_id|>\n\n{content}<|eot_id|>")
+        elif role == 'user':
+            formatted_parts.append(f"<|start_header_id|>user<|end_header_id|>\n\n{content}<|eot_id|>")
+        elif role == 'assistant':
+            formatted_parts.append(f"<|start_header_id|>assistant<|end_header_id|>\n\n{content}<|eot_id|>")
+        else:
+            print(f"⚠️ Unknown role '{role}', skipping message")
+    return "".join(formatted_parts)
+def tokenize_function(examples, tokenizer, max_length=1024):
+    """Tokenize the conversation examples"""
+    # Tokenize inputs
+    tokenized = tokenizer(
+        examples["text"],
+        truncation=True,
+        padding="max_length",
+        max_length=max_length,
+        return_tensors=None  # Don't return tensors here, let the collator handle it
+    )
+    # For causal language modeling, labels are the same as input_ids
+    tokenized["labels"] = tokenized["input_ids"].copy()
+    return tokenized
+def prepare_dataset(conversations, tokenizer, max_length=1024):
+    """Prepare dataset for training from conversation data"""
+    formatted_texts = []
+    print("📝 Processing conversations...")
+    for i, messages in enumerate(conversations):
+        if not messages:
+            print(f"⚠️ Skipping empty conversation {i+1}")
+            continue
+        # Validate conversation structure
+        has_system = any(msg.get('role') == 'system' for msg in messages)
+        has_user = any(msg.get('role') == 'user' for msg in messages)
+        has_assistant = any(msg.get('role') == 'assistant' for msg in messages)
+        if not (has_user and has_assistant):
+            print(f"⚠️ Skipping conversation {i+1}: missing user or assistant message")
+            continue
+        if not has_system:
+            print(f"⚠️ Conversation {i+1} has no system message")
+        # Format the conversation
+        formatted_text = format_conversation_for_training(messages)
+        if len(formatted_text.strip()) > 0:
+            formatted_texts.append(formatted_text)
+        else:
+            print(f"⚠️ Skipping empty formatted conversation {i+1}")
+    if not formatted_texts:
+        raise ValueError("No valid conversations found! Please check your JSONL files.")
+    print(f"✅ Successfully processed {len(formatted_texts)} conversations")
+    # Show a sample formatted conversation
+    if formatted_texts:
+        print("\n📋 Sample formatted conversation:")
+        print("-" * 80)
+        sample = formatted_texts[0]
+        print(sample[:500] + "..." if len(sample) > 500 else sample)
+        print("-" * 80)
+    # Create Hugging Face dataset
+    dataset = Dataset.from_dict({"text": formatted_texts})
+    # Tokenize the dataset
+    tokenized_dataset = dataset.map(
+        lambda examples: tokenize_function(examples, tokenizer, max_length),
+        batched=True,
+        remove_columns=dataset.column_names,
+        desc="Tokenizing conversations"
+    )
+    return tokenized_dataset
+def setup_model_and_tokenizer(model_path):
+    """Setup model with quantization and tokenizer"""
+    # Quantization config for 4-bit training
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_use_double_quant=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.bfloat16,
+    )
+    # Load tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_path,
+        trust_remote_code=True,
+        padding_side="right"
+    )
+    # Add pad token if it doesn't exist
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+    # Load model with quantization
+    try:
+        # Try to use Flash Attention 2 if available and compatible
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            quantization_config=bnb_config,
+            device_map="auto",
+            torch_dtype=torch.bfloat16,
+            trust_remote_code=True,
+            use_cache=False,  # Disable cache for training
+            attn_implementation="flash_attention_2" if torch.cuda.get_device_capability()[0] >= 8 else "eager"
+        )
+        print("✅ Using Flash Attention 2 for better performance!")
+    except Exception as e:
+        print(f"⚠️ Flash Attention 2 not available ({str(e)}), using standard attention")
+        # Fallback to standard attention
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            quantization_config=bnb_config,
+            device_map="auto",
+            torch_dtype=torch.bfloat16,
+            trust_remote_code=True,
+            use_cache=False,  # Disable cache for training
+        )
+    # Prepare model for k-bit training
+    model = prepare_model_for_kbit_training(model)
+    return model, tokenizer
+def setup_lora_config():
+    """Setup LoRA configuration for Llama 3.2"""
+    lora_config = LoraConfig(
+        task_type=TaskType.CAUSAL_LM,
+        r=16,  # Rank - can be increased for potentially better results
+        lora_alpha=32,  # LoRA scaling parameter
+        lora_dropout=0.1,  # LoRA dropout
+        target_modules=[
+            "q_proj",
+            "k_proj",
+            "v_proj",
+            "o_proj",
+            "gate_proj",
+            "up_proj",
+            "down_proj"
+        ],
+        bias="none",
+        inference_mode=False,
+    )
+    return lora_config
+def main():
+    # Configuration
+    MODEL_PATH = "llama-3.2-3b"  # Path to your base model directory
+    QA_DATA_PATH = "./new_qa_pairs/"    # Path to your JSONL data directory
+    OUTPUT_DIR = "llama-3.2-3b-finetuned"  # Output directory for the fine-tuned model
+    # Check CUDA availability
+    if not torch.cuda.is_available():
+        print("❌ CUDA is not available. Please check your installation.")
+        return
+    print(f"🚀 Starting Llama 3.2 Fine-tuning")
+    print(f"Using GPU: {torch.cuda.get_device_name()}")
+    print(f"CUDA Version: {torch.version.cuda}")
+    print(f"PyTorch Version: {torch.__version__}")
+    # Check if data directory exists
+    if not os.path.exists(QA_DATA_PATH):
+        print(f"❌ Data directory not found: {QA_DATA_PATH}")
+        print("Please create the directory and add your JSONL files.")
+        return
+    # Load conversation data
+    print(f"\n📚 Loading conversation data from {QA_DATA_PATH}...")
+    conversations = load_jsonl_data(QA_DATA_PATH)
+    if len(conversations) == 0:
+        print("❌ No valid conversations found. Please check your JSONL files.")
+        return
+    # Setup model and tokenizer
+    print(f"\n🧠 Loading model and tokenizer from {MODEL_PATH}...")
+    model, tokenizer = setup_model_and_tokenizer(MODEL_PATH)
+    # Prepare dataset
+    print(f"\n🔧 Preparing dataset...")
+    dataset = prepare_dataset(conversations, tokenizer, max_length=1024)  # Increased for system messages
+    # Split dataset (90% train, 10% eval)
+    dataset = dataset.train_test_split(test_size=0.1, seed=42)
+    train_dataset = dataset['train']
+    eval_dataset = dataset['test']
+    print(f"\n📊 Dataset Statistics:")
+    print(f"  • Total conversations: {len(conversations)}")
+    print(f"  • Training samples: {len(train_dataset)}")
+    print(f"  • Evaluation samples: {len(eval_dataset)}")
+    # Setup LoRA
+    print(f"\n🎯 Setting up LoRA...")
+    lora_config = setup_lora_config()
+    model = get_peft_model(model, lora_config)
+    model.print_trainable_parameters()
+    # Data collator - handles dynamic padding and label preparation
+    data_collator = DataCollatorForLanguageModeling(
+        tokenizer=tokenizer,
+        mlm=False,  # We're doing causal language modeling, not masked LM
+        pad_to_multiple_of=8,
+        return_tensors="pt"
+    )
+    # Training arguments - updated for latest API
+    training_args = TrainingArguments(
+        output_dir=OUTPUT_DIR,
+        num_train_epochs=3,
+        per_device_train_batch_size=1,  # Small batch size for 8GB GPU
+        per_device_eval_batch_size=1,
+        gradient_accumulation_steps=8,  # Effective batch size = 1 * 8 = 8
+        warmup_steps=100,
+        learning_rate=2e-4,
+        weight_decay=0.01,
+        fp16=False,
+        bf16=True,  # Use bfloat16 for better stability
+        logging_steps=10,
+        eval_steps=100,
+        save_steps=200,
+        eval_strategy="steps",  # Updated parameter name
+        save_strategy="steps",
+        load_best_model_at_end=True,
+        metric_for_best_model="eval_loss",
+        greater_is_better=False,
+        report_to=None,  # Disable wandb/tensorboard logging
+        dataloader_pin_memory=True,
+        remove_unused_columns=False,
+        optim="paged_adamw_8bit",  # Memory-efficient optimizer
+        lr_scheduler_type="cosine",
+        max_grad_norm=1.0,
+        dataloader_num_workers=0,  # Avoid multiprocessing issues
+        group_by_length=False,  # Disable grouping for stability
+        ddp_find_unused_parameters=False,  # For better performance
+        save_total_limit=3,  # Keep only 3 checkpoints
+        prediction_loss_only=False,
+        include_inputs_for_metrics=False,
+        seed=42,
+        data_seed=42,
+        # New parameters in latest version
+        eval_do_concat_batches=False,  # Better for memory
+        torch_empty_cache_steps=50,    # Clear cache every 50 steps
+        gradient_checkpointing=True,   # Enable gradient checkpointing for memory efficiency
+        gradient_checkpointing_kwargs={"use_reentrant": False},  # Use non-reentrant checkpointing (recommended)
+    )
+    # Initialize trainer
+    print(f"\n🏃 Initializing trainer...")
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        processing_class=tokenizer,  # Updated parameter name from tokenizer
+        data_collator=data_collator,
+    )
+    # Print training info
+    total_steps = len(train_dataset) // training_args.gradient_accumulation_steps * training_args.num_train_epochs
+    print(f"\n📈 Training Configuration:")
+    print(f"  • Total training steps: {total_steps}")
+    print(f"  • Warmup steps: {training_args.warmup_steps}")
+    print(f"  • Learning rate: {training_args.learning_rate}")
+    print(f"  • Batch size (effective): {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
+    print(f"  • Save every: {training_args.save_steps} steps")
+    print(f"  • Eval every: {training_args.eval_steps} steps")
+    # Start training
+    print(f"\n🚀 Starting training...")
+    print("=" * 60)
+    trainer.train()
+    # Save the fine-tuned model
+    print(f"\n💾 Saving model...")
+    trainer.save_model()
+    # Save tokenizer separately to ensure compatibility
+    tokenizer.save_pretrained(OUTPUT_DIR)
+    print(f"\n✅ Fine-tuning completed!")
+    print(f"📁 Model saved to: {OUTPUT_DIR}")
+    # Test the model with a sample conversation
+    print(f"\n🧪 Testing the model with a sample...")
+    # Set model to eval mode
+    model.eval()
+    # Use first conversation as test
+    if conversations:
+        test_conversation = conversations[0]
+        # Extract system message and user question
+        system_msg = next((msg['content'] for msg in test_conversation if msg['role'] == 'system'), "")
+        user_msg = next((msg['content'] for msg in test_conversation if msg['role'] == 'user'), "")
+        expected_response = next((msg['content'] for msg in test_conversation if msg['role'] == 'assistant'), "")
+        if system_msg and user_msg:
+            # Format input for testing
+            test_input = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{system_msg}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{user_msg}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
+            # Tokenize and generate
+            inputs = tokenizer(test_input, return_tensors="pt").to(model.device)
+            with torch.no_grad():
+                outputs = model.generate(
+                    **inputs,
+                    max_new_tokens=150,
+                    temperature=0.7,
+                    do_sample=True,
+                    pad_token_id=tokenizer.eos_token_id,
+                    eos_token_id=tokenizer.eos_token_id,
+                    repetition_penalty=1.1,
+                )
+            response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+            generated_answer = response[len(test_input):].strip()
+            print(f"\n📋 Test Results:")
+            print(f"System: {system_msg[:100]}{'...' if len(system_msg) > 100 else ''}")
+            print(f"Question: {user_msg}")
+            print(f"Generated: {generated_answer}")
+            print(f"Expected: {expected_response}")
+            print("=" * 60)
+if __name__ == "__main__":
+    main()