""" Hugging Face Space App for Free H200 Training This app runs nano-coder training on HF's free H200 GPU (4 minutes daily) """ import os import subprocess import time import gradio as gr from datetime import datetime, timedelta # Configuration MAX_TRAINING_TIME = 3.5 * 60 # 3.5 minutes to be safe TRAINING_SCRIPT = "hf_free_training.py" DATA_PREP_SCRIPT = "prepare_code_dataset.py" def check_daily_limit(): """Check if we've used today's free H200 time.""" today = datetime.now().date() limit_file = f"daily_limit_{today}.txt" # For debugging, let's check what's in the file if os.path.exists(limit_file): try: with open(limit_file, 'r') as f: last_run = f.read().strip() print(f"Debug: Found limit file with content: '{last_run}' for date: {today}") if last_run == str(today): return False, f"Daily H200 limit reached. Try again tomorrow! (Last run: {last_run})" except Exception as e: print(f"Debug: Error reading limit file: {e}") # If there's an error reading the file, let's allow training return True, "Ready to train! (Limit file error, allowing training)" else: print(f"Debug: No limit file found for today: {today}") return True, "Ready to train!" def mark_daily_usage(): """Mark that we've used today's free time.""" today = datetime.now().date() limit_file = f"daily_limit_{today}.txt" with open(limit_file, 'w') as f: f.write(str(today)) print(f"Debug: Marked daily usage for {today}") def reset_daily_limit(): """Reset the daily limit (for testing).""" today = datetime.now().date() limit_file = f"daily_limit_{today}.txt" if os.path.exists(limit_file): os.remove(limit_file) return f"✅ Daily limit reset for {today}" else: return f"ℹ️ No limit file found for {today}" def run_training(): """Run the free H200 training.""" # Check daily limit can_run, message = check_daily_limit() if not can_run: return message try: # Mark usage mark_daily_usage() # Prepare dataset if not already done if not os.path.exists("data/python-codes-25k/train.bin"): print("Preparing dataset...") subprocess.run(["python", DATA_PREP_SCRIPT], check=True) # Run training print("Starting free H200 training...") start_time = time.time() # Set environment variables for HF env = os.environ.copy() # HF Spaces automatically provides HF_TOKEN if 'HF_TOKEN' not in env: env['HF_TOKEN'] = os.environ.get('HF_TOKEN', '') # Run training with timeout process = subprocess.Popen( ["python", TRAINING_SCRIPT], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True, env=env ) output_lines = [] while True: elapsed = time.time() - start_time if elapsed > MAX_TRAINING_TIME: process.terminate() output_lines.append(f"\n⏰ Time limit reached ({elapsed/60:.1f} minutes)") break line = process.stdout.readline() if not line and process.poll() is not None: break if line: output_lines.append(line.strip()) print(line.strip()) # Wait for process to finish process.wait() # Check if training completed successfully if process.returncode == 0: result = "✅ Training completed successfully!\n\n" + "\n".join(output_lines[-20:]) # Last 20 lines else: result = "❌ Training failed or was interrupted.\n\n" + "\n".join(output_lines[-20:]) return result except Exception as e: return f"❌ Error during training: {str(e)}" def check_model_status(): """Check if trained model exists.""" model_path = "out-nano-coder-free/ckpt.pt" if os.path.exists(model_path): # Get file size size = os.path.getsize(model_path) / (1024 * 1024) # MB return f"✅ Model found! Size: {size:.1f} MB" else: return "❌ No trained model found. Run training first." def generate_sample_code(prompt, max_tokens=100, temperature=0.8): """Generate code using the trained model.""" if not os.path.exists("out-nano-coder-free/ckpt.pt"): return "❌ No trained model found. Please run training first." try: # Import and run sampling from sample_nano_coder import load_model, load_vocab, generate_code model, checkpoint = load_model() stoi, itos = load_vocab() # Generate code completion = generate_code(model, stoi, itos, prompt, max_tokens, temperature, 200) return f"Generated code:\n\n{completion}" except Exception as e: return f"❌ Error generating code: {str(e)}" # Create Gradio interface with gr.Blocks(title="Nano-Coder Free H200 Training") as demo: gr.Markdown("# 🚀 Nano-Coder Free H200 Training") gr.Markdown("Train a nanoGPT model for Python code generation using Hugging Face's free H200 GPU (4 minutes daily)") with gr.Row(): with gr.Column(): gr.Markdown("### 🎯 Training Control") train_button = gr.Button("🚀 Start Free H200 Training", variant="primary") reset_button = gr.Button("🔄 Reset Daily Limit", variant="secondary") status_text = gr.Textbox(label="Training Status", lines=10, interactive=False) with gr.Column(): gr.Markdown("### 📊 Model Status") model_status_button = gr.Button("🔍 Check Model Status") model_status_text = gr.Textbox(label="Model Status", lines=2, interactive=False) with gr.Row(): with gr.Column(): gr.Markdown("### 🎨 Code Generation") code_prompt = gr.Textbox( label="Code Prompt", placeholder="def fibonacci(n):\n ", lines=3 ) with gr.Row(): max_tokens = gr.Slider(50, 500, 100, label="Max Tokens") temperature = gr.Slider(0.1, 2.0, 0.8, label="Temperature") generate_button = gr.Button("✨ Generate Code") generated_code = gr.Textbox(label="Generated Code", lines=10, interactive=False) # Event handlers train_button.click( fn=run_training, outputs=status_text ) reset_button.click( fn=reset_daily_limit, outputs=status_text ) model_status_button.click( fn=check_model_status, outputs=model_status_text ) generate_button.click( fn=generate_sample_code, inputs=[code_prompt, max_tokens, temperature], outputs=generated_code ) gr.Markdown(""" ### 📋 Instructions 1. **Daily Limit**: You get 4 minutes of free H200 GPU time per day 2. **Training**: Click "Start Free H200 Training" to begin 3. **Model**: Check model status after training 4. **Generation**: Use the trained model to generate Python code ### ⚙️ Model Configuration (Free Tier) - **Layers**: 6 (reduced from 12) - **Heads**: 6 (reduced from 12) - **Embedding**: 384 (reduced from 768) - **Context**: 512 tokens - **Parameters**: ~15M (vs 124M full model) ### 💡 Tips - Training automatically stops at 3.5 minutes to be safe - Model checkpoints are saved to HF Hub - Use shorter prompts for better results """) if __name__ == "__main__": demo.launch()