nano-coder-free / app.py
mlopez6132's picture
Upload app.py with huggingface_hub
ee850c7 verified
"""
Hugging Face Space App for Free H200 Training
This app runs nano-coder training on HF's free H200 GPU (4 minutes daily)
"""
import os
import subprocess
import time
import gradio as gr
from datetime import datetime, timedelta
# Configuration
MAX_TRAINING_TIME = 3.5 * 60 # 3.5 minutes to be safe
TRAINING_SCRIPT = "hf_free_training.py"
DATA_PREP_SCRIPT = "prepare_code_dataset.py"
def check_daily_limit():
"""Check if we've used today's free H200 time."""
today = datetime.now().date()
limit_file = f"daily_limit_{today}.txt"
# For debugging, let's check what's in the file
if os.path.exists(limit_file):
try:
with open(limit_file, 'r') as f:
last_run = f.read().strip()
print(f"Debug: Found limit file with content: '{last_run}' for date: {today}")
if last_run == str(today):
return False, f"Daily H200 limit reached. Try again tomorrow! (Last run: {last_run})"
except Exception as e:
print(f"Debug: Error reading limit file: {e}")
# If there's an error reading the file, let's allow training
return True, "Ready to train! (Limit file error, allowing training)"
else:
print(f"Debug: No limit file found for today: {today}")
return True, "Ready to train!"
def mark_daily_usage():
"""Mark that we've used today's free time."""
today = datetime.now().date()
limit_file = f"daily_limit_{today}.txt"
with open(limit_file, 'w') as f:
f.write(str(today))
print(f"Debug: Marked daily usage for {today}")
def reset_daily_limit():
"""Reset the daily limit (for testing)."""
today = datetime.now().date()
limit_file = f"daily_limit_{today}.txt"
if os.path.exists(limit_file):
os.remove(limit_file)
return f"βœ… Daily limit reset for {today}"
else:
return f"ℹ️ No limit file found for {today}"
def run_training():
"""Run the free H200 training."""
# Check daily limit
can_run, message = check_daily_limit()
if not can_run:
return message
try:
# Mark usage
mark_daily_usage()
# Prepare dataset if not already done
if not os.path.exists("data/python-codes-25k/train.bin"):
print("Preparing dataset...")
subprocess.run(["python", DATA_PREP_SCRIPT], check=True)
# Run training
print("Starting free H200 training...")
start_time = time.time()
# Set environment variables for HF
env = os.environ.copy()
# HF Spaces automatically provides HF_TOKEN
if 'HF_TOKEN' not in env:
env['HF_TOKEN'] = os.environ.get('HF_TOKEN', '')
# Run training with timeout
process = subprocess.Popen(
["python", TRAINING_SCRIPT],
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
universal_newlines=True,
env=env
)
output_lines = []
while True:
elapsed = time.time() - start_time
if elapsed > MAX_TRAINING_TIME:
process.terminate()
output_lines.append(f"\n⏰ Time limit reached ({elapsed/60:.1f} minutes)")
break
line = process.stdout.readline()
if not line and process.poll() is not None:
break
if line:
output_lines.append(line.strip())
print(line.strip())
# Wait for process to finish
process.wait()
# Check if training completed successfully
if process.returncode == 0:
result = "βœ… Training completed successfully!\n\n" + "\n".join(output_lines[-20:]) # Last 20 lines
else:
result = "❌ Training failed or was interrupted.\n\n" + "\n".join(output_lines[-20:])
return result
except Exception as e:
return f"❌ Error during training: {str(e)}"
def check_model_status():
"""Check if trained model exists."""
model_path = "out-nano-coder-free/ckpt.pt"
if os.path.exists(model_path):
# Get file size
size = os.path.getsize(model_path) / (1024 * 1024) # MB
return f"βœ… Model found! Size: {size:.1f} MB"
else:
return "❌ No trained model found. Run training first."
def generate_sample_code(prompt, max_tokens=100, temperature=0.8):
"""Generate code using the trained model."""
if not os.path.exists("out-nano-coder-free/ckpt.pt"):
return "❌ No trained model found. Please run training first."
try:
# Import and run sampling
from sample_nano_coder import load_model, load_vocab, generate_code
model, checkpoint = load_model()
stoi, itos = load_vocab()
# Generate code
completion = generate_code(model, stoi, itos, prompt, max_tokens, temperature, 200)
return f"Generated code:\n\n{completion}"
except Exception as e:
return f"❌ Error generating code: {str(e)}"
# Create Gradio interface
with gr.Blocks(title="Nano-Coder Free H200 Training") as demo:
gr.Markdown("# πŸš€ Nano-Coder Free H200 Training")
gr.Markdown("Train a nanoGPT model for Python code generation using Hugging Face's free H200 GPU (4 minutes daily)")
with gr.Row():
with gr.Column():
gr.Markdown("### 🎯 Training Control")
train_button = gr.Button("πŸš€ Start Free H200 Training", variant="primary")
reset_button = gr.Button("πŸ”„ Reset Daily Limit", variant="secondary")
status_text = gr.Textbox(label="Training Status", lines=10, interactive=False)
with gr.Column():
gr.Markdown("### πŸ“Š Model Status")
model_status_button = gr.Button("πŸ” Check Model Status")
model_status_text = gr.Textbox(label="Model Status", lines=2, interactive=False)
with gr.Row():
with gr.Column():
gr.Markdown("### 🎨 Code Generation")
code_prompt = gr.Textbox(
label="Code Prompt",
placeholder="def fibonacci(n):\n ",
lines=3
)
with gr.Row():
max_tokens = gr.Slider(50, 500, 100, label="Max Tokens")
temperature = gr.Slider(0.1, 2.0, 0.8, label="Temperature")
generate_button = gr.Button("✨ Generate Code")
generated_code = gr.Textbox(label="Generated Code", lines=10, interactive=False)
# Event handlers
train_button.click(
fn=run_training,
outputs=status_text
)
reset_button.click(
fn=reset_daily_limit,
outputs=status_text
)
model_status_button.click(
fn=check_model_status,
outputs=model_status_text
)
generate_button.click(
fn=generate_sample_code,
inputs=[code_prompt, max_tokens, temperature],
outputs=generated_code
)
gr.Markdown("""
### πŸ“‹ Instructions
1. **Daily Limit**: You get 4 minutes of free H200 GPU time per day
2. **Training**: Click "Start Free H200 Training" to begin
3. **Model**: Check model status after training
4. **Generation**: Use the trained model to generate Python code
### βš™οΈ Model Configuration (Free Tier)
- **Layers**: 6 (reduced from 12)
- **Heads**: 6 (reduced from 12)
- **Embedding**: 384 (reduced from 768)
- **Context**: 512 tokens
- **Parameters**: ~15M (vs 124M full model)
### πŸ’‘ Tips
- Training automatically stops at 3.5 minutes to be safe
- Model checkpoints are saved to HF Hub
- Use shorter prompts for better results
""")
if __name__ == "__main__":
demo.launch()