Spaces:
Running
Running
""" | |
Hugging Face Space App for Free H200 Training | |
This app runs nano-coder training on HF's free H200 GPU (4 minutes daily) | |
""" | |
import os | |
import subprocess | |
import time | |
import gradio as gr | |
from datetime import datetime, timedelta | |
# Configuration | |
MAX_TRAINING_TIME = 3.5 * 60 # 3.5 minutes to be safe | |
TRAINING_SCRIPT = "hf_free_training.py" | |
DATA_PREP_SCRIPT = "prepare_code_dataset.py" | |
def check_daily_limit(): | |
"""Check if we've used today's free H200 time.""" | |
today = datetime.now().date() | |
limit_file = f"daily_limit_{today}.txt" | |
# For debugging, let's check what's in the file | |
if os.path.exists(limit_file): | |
try: | |
with open(limit_file, 'r') as f: | |
last_run = f.read().strip() | |
print(f"Debug: Found limit file with content: '{last_run}' for date: {today}") | |
if last_run == str(today): | |
return False, f"Daily H200 limit reached. Try again tomorrow! (Last run: {last_run})" | |
except Exception as e: | |
print(f"Debug: Error reading limit file: {e}") | |
# If there's an error reading the file, let's allow training | |
return True, "Ready to train! (Limit file error, allowing training)" | |
else: | |
print(f"Debug: No limit file found for today: {today}") | |
return True, "Ready to train!" | |
def mark_daily_usage(): | |
"""Mark that we've used today's free time.""" | |
today = datetime.now().date() | |
limit_file = f"daily_limit_{today}.txt" | |
with open(limit_file, 'w') as f: | |
f.write(str(today)) | |
print(f"Debug: Marked daily usage for {today}") | |
def reset_daily_limit(): | |
"""Reset the daily limit (for testing).""" | |
today = datetime.now().date() | |
limit_file = f"daily_limit_{today}.txt" | |
if os.path.exists(limit_file): | |
os.remove(limit_file) | |
return f"β Daily limit reset for {today}" | |
else: | |
return f"βΉοΈ No limit file found for {today}" | |
def run_training(): | |
"""Run the free H200 training.""" | |
# Check daily limit | |
can_run, message = check_daily_limit() | |
if not can_run: | |
return message | |
try: | |
# Mark usage | |
mark_daily_usage() | |
# Prepare dataset if not already done | |
if not os.path.exists("data/python-codes-25k/train.bin"): | |
print("Preparing dataset...") | |
subprocess.run(["python", DATA_PREP_SCRIPT], check=True) | |
# Run training | |
print("Starting free H200 training...") | |
start_time = time.time() | |
# Set environment variables for HF | |
env = os.environ.copy() | |
# HF Spaces automatically provides HF_TOKEN | |
if 'HF_TOKEN' not in env: | |
env['HF_TOKEN'] = os.environ.get('HF_TOKEN', '') | |
# Run training with timeout | |
process = subprocess.Popen( | |
["python", TRAINING_SCRIPT], | |
stdout=subprocess.PIPE, | |
stderr=subprocess.STDOUT, | |
universal_newlines=True, | |
env=env | |
) | |
output_lines = [] | |
while True: | |
elapsed = time.time() - start_time | |
if elapsed > MAX_TRAINING_TIME: | |
process.terminate() | |
output_lines.append(f"\nβ° Time limit reached ({elapsed/60:.1f} minutes)") | |
break | |
line = process.stdout.readline() | |
if not line and process.poll() is not None: | |
break | |
if line: | |
output_lines.append(line.strip()) | |
print(line.strip()) | |
# Wait for process to finish | |
process.wait() | |
# Check if training completed successfully | |
if process.returncode == 0: | |
result = "β Training completed successfully!\n\n" + "\n".join(output_lines[-20:]) # Last 20 lines | |
else: | |
result = "β Training failed or was interrupted.\n\n" + "\n".join(output_lines[-20:]) | |
return result | |
except Exception as e: | |
return f"β Error during training: {str(e)}" | |
def check_model_status(): | |
"""Check if trained model exists.""" | |
model_path = "out-nano-coder-free/ckpt.pt" | |
if os.path.exists(model_path): | |
# Get file size | |
size = os.path.getsize(model_path) / (1024 * 1024) # MB | |
return f"β Model found! Size: {size:.1f} MB" | |
else: | |
return "β No trained model found. Run training first." | |
def generate_sample_code(prompt, max_tokens=100, temperature=0.8): | |
"""Generate code using the trained model.""" | |
if not os.path.exists("out-nano-coder-free/ckpt.pt"): | |
return "β No trained model found. Please run training first." | |
try: | |
# Import and run sampling | |
from sample_nano_coder import load_model, load_vocab, generate_code | |
model, checkpoint = load_model() | |
stoi, itos = load_vocab() | |
# Generate code | |
completion = generate_code(model, stoi, itos, prompt, max_tokens, temperature, 200) | |
return f"Generated code:\n\n{completion}" | |
except Exception as e: | |
return f"β Error generating code: {str(e)}" | |
# Create Gradio interface | |
with gr.Blocks(title="Nano-Coder Free H200 Training") as demo: | |
gr.Markdown("# π Nano-Coder Free H200 Training") | |
gr.Markdown("Train a nanoGPT model for Python code generation using Hugging Face's free H200 GPU (4 minutes daily)") | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown("### π― Training Control") | |
train_button = gr.Button("π Start Free H200 Training", variant="primary") | |
reset_button = gr.Button("π Reset Daily Limit", variant="secondary") | |
status_text = gr.Textbox(label="Training Status", lines=10, interactive=False) | |
with gr.Column(): | |
gr.Markdown("### π Model Status") | |
model_status_button = gr.Button("π Check Model Status") | |
model_status_text = gr.Textbox(label="Model Status", lines=2, interactive=False) | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown("### π¨ Code Generation") | |
code_prompt = gr.Textbox( | |
label="Code Prompt", | |
placeholder="def fibonacci(n):\n ", | |
lines=3 | |
) | |
with gr.Row(): | |
max_tokens = gr.Slider(50, 500, 100, label="Max Tokens") | |
temperature = gr.Slider(0.1, 2.0, 0.8, label="Temperature") | |
generate_button = gr.Button("β¨ Generate Code") | |
generated_code = gr.Textbox(label="Generated Code", lines=10, interactive=False) | |
# Event handlers | |
train_button.click( | |
fn=run_training, | |
outputs=status_text | |
) | |
reset_button.click( | |
fn=reset_daily_limit, | |
outputs=status_text | |
) | |
model_status_button.click( | |
fn=check_model_status, | |
outputs=model_status_text | |
) | |
generate_button.click( | |
fn=generate_sample_code, | |
inputs=[code_prompt, max_tokens, temperature], | |
outputs=generated_code | |
) | |
gr.Markdown(""" | |
### π Instructions | |
1. **Daily Limit**: You get 4 minutes of free H200 GPU time per day | |
2. **Training**: Click "Start Free H200 Training" to begin | |
3. **Model**: Check model status after training | |
4. **Generation**: Use the trained model to generate Python code | |
### βοΈ Model Configuration (Free Tier) | |
- **Layers**: 6 (reduced from 12) | |
- **Heads**: 6 (reduced from 12) | |
- **Embedding**: 384 (reduced from 768) | |
- **Context**: 512 tokens | |
- **Parameters**: ~15M (vs 124M full model) | |
### π‘ Tips | |
- Training automatically stops at 3.5 minutes to be safe | |
- Model checkpoints are saved to HF Hub | |
- Use shorter prompts for better results | |
""") | |
if __name__ == "__main__": | |
demo.launch() |