BitTransformerLM / single_gpu_massive.py
WCNegentropy's picture
πŸ€– Updated BitTransformerLM from development space
36c78b1 verified
raw
history blame
4.8 kB
#!/usr/bin/env python3
"""
BitTransformerLM Single GPU 680M Parameter Training
===================================================
PROOF OF CONCEPT: 680M parameter model on single GPU to validate everything works!
"""
import os
import sys
import time
import logging
from datetime import datetime
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from datasets import load_dataset
from bit_transformer.model import BitTransformerLM
from bit_transformer.bit_io import text_to_bits
from bit_transformer.utils import set_dropout
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s')
logger = logging.getLogger(__name__)
def main():
"""Single GPU 680M parameter training - PROOF IT WORKS!"""
logger.info("πŸš€ SINGLE GPU 680M PARAMETER BITTRANSFORMERLM PROOF OF CONCEPT!")
logger.info("=" * 70)
# Model configuration - SAME AS BEFORE
config = {
"d_model": 1536,
"nhead": 24,
"num_layers": 24,
"dim_feedforward": 6144,
"max_seq_len": 2048,
"lambda_K": 1.0,
"lambda_C": 1.0,
"lambda_S": 1.0,
"reversible": True,
"use_checkpoint": True,
"use_autocast": True,
"chunk_size": None,
"full_attn_logging": False,
}
# Create model
logger.info("πŸ—οΈ Creating 680M parameter model...")
model = BitTransformerLM(**config)
params = sum(p.numel() for p in model.parameters())
logger.info(f"βœ… Model created: {params:,} parameters ({params/1e6:.1f}M)")
# Move to GPU
device = torch.device('cuda:0')
model = model.to(device)
logger.info(f"βœ… Model moved to {device}")
# Simple dataset
logger.info("πŸ“š Creating simple dataset...")
class SimpleDataset(torch.utils.data.Dataset):
def __init__(self, num_samples=100):
self.num_samples = num_samples
self.seq_len = 2048
def __len__(self):
return self.num_samples
def __getitem__(self, idx):
# Create simple alternating bit patterns
pattern = [0, 1, 1, 0] * (self.seq_len // 4)
if len(pattern) > self.seq_len:
pattern = pattern[:self.seq_len]
elif len(pattern) < self.seq_len:
pattern.extend([0] * (self.seq_len - len(pattern)))
input_bits = torch.tensor(pattern[:-1], dtype=torch.long)
target_bits = torch.tensor(pattern[1:], dtype=torch.long)
return input_bits, target_bits
dataset = SimpleDataset(100)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
logger.info(f"βœ… Dataset created: {len(dataset)} samples")
# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.01)
scaler = torch.amp.GradScaler('cuda')
logger.info("🎯 Starting training...")
model.train()
set_dropout(model, 0.1)
start_time = time.time()
for step, (input_ids, labels) in enumerate(dataloader):
if step >= 50: # Just prove it works for 50 steps
break
input_ids = input_ids.to(device)
labels = labels.to(device)
optimizer.zero_grad()
# Forward pass with mixed precision
with torch.amp.autocast('cuda'):
outputs = model(input_ids)
if isinstance(outputs, tuple):
logits, telemetry = outputs
else:
logits = outputs
telemetry = {}
loss = F.cross_entropy(logits.view(-1, 2), labels.view(-1))
# Backward pass
scaler.scale(loss).backward()
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
if step % 10 == 0:
elapsed = time.time() - start_time
memory_used = torch.cuda.memory_allocated(0) / (1024**3)
logger.info(
f"Step {step:2d} | "
f"Loss: {loss.item():.4f} | "
f"K: {telemetry.get('negentropy', 0):.3f} | "
f"C: {telemetry.get('lz_complexity', 0):.3f} | "
f"S: {telemetry.get('symbiosis', 0):.3f} | "
f"Mem: {memory_used:.1f}GB | "
f"Time: {elapsed:.1f}s"
)
start_time = time.time()
logger.info("πŸ† SUCCESS! 680M parameter BitTransformerLM trained successfully!")
logger.info("βœ… Single GPU training PROVEN!")
logger.info("βœ… Ready for proper multi-GPU scaling!")
if __name__ == "__main__":
main()