File size: 4,803 Bytes
36c78b1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
#!/usr/bin/env python3
"""
BitTransformerLM Single GPU 680M Parameter Training
===================================================
PROOF OF CONCEPT: 680M parameter model on single GPU to validate everything works!
"""
import os
import sys
import time
import logging
from datetime import datetime
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from datasets import load_dataset
from bit_transformer.model import BitTransformerLM
from bit_transformer.bit_io import text_to_bits
from bit_transformer.utils import set_dropout
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s')
logger = logging.getLogger(__name__)
def main():
"""Single GPU 680M parameter training - PROOF IT WORKS!"""
logger.info("π SINGLE GPU 680M PARAMETER BITTRANSFORMERLM PROOF OF CONCEPT!")
logger.info("=" * 70)
# Model configuration - SAME AS BEFORE
config = {
"d_model": 1536,
"nhead": 24,
"num_layers": 24,
"dim_feedforward": 6144,
"max_seq_len": 2048,
"lambda_K": 1.0,
"lambda_C": 1.0,
"lambda_S": 1.0,
"reversible": True,
"use_checkpoint": True,
"use_autocast": True,
"chunk_size": None,
"full_attn_logging": False,
}
# Create model
logger.info("ποΈ Creating 680M parameter model...")
model = BitTransformerLM(**config)
params = sum(p.numel() for p in model.parameters())
logger.info(f"β
Model created: {params:,} parameters ({params/1e6:.1f}M)")
# Move to GPU
device = torch.device('cuda:0')
model = model.to(device)
logger.info(f"β
Model moved to {device}")
# Simple dataset
logger.info("π Creating simple dataset...")
class SimpleDataset(torch.utils.data.Dataset):
def __init__(self, num_samples=100):
self.num_samples = num_samples
self.seq_len = 2048
def __len__(self):
return self.num_samples
def __getitem__(self, idx):
# Create simple alternating bit patterns
pattern = [0, 1, 1, 0] * (self.seq_len // 4)
if len(pattern) > self.seq_len:
pattern = pattern[:self.seq_len]
elif len(pattern) < self.seq_len:
pattern.extend([0] * (self.seq_len - len(pattern)))
input_bits = torch.tensor(pattern[:-1], dtype=torch.long)
target_bits = torch.tensor(pattern[1:], dtype=torch.long)
return input_bits, target_bits
dataset = SimpleDataset(100)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
logger.info(f"β
Dataset created: {len(dataset)} samples")
# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.01)
scaler = torch.amp.GradScaler('cuda')
logger.info("π― Starting training...")
model.train()
set_dropout(model, 0.1)
start_time = time.time()
for step, (input_ids, labels) in enumerate(dataloader):
if step >= 50: # Just prove it works for 50 steps
break
input_ids = input_ids.to(device)
labels = labels.to(device)
optimizer.zero_grad()
# Forward pass with mixed precision
with torch.amp.autocast('cuda'):
outputs = model(input_ids)
if isinstance(outputs, tuple):
logits, telemetry = outputs
else:
logits = outputs
telemetry = {}
loss = F.cross_entropy(logits.view(-1, 2), labels.view(-1))
# Backward pass
scaler.scale(loss).backward()
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
if step % 10 == 0:
elapsed = time.time() - start_time
memory_used = torch.cuda.memory_allocated(0) / (1024**3)
logger.info(
f"Step {step:2d} | "
f"Loss: {loss.item():.4f} | "
f"K: {telemetry.get('negentropy', 0):.3f} | "
f"C: {telemetry.get('lz_complexity', 0):.3f} | "
f"S: {telemetry.get('symbiosis', 0):.3f} | "
f"Mem: {memory_used:.1f}GB | "
f"Time: {elapsed:.1f}s"
)
start_time = time.time()
logger.info("π SUCCESS! 680M parameter BitTransformerLM trained successfully!")
logger.info("β
Single GPU training PROVEN!")
logger.info("β
Ready for proper multi-GPU scaling!")
if __name__ == "__main__":
main() |