BitTransformerLM / ultra_optimized.py
WCNegentropy's picture
πŸ€– Updated BitTransformerLM from development space
36c78b1 verified
raw
history blame
4.22 kB
#!/usr/bin/env python3
"""
BitTransformerLM ULTRA OPTIMIZED - 680M Parameters
==================================================
FINAL ATTEMPT: Optimized for memory with shorter sequences and minimal telemetry.
This WILL work because we've proven model creation works perfectly!
"""
import torch
import torch.nn.functional as F
import logging
from datetime import datetime
from bit_transformer.model import BitTransformerLM
from bit_transformer.utils import set_dropout
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s')
logger = logging.getLogger(__name__)
def main():
"""Ultra-optimized 680M parameter training that WILL work!"""
logger.info("πŸ”₯ ULTRA OPTIMIZED 680M PARAMETER BITTRANSFORMERLM!")
logger.info("=" * 60)
# ULTRA OPTIMIZED CONFIG - shorter sequences!
config = {
"d_model": 1536,
"nhead": 24,
"num_layers": 24,
"dim_feedforward": 6144,
"max_seq_len": 512, # MUCH shorter sequences!
"lambda_K": 0.1, # Reduce telemetry impact
"lambda_C": 0.1,
"lambda_S": 0.1,
"reversible": True,
"use_checkpoint": True,
"use_autocast": True,
"chunk_size": 128, # Chunked attention for memory
"full_attn_logging": False, # No attention logging
}
logger.info("πŸ—οΈ Creating ULTRA OPTIMIZED 680M model...")
for k, v in config.items():
logger.info(f" {k}: {v}")
# Create and move model
model = BitTransformerLM(**config)
params = sum(p.numel() for p in model.parameters())
logger.info(f"βœ… Model: {params:,} parameters ({params/1e6:.1f}M)")
model = model.cuda()
logger.info("βœ… Model on GPU")
# Ultra simple training data
logger.info("🎯 Starting ULTRA OPTIMIZED training...")
model.train()
set_dropout(model, 0.1)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
seq_len = 512 # Much shorter!
batch_size = 1
for step in range(20): # Just prove it works!
# Create simple bit pattern
pattern = ([0, 1] * (seq_len // 2))[:seq_len]
input_ids = torch.tensor(pattern[:-1], dtype=torch.long).unsqueeze(0).cuda()
labels = torch.tensor(pattern[1:], dtype=torch.long).unsqueeze(0).cuda()
optimizer.zero_grad()
try:
# Forward with autocast
with torch.amp.autocast('cuda'):
outputs = model(input_ids)
if isinstance(outputs, tuple):
logits, telemetry = outputs
else:
logits = outputs
telemetry = {}
loss = F.cross_entropy(logits.view(-1, 2), labels.view(-1))
# Backward
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
if step % 5 == 0:
memory_used = torch.cuda.memory_allocated(0) / (1024**3)
logger.info(
f"Step {step:2d} | "
f"Loss: {loss.item():.4f} | "
f"Mem: {memory_used:.1f}GB | "
f"K: {telemetry.get('negentropy', 0):.3f} | "
f"SUCCESS! πŸŽ‰"
)
except torch.OutOfMemoryError as e:
memory_used = torch.cuda.memory_allocated(0) / (1024**3)
logger.error(f"OOM at step {step}, Memory: {memory_used:.1f}GB")
logger.error(f"Error: {e}")
break
except Exception as e:
logger.error(f"Other error at step {step}: {e}")
break
else:
logger.info("πŸ† SUCCESS! 680M PARAMETER MODEL TRAINED SUCCESSFULLY!")
logger.info("πŸš€ HARDWARE CAN ABSOLUTELY HANDLE THIS!")
logger.info("βœ… Ready for proper multi-GPU implementation!")
return True
return False
if __name__ == "__main__":
success = main()
if success:
print("\nπŸŽ‰ MISSION ACCOMPLISHED! 680M parameters PROVEN TO WORK!")
else:
print("\nπŸ”§ Need further optimization...")