🚀 OS Launch: Clean documentation and refined licensing

This OS launch commit includes:

✅ **Cleaned Documentation**
- Removed inflated claims and marketing language
- Added honest research status and limitations
- Created professional model card and validation reports
- Streamlined licensing to AGPLv3 + commercial contact

✅ **Refined Codebase**
- Complete experimental bit-native transformer implementation
- 57 Python files with comprehensive research framework
- Safety telemetry and monitoring systems
- Distributed training and development tools

✅ **Professional Standards**
- Empirical validation of all claims
- Clear experimental vs production distinctions
- Rigorous research methodology requirements
- Community contribution framework

Ready for serious research evaluation and academic investigation.

Files changed (1) hide show

integration_schedule.py +379 -0

integration_schedule.py ADDED Viewed

	@@ -0,0 +1,379 @@

+import os
+import time
+import math
+from itertools import cycle
+from typing import Optional
+import torch
+import torch.nn.functional as F
+from bit_transformer import (
+    BitTransformerLM,
+    text_to_bits,
+    quantize_dynamic,
+    prepare_qat_fx,
+    convert_qat_fx,
+    hil_safe_inference,
+    collapse_submodel,
+    diffusion_inference,
+    TelemetrySynthesizer,
+    save_distilled_model,
+)
+from bit_transformer.training import train_loop as train
+from bit_transformer.optimization import configure_optimizer, adjust_learning_rate
+from bit_transformer.utils import save_model, load_model, set_dropout
+from bit_transformer.torch_utils import cpu_autocast
+def lines_to_tensor(lines, max_len):
+    seqs = []
+    for text in lines:
+        bits = text_to_bits(text)[:max_len]
+        if len(bits) < max_len:
+            bits.extend([0] * (max_len - len(bits)))
+        seqs.append(bits)
+    return torch.tensor(seqs, dtype=torch.long)
+def load_wikitext(dataset_size=128, max_len=64):
+    try:
+        from datasets import load_dataset
+        ds = load_dataset("wikitext", "wikitext-2-raw-v1")
+        train_lines = [t for t in ds["train"]["text"] if t.strip()][:dataset_size]
+        valid_split = max(1, dataset_size // 4)
+        valid_lines = [t for t in ds["validation"]["text"] if t.strip()][:valid_split]
+        train = lines_to_tensor(train_lines, max_len)
+        valid = lines_to_tensor(valid_lines, max_len)
+        return train, valid, train_lines
+    except Exception as e:
+        print("Dataset load failed, using random bits", e)
+        train = torch.randint(0, 2, (dataset_size, max_len), dtype=torch.long)
+        valid = torch.randint(0, 2, (max_len, max_len), dtype=torch.long)
+        return train, valid, ["" for _ in range(len(train))]
+def _warmup(
+    model: BitTransformerLM,
+    data: torch.Tensor,
+    steps: int = 5,
+    freeze_old: bool = False,
+    old_layers: int = 0,
+    *,
+    diffusion: bool = False,
+    curriculum: bool = False,
+    optimizer: Optional[torch.optim.Optimizer] = None,
+    scheduler: Optional[torch.optim.lr_scheduler._LRScheduler] = None,
+) -> None:
+    """Run a short warm-up loop after expansion."""
+    model.train()
+    set_dropout(model, 0.1)
+    if freeze_old:
+        for idx, layer in enumerate(model.layers):
+            if idx < old_layers:
+                for p in layer.parameters():
+                    p.requires_grad_(False)
+    if optimizer is None or scheduler is None:
+        optimizer, scheduler = configure_optimizer(model, lr=1e-3, total_steps=steps)
+    it = iter(data.split(8))
+    for idx in range(steps):
+        try:
+            batch = next(it)
+        except StopIteration:
+            it = iter(data.split(8))
+            batch = next(it)
+        if diffusion:
+            p = 0.5 * (1 - idx / max(1, steps - 1)) if curriculum else 0.5
+            noise = (torch.rand_like(batch.float()) < p).long()
+            noisy = batch ^ noise
+            logits, _ = model(noisy, causal=False)
+            pred = logits.reshape(-1, 2)
+            target = batch.reshape(-1)
+        else:
+            logits, _ = model(batch)
+            pred = logits[:, :-1, :].reshape(-1, 2)
+            target = batch[:, 1:].reshape(-1)
+        loss = F.cross_entropy(pred, target)
+        loss.backward()
+        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+        optimizer.step()
+        scheduler.step()
+        optimizer.zero_grad()
+    for p in model.parameters():
+        p.requires_grad_(True)
+    model.eval()
+    set_dropout(model, 0.0)
+def integration_schedule(
+    steps: int = 10,
+    max_len: int = 64,
+    dataset_size: int = 128,
+    *,
+    weights_path: str = "weights/model.pt.gz",
+    plateau_steps: int = 0,
+    collapsed_path: str | None = None,
+    epochs_per_step: int = 2,
+    extra_steps: int = 3,
+    collapse: bool = True,
+    diffusion: bool = False,
+    noise_schedule: str = "linear",
+    diffusion_steps: int = 8,
+    diffusion_curriculum: bool = False,
+    use_checkpoint: bool = True,
+    reversible: bool = True,
+    improve_thresh: float = 0.01,
+    qat: bool = False,
+):
+    start = time.time()
+    train_bits, valid_bits, train_lines = load_wikitext(dataset_size, max_len)
+    if os.path.exists(weights_path):
+        try:
+            model = load_model(weights_path)
+            print(f"Loaded model from {weights_path}")
+        except Exception as e:
+            print("Failed to load weights, initializing new model", e)
+            model = BitTransformerLM(
+                d_model=32,
+                nhead=4,
+                num_layers=1,
+                dim_feedforward=64,
+                max_seq_len=max_len,
+                use_act=True,
+                act_threshold=0.7,
+                reversible=reversible,
+                chunk_size=max_len,
+                use_autocast=True,
+                use_checkpoint=use_checkpoint,
+            )
+    else:
+        model = BitTransformerLM(
+            d_model=32,
+            nhead=4,
+            num_layers=1,
+            dim_feedforward=64,
+            max_seq_len=max_len,
+            use_act=True,
+            act_threshold=0.7,
+            reversible=reversible,
+            chunk_size=max_len,
+            use_autocast=True,
+            use_checkpoint=use_checkpoint,
+        )
+    if qat:
+        model = prepare_qat_fx(model)
+    results = []
+    scale_cycle = cycle(["layers", "width", "context"])
+    base_lr = 1e-3
+    prev_val_loss: Optional[float] = None
+    for step in range(steps):
+        model.train()
+        set_dropout(model, 0.1)
+        opt, sched = configure_optimizer(
+            model, lr=base_lr, total_steps=epochs_per_step
+        )
+        train(
+            model,
+            train_bits,
+            epochs=epochs_per_step,
+            extra_steps=extra_steps,
+            compress_prob=0.0 if diffusion else 1.0,
+            log=True,
+            diffusion=diffusion,
+            diffusion_curriculum=diffusion_curriculum,
+            optimizer=opt,
+            scheduler=sched,
+        )
+        model.eval()
+        set_dropout(model, 0.0)
+        with torch.no_grad():
+            logits, telemetry = model(valid_bits, causal=not diffusion)
+            if diffusion:
+                pred = logits.reshape(-1, 2)
+                target = valid_bits.reshape(-1)
+            else:
+                pred = logits[:, :-1, :].reshape(-1, 2)
+                target = valid_bits[:, 1:].reshape(-1)
+            val_loss = F.cross_entropy(pred, target).item()
+            k = telemetry["negentropy_logits"].mean().item()
+            c = telemetry["lz_complexity_logits"].mean().item()
+            s = telemetry["symbiosis_score"].mean().item()
+        print(f"Step {step} validation loss: {val_loss:.4f} K={k:.3f} C={c:.3f} S={s:.3f}")
+        results.append((step, val_loss, k, c, s))
+        if prev_val_loss is not None and prev_val_loss - val_loss < improve_thresh:
+            strategy = next(scale_cycle)
+            base_lr = adjust_learning_rate(opt, 1 / math.sqrt(2))
+            if strategy == "layers":
+                old_layers = model.num_layers
+                model = model.double_layers()
+                warm_opt, warm_sched = configure_optimizer(
+                    model, lr=base_lr, total_steps=100
+                )
+                _warmup(
+                    model,
+                    train_bits,
+                    steps=100,
+                    freeze_old=True,
+                    old_layers=old_layers,
+                    diffusion=diffusion,
+                    curriculum=diffusion_curriculum,
+                    optimizer=warm_opt,
+                    scheduler=warm_sched,
+                )
+            elif strategy == "width":
+                model = model.double_width()
+                warm_opt, warm_sched = configure_optimizer(
+                    model, lr=base_lr, total_steps=100
+                )
+                _warmup(
+                    model,
+                    train_bits,
+                    steps=100,
+                    diffusion=diffusion,
+                    curriculum=diffusion_curriculum,
+                    optimizer=warm_opt,
+                    scheduler=warm_sched,
+                )
+            else:
+                max_len *= 2
+                train_bits, valid_bits, train_lines = load_wikitext(
+                    dataset_size, max_len
+                )
+                model = model.double_length()
+                warm_opt, warm_sched = configure_optimizer(
+                    model, lr=base_lr, total_steps=100
+                )
+                _warmup(
+                    model,
+                    train_bits,
+                    steps=100,
+                    diffusion=diffusion,
+                    curriculum=diffusion_curriculum,
+                    optimizer=warm_opt,
+                    scheduler=warm_sched,
+                )
+        prev_val_loss = val_loss
+        if time.time() - start > 8 * 60:
+            print("Time limit reached")
+            break
+    # optional plateau phase at final size
+    for p in range(plateau_steps):
+        model.train()
+        set_dropout(model, 0.1)
+        train(
+            model,
+            train_bits,
+            epochs=epochs_per_step,
+            extra_steps=extra_steps,
+            compress_prob=0.0 if diffusion else 1.0,
+            log=True,
+            diffusion=diffusion,
+            diffusion_curriculum=diffusion_curriculum,
+        )
+        model.eval()
+        set_dropout(model, 0.0)
+        with torch.no_grad():
+            logits, telemetry = model(valid_bits, causal=not diffusion)
+            if diffusion:
+                pred = logits.reshape(-1, 2)
+                target = valid_bits.reshape(-1)
+            else:
+                pred = logits[:, :-1, :].reshape(-1, 2)
+                target = valid_bits[:, 1:].reshape(-1)
+            val_loss = F.cross_entropy(pred, target).item()
+            k = telemetry["negentropy_logits"].mean().item()
+            c = telemetry["lz_complexity_logits"].mean().item()
+            s = telemetry["symbiosis_score"].mean().item()
+        idx = steps + p
+        print(
+            f"Plateau {p} validation loss: {val_loss:.4f} K={k:.3f} C={c:.3f} S={s:.3f}"
+        )
+        results.append((idx, val_loss, k, c, s))
+        if time.time() - start > 8 * 60:
+            print("Time limit reached")
+            break
+    # final validation after last step
+    model.eval()
+    set_dropout(model, 0.0)
+    with torch.no_grad():
+        logits, telemetry = model(valid_bits, causal=not diffusion)
+        if diffusion:
+            pred = logits.reshape(-1, 2)
+            target = valid_bits.reshape(-1)
+        else:
+            pred = logits[:, :-1, :].reshape(-1, 2)
+            target = valid_bits[:, 1:].reshape(-1)
+        val_loss = F.cross_entropy(pred, target).item()
+        k = telemetry["negentropy_logits"].mean().item()
+        c = telemetry["lz_complexity_logits"].mean().item()
+        s = telemetry["symbiosis_score"].mean().item()
+    print(f"Final validation loss: {val_loss:.4f} K={k:.3f} C={c:.3f} S={s:.3f}")
+    results.append((steps + plateau_steps, val_loss, k, c, s))
+    # persist final model weights for future runs
+    save_model(model, weights_path)
+    input_bits = valid_bits[:1]
+    if qat:
+        qmodel = convert_qat_fx(model)
+    else:
+        with cpu_autocast():
+            model(input_bits)
+        qmodel = quantize_dynamic(model)
+    qmodel.eval()
+    try:
+        hil_safe_inference(
+            qmodel,
+            input_bits,
+            c_floor=0.3,
+            s_floor=0.5,
+            causal=not diffusion,
+            strict=not diffusion,
+        )
+    except RuntimeError as e:
+        print("Safety gate triggered", e)
+    collapsed = None
+    if collapse:
+        synth = TelemetrySynthesizer(n_clusters=8)
+        reps = synth.cluster_sequences(model, train_bits[:64])
+        floors = {"negentropy": 0.3, "lz_complexity": 0.35, "symbiosis_score": 0.5}
+        collapsed, metrics = collapse_submodel(
+            reps,
+            target_params=dict(
+                d_model=16,
+                nhead=4,
+                num_layers=1,
+                dim_feedforward=32,
+                max_seq_len=max_len,
+            ),
+            floors=floors,
+        )
+        collapsed.eval()
+        with torch.no_grad():
+            logits, _ = collapsed(valid_bits)
+            pred = logits[:, :-1, :].reshape(-1, 2)
+            target = valid_bits[:, 1:].reshape(-1)
+            c_loss = F.cross_entropy(pred, target).item()
+        print("Collapsed model validation loss:", c_loss)
+        if collapsed_path is not None:
+            save_distilled_model(
+                collapsed,
+                collapsed_path,
+                {**metrics, "val_loss": c_loss},
+                floors=floors,
+            )
+    if diffusion:
+        sample = diffusion_inference(
+            model, length=max_len, steps=diffusion_steps, schedule=noise_schedule
+        )
+        print("Diffusion sample:", sample[0].tolist())
+    return results, collapsed
+if __name__ == "__main__":
+    integration_schedule()