WCNegentropy's picture
🤖 Updated BitTransformerLM from development space
36c78b1 verified
raw
history blame
3.24 kB
import json
import os
from typing import Dict, List, Optional, Tuple
import torch
from .model import BitTransformerLM
from .training import train_loop
def collapse_submodel(
cluster_data: List[List[int]],
target_params: Dict,
floors: Optional[Dict[str, float]] = None,
max_rounds: int = 3,
width_scale: float = 1.5,
forward_kwargs: Optional[Dict] = None,
) -> Tuple[BitTransformerLM, Dict[str, float]]:
"""Distill a submodel from clustered bit sequences.
The routine deepens the target model when telemetry floors are unmet and,
after the first deepening fails, widens the hidden dimensions by
``width_scale`` once before retrying. Returns the distilled model and its
final telemetry metrics.
"""
if floors is None:
floors = {"negentropy": 0.5, "lz_complexity": 0.3, "symbiosis_score": 0.5}
bit_tensor = torch.tensor(cluster_data, dtype=torch.long)
n = len(bit_tensor)
split = max(1, int(0.8 * n))
train_bits = bit_tensor[:split]
val_bits = bit_tensor[split:]
if len(val_bits) == 0:
val_bits = train_bits
params = target_params.copy()
metrics: Dict[str, float] = {}
width_scaled = False
for round_idx in range(max_rounds):
model = BitTransformerLM(**params)
train_loop(
model,
train_bits,
epochs=2,
compress_prob=0.5,
direct_prob=0.0,
log=False,
forward_kwargs=forward_kwargs,
)
with torch.no_grad():
logits, telemetry = model(val_bits, **(forward_kwargs or {}))
neg_k = model.negentropy_logits(logits).mean().item()
lz_c = model.lz_complexity_logits(logits).mean().item()
sym_s = telemetry["symbiosis_score"].mean().item()
metrics = {
"negentropy": neg_k,
"lz_complexity": lz_c,
"symbiosis_score": sym_s,
}
if (
neg_k >= floors["negentropy"]
and lz_c >= floors["lz_complexity"]
and sym_s >= floors["symbiosis_score"]
):
break
if round_idx == 0:
params["num_layers"] = max(1, params.get("num_layers", 1)) + 1
elif not width_scaled:
params["d_model"] = int(params.get("d_model", 32) * width_scale)
params["dim_feedforward"] = int(
params.get("dim_feedforward", 64) * width_scale
)
width_scaled = True
else:
params["num_layers"] = max(1, params.get("num_layers", 1)) + 1
return model, metrics
def save_distilled_model(
model: BitTransformerLM,
path: str,
metrics: Dict[str, float],
floors: Optional[Dict[str, float]] = None,
) -> None:
"""Serialize a distilled model and its metric summary to disk.
Weights are written to ``path`` and a ``metrics.json`` file is placed in the
same directory containing the achieved metrics alongside the target floors.
"""
torch.save(model.state_dict(), path)
payload = {"metrics": metrics, "floors": floors or {}}
metrics_path = os.path.join(os.path.dirname(path), "metrics.json")
with open(metrics_path, "w") as f:
json.dump(payload, f)