WCNegentropy
/

BitTransformerLM

Text Generation

language-modeling

Model card Files Files and versions

BitTransformerLM / bit_transformer /collapse.py

WCNegentropy's picture

🤖 Updated BitTransformerLM from development space

36c78b1 verified 23 days ago

3.24 kB

	import json
	import os
	from typing import Dict, List, Optional, Tuple

	import torch

	from .model import BitTransformerLM
	from .training import train_loop


	def collapse_submodel(
	cluster_data: List[List[int]],
	target_params: Dict,
	floors: Optional[Dict[str, float]] = None,
	max_rounds: int = 3,
	width_scale: float = 1.5,
	forward_kwargs: Optional[Dict] = None,
	) -> Tuple[BitTransformerLM, Dict[str, float]]:
	"""Distill a submodel from clustered bit sequences.

	The routine deepens the target model when telemetry floors are unmet and,
	after the first deepening fails, widens the hidden dimensions by
	``width_scale`` once before retrying. Returns the distilled model and its
	final telemetry metrics.
	"""
	if floors is None:
	floors = {"negentropy": 0.5, "lz_complexity": 0.3, "symbiosis_score": 0.5}

	bit_tensor = torch.tensor(cluster_data, dtype=torch.long)
	n = len(bit_tensor)
	split = max(1, int(0.8 * n))
	train_bits = bit_tensor[:split]
	val_bits = bit_tensor[split:]
	if len(val_bits) == 0:
	val_bits = train_bits

	params = target_params.copy()
	metrics: Dict[str, float] = {}
	width_scaled = False
	for round_idx in range(max_rounds):
	model = BitTransformerLM(**params)
	train_loop(
	model,
	train_bits,
	epochs=2,
	compress_prob=0.5,
	direct_prob=0.0,
	log=False,
	forward_kwargs=forward_kwargs,
	)
	with torch.no_grad():
	logits, telemetry = model(val_bits, **(forward_kwargs or {}))
	neg_k = model.negentropy_logits(logits).mean().item()
	lz_c = model.lz_complexity_logits(logits).mean().item()
	sym_s = telemetry["symbiosis_score"].mean().item()
	metrics = {
	"negentropy": neg_k,
	"lz_complexity": lz_c,
	"symbiosis_score": sym_s,
	}
	if (
	neg_k >= floors["negentropy"]
	and lz_c >= floors["lz_complexity"]
	and sym_s >= floors["symbiosis_score"]
	):
	break
	if round_idx == 0:
	params["num_layers"] = max(1, params.get("num_layers", 1)) + 1
	elif not width_scaled:
	params["d_model"] = int(params.get("d_model", 32) * width_scale)
	params["dim_feedforward"] = int(
	params.get("dim_feedforward", 64) * width_scale
	)
	width_scaled = True
	else:
	params["num_layers"] = max(1, params.get("num_layers", 1)) + 1
	return model, metrics


	def save_distilled_model(
	model: BitTransformerLM,
	path: str,
	metrics: Dict[str, float],
	floors: Optional[Dict[str, float]] = None,
	) -> None:
	"""Serialize a distilled model and its metric summary to disk.

	Weights are written to ``path`` and a ``metrics.json`` file is placed in the
	same directory containing the achieved metrics alongside the target floors.
	"""
	torch.save(model.state_dict(), path)
	payload = {"metrics": metrics, "floors": floors or {}}
	metrics_path = os.path.join(os.path.dirname(path), "metrics.json")
	with open(metrics_path, "w") as f:
	json.dump(payload, f)