BitTransformerLM / unified_workflow.py
WCNegentropy's picture
🔧 Configuration update: unified_workflow.py with optimizations
3334875 verified
raw
history blame
6.06 kB
import argparse
import os
import subprocess
import sys
import time
import torch
from bit_transformer.utils import load_model
from bit_transformer.hf_checkpoint import (
hf_login,
save_checkpoint,
download_checkpoint,
)
from bit_transformer import diffusion_inference
from bit_transformer.cli_standards import create_workflow_parser, BitTransformerCLI
from integration_schedule import integration_schedule
def _launch_dashboard() -> list[subprocess.Popen]:
"""Start MCP server and dashboard processes."""
server = subprocess.Popen([sys.executable, "mcp_server.py"])
time.sleep(2)
dash_env = dict(os.environ)
dash_env.setdefault("MCP_SERVER_ADDR", "http://127.0.0.1:7000")
dashboard = subprocess.Popen(
[sys.executable, "-m", "bit_transformer.dashboard_app"],
env=dash_env,
)
return [server, dashboard]
def _terminate(procs: list[subprocess.Popen]) -> None:
for p in procs:
p.terminate()
try:
p.wait(timeout=5)
except Exception:
p.kill()
def run_workflow(
steps: int = 10,
max_len: int = 64,
dataset_size: int = 128,
*,
launch_ui: bool = False,
weights_path: str = "weights/model.pt.gz",
collapsed_path: str = "weights/collapsed.pt.gz",
plateau_steps: int = 0,
epochs_per_step: int = 2,
extra_steps: int = 3,
collapse: bool = True,
hf_repo: str | None = None,
hf_token: str | None = None,
diffusion: bool = False,
noise_schedule: str = "linear",
diffusion_steps: int = 8,
diffusion_curriculum: bool = False,
use_checkpoint: bool = True,
reversible: bool = True,
qat: bool = False,
) -> tuple:
"""Run the full integration schedule with optional dashboard.
If ``qat`` is ``True`` the model undergoes 4-bit quantization-aware training
before being converted to quantized weights for safety checks.
"""
procs: list[subprocess.Popen] = []
if launch_ui:
procs = _launch_dashboard()
if hf_repo:
hf_login(token=hf_token)
if not os.path.exists(weights_path):
download_checkpoint(weights_path, repo_id=hf_repo)
try:
results, collapsed = integration_schedule(
steps=steps,
max_len=max_len,
dataset_size=dataset_size,
weights_path=weights_path,
plateau_steps=plateau_steps,
collapsed_path=collapsed_path,
epochs_per_step=epochs_per_step,
extra_steps=extra_steps,
collapse=collapse,
diffusion=diffusion,
noise_schedule=noise_schedule,
diffusion_steps=diffusion_steps,
diffusion_curriculum=diffusion_curriculum,
use_checkpoint=use_checkpoint,
reversible=reversible,
qat=qat,
)
model = load_model(weights_path)
print("Workflow results:", results)
if diffusion:
sample = diffusion_inference(
model, length=max_len, steps=diffusion_steps, schedule=noise_schedule
)
print("Diffusion inference output bits:", sample[0].tolist())
if hf_repo:
save_checkpoint(model, repo_id=hf_repo)
finally:
if launch_ui:
_terminate(procs)
return model, collapsed
if __name__ == "__main__":
# Use standardized CLI parser
parser = create_workflow_parser()
# Add workflow-specific arguments
workflow_group = parser.add_argument_group('Workflow Configuration')
workflow_group.add_argument("--steps", type=int, default=10,
help="Number of progressive scale-up steps")
workflow_group.add_argument("--plateau-steps", type=int, default=0,
help="Extra training steps at final size")
workflow_group.add_argument("--epochs-per-step", type=int, default=2,
help="Epochs per training step")
workflow_group.add_argument("--extra-steps", type=int, default=3,
help="Optimizer updates after each epoch")
workflow_group.add_argument("--no-collapse", action="store_true",
help="Skip collapsed model generation")
workflow_group.add_argument("--dashboard", action="store_true",
help="Launch MCP server and dashboard UI")
# Add advanced optimization arguments
opt_group = parser.add_argument_group('Advanced Optimization')
opt_group.add_argument("--no-checkpoint", action="store_true",
help="Disable gradient checkpointing (faster but more memory)")
opt_group.add_argument("--no-reversible", action="store_true",
help="Use standard transformer blocks instead of reversible layers")
opt_group.add_argument("--qat", action="store_true",
help="Enable 4-bit quantization-aware training")
# Override some defaults for workflow context
parser.set_defaults(
seq_length=64, # Use seq-length instead of max-len
dataset_size=128,
weights_path="weights/model.pt.gz"
)
args = parser.parse_args()
run_workflow(
args.steps,
args.seq_length, # Standardized name
args.dataset_size,
launch_ui=args.dashboard,
weights_path=args.weights_path,
collapsed_path=getattr(args, 'collapsed_path', 'weights/collapsed.pt.gz'),
plateau_steps=args.plateau_steps,
epochs_per_step=args.epochs_per_step,
extra_steps=args.extra_steps,
collapse=not args.no_collapse,
hf_repo=args.hf_repo,
hf_token=args.hf_token,
diffusion=args.diffusion_mode, # Standardized name
noise_schedule=args.noise_schedule,
diffusion_steps=args.diffusion_steps,
diffusion_curriculum=args.diffusion_curriculum,
use_checkpoint=not args.no_checkpoint,
reversible=not args.no_reversible,
qat=args.qat,
)