
π Final optimization: Update cli_standards.py with production-ready enhancements
e2ef423
verified
""" | |
BitTransformerLM CLI Argument Standards | |
Unified command-line interface standards for all BitTransformerLM scripts. | |
This module provides standardized argument parsers and naming conventions. | |
""" | |
import argparse | |
from typing import Optional, Callable | |
class BitTransformerCLI: | |
"""Standardized CLI argument parser for BitTransformerLM.""" | |
def add_model_args(parser: argparse.ArgumentParser) -> None: | |
"""Add standard model configuration arguments.""" | |
model_group = parser.add_argument_group('Model Configuration') | |
model_group.add_argument('--model-size', choices=['tiny', 'small', 'medium', 'large'], | |
default='small', help='Model size preset') | |
model_group.add_argument('--d-model', type=int, default=128, | |
help='Model dimension') | |
model_group.add_argument('--num-heads', type=int, default=8, | |
help='Number of attention heads') | |
model_group.add_argument('--num-layers', type=int, default=6, | |
help='Number of transformer layers') | |
model_group.add_argument('--dropout', type=float, default=0.1, | |
help='Dropout rate') | |
model_group.add_argument('--max-seq-len', type=int, default=512, | |
help='Maximum sequence length') | |
def add_training_args(parser: argparse.ArgumentParser) -> None: | |
"""Add standard training arguments.""" | |
train_group = parser.add_argument_group('Training Configuration') | |
train_group.add_argument('--epochs', type=int, default=10, | |
help='Number of training epochs') | |
train_group.add_argument('--batch-size', type=int, default=16, | |
help='Training batch size') | |
train_group.add_argument('--learning-rate', type=float, default=1e-3, | |
help='Learning rate') | |
train_group.add_argument('--weight-decay', type=float, default=0.01, | |
help='Weight decay') | |
train_group.add_argument('--grad-clip', type=float, default=1.0, | |
help='Gradient clipping threshold') | |
train_group.add_argument('--warmup-steps', type=int, default=100, | |
help='Number of warmup steps') | |
def add_dataset_args(parser: argparse.ArgumentParser) -> None: | |
"""Add standard dataset arguments.""" | |
data_group = parser.add_argument_group('Dataset Configuration') | |
data_group.add_argument('--dataset-name', type=str, default='synthetic', | |
help='Dataset name or path') | |
data_group.add_argument('--dataset-size', type=int, default=10000, | |
help='Dataset size (number of samples)') | |
data_group.add_argument('--seq-length', type=int, default=64, | |
help='Sequence length for training') | |
data_group.add_argument('--validation-split', type=float, default=0.1, | |
help='Validation split ratio') | |
def add_safety_args(parser: argparse.ArgumentParser) -> None: | |
"""Add safety and telemetry arguments.""" | |
safety_group = parser.add_argument_group('Safety & Telemetry') | |
safety_group.add_argument('--enable-safety-gates', action='store_true', | |
help='Enable safety gates during inference') | |
safety_group.add_argument('--min-negentropy', type=float, default=0.1, | |
help='Minimum negentropy threshold') | |
safety_group.add_argument('--max-complexity', type=float, default=0.9, | |
help='Maximum LZ complexity threshold') | |
safety_group.add_argument('--min-symbiosis', type=float, default=0.3, | |
help='Minimum symbiosis score threshold') | |
safety_group.add_argument('--telemetry-logging', action='store_true', | |
help='Enable detailed telemetry logging') | |
def add_optimization_args(parser: argparse.ArgumentParser) -> None: | |
"""Add optimization and performance arguments.""" | |
opt_group = parser.add_argument_group('Optimization & Performance') | |
opt_group.add_argument('--use-amp', action='store_true', | |
help='Use automatic mixed precision') | |
opt_group.add_argument('--gradient-checkpointing', action='store_true', | |
help='Use gradient checkpointing') | |
opt_group.add_argument('--compile-model', action='store_true', | |
help='Use torch.compile for optimization') | |
opt_group.add_argument('--chunk-size', type=int, default=None, | |
help='Chunk size for chunked attention') | |
opt_group.add_argument('--num-workers', type=int, default=4, | |
help='Number of data loader workers') | |
def add_distributed_args(parser: argparse.ArgumentParser) -> None: | |
"""Add distributed training arguments.""" | |
dist_group = parser.add_argument_group('Distributed Training') | |
dist_group.add_argument('--distributed', action='store_true', | |
help='Enable distributed training') | |
dist_group.add_argument('--world-size', type=int, default=1, | |
help='Number of distributed processes') | |
dist_group.add_argument('--rank', type=int, default=0, | |
help='Process rank for distributed training') | |
dist_group.add_argument('--backend', choices=['nccl', 'gloo'], default='nccl', | |
help='Distributed backend') | |
def add_io_args(parser: argparse.ArgumentParser) -> None: | |
"""Add input/output arguments.""" | |
io_group = parser.add_argument_group('Input/Output') | |
io_group.add_argument('--input-path', type=str, | |
help='Input file or directory path') | |
io_group.add_argument('--output-path', type=str, default='./output', | |
help='Output directory path') | |
io_group.add_argument('--weights-path', type=str, default='./weights/model.pt', | |
help='Model weights file path') | |
io_group.add_argument('--checkpoint-dir', type=str, default='./checkpoints', | |
help='Checkpoint directory path') | |
io_group.add_argument('--log-level', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR'], | |
default='INFO', help='Logging level') | |
def add_huggingface_args(parser: argparse.ArgumentParser) -> None: | |
"""Add HuggingFace integration arguments.""" | |
hf_group = parser.add_argument_group('HuggingFace Integration') | |
hf_group.add_argument('--hf-repo', type=str, | |
help='HuggingFace repository ID') | |
hf_group.add_argument('--hf-token', type=str, | |
help='HuggingFace access token') | |
hf_group.add_argument('--private-repo', action='store_true', | |
help='Create private HuggingFace repository') | |
hf_group.add_argument('--auto-upload', action='store_true', | |
help='Automatically upload to HuggingFace after training') | |
def add_diffusion_args(parser: argparse.ArgumentParser) -> None: | |
"""Add diffusion mode arguments.""" | |
diff_group = parser.add_argument_group('Diffusion Mode') | |
diff_group.add_argument('--diffusion-mode', action='store_true', | |
help='Enable diffusion training mode') | |
diff_group.add_argument('--diffusion-steps', type=int, default=8, | |
help='Number of diffusion steps') | |
diff_group.add_argument('--noise-schedule', choices=['linear', 'cosine', 'exponential'], | |
default='linear', help='Noise schedule type') | |
diff_group.add_argument('--diffusion-curriculum', action='store_true', | |
help='Use curriculum learning for diffusion') | |
def create_standard_parser(cls, | |
description: str, | |
include_groups: Optional[list] = None) -> argparse.ArgumentParser: | |
"""Create a standardized argument parser with specified groups. | |
Args: | |
description: Parser description | |
include_groups: List of group names to include. If None, includes all. | |
Options: ['model', 'training', 'dataset', 'safety', 'optimization', | |
'distributed', 'io', 'huggingface', 'diffusion'] | |
""" | |
parser = argparse.ArgumentParser( | |
description=description, | |
formatter_class=argparse.ArgumentDefaultsHelpFormatter | |
) | |
# Default groups to include if none specified | |
if include_groups is None: | |
include_groups = ['model', 'training', 'dataset', 'safety', 'io'] | |
# Add requested argument groups | |
group_methods = { | |
'model': cls.add_model_args, | |
'training': cls.add_training_args, | |
'dataset': cls.add_dataset_args, | |
'safety': cls.add_safety_args, | |
'optimization': cls.add_optimization_args, | |
'distributed': cls.add_distributed_args, | |
'io': cls.add_io_args, | |
'huggingface': cls.add_huggingface_args, | |
'diffusion': cls.add_diffusion_args, | |
} | |
for group_name in include_groups: | |
if group_name in group_methods: | |
group_methods[group_name](parser) | |
# Add common flags | |
parser.add_argument('--verbose', '-v', action='store_true', | |
help='Enable verbose output') | |
parser.add_argument('--debug', action='store_true', | |
help='Enable debug mode') | |
parser.add_argument('--seed', type=int, default=42, | |
help='Random seed for reproducibility') | |
return parser | |
# Pre-configured parsers for common use cases | |
def create_training_parser() -> argparse.ArgumentParser: | |
"""Create parser for training scripts.""" | |
return BitTransformerCLI.create_standard_parser( | |
"BitTransformerLM Training Script", | |
['model', 'training', 'dataset', 'safety', 'optimization', 'distributed', 'io', 'huggingface'] | |
) | |
def create_inference_parser() -> argparse.ArgumentParser: | |
"""Create parser for inference scripts.""" | |
return BitTransformerCLI.create_standard_parser( | |
"BitTransformerLM Inference Script", | |
['model', 'safety', 'io', 'diffusion'] | |
) | |
def create_evaluation_parser() -> argparse.ArgumentParser: | |
"""Create parser for evaluation scripts.""" | |
return BitTransformerCLI.create_standard_parser( | |
"BitTransformerLM Evaluation Script", | |
['model', 'dataset', 'safety', 'io'] | |
) | |
def create_workflow_parser() -> argparse.ArgumentParser: | |
"""Create parser for workflow/pipeline scripts.""" | |
return BitTransformerCLI.create_standard_parser( | |
"BitTransformerLM Workflow Script", | |
['model', 'training', 'dataset', 'safety', 'optimization', 'io', 'huggingface', 'diffusion'] | |
) |