#!/usr/bin/env python3 """ Test script for BitTransformerLM dataset creation (small version) """ import os import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).parent)) # Test the dataset builder with a small sample from bit_transformer.dataset_builder import BitTransformerDatasetBuilder def test_small_dataset(): print("๐Ÿงช Testing BitTransformerLM Dataset Builder...") # Create builder with your token hf_token = "os.environ.get('HF_TOKEN', 'your-token-here')" repo_id = "BitTransformerLM" builder = BitTransformerDatasetBuilder(hf_token, repo_id) # Test text-to-bits generation print("๐Ÿ“ Testing text-to-bits conversion...") test_texts = [ "Hello, world!", "The quick brown fox jumps over the lazy dog.", "Binary data processing with transformers.", "Information theory meets deep learning.", "Parity-protected bit sequences for safety." ] text_samples = builder.generate_text_to_bits_data(test_texts, max_len=128) print(f"โœ… Generated {len(text_samples)} text-to-bits samples") # Test synthetic patterns print("๐ŸŽจ Testing synthetic patterns...") synthetic_samples = builder.generate_synthetic_patterns(10, max_len=64) print(f"โœ… Generated {len(synthetic_samples)} synthetic samples") # Test safety benchmarks print("๐Ÿ›ก๏ธ Testing safety benchmarks...") safety_samples = builder.generate_safety_benchmarks(8) print(f"โœ… Generated {len(safety_samples)} safety samples") # Show sample structure print("\n๐Ÿ“Š Sample Structure:") sample = text_samples[0] for key, value in sample.items(): if key == "bit_sequence": print(f" {key}: [{len(value)} bits] {value[:10]}...") else: print(f" {key}: {value}") print("\n๐ŸŽ‰ All tests passed! Dataset builder is working correctly.") return True if __name__ == "__main__": test_small_dataset()