BitTransformerLM / test_dataset_small.py
WCNegentropy's picture
πŸ€– Updated BitTransformerLM from development space
36c78b1 verified
raw
history blame
1.99 kB
#!/usr/bin/env python3
"""
Test script for BitTransformerLM dataset creation (small version)
"""
import os
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent))
# Test the dataset builder with a small sample
from bit_transformer.dataset_builder import BitTransformerDatasetBuilder
def test_small_dataset():
print("πŸ§ͺ Testing BitTransformerLM Dataset Builder...")
# Create builder with your token
hf_token = "os.environ.get('HF_TOKEN', 'your-token-here')"
repo_id = "BitTransformerLM"
builder = BitTransformerDatasetBuilder(hf_token, repo_id)
# Test text-to-bits generation
print("πŸ“ Testing text-to-bits conversion...")
test_texts = [
"Hello, world!",
"The quick brown fox jumps over the lazy dog.",
"Binary data processing with transformers.",
"Information theory meets deep learning.",
"Parity-protected bit sequences for safety."
]
text_samples = builder.generate_text_to_bits_data(test_texts, max_len=128)
print(f"βœ… Generated {len(text_samples)} text-to-bits samples")
# Test synthetic patterns
print("🎨 Testing synthetic patterns...")
synthetic_samples = builder.generate_synthetic_patterns(10, max_len=64)
print(f"βœ… Generated {len(synthetic_samples)} synthetic samples")
# Test safety benchmarks
print("πŸ›‘οΈ Testing safety benchmarks...")
safety_samples = builder.generate_safety_benchmarks(8)
print(f"βœ… Generated {len(safety_samples)} safety samples")
# Show sample structure
print("\nπŸ“Š Sample Structure:")
sample = text_samples[0]
for key, value in sample.items():
if key == "bit_sequence":
print(f" {key}: [{len(value)} bits] {value[:10]}...")
else:
print(f" {key}: {value}")
print("\nπŸŽ‰ All tests passed! Dataset builder is working correctly.")
return True
if __name__ == "__main__":
test_small_dataset()