File size: 1,994 Bytes
36c78b1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
#!/usr/bin/env python3
"""
Test script for BitTransformerLM dataset creation (small version)
"""
import os
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent))
# Test the dataset builder with a small sample
from bit_transformer.dataset_builder import BitTransformerDatasetBuilder
def test_small_dataset():
print("π§ͺ Testing BitTransformerLM Dataset Builder...")
# Create builder with your token
hf_token = "os.environ.get('HF_TOKEN', 'your-token-here')"
repo_id = "BitTransformerLM"
builder = BitTransformerDatasetBuilder(hf_token, repo_id)
# Test text-to-bits generation
print("π Testing text-to-bits conversion...")
test_texts = [
"Hello, world!",
"The quick brown fox jumps over the lazy dog.",
"Binary data processing with transformers.",
"Information theory meets deep learning.",
"Parity-protected bit sequences for safety."
]
text_samples = builder.generate_text_to_bits_data(test_texts, max_len=128)
print(f"β
Generated {len(text_samples)} text-to-bits samples")
# Test synthetic patterns
print("π¨ Testing synthetic patterns...")
synthetic_samples = builder.generate_synthetic_patterns(10, max_len=64)
print(f"β
Generated {len(synthetic_samples)} synthetic samples")
# Test safety benchmarks
print("π‘οΈ Testing safety benchmarks...")
safety_samples = builder.generate_safety_benchmarks(8)
print(f"β
Generated {len(safety_samples)} safety samples")
# Show sample structure
print("\nπ Sample Structure:")
sample = text_samples[0]
for key, value in sample.items():
if key == "bit_sequence":
print(f" {key}: [{len(value)} bits] {value[:10]}...")
else:
print(f" {key}: {value}")
print("\nπ All tests passed! Dataset builder is working correctly.")
return True
if __name__ == "__main__":
test_small_dataset() |