File size: 1,994 Bytes
36c78b1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#!/usr/bin/env python3
"""
Test script for BitTransformerLM dataset creation (small version)
"""

import os
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent))

# Test the dataset builder with a small sample
from bit_transformer.dataset_builder import BitTransformerDatasetBuilder

def test_small_dataset():
    print("πŸ§ͺ Testing BitTransformerLM Dataset Builder...")
    
    # Create builder with your token
    hf_token = "os.environ.get('HF_TOKEN', 'your-token-here')"
    repo_id = "BitTransformerLM"
    
    builder = BitTransformerDatasetBuilder(hf_token, repo_id)
    
    # Test text-to-bits generation
    print("πŸ“ Testing text-to-bits conversion...")
    test_texts = [
        "Hello, world!",
        "The quick brown fox jumps over the lazy dog.",
        "Binary data processing with transformers.",
        "Information theory meets deep learning.",
        "Parity-protected bit sequences for safety."
    ]
    
    text_samples = builder.generate_text_to_bits_data(test_texts, max_len=128)
    print(f"βœ… Generated {len(text_samples)} text-to-bits samples")
    
    # Test synthetic patterns
    print("🎨 Testing synthetic patterns...")
    synthetic_samples = builder.generate_synthetic_patterns(10, max_len=64)
    print(f"βœ… Generated {len(synthetic_samples)} synthetic samples")
    
    # Test safety benchmarks
    print("πŸ›‘οΈ Testing safety benchmarks...")
    safety_samples = builder.generate_safety_benchmarks(8)
    print(f"βœ… Generated {len(safety_samples)} safety samples")
    
    # Show sample structure
    print("\nπŸ“Š Sample Structure:")
    sample = text_samples[0]
    for key, value in sample.items():
        if key == "bit_sequence":
            print(f"  {key}: [{len(value)} bits] {value[:10]}...")
        else:
            print(f"  {key}: {value}")
    
    print("\nπŸŽ‰ All tests passed! Dataset builder is working correctly.")
    return True

if __name__ == "__main__":
    test_small_dataset()