|
|
|
""" |
|
Test script for BitTransformerLM dataset creation (small version) |
|
""" |
|
|
|
import os |
|
import sys |
|
from pathlib import Path |
|
sys.path.insert(0, str(Path(__file__).parent)) |
|
|
|
|
|
from bit_transformer.dataset_builder import BitTransformerDatasetBuilder |
|
|
|
def test_small_dataset(): |
|
print("π§ͺ Testing BitTransformerLM Dataset Builder...") |
|
|
|
|
|
hf_token = "os.environ.get('HF_TOKEN', 'your-token-here')" |
|
repo_id = "BitTransformerLM" |
|
|
|
builder = BitTransformerDatasetBuilder(hf_token, repo_id) |
|
|
|
|
|
print("π Testing text-to-bits conversion...") |
|
test_texts = [ |
|
"Hello, world!", |
|
"The quick brown fox jumps over the lazy dog.", |
|
"Binary data processing with transformers.", |
|
"Information theory meets deep learning.", |
|
"Parity-protected bit sequences for safety." |
|
] |
|
|
|
text_samples = builder.generate_text_to_bits_data(test_texts, max_len=128) |
|
print(f"β
Generated {len(text_samples)} text-to-bits samples") |
|
|
|
|
|
print("π¨ Testing synthetic patterns...") |
|
synthetic_samples = builder.generate_synthetic_patterns(10, max_len=64) |
|
print(f"β
Generated {len(synthetic_samples)} synthetic samples") |
|
|
|
|
|
print("π‘οΈ Testing safety benchmarks...") |
|
safety_samples = builder.generate_safety_benchmarks(8) |
|
print(f"β
Generated {len(safety_samples)} safety samples") |
|
|
|
|
|
print("\nπ Sample Structure:") |
|
sample = text_samples[0] |
|
for key, value in sample.items(): |
|
if key == "bit_sequence": |
|
print(f" {key}: [{len(value)} bits] {value[:10]}...") |
|
else: |
|
print(f" {key}: {value}") |
|
|
|
print("\nπ All tests passed! Dataset builder is working correctly.") |
|
return True |
|
|
|
if __name__ == "__main__": |
|
test_small_dataset() |