| import pathlib | |
| import torch | |
| from datasets import load_dataset | |
| TXT_MB = 100 | |
| OUT = pathlib.Path('full_bits.pt') | |
| def build_bits(out: pathlib.Path = OUT, txt_mb: int = TXT_MB) -> None: | |
| ds = load_dataset('wikitext', 'wikitext-2-raw-v1', split='train') | |
| buf = bytearray() | |
| for line in ds['text']: | |
| buf.extend(line.encode() + b"\n") | |
| if len(buf) >= txt_mb * 2 ** 20: | |
| break | |
| bits = [] | |
| for byte in buf: | |
| bits.extend(int(b) for b in f'{byte:08b}') | |
| tensor = torch.tensor(bits, dtype=torch.uint8) | |
| torch.save(tensor, out) | |
| if __name__ == '__main__': | |
| build_bits() | |