| from sys import argv | |
| filename = argv[1] | |
| num_line = argv[2] | |
| output_dir = argv[3] | |
| lines = open(filename).read().strip().split('\n') | |
| ckpt = 0 | |
| shard_lines = [] | |
| for i, line in enumerate(lines): | |
| if line == '' and (i-ckpt)>=int(num_line): | |
| shard_lines.append(lines[ckpt:i+1]) | |
| ckpt = i+1 | |
| if ckpt < len(lines) - 1: | |
| shard_lines.append(lines[ckpt:]) | |
| for i, doc in enumerate(shard_lines): | |
| with open(f'{output_dir}/{i:06}.txt', 'w') as f: | |
| print('\n'.join(doc), file=f, end='\n') | |