|
|
|
import shutil |
|
import sys |
|
import time |
|
from pathlib import Path |
|
|
|
import lmdb |
|
|
|
from mmocr.utils import list_from_file |
|
|
|
|
|
def lmdb_converter(img_list_file, |
|
output, |
|
batch_size=1000, |
|
coding='utf-8', |
|
lmdb_map_size=109951162776): |
|
|
|
lines = list_from_file(img_list_file) |
|
|
|
|
|
if Path(output).is_dir(): |
|
while True: |
|
print('%s already exist, delete or not? [Y/n]' % output) |
|
Yn = input().strip() |
|
if Yn in ['Y', 'y']: |
|
shutil.rmtree(output) |
|
break |
|
if Yn in ['N', 'n']: |
|
return |
|
print('create database %s' % output) |
|
Path(output).mkdir(parents=True, exist_ok=False) |
|
env = lmdb.open(output, map_size=lmdb_map_size) |
|
|
|
|
|
beg_time = time.strftime('%H:%M:%S') |
|
for beg_index in range(0, len(lines), batch_size): |
|
end_index = min(beg_index + batch_size, len(lines)) |
|
sys.stdout.write('\r[%s-%s], processing [%d-%d] / %d' % |
|
(beg_time, time.strftime('%H:%M:%S'), beg_index, |
|
end_index, len(lines))) |
|
sys.stdout.flush() |
|
batch = [(str(index).encode(coding), lines[index].encode(coding)) |
|
for index in range(beg_index, end_index)] |
|
with env.begin(write=True) as txn: |
|
cursor = txn.cursor() |
|
cursor.putmulti(batch, dupdata=False, overwrite=True) |
|
sys.stdout.write('\n') |
|
with env.begin(write=True) as txn: |
|
key = 'total_number'.encode(coding) |
|
value = str(len(lines)).encode(coding) |
|
txn.put(key, value) |
|
print('done', flush=True) |
|
|