uie-base / convert.py

init commit

2fc5f2f almost 2 years ago

20.5 kB

	'''下载预训练模型并且转了pytorch格式
	'''
	import argparse
	import collections
	import json
	import os
	import pickle
	import torch
	import logging
	import shutil
	from tqdm import tqdm
	import time

	logger = logging.Logger('log')


	def get_path_from_url(url, root_dir, check_exist=True, decompress=True):
	""" Download from given url to root_dir.
	if file or directory specified by url is exists under
	root_dir, return the path directly, otherwise download
	from url and decompress it, return the path.

	Args:
	url (str): download url
	root_dir (str): root dir for downloading, it should be
	WEIGHTS_HOME or DATASET_HOME
	decompress (bool): decompress zip or tar file. Default is `True`

	Returns:
	str: a local path to save downloaded models & weights & datasets.
	"""

	import os.path
	import os
	import tarfile
	import zipfile

	def is_url(path):
	"""
	Whether path is URL.
	Args:
	path (string): URL string or not.
	"""
	return path.startswith('http://') or path.startswith('https://')

	def _map_path(url, root_dir):
	# parse path after download under root_dir
	fname = os.path.split(url)[-1]
	fpath = fname
	return os.path.join(root_dir, fpath)

	def _get_download(url, fullname):
	import requests
	# using requests.get method
	fname = os.path.basename(fullname)
	try:
	req = requests.get(url, stream=True)
	except Exception as e: # requests.exceptions.ConnectionError
	logger.info("Downloading {} from {} failed with exception {}".format(
	fname, url, str(e)))
	return False

	if req.status_code != 200:
	raise RuntimeError("Downloading from {} failed with code "
	"{}!".format(url, req.status_code))

	# For protecting download interupted, download to
	# tmp_fullname firstly, move tmp_fullname to fullname
	# after download finished
	tmp_fullname = fullname + "_tmp"
	total_size = req.headers.get('content-length')
	with open(tmp_fullname, 'wb') as f:
	if total_size:
	with tqdm(total=(int(total_size) + 1023) // 1024, unit='KB') as pbar:
	for chunk in req.iter_content(chunk_size=1024):
	f.write(chunk)
	pbar.update(1)
	else:
	for chunk in req.iter_content(chunk_size=1024):
	if chunk:
	f.write(chunk)
	shutil.move(tmp_fullname, fullname)

	return fullname

	def _download(url, path):
	"""
	Download from url, save to path.

	url (str): download url
	path (str): download to given path
	"""

	if not os.path.exists(path):
	os.makedirs(path)

	fname = os.path.split(url)[-1]
	fullname = os.path.join(path, fname)
	retry_cnt = 0

	logger.info("Downloading {} from {}".format(fname, url))
	DOWNLOAD_RETRY_LIMIT = 3
	while not os.path.exists(fullname):
	if retry_cnt < DOWNLOAD_RETRY_LIMIT:
	retry_cnt += 1
	else:
	raise RuntimeError("Download from {} failed. "
	"Retry limit reached".format(url))

	if not _get_download(url, fullname):
	time.sleep(1)
	continue

	return fullname

	def _uncompress_file_zip(filepath):
	with zipfile.ZipFile(filepath, 'r') as files:
	file_list = files.namelist()

	file_dir = os.path.dirname(filepath)

	if _is_a_single_file(file_list):
	rootpath = file_list[0]
	uncompressed_path = os.path.join(file_dir, rootpath)
	files.extractall(file_dir)

	elif _is_a_single_dir(file_list):
	# `strip(os.sep)` to remove `os.sep` in the tail of path
	rootpath = os.path.splitext(file_list[0].strip(os.sep))[0].split(
	os.sep)[-1]
	uncompressed_path = os.path.join(file_dir, rootpath)

	files.extractall(file_dir)
	else:
	rootpath = os.path.splitext(filepath)[0].split(os.sep)[-1]
	uncompressed_path = os.path.join(file_dir, rootpath)
	if not os.path.exists(uncompressed_path):
	os.makedirs(uncompressed_path)
	files.extractall(os.path.join(file_dir, rootpath))

	return uncompressed_path

	def _is_a_single_file(file_list):
	if len(file_list) == 1 and file_list[0].find(os.sep) < 0:
	return True
	return False

	def _is_a_single_dir(file_list):
	new_file_list = []
	for file_path in file_list:
	if '/' in file_path:
	file_path = file_path.replace('/', os.sep)
	elif '\\' in file_path:
	file_path = file_path.replace('\\', os.sep)
	new_file_list.append(file_path)

	file_name = new_file_list[0].split(os.sep)[0]
	for i in range(1, len(new_file_list)):
	if file_name != new_file_list[i].split(os.sep)[0]:
	return False
	return True

	def _uncompress_file_tar(filepath, mode="r:*"):
	with tarfile.open(filepath, mode) as files:
	file_list = files.getnames()

	file_dir = os.path.dirname(filepath)

	if _is_a_single_file(file_list):
	rootpath = file_list[0]
	uncompressed_path = os.path.join(file_dir, rootpath)
	files.extractall(file_dir)
	elif _is_a_single_dir(file_list):
	rootpath = os.path.splitext(file_list[0].strip(os.sep))[0].split(
	os.sep)[-1]
	uncompressed_path = os.path.join(file_dir, rootpath)
	files.extractall(file_dir)
	else:
	rootpath = os.path.splitext(filepath)[0].split(os.sep)[-1]
	uncompressed_path = os.path.join(file_dir, rootpath)
	if not os.path.exists(uncompressed_path):
	os.makedirs(uncompressed_path)

	files.extractall(os.path.join(file_dir, rootpath))

	return uncompressed_path

	def _decompress(fname):
	"""
	Decompress for zip and tar file
	"""
	logger.info("Decompressing {}...".format(fname))

	# For protecting decompressing interupted,
	# decompress to fpath_tmp directory firstly, if decompress
	# successed, move decompress files to fpath and delete
	# fpath_tmp and remove download compress file.

	if tarfile.is_tarfile(fname):
	uncompressed_path = _uncompress_file_tar(fname)
	elif zipfile.is_zipfile(fname):
	uncompressed_path = _uncompress_file_zip(fname)
	else:
	raise TypeError("Unsupport compress file type {}".format(fname))

	return uncompressed_path

	assert is_url(url), "downloading from {} not a url".format(url)
	fullpath = _map_path(url, root_dir)
	if os.path.exists(fullpath) and check_exist:
	logger.info("Found {}".format(fullpath))
	else:
	fullpath = _download(url, root_dir)

	if decompress and (tarfile.is_tarfile(fullpath) or
	zipfile.is_zipfile(fullpath)):
	fullpath = _decompress(fullpath)

	return fullpath


	MODEL_MAP = {
	"uie-base": {
	"resource_file_urls": {
	"model_state.pdparams":
	"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base_v0.1/model_state.pdparams",
	"model_config.json":
	"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/model_config.json",
	"vocab_file":
	"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/vocab.txt",
	"special_tokens_map":
	"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/special_tokens_map.json",
	"tokenizer_config":
	"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/tokenizer_config.json"
	}
	},
	"uie-medium": {
	"resource_file_urls": {
	"model_state.pdparams":
	"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_medium_v1.0/model_state.pdparams",
	"model_config.json":
	"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_medium/model_config.json",
	"vocab_file":
	"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/vocab.txt",
	"special_tokens_map":
	"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/special_tokens_map.json",
	"tokenizer_config":
	"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/tokenizer_config.json",
	}
	},
	"uie-mini": {
	"resource_file_urls": {
	"model_state.pdparams":
	"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_mini_v1.0/model_state.pdparams",
	"model_config.json":
	"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_mini/model_config.json",
	"vocab_file":
	"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/vocab.txt",
	"special_tokens_map":
	"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/special_tokens_map.json",
	"tokenizer_config":
	"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/tokenizer_config.json",
	}
	},
	"uie-micro": {
	"resource_file_urls": {
	"model_state.pdparams":
	"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_micro_v1.0/model_state.pdparams",
	"model_config.json":
	"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_micro/model_config.json",
	"vocab_file":
	"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/vocab.txt",
	"special_tokens_map":
	"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/special_tokens_map.json",
	"tokenizer_config":
	"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/tokenizer_config.json",
	}
	},
	"uie-nano": {
	"resource_file_urls": {
	"model_state.pdparams":
	"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_nano_v1.0/model_state.pdparams",
	"model_config.json":
	"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_nano/model_config.json",
	"vocab_file":
	"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/vocab.txt",
	"special_tokens_map":
	"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/special_tokens_map.json",
	"tokenizer_config":
	"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/tokenizer_config.json",
	}
	},
	"uie-medical-base": {
	"resource_file_urls": {
	"model_state.pdparams":
	"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_medical_base_v0.1/model_state.pdparams",
	"model_config.json":
	"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/model_config.json",
	"vocab_file":
	"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/vocab.txt",
	"special_tokens_map":
	"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/special_tokens_map.json",
	"tokenizer_config":
	"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/tokenizer_config.json",
	}
	},
	"uie-tiny": {
	"resource_file_urls": {
	"model_state.pdparams":
	"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_tiny_v0.1/model_state.pdparams",
	"model_config.json":
	"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_tiny/model_config.json",
	"vocab_file":
	"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_tiny/vocab.txt",
	"special_tokens_map":
	"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_tiny/special_tokens_map.json",
	"tokenizer_config":
	"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_tiny/tokenizer_config.json"
	}
	}
	}


	def build_params_map(attention_num=12):
	"""
	build params map from paddle-paddle's ERNIE to transformer's BERT
	:return:
	"""
	weight_map = collections.OrderedDict({
	'encoder.embeddings.word_embeddings.weight': "bert.embeddings.word_embeddings.weight",
	'encoder.embeddings.position_embeddings.weight': "bert.embeddings.position_embeddings.weight",
	'encoder.embeddings.token_type_embeddings.weight': "bert.embeddings.token_type_embeddings.weight",
	'encoder.embeddings.task_type_embeddings.weight': "embeddings.task_type_embeddings.weight", # 这里没有前缀bert，直接映射到bert4torch结构
	'encoder.embeddings.layer_norm.weight': 'bert.embeddings.LayerNorm.weight',
	'encoder.embeddings.layer_norm.bias': 'bert.embeddings.LayerNorm.bias',
	})
	# add attention layers
	for i in range(attention_num):
	weight_map[f'encoder.encoder.layers.{i}.self_attn.q_proj.weight'] = f'bert.encoder.layer.{i}.attention.self.query.weight'
	weight_map[f'encoder.encoder.layers.{i}.self_attn.q_proj.bias'] = f'bert.encoder.layer.{i}.attention.self.query.bias'
	weight_map[f'encoder.encoder.layers.{i}.self_attn.k_proj.weight'] = f'bert.encoder.layer.{i}.attention.self.key.weight'
	weight_map[f'encoder.encoder.layers.{i}.self_attn.k_proj.bias'] = f'bert.encoder.layer.{i}.attention.self.key.bias'
	weight_map[f'encoder.encoder.layers.{i}.self_attn.v_proj.weight'] = f'bert.encoder.layer.{i}.attention.self.value.weight'
	weight_map[f'encoder.encoder.layers.{i}.self_attn.v_proj.bias'] = f'bert.encoder.layer.{i}.attention.self.value.bias'
	weight_map[f'encoder.encoder.layers.{i}.self_attn.out_proj.weight'] = f'bert.encoder.layer.{i}.attention.output.dense.weight'
	weight_map[f'encoder.encoder.layers.{i}.self_attn.out_proj.bias'] = f'bert.encoder.layer.{i}.attention.output.dense.bias'
	weight_map[f'encoder.encoder.layers.{i}.norm1.weight'] = f'bert.encoder.layer.{i}.attention.output.LayerNorm.weight'
	weight_map[f'encoder.encoder.layers.{i}.norm1.bias'] = f'bert.encoder.layer.{i}.attention.output.LayerNorm.bias'
	weight_map[f'encoder.encoder.layers.{i}.linear1.weight'] = f'bert.encoder.layer.{i}.intermediate.dense.weight'
	weight_map[f'encoder.encoder.layers.{i}.linear1.bias'] = f'bert.encoder.layer.{i}.intermediate.dense.bias'
	weight_map[f'encoder.encoder.layers.{i}.linear2.weight'] = f'bert.encoder.layer.{i}.output.dense.weight'
	weight_map[f'encoder.encoder.layers.{i}.linear2.bias'] = f'bert.encoder.layer.{i}.output.dense.bias'
	weight_map[f'encoder.encoder.layers.{i}.norm2.weight'] = f'bert.encoder.layer.{i}.output.LayerNorm.weight'
	weight_map[f'encoder.encoder.layers.{i}.norm2.bias'] = f'bert.encoder.layer.{i}.output.LayerNorm.bias'
	# add pooler
	weight_map.update(
	{
	'encoder.pooler.dense.weight': 'bert.pooler.dense.weight',
	'encoder.pooler.dense.bias': 'bert.pooler.dense.bias',
	'linear_start.weight': 'linear_start.weight',
	'linear_start.bias': 'linear_start.bias',
	'linear_end.weight': 'linear_end.weight',
	'linear_end.bias': 'linear_end.bias',
	}
	)
	return weight_map


	def extract_and_convert(input_dir, output_dir):
	if not os.path.exists(output_dir):
	os.makedirs(output_dir)
	logger.info('=' * 20 + 'save config file' + '=' * 20)
	config = json.load(open(os.path.join(input_dir, 'model_config.json'), 'rt', encoding='utf-8'))
	config = config['init_args'][0]
	config["architectures"] = ["UIE"]
	config['layer_norm_eps'] = 1e-12
	del config['init_class']
	if 'sent_type_vocab_size' in config:
	config['type_vocab_size'] = config['sent_type_vocab_size']
	config['intermediate_size'] = 4 * config['hidden_size']
	json.dump(config, open(os.path.join(output_dir, 'config.json'),
	'wt', encoding='utf-8'), indent=4)
	logger.info('=' * 20 + 'save vocab file' + '=' * 20)
	with open(os.path.join(input_dir, 'vocab.txt'), 'rt', encoding='utf-8') as f:
	words = f.read().splitlines()
	words_set = set()
	words_duplicate_indices = []
	for i in range(len(words)-1, -1, -1):
	word = words[i]
	if word in words_set:
	words_duplicate_indices.append(i)
	words_set.add(word)
	for i, idx in enumerate(words_duplicate_indices):
	words[idx] = chr(0x1F6A9+i) # Change duplicated word to 🚩 LOL
	with open(os.path.join(output_dir, 'vocab.txt'), 'wt', encoding='utf-8') as f:
	for word in words:
	f.write(word+'\n')
	special_tokens_map = {
	"unk_token": "[UNK]",
	"sep_token": "[SEP]",
	"pad_token": "[PAD]",
	"cls_token": "[CLS]",
	"mask_token": "[MASK]"
	}
	json.dump(special_tokens_map, open(os.path.join(output_dir, 'special_tokens_map.json'),
	'wt', encoding='utf-8'))
	tokenizer_config = {
	"do_lower_case": True,
	"unk_token": "[UNK]",
	"sep_token": "[SEP]",
	"pad_token": "[PAD]",
	"cls_token": "[CLS]",
	"mask_token": "[MASK]",
	"tokenizer_class": "BertTokenizer"
	}
	json.dump(tokenizer_config, open(os.path.join(output_dir, 'tokenizer_config.json'),
	'wt', encoding='utf-8'))
	logger.info('=' * 20 + 'extract weights' + '=' * 20)
	state_dict = collections.OrderedDict()
	weight_map = build_params_map(attention_num=config['num_hidden_layers'])
	paddle_paddle_params = pickle.load(
	open(os.path.join(input_dir, 'model_state.pdparams'), 'rb'))
	del paddle_paddle_params['StructuredToParameterName@@']
	for weight_name, weight_value in paddle_paddle_params.items():
	if 'weight' in weight_name:
	if 'encoder.encoder' in weight_name or 'pooler' in weight_name or 'linear' in weight_name:
	weight_value = weight_value.transpose()
	# Fix: embedding error
	if 'word_embeddings.weight' in weight_name:
	weight_value[0, :] = 0
	if weight_name not in weight_map:
	logger.info(f"{'='20} [SKIP] {weight_name} {'='20}")
	continue
	state_dict[weight_map[weight_name]] = torch.FloatTensor(weight_value)
	logger.info(f"{weight_name} -> {weight_map[weight_name]} {weight_value.shape}")
	torch.save(state_dict, os.path.join(output_dir, "pytorch_model.bin"))


	def check_model(input_model):
	if not os.path.exists(input_model):
	if input_model not in MODEL_MAP:
	raise ValueError('input_model not exists!')

	resource_file_urls = MODEL_MAP[input_model]['resource_file_urls']
	logger.info("Downloading resource files...")

	for key, val in resource_file_urls.items():
	file_path = os.path.join(input_model, key)
	if not os.path.exists(file_path):
	get_path_from_url(val, input_model)


	def do_main():
	check_model(args.input_model)
	extract_and_convert(args.input_model, args.output_model)

	if __name__ == '__main__':
	parser = argparse.ArgumentParser()
	parser.add_argument("-i", "--input_model", default="uie-base", type=str,
	help="Directory of input paddle model.\n Will auto download model [uie-base/uie-tiny]")
	parser.add_argument("-o", "--output_model", default="uie_base_pytorch", type=str,
	help="Directory of output pytorch model")
	args = parser.parse_args()

	do_main()