Spaces:

O-ken5481
/

talkingAvater_bgk

Paused

App Files Files Community

talkingAvater_bgk / core /utils /tensorrt_utils.py

oKen38461

初回コミットに基づくファイルの追加

ac7cda5 4 months ago

raw

history blame

10.5 kB

	import ctypes
	from collections import OrderedDict
	from typing import Type
	from cuda import cuda, cudart, nvrtc
	import numpy as np
	import torch
	import ctypes
	import os
	import torch

	try:
	import tensorrt as trt
	except ImportError:
	import tensorrt_libs

	trt_libs_path = tensorrt_libs.__path__[0]
	ctypes.CDLL(os.path.join(trt_libs_path, "libnvinfer.so.8"))
	ctypes.CDLL(os.path.join(trt_libs_path, "libnvinfer_plugin.so.8"))
	ctypes.CDLL(os.path.join(trt_libs_path, "libnvonnxparser.so.8"))
	ctypes.CDLL(os.path.join(trt_libs_path, "libnvparsers.so.8"))
	ctypes.CDLL(os.path.join(trt_libs_path, "libnvinfer_builder_resource.so.8.6.1"))
	import tensorrt as trt

	logger = trt.Logger(trt.Logger.ERROR)
	trt.init_libnvinfer_plugins(logger, "")


	def _cudaGetErrorEnum(error):
	if isinstance(error, cuda.CUresult):
	err, name = cuda.cuGetErrorName(error)
	return name if err == cuda.CUresult.CUDA_SUCCESS else "<unknown>"
	elif isinstance(error, cudart.cudaError_t):
	return cudart.cudaGetErrorName(error)[1]
	elif isinstance(error, nvrtc.nvrtcResult):
	return nvrtc.nvrtcGetErrorString(error)[1]
	else:
	raise RuntimeError("Unknown error type: {}".format(error))


	def checkCudaErrors(result):
	if result[0].value:
	raise RuntimeError(
	"CUDA error code={}({})".format(
	result[0].value, _cudaGetErrorEnum(result[0])
	)
	)
	if len(result) == 1:
	return None
	elif len(result) == 2:
	return result[1]
	else:
	return result[1:]


	class MyOutputAllocator(trt.IOutputAllocator):
	def __init__(self) -> None:
	super().__init__()
	# members for outside use
	self.shape = None
	self.n_bytes = 0
	self.address = 0

	def reallocate_output(self, tensor_name, old_address, size, alignment) -> int:
	return self.reallocate_common(tensor_name, old_address, size, alignment)

	def reallocate_output_async(
	self, tensor_name, old_address, size, alignment, stream
	) -> int:
	return self.reallocate_common(tensor_name, old_address, size, alignment, stream)

	def notify_shape(self, tensor_name, shape):
	self.shape = shape
	return

	def reallocate_common(
	self, tensor_name, old_address, size, alignment, stream=-1
	): # not necessary API
	if size <= self.n_bytes:
	return old_address
	if old_address != 0:
	checkCudaErrors(cudart.cudaFree(old_address))
	if stream == -1:
	address = checkCudaErrors(cudart.cudaMalloc(size))
	else:
	address = checkCudaErrors(cudart.cudaMallocAsync(size, stream))
	self.n_bytes = size
	self.address = address
	return address


	class TRTWrapper:
	def __init__(
	self,
	trt_file: str,
	plugin_file_list: list = [],
	) -> None:
	# Load custom plugins
	for plugin_file in plugin_file_list:
	ctypes.cdll.LoadLibrary(plugin_file)

	# Load engine bytes from file
	self.model = trt_file
	with open(trt_file, "rb") as f, trt.Runtime(logger) as runtime:
	assert runtime
	self.engine = runtime.deserialize_cuda_engine(f.read())
	assert self.engine
	self.buffer = OrderedDict()
	self.output_allocator_map = OrderedDict()
	self.context = self.engine.create_execution_context()
	return

	def setup(self, input_data: dict = {}) -> None:
	for name, value in self.buffer.items():
	_, device_buffer, _ = value
	if (
	device_buffer is not None
	and device_buffer != 0
	and name not in self.output_allocator_map
	):
	checkCudaErrors(cudart.cudaFree(device_buffer))
	self.buffer[name][1] = None
	self.buffer[name][2] = 0
	self.tensor_name_list = [
	self.engine.get_tensor_name(i) for i in range(self.engine.num_io_tensors)
	]
	self.n_input = sum(
	[
	self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT
	for name in self.tensor_name_list
	]
	)
	self.n_output = self.engine.num_io_tensors - self.n_input

	for name, data in input_data.items():
	if self.engine.get_tensor_location(name) == trt.TensorLocation.DEVICE:
	self.context.set_input_shape(name, data.shape)
	else:
	self.context.set_tensor_address(name, data.ctypes.data)

	# Prepare work before inference
	for name in self.tensor_name_list:
	data_type = self.engine.get_tensor_dtype(name)
	runtime_shape = self.context.get_tensor_shape(name)
	if name not in self.output_allocator_map:
	if -1 in runtime_shape:
	# for Data-Dependent-Shape (DDS) output, "else" branch for normal output
	n_byte = 0 # self.context.get_max_output_size(name)
	self.output_allocator_map[name] = MyOutputAllocator()
	self.context.set_output_allocator(
	name, self.output_allocator_map[name]
	)
	host_buffer = np.empty(0, dtype=trt.nptype(data_type))
	device_buffer = None
	else:
	n_byte = trt.volume(runtime_shape) * data_type.itemsize
	host_buffer = np.empty(runtime_shape, dtype=trt.nptype(data_type))
	if (
	self.engine.get_tensor_location(name)
	== trt.TensorLocation.DEVICE
	):
	device_buffer = checkCudaErrors(cudart.cudaMalloc(n_byte))
	else:
	device_buffer = None
	self.buffer[name] = [host_buffer, device_buffer, n_byte]
	else:
	# for DDS output, don't need to reallocate
	pass

	for name, data in input_data.items():
	self.buffer[name][0] = np.ascontiguousarray(data)

	for name in self.tensor_name_list:
	if self.engine.get_tensor_location(name) == trt.TensorLocation.DEVICE:
	if self.buffer[name][1] is not None:
	self.context.set_tensor_address(name, self.buffer[name][1])
	elif self.engine.get_tensor_mode(name) == trt.TensorIOMode.OUTPUT:
	self.context.set_tensor_address(name, self.buffer[name][0].ctypes.data)

	return

	def infer(self, stream=0) -> None:
	# Do inference and print output
	for name in self.tensor_name_list:
	if (
	self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT
	and self.engine.get_tensor_location(name) == trt.TensorLocation.DEVICE
	):
	cudart.cudaMemcpy(
	self.buffer[name][1],
	self.buffer[name][0].ctypes.data,
	self.buffer[name][2],
	cudart.cudaMemcpyKind.cudaMemcpyHostToDevice,
	)

	self.context.execute_async_v3(stream)

	for name in self.output_allocator_map:
	myOutputAllocator = self.context.get_output_allocator(name)
	runtime_shape = myOutputAllocator.shape
	data_type = self.engine.get_tensor_dtype(name)
	host_buffer = np.empty(runtime_shape, dtype=trt.nptype(data_type))
	device_buffer = myOutputAllocator.address
	n_bytes = trt.volume(runtime_shape) * data_type.itemsize
	self.buffer[name] = [host_buffer, device_buffer, n_bytes]

	for name in self.tensor_name_list:
	if (
	self.engine.get_tensor_mode(name) == trt.TensorIOMode.OUTPUT
	and self.engine.get_tensor_location(name) == trt.TensorLocation.DEVICE
	):
	cudart.cudaMemcpy(
	self.buffer[name][0].ctypes.data,
	self.buffer[name][1],
	self.buffer[name][2],
	cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost,
	)

	return

	def infer_async(self, stream=0) -> None:
	# Do inference and print output
	for name in self.tensor_name_list:
	if (
	self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT
	and self.engine.get_tensor_location(name) == trt.TensorLocation.DEVICE
	):
	cudart.cudaMemcpyAsync(
	self.buffer[name][1],
	self.buffer[name][0].ctypes.data,
	self.buffer[name][2],
	cudart.cudaMemcpyKind.cudaMemcpyHostToDevice,
	stream=stream,
	)

	self.context.execute_async_v3(stream)

	for name in self.output_allocator_map:
	myOutputAllocator = self.context.get_output_allocator(name)
	runtime_shape = myOutputAllocator.shape
	data_type = self.engine.get_tensor_dtype(name)
	host_buffer = np.empty(runtime_shape, dtype=trt.nptype(data_type))
	device_buffer = myOutputAllocator.address
	n_bytes = trt.volume(runtime_shape) * data_type.itemsize
	self.buffer[name] = [host_buffer, device_buffer, n_bytes]

	for name in self.tensor_name_list:
	if (
	self.engine.get_tensor_mode(name) == trt.TensorIOMode.OUTPUT
	and self.engine.get_tensor_location(name) == trt.TensorLocation.DEVICE
	):
	cudart.cudaMemcpyAsync(
	self.buffer[name][0].ctypes.data,
	self.buffer[name][1],
	self.buffer[name][2],
	cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost,
	stream=stream,
	)

	return

	def __del__(self):
	if hasattr(self, "buffer") and self.buffer is not None:
	for _, device_buffer, _ in self.buffer.values():
	if (
	device_buffer is not None
	and device_buffer != 0
	and cudart is not None
	):
	try:
	checkCudaErrors(cudart.cudaFree(device_buffer))
	except TypeError:
	pass
	return