Spaces:
Paused
Paused
| import ctypes | |
| from collections import OrderedDict | |
| from typing import Type | |
| from cuda import cuda, cudart, nvrtc | |
| import numpy as np | |
| import torch | |
| import ctypes | |
| import os | |
| import torch | |
| try: | |
| import tensorrt as trt | |
| except ImportError: | |
| import tensorrt_libs | |
| trt_libs_path = tensorrt_libs.__path__[0] | |
| ctypes.CDLL(os.path.join(trt_libs_path, "libnvinfer.so.8")) | |
| ctypes.CDLL(os.path.join(trt_libs_path, "libnvinfer_plugin.so.8")) | |
| ctypes.CDLL(os.path.join(trt_libs_path, "libnvonnxparser.so.8")) | |
| ctypes.CDLL(os.path.join(trt_libs_path, "libnvparsers.so.8")) | |
| ctypes.CDLL(os.path.join(trt_libs_path, "libnvinfer_builder_resource.so.8.6.1")) | |
| import tensorrt as trt | |
| logger = trt.Logger(trt.Logger.ERROR) | |
| trt.init_libnvinfer_plugins(logger, "") | |
| def _cudaGetErrorEnum(error): | |
| if isinstance(error, cuda.CUresult): | |
| err, name = cuda.cuGetErrorName(error) | |
| return name if err == cuda.CUresult.CUDA_SUCCESS else "<unknown>" | |
| elif isinstance(error, cudart.cudaError_t): | |
| return cudart.cudaGetErrorName(error)[1] | |
| elif isinstance(error, nvrtc.nvrtcResult): | |
| return nvrtc.nvrtcGetErrorString(error)[1] | |
| else: | |
| raise RuntimeError("Unknown error type: {}".format(error)) | |
| def checkCudaErrors(result): | |
| if result[0].value: | |
| raise RuntimeError( | |
| "CUDA error code={}({})".format( | |
| result[0].value, _cudaGetErrorEnum(result[0]) | |
| ) | |
| ) | |
| if len(result) == 1: | |
| return None | |
| elif len(result) == 2: | |
| return result[1] | |
| else: | |
| return result[1:] | |
| class MyOutputAllocator(trt.IOutputAllocator): | |
| def __init__(self) -> None: | |
| super().__init__() | |
| # members for outside use | |
| self.shape = None | |
| self.n_bytes = 0 | |
| self.address = 0 | |
| def reallocate_output(self, tensor_name, old_address, size, alignment) -> int: | |
| return self.reallocate_common(tensor_name, old_address, size, alignment) | |
| def reallocate_output_async( | |
| self, tensor_name, old_address, size, alignment, stream | |
| ) -> int: | |
| return self.reallocate_common(tensor_name, old_address, size, alignment, stream) | |
| def notify_shape(self, tensor_name, shape): | |
| self.shape = shape | |
| return | |
| def reallocate_common( | |
| self, tensor_name, old_address, size, alignment, stream=-1 | |
| ): # not necessary API | |
| if size <= self.n_bytes: | |
| return old_address | |
| if old_address != 0: | |
| checkCudaErrors(cudart.cudaFree(old_address)) | |
| if stream == -1: | |
| address = checkCudaErrors(cudart.cudaMalloc(size)) | |
| else: | |
| address = checkCudaErrors(cudart.cudaMallocAsync(size, stream)) | |
| self.n_bytes = size | |
| self.address = address | |
| return address | |
| class TRTWrapper: | |
| def __init__( | |
| self, | |
| trt_file: str, | |
| plugin_file_list: list = [], | |
| ) -> None: | |
| # Load custom plugins | |
| for plugin_file in plugin_file_list: | |
| ctypes.cdll.LoadLibrary(plugin_file) | |
| # Load engine bytes from file | |
| self.model = trt_file | |
| with open(trt_file, "rb") as f, trt.Runtime(logger) as runtime: | |
| assert runtime | |
| self.engine = runtime.deserialize_cuda_engine(f.read()) | |
| assert self.engine | |
| self.buffer = OrderedDict() | |
| self.output_allocator_map = OrderedDict() | |
| self.context = self.engine.create_execution_context() | |
| return | |
| def setup(self, input_data: dict = {}) -> None: | |
| for name, value in self.buffer.items(): | |
| _, device_buffer, _ = value | |
| if ( | |
| device_buffer is not None | |
| and device_buffer != 0 | |
| and name not in self.output_allocator_map | |
| ): | |
| checkCudaErrors(cudart.cudaFree(device_buffer)) | |
| self.buffer[name][1] = None | |
| self.buffer[name][2] = 0 | |
| self.tensor_name_list = [ | |
| self.engine.get_tensor_name(i) for i in range(self.engine.num_io_tensors) | |
| ] | |
| self.n_input = sum( | |
| [ | |
| self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT | |
| for name in self.tensor_name_list | |
| ] | |
| ) | |
| self.n_output = self.engine.num_io_tensors - self.n_input | |
| for name, data in input_data.items(): | |
| if self.engine.get_tensor_location(name) == trt.TensorLocation.DEVICE: | |
| self.context.set_input_shape(name, data.shape) | |
| else: | |
| self.context.set_tensor_address(name, data.ctypes.data) | |
| # Prepare work before inference | |
| for name in self.tensor_name_list: | |
| data_type = self.engine.get_tensor_dtype(name) | |
| runtime_shape = self.context.get_tensor_shape(name) | |
| if name not in self.output_allocator_map: | |
| if -1 in runtime_shape: | |
| # for Data-Dependent-Shape (DDS) output, "else" branch for normal output | |
| n_byte = 0 # self.context.get_max_output_size(name) | |
| self.output_allocator_map[name] = MyOutputAllocator() | |
| self.context.set_output_allocator( | |
| name, self.output_allocator_map[name] | |
| ) | |
| host_buffer = np.empty(0, dtype=trt.nptype(data_type)) | |
| device_buffer = None | |
| else: | |
| n_byte = trt.volume(runtime_shape) * data_type.itemsize | |
| host_buffer = np.empty(runtime_shape, dtype=trt.nptype(data_type)) | |
| if ( | |
| self.engine.get_tensor_location(name) | |
| == trt.TensorLocation.DEVICE | |
| ): | |
| device_buffer = checkCudaErrors(cudart.cudaMalloc(n_byte)) | |
| else: | |
| device_buffer = None | |
| self.buffer[name] = [host_buffer, device_buffer, n_byte] | |
| else: | |
| # for DDS output, don't need to reallocate | |
| pass | |
| for name, data in input_data.items(): | |
| self.buffer[name][0] = np.ascontiguousarray(data) | |
| for name in self.tensor_name_list: | |
| if self.engine.get_tensor_location(name) == trt.TensorLocation.DEVICE: | |
| if self.buffer[name][1] is not None: | |
| self.context.set_tensor_address(name, self.buffer[name][1]) | |
| elif self.engine.get_tensor_mode(name) == trt.TensorIOMode.OUTPUT: | |
| self.context.set_tensor_address(name, self.buffer[name][0].ctypes.data) | |
| return | |
| def infer(self, stream=0) -> None: | |
| # Do inference and print output | |
| for name in self.tensor_name_list: | |
| if ( | |
| self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT | |
| and self.engine.get_tensor_location(name) == trt.TensorLocation.DEVICE | |
| ): | |
| cudart.cudaMemcpy( | |
| self.buffer[name][1], | |
| self.buffer[name][0].ctypes.data, | |
| self.buffer[name][2], | |
| cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, | |
| ) | |
| self.context.execute_async_v3(stream) | |
| for name in self.output_allocator_map: | |
| myOutputAllocator = self.context.get_output_allocator(name) | |
| runtime_shape = myOutputAllocator.shape | |
| data_type = self.engine.get_tensor_dtype(name) | |
| host_buffer = np.empty(runtime_shape, dtype=trt.nptype(data_type)) | |
| device_buffer = myOutputAllocator.address | |
| n_bytes = trt.volume(runtime_shape) * data_type.itemsize | |
| self.buffer[name] = [host_buffer, device_buffer, n_bytes] | |
| for name in self.tensor_name_list: | |
| if ( | |
| self.engine.get_tensor_mode(name) == trt.TensorIOMode.OUTPUT | |
| and self.engine.get_tensor_location(name) == trt.TensorLocation.DEVICE | |
| ): | |
| cudart.cudaMemcpy( | |
| self.buffer[name][0].ctypes.data, | |
| self.buffer[name][1], | |
| self.buffer[name][2], | |
| cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, | |
| ) | |
| return | |
| def infer_async(self, stream=0) -> None: | |
| # Do inference and print output | |
| for name in self.tensor_name_list: | |
| if ( | |
| self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT | |
| and self.engine.get_tensor_location(name) == trt.TensorLocation.DEVICE | |
| ): | |
| cudart.cudaMemcpyAsync( | |
| self.buffer[name][1], | |
| self.buffer[name][0].ctypes.data, | |
| self.buffer[name][2], | |
| cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, | |
| stream=stream, | |
| ) | |
| self.context.execute_async_v3(stream) | |
| for name in self.output_allocator_map: | |
| myOutputAllocator = self.context.get_output_allocator(name) | |
| runtime_shape = myOutputAllocator.shape | |
| data_type = self.engine.get_tensor_dtype(name) | |
| host_buffer = np.empty(runtime_shape, dtype=trt.nptype(data_type)) | |
| device_buffer = myOutputAllocator.address | |
| n_bytes = trt.volume(runtime_shape) * data_type.itemsize | |
| self.buffer[name] = [host_buffer, device_buffer, n_bytes] | |
| for name in self.tensor_name_list: | |
| if ( | |
| self.engine.get_tensor_mode(name) == trt.TensorIOMode.OUTPUT | |
| and self.engine.get_tensor_location(name) == trt.TensorLocation.DEVICE | |
| ): | |
| cudart.cudaMemcpyAsync( | |
| self.buffer[name][0].ctypes.data, | |
| self.buffer[name][1], | |
| self.buffer[name][2], | |
| cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, | |
| stream=stream, | |
| ) | |
| return | |
| def __del__(self): | |
| if hasattr(self, "buffer") and self.buffer is not None: | |
| for _, device_buffer, _ in self.buffer.values(): | |
| if ( | |
| device_buffer is not None | |
| and device_buffer != 0 | |
| and cudart is not None | |
| ): | |
| try: | |
| checkCudaErrors(cudart.cudaFree(device_buffer)) | |
| except TypeError: | |
| pass | |
| return | |