TensorRT Python常用操作
Date: 2019/10/12 Categories: 工作 Tags: TensorRT DeepLearningInference
Minimal Builder
这里构建了一个只包含一个identity
函数的网络
import tensorrt as trt
## Setup Builder
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
builder = trt.Builder(TRT_LOGGER)
explicit_batch_flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
network = builder.create_network(explicit_batch_flag)
builder_config = builder.create_builder_config()
builder_config.max_workspace_size = 5000 * (1024 * 1024) # 5000 MiB
## Build Network
input_shape = (1,4)
x = network.add_input(name="X", dtype=trt.int32, shape=input_shape)
id_layer = network.add_identity(x)
x = id_layer.get_output(0)
network.mark_output(x)
## Get Engine
engine = builder.build_engine(network, builder_config)
## Write engine
with open('engine', 'wb') as f:
f.write(engine.serialize())
Minimal Inference
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
import tensorrt as trt
input_shape = (1,4)
runtime = trt.Runtime(TRT_LOGGER)
# restore engine
with open('engine', 'rb') as f:
engine = runtime.deserialize_cuda_engine(f.read())
# make context
context = engine.create_execution_context()
# allocate input and output memory
input_nbytes = trt.volume(input_shape)* trt.int32.itemsize
d_input = cuda.mem_alloc(input_nbytes)
context.set_binding_shape(0, input_shape)
h_output = cuda.pagelocked_empty(tuple(context.get_binding_shape(0)), dtype=np.int32)
d_output = cuda.mem_alloc(h_output.nbytes)
stream = cuda.Stream() # Create a stream in which to copy inputs/outputs and run inference.
# setup input
data = np.array([[1,2,3,4]], dtype='int32')
cuda.memcpy_htod_async(d_input, data, stream)
# do inference
context.execute_async_v2(bindings=[int(d_input)] + [int(d_output)], stream_handle=stream.handle)
# copy results to host
cuda.memcpy_dtoh_async(h_output, d_output, stream)
stream.synchronize()
# show result
print(h_output)
Dynamic Shape
Dynamic Batch是Dynamic Shape的一种特殊情况, 下面的网络获取了每个batch的第一个element, 利用了TensorRT中的gather层
import tensorrt as trt
import numpy as np
GGER = trt.Logger(trt.Logger.INFO)
builder = trt.Builder(TRT_LOGGER)
explicit_batch_flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
network = builder.create_network(explicit_batch_flag)
builder_config = builder.create_builder_config()
builder_config.max_workspace_size = 5000 * (1024 * 1024) # 5000 MiB
profile = builder.create_optimization_profile()
profile.set_shape("X", min=[1,4], opt=[1,4], max=[1,4])
builder_config.add_optimization_profile(profile)
# Input Layer
input_shape = (-1,4)
x = network.add_input(name="X", dtype=trt.int32, shape=input_shape)
# Identity Layer
id_layer = network.add_identity(x)
x = id_layer.get_output(0)
indices = network.add_constant([1], np.array([0], dtype='int32')).get_output(0)
axis = 1
gather_layer = network.add_gather(x, indices, axis)
x = gather_layer.get_output(0)
print(f"shape: {x.shape}")
network.mark_output(x)
engine = builder.build_engine(network, builder_config)
参考了Working With Dynamic Shapes 和IGatherLayer的文档
Fully-connected layer
TRT中的全连接层输入是一个4维的向量[B, C, 1, 1]
, 输出也是4维[B, O, 1, 1]
,
所以输入必须被reshape到合适的形状, 下面是一个例子
# build the engine
import tensorrt as trt
import numpy as np
## Setup Builder
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
builder = trt.Builder(TRT_LOGGER)
## Build Network
batch_size = 2
input_dim = 4
output_dim = 2
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
input_shape = (batch_size, input_dim, 1, 1)
x = network.add_input(name="X", dtype=trt.float32, shape=input_shape)
W = np.ones((input_dim, output_dim), dtype='float32')
B = np.zeros(output_dim, dtype='float32')
fc_Layer = network.add_fully_connected(x, output_dim, W, B)
y = fc_Layer.get_output(0)
network.mark_output(y)
engine = builder.build_engine(network, builder.create_builder_config())
# do inference
def nvinfer(engine, X_shape, Y_shape, X_dtype='float32', output_dtype='float32'):
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
import tensorrt as trt
runtime = trt.Runtime(TRT_LOGGER)
# make context
context = engine.create_execution_context()
# allocate input and output memory
input_nbytes = trt.volume(X_shape) * getattr(trt, X_dtype).itemsize
d_input = cuda.mem_alloc(input_nbytes)
context.set_binding_shape(0, X.shape)
h_output = cuda.pagelocked_empty(Y_shape, dtype=output_dtype)
d_output = cuda.mem_alloc(h_output.nbytes)
stream = cuda.Stream() # Create a stream in which to copy inputs/outputs and run inference.
def run(X):
cuda.memcpy_htod_async(d_input, X, stream)
# do inference
context.execute_async_v2(bindings=[int(d_input)] + [int(d_output)], stream_handle=stream.handle)
# copy results to host
cuda.memcpy_dtoh_async(h_output, d_output, stream)
stream.synchronize()
return h_output
return run
run = nvinfer(engine, (2,4,), [2,2])
X = np.ones([2,4], dtype='float32')
print(run(X) - np.matmul(X, W))
shape操作
shuffle
, 在tensorrt中, shuffle包含了first_transpost
,reshape
和second_transpose
三步gather
,从某个维度提取indicesslice
, 从多个维度提取indices