TensorRT Python常用操作

Date: 2019/10/12 Categories: 工作 Tags: TensorRT DeepLearningInference



Minimal Builder

这里构建了一个只包含一个identity函数的网络

import tensorrt as trt

## Setup Builder
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
builder = trt.Builder(TRT_LOGGER)
explicit_batch_flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
network = builder.create_network(explicit_batch_flag)
builder_config = builder.create_builder_config()
builder_config.max_workspace_size = 5000 * (1024 * 1024) # 5000 MiB

## Build Network
input_shape = (1,4)
x = network.add_input(name="X", dtype=trt.int32, shape=input_shape)
id_layer = network.add_identity(x)
x = id_layer.get_output(0)
network.mark_output(x)

## Get Engine
engine = builder.build_engine(network, builder_config)

## Write engine
with open('engine', 'wb') as f:
    f.write(engine.serialize())

Minimal Inference

import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
import tensorrt as trt

input_shape = (1,4)

runtime = trt.Runtime(TRT_LOGGER)

# restore engine
with open('engine', 'rb') as f:
    engine = runtime.deserialize_cuda_engine(f.read())

# make context
context = engine.create_execution_context()

# allocate input and output memory
input_nbytes = trt.volume(input_shape)* trt.int32.itemsize
d_input = cuda.mem_alloc(input_nbytes)
context.set_binding_shape(0, input_shape)
h_output = cuda.pagelocked_empty(tuple(context.get_binding_shape(0)), dtype=np.int32)
d_output = cuda.mem_alloc(h_output.nbytes)
stream = cuda.Stream()  # Create a stream in which to copy inputs/outputs and run inference.

# setup input
data = np.array([[1,2,3,4]], dtype='int32')
cuda.memcpy_htod_async(d_input, data, stream)

# do inference
context.execute_async_v2(bindings=[int(d_input)] + [int(d_output)], stream_handle=stream.handle)

# copy results to host
cuda.memcpy_dtoh_async(h_output, d_output, stream)
stream.synchronize()

# show result
print(h_output)

Dynamic Shape

Dynamic Batch是Dynamic Shape的一种特殊情况, 下面的网络获取了每个batch的第一个element, 利用了TensorRT中的gather层

import tensorrt as trt
import numpy as np

GGER = trt.Logger(trt.Logger.INFO)
builder = trt.Builder(TRT_LOGGER)
explicit_batch_flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
network = builder.create_network(explicit_batch_flag)
builder_config = builder.create_builder_config()
builder_config.max_workspace_size = 5000 * (1024 * 1024) # 5000 MiB

profile = builder.create_optimization_profile()
profile.set_shape("X", min=[1,4], opt=[1,4], max=[1,4])
builder_config.add_optimization_profile(profile)

# Input Layer
input_shape = (-1,4)
x = network.add_input(name="X", dtype=trt.int32, shape=input_shape)

# Identity Layer
id_layer = network.add_identity(x)
x = id_layer.get_output(0)

indices = network.add_constant([1], np.array([0], dtype='int32')).get_output(0)
axis = 1
gather_layer = network.add_gather(x, indices, axis)
x = gather_layer.get_output(0)
print(f"shape: {x.shape}")

network.mark_output(x)
engine = builder.build_engine(network, builder_config)

参考了Working With Dynamic ShapesIGatherLayer的文档

Fully-connected layer

TRT中的全连接层输入是一个4维的向量[B, C, 1, 1], 输出也是4维[B, O, 1, 1], 所以输入必须被reshape到合适的形状, 下面是一个例子

# build the engine
import tensorrt as trt
import numpy as np

## Setup Builder
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
builder = trt.Builder(TRT_LOGGER)

## Build Network
batch_size = 2
input_dim = 4
output_dim = 2
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
input_shape = (batch_size, input_dim, 1, 1)
x = network.add_input(name="X", dtype=trt.float32, shape=input_shape)

W = np.ones((input_dim, output_dim), dtype='float32')
B = np.zeros(output_dim, dtype='float32')
fc_Layer = network.add_fully_connected(x, output_dim, W, B)
y = fc_Layer.get_output(0)

network.mark_output(y)
engine = builder.build_engine(network, builder.create_builder_config())

# do inference

def nvinfer(engine, X_shape,  Y_shape, X_dtype='float32', output_dtype='float32'):
    import pycuda.driver as cuda
    import pycuda.autoinit
    import numpy as np
    import tensorrt as trt

    runtime = trt.Runtime(TRT_LOGGER)

    # make context
    context = engine.create_execution_context()

    # allocate input and output memory
    input_nbytes = trt.volume(X_shape) * getattr(trt, X_dtype).itemsize
    d_input = cuda.mem_alloc(input_nbytes)
    context.set_binding_shape(0, X.shape)
    h_output = cuda.pagelocked_empty(Y_shape, dtype=output_dtype)
    d_output = cuda.mem_alloc(h_output.nbytes)
    stream = cuda.Stream()  # Create a stream in which to copy inputs/outputs and run inference.

    def run(X):
        cuda.memcpy_htod_async(d_input, X, stream)
        # do inference
        context.execute_async_v2(bindings=[int(d_input)] + [int(d_output)], stream_handle=stream.handle)
        # copy results to host
        cuda.memcpy_dtoh_async(h_output, d_output, stream)
        stream.synchronize()
        return h_output
    return run

run = nvinfer(engine, (2,4,), [2,2])
X = np.ones([2,4], dtype='float32')
print(run(X) - np.matmul(X, W))

shape操作

  • shuffle, 在tensorrt中, shuffle包含了first_transpost, reshapesecond_transpose三步
  • gather,从某个维度提取indices
  • slice, 从多个维度提取indices