In [1]:
#!/usr/bin/env python3
# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import time
import ctypes
import argparse
import numpy as np
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
In [2]:
SIZE = 32
In [3]:
max_query_length = SIZE
# When splitting up a long document into chunks, how much stride to take between chunks.
doc_stride = SIZE
# The maximum total input sequence length after WordPiece tokenization.
# Sequences longer than this will be truncated, and sequences shorter
max_seq_length = SIZE

# Import necessary plugins for BERT TensorRT
ctypes.CDLL("libnvinfer_plugin.so.6", mode=ctypes.RTLD_GLOBAL)
ctypes.CDLL("libcommon.so", mode=ctypes.RTLD_GLOBAL)
ctypes.CDLL("libbert_plugins.so", mode=ctypes.RTLD_GLOBAL)
Out[3]:
<CDLL 'libbert_plugins.so', handle 37fc880 at 0x7f392d247d10>
In [4]:
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
In [5]:
serialized_bert = open('./trt_bert/p40/{}.trt'.format(SIZE), 'rb').read()
In [6]:
runtime = trt.Runtime(TRT_LOGGER)
In [7]:
engine = runtime.deserialize_cuda_engine(serialized_bert)
In [8]:
context = engine.create_execution_context()
In [9]:
BATCH_SIZE = 8
In [10]:
input_shape=(BATCH_SIZE, max_seq_length)
input_nbytes = trt.volume(input_shape) * trt.int32.itemsize
In [11]:
d_inputs = [cuda.mem_alloc(input_nbytes) for binding in range(3)]
In [12]:
# Specify input shapes. These must be within the min/max bounds of the active profile (0th profile in this case)
# Note that input shapes can be specified on a per-inference basis, but in this case, we only have a single shape.
for binding in range(3):
    context.set_binding_shape(binding, input_shape)
assert context.all_binding_shapes_specified
In [13]:
h_output = cuda.pagelocked_empty(tuple(context.get_binding_shape(3)), dtype=np.float32)
d_output = cuda.mem_alloc(h_output.nbytes)
In [14]:
import data_processing as dp
import tokenization
/usr/local/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:516: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
/usr/local/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:517: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
/usr/local/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:518: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
/usr/local/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:519: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
/usr/local/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:520: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
/usr/local/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:525: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  np_resource = np.dtype([("resource", np.ubyte, 1)])
/usr/local/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:541: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
/usr/local/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:542: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
/usr/local/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:543: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
/usr/local/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:544: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
/usr/local/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:545: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
/usr/local/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:550: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  np_resource = np.dtype([("resource", np.ubyte, 1)])
In [15]:
tokenizer = tokenization.FullTokenizer(vocab_file='./trt_bert/model/vocab.txt',
                                       do_lower_case=True)
WARNING:tensorflow:From /apdcephfs/private_andyfei/trt/tokenization.py:125: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.

In [16]:
doc_tokens = dp.convert_doc_tokens('hello world, nice to meet you' * 100)
In [17]:
def question_features(question):
    return dp.convert_examples_to_features(doc_tokens, question, tokenizer,
                                           max_seq_length, doc_stride, max_query_length)
In [18]:
features = question_features('hello world')

features['input_ids'] = np.reshape((np.stack([features['input_ids']] * BATCH_SIZE)), [-1])
features['input_mask'] = np.reshape((np.stack([features['input_mask']] * BATCH_SIZE)), [-1])
features['segment_ids'] = np.reshape((np.stack([features['segment_ids']] * BATCH_SIZE)), [-1])
In [19]:
stream = cuda.Stream()  # Create a stream in which to copy inputs/outputs and run inference.
In [20]:
def inference(features):
    #print("\nRunning Inference...")
    
    eval_start_time = time.time()

    # Copy inputs
    cuda.memcpy_htod_async(d_inputs[0], features["input_ids"], stream)
    cuda.memcpy_htod_async(d_inputs[1], features["segment_ids"], stream)
    cuda.memcpy_htod_async(d_inputs[2], features["input_mask"], stream)
    # Run inference
    context.execute_async_v2(bindings=[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream.handle)
    # Transfer predictions back from GPU
    cuda.memcpy_dtoh_async(h_output, d_output, stream)
    # Synchronize the stream
    stream.synchronize()

    eval_time_elapsed = time.time() - eval_start_time

    #print("------------------------")
    #print("Running inference in {:.3f} ".format(eval_time_elapsed))
    #print("------------------------")
    
    return h_output
    
    for index, batch in enumerate(h_output):
        # Data Post-processing
        start_logits = batch[:, 0]
        end_logits = batch[:, 1]

        # Total number of n-best predictions to generate in the nbest_predictions.json output file
        n_best_size = 20

        # The maximum length of an answer that can be generated. This is needed
        # because the start and end predictions are not conditioned on one another
        max_answer_length = 30

        prediction, nbest_json, scores_diff_json = dp.get_predictions(doc_tokens, features,
                start_logits, end_logits, n_best_size, max_answer_length)

        print("Processing output {:} in batch".format(index))
        print("Answer: '{}'".format(prediction))
        print("With probability: {:.3f}".format(nbest_json[0]['probability'] * 100.0))
In [21]:
print(BATCH_SIZE)
print(SIZE)
8
32
In [22]:
%%time
out = inference(features)
out.shape
CPU times: user 9.07 ms, sys: 252 µs, total: 9.32 ms
Wall time: 8.73 ms
Out[22]:
(8, 32, 2, 1, 1)
In [23]:
%timeit inference(features)
8.34 ms ± 3.83 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)