TensorRT-Demo/trt_ssd_async.py

186 lines
6.1 KiB
Python

"""trt_ssd_async.py
This is the 'async' version of trt_ssd.py implementation. It creates
1 dedicated child thread for fetching camera input and do inferencing
with the TensorRT optimized SSD model/engine, while using the main
thread for drawing detection results and displaying video. Ideally,
the 2 threads work in a pipeline fashion so overall throughput (FPS)
would be improved comparing to the non-async version.
"""
import time
import argparse
import threading
import cv2
import pycuda.driver as cuda
from utils.ssd_classes import get_cls_dict
from utils.ssd import TrtSSD
from utils.camera import add_camera_args, Camera
from utils.display import open_window, set_display, show_fps
from utils.visualization import BBoxVisualization
WINDOW_NAME = 'TrtSsdDemoAsync'
MAIN_THREAD_TIMEOUT = 20.0 # 20 seconds
INPUT_HW = (300, 300)
SUPPORTED_MODELS = [
'ssd_mobilenet_v1_coco',
'ssd_mobilenet_v1_egohands',
'ssd_mobilenet_v2_coco',
'ssd_mobilenet_v2_egohands',
'ssd_inception_v2_coco',
'ssdlite_mobilenet_v2_coco',
]
# These global variables are 'shared' between the main and child
# threads. The child thread writes new frame and detection result
# into these variables, while the main thread reads from them.
s_img, s_boxes, s_confs, s_clss = None, None, None, None
def parse_args():
"""Parse input arguments."""
desc = ('Capture and display live camera video, while doing '
'real-time object detection with TensorRT optimized '
'SSD model on Jetson Nano')
parser = argparse.ArgumentParser(description=desc)
parser = add_camera_args(parser)
parser.add_argument('-m', '--model', type=str,
default='ssd_mobilenet_v1_coco',
choices=SUPPORTED_MODELS)
args = parser.parse_args()
return args
class TrtThread(threading.Thread):
"""TrtThread
This implements the child thread which continues to read images
from cam (input) and to do TRT engine inferencing. The child
thread stores the input image and detection results into global
variables and uses a condition varaiable to inform main thread.
In other words, the TrtThread acts as the producer while the
main thread is the consumer.
"""
def __init__(self, condition, cam, model, conf_th):
"""__init__
# Arguments
condition: the condition variable used to notify main
thread about new frame and detection result
cam: the camera object for reading input image frames
model: a string, specifying the TRT SSD model
conf_th: confidence threshold for detection
"""
threading.Thread.__init__(self)
self.condition = condition
self.cam = cam
self.model = model
self.conf_th = conf_th
self.cuda_ctx = None # to be created when run
self.trt_ssd = None # to be created when run
self.running = False
def run(self):
"""Run until 'running' flag is set to False by main thread.
NOTE: CUDA context is created here, i.e. inside the thread
which calls CUDA kernels. In other words, creating CUDA
context in __init__() doesn't work.
"""
global s_img, s_boxes, s_confs, s_clss
print('TrtThread: loading the TRT SSD engine...')
self.cuda_ctx = cuda.Device(0).make_context() # GPU 0
self.trt_ssd = TrtSSD(self.model, INPUT_HW)
print('TrtThread: start running...')
self.running = True
while self.running:
img = self.cam.read()
if img is None:
break
boxes, confs, clss = self.trt_ssd.detect(img, self.conf_th)
with self.condition:
s_img, s_boxes, s_confs, s_clss = img, boxes, confs, clss
self.condition.notify()
del self.trt_ssd
self.cuda_ctx.pop()
del self.cuda_ctx
print('TrtThread: stopped...')
def stop(self):
self.running = False
self.join()
def loop_and_display(condition, vis):
"""Take detection results from the child thread and display.
# Arguments
condition: the condition variable for synchronization with
the child thread.
vis: for visualization.
"""
global s_img, s_boxes, s_confs, s_clss
full_scrn = False
fps = 0.0
tic = time.time()
while True:
if cv2.getWindowProperty(WINDOW_NAME, 0) < 0:
break
with condition:
# Wait for the next frame and detection result. When
# getting the signal from the child thread, save the
# references to the frame and detection result for
# display.
if condition.wait(timeout=MAIN_THREAD_TIMEOUT):
img, boxes, confs, clss = s_img, s_boxes, s_confs, s_clss
else:
raise SystemExit('ERROR: timeout waiting for img from child')
img = vis.draw_bboxes(img, boxes, confs, clss)
img = show_fps(img, fps)
cv2.imshow(WINDOW_NAME, img)
toc = time.time()
curr_fps = 1.0 / (toc - tic)
# calculate an exponentially decaying average of fps number
fps = curr_fps if fps == 0.0 else (fps*0.95 + curr_fps*0.05)
tic = toc
key = cv2.waitKey(1)
if key == 27: # ESC key: quit program
break
elif key == ord('F') or key == ord('f'): # Toggle fullscreen
full_scrn = not full_scrn
set_display(WINDOW_NAME, full_scrn)
def main():
args = parse_args()
cam = Camera(args)
if not cam.isOpened():
raise SystemExit('ERROR: failed to open camera!')
cuda.init() # init pycuda driver
cls_dict = get_cls_dict(args.model.split('_')[-1])
open_window(
WINDOW_NAME, 'Camera TensorRT SSD Demo',
cam.img_width, cam.img_height)
vis = BBoxVisualization(cls_dict)
condition = threading.Condition()
trt_thread = TrtThread(condition, cam, args.model, conf_th=0.3)
trt_thread.start() # start the child thread
loop_and_display(condition, vis)
trt_thread.stop() # stop the child thread
cam.release()
cv2.destroyAllWindows()
if __name__ == '__main__':
main()