"""yolo_with_plugins.py Implementation of TrtYOLO class with the yolo_layer plugins. """ from __future__ import print_function import ctypes import numpy as np import cv2 import tensorrt as trt import pycuda.driver as cuda try: ctypes.cdll.LoadLibrary('./plugins/libyolo_layer.so') except OSError as e: raise SystemExit('ERROR: failed to load ./plugins/libyolo_layer.so. ' 'Did you forget to do a "make" in the "./plugins/" ' 'subdirectory?') from e def _preprocess_yolo(img, input_shape, letter_box=False): """Preprocess an image before TRT YOLO inferencing. # Args img: int8 numpy array of shape (img_h, img_w, 3) input_shape: a tuple of (H, W) letter_box: boolean, specifies whether to keep aspect ratio and create a "letterboxed" image for inference # Returns preprocessed img: float32 numpy array of shape (3, H, W) """ if letter_box: img_h, img_w, _ = img.shape new_h, new_w = input_shape[0], input_shape[1] offset_h, offset_w = 0, 0 if (new_w / img_w) <= (new_h / img_h): new_h = int(img_h * new_w / img_w) offset_h = (input_shape[0] - new_h) // 2 else: new_w = int(img_w * new_h / img_h) offset_w = (input_shape[1] - new_w) // 2 resized = cv2.resize(img, (new_w, new_h)) img = np.full((input_shape[0], input_shape[1], 3), 127, dtype=np.uint8) img[offset_h:(offset_h + new_h), offset_w:(offset_w + new_w), :] = resized else: img = cv2.resize(img, (input_shape[1], input_shape[0])) img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) img = img.transpose((2, 0, 1)).astype(np.float32) img /= 255.0 return img def _nms_boxes(detections, nms_threshold): """Apply the Non-Maximum Suppression (NMS) algorithm on the bounding boxes with their confidence scores and return an array with the indexes of the bounding boxes we want to keep. # Args detections: Nx7 numpy arrays of [[x, y, w, h, box_confidence, class_id, class_prob], ......] """ x_coord = detections[:, 0] y_coord = detections[:, 1] width = detections[:, 2] height = detections[:, 3] box_confidences = detections[:, 4] * detections[:, 6] areas = width * height ordered = box_confidences.argsort()[::-1] keep = list() while ordered.size > 0: # Index of the current element: i = ordered[0] keep.append(i) xx1 = np.maximum(x_coord[i], x_coord[ordered[1:]]) yy1 = np.maximum(y_coord[i], y_coord[ordered[1:]]) xx2 = np.minimum(x_coord[i] + width[i], x_coord[ordered[1:]] + width[ordered[1:]]) yy2 = np.minimum(y_coord[i] + height[i], y_coord[ordered[1:]] + height[ordered[1:]]) width1 = np.maximum(0.0, xx2 - xx1 + 1) height1 = np.maximum(0.0, yy2 - yy1 + 1) intersection = width1 * height1 union = (areas[i] + areas[ordered[1:]] - intersection) iou = intersection / union indexes = np.where(iou <= nms_threshold)[0] ordered = ordered[indexes + 1] keep = np.array(keep) return keep def _postprocess_yolo(trt_outputs, img_w, img_h, conf_th, nms_threshold, input_shape, letter_box=False): """Postprocess TensorRT outputs. # Args trt_outputs: a list of 2 or 3 tensors, where each tensor contains a multiple of 7 float32 numbers in the order of [x, y, w, h, box_confidence, class_id, class_prob] conf_th: confidence threshold letter_box: boolean, referring to _preprocess_yolo() # Returns boxes, scores, classes (after NMS) """ # filter low-conf detections and concatenate results of all yolo layers # 输出应该是[3*(80 + c + w + h + x + y)),(13+26+52),(13+26+52)] # 猜测是应该将80分类换算成了1个id号和1个置信度,这可能就是yolo更换输出的原因,确实是在插件中更换的输出类型 detections = [] for o in trt_outputs: # x, y, w, h , c , id , score dets = o.reshape((-1, 7)) dets = dets[dets[:, 4] * dets[:, 6] >= conf_th] detections.append(dets) detections = np.concatenate(detections, axis=0) if len(detections) == 0: boxes = np.zeros((0, 4), dtype=np.int) scores = np.zeros((0,), dtype=np.float32) classes = np.zeros((0,), dtype=np.float32) else: box_scores = detections[:, 4] * detections[:, 6] # scale x, y, w, h from [0, 1] to pixel values old_h, old_w = img_h, img_w offset_h, offset_w = 0, 0 if letter_box: if (img_w / input_shape[1]) >= (img_h / input_shape[0]): old_h = int(input_shape[0] * img_w / input_shape[1]) offset_h = (old_h - img_h) // 2 else: old_w = int(input_shape[1] * img_h / input_shape[0]) offset_w = (old_w - img_w) // 2 detections[:, 0:4] *= np.array( [old_w, old_h, old_w, old_h], dtype=np.float32) # NMS nms_detections = np.zeros((0, 7), dtype=detections.dtype) for class_id in set(detections[:, 5]): idxs = np.where(detections[:, 5] == class_id) cls_detections = detections[idxs] keep = _nms_boxes(cls_detections, nms_threshold) nms_detections = np.concatenate( [nms_detections, cls_detections[keep]], axis=0) xx = nms_detections[:, 0].reshape(-1, 1) yy = nms_detections[:, 1].reshape(-1, 1) if letter_box: xx = xx - offset_w yy = yy - offset_h ww = nms_detections[:, 2].reshape(-1, 1) hh = nms_detections[:, 3].reshape(-1, 1) boxes = np.concatenate([xx, yy, xx+ww, yy+hh], axis=1) + 0.5 boxes = boxes.astype(np.int) scores = nms_detections[:, 4] * nms_detections[:, 6] classes = nms_detections[:, 5] return boxes, scores, classes class HostDeviceMem(object): """Simple helper data class that's a little nicer to use than a 2-tuple.""" def __init__(self, host_mem, device_mem): self.host = host_mem self.device = device_mem def __str__(self): return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device) def __repr__(self): return self.__str__() def __del__(self): del self.device del self.host def get_input_shape(engine): """Get input shape of the TensorRT YOLO engine.""" binding = engine[0] assert engine.binding_is_input(binding) binding_dims = engine.get_binding_shape(binding) if len(binding_dims) == 4: return tuple(binding_dims[2:]) elif len(binding_dims) == 3: return tuple(binding_dims[1:]) else: raise ValueError('bad dims of binding %s: %s' % (binding, str(binding_dims))) def allocate_buffers(engine): """Allocates all host/device in/out buffers required for an engine.""" inputs = [] outputs = [] bindings = [] output_idx = 0 stream = cuda.Stream() for binding in engine: binding_dims = engine.get_binding_shape(binding) if len(binding_dims) == 4: # explicit batch case (TensorRT 7+) size = trt.volume(binding_dims) elif len(binding_dims) == 3: # implicit batch case (TensorRT 6 or older) size = trt.volume(binding_dims) * engine.max_batch_size else: raise ValueError('bad dims of binding %s: %s' % (binding, str(binding_dims))) dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(device_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) else: # each grid has 3 anchors, each anchor generates a detection # output of 7 float32 values assert size % 7 == 0 outputs.append(HostDeviceMem(host_mem, device_mem)) output_idx += 1 assert len(inputs) == 1 assert len(outputs) == 1 return inputs, outputs, bindings, stream def do_inference(context, bindings, inputs, outputs, stream, batch_size=1): """do_inference (for TensorRT 6.x or lower) This function is generalized for multiple inputs/outputs. Inputs and outputs are expected to be lists of HostDeviceMem objects. """ # Transfer input data to the GPU. [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs] # Run inference. context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle) # Transfer predictions back from the GPU. [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs] # Synchronize the stream stream.synchronize() # Return only the host outputs. return [out.host for out in outputs] def do_inference_v2(context, bindings, inputs, outputs, stream): """do_inference_v2 (for TensorRT 7.0+) This function is generalized for multiple inputs/outputs for full dimension networks. Inputs and outputs are expected to be lists of HostDeviceMem objects. """ # Transfer input data to the GPU. [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs] # Run inference. context.execute_async_v2(bindings=bindings, stream_handle=stream.handle) # Transfer predictions back from the GPU. [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs] # Synchronize the stream stream.synchronize() # Return only the host outputs. return [out.host for out in outputs] class TrtYOLO(object): """TrtYOLO class encapsulates things needed to run TRT YOLO.""" def _load_engine(self): TRTbin = 'yolo/%s.trt' % self.model with open(TRTbin, 'rb') as f, trt.Runtime(self.trt_logger) as runtime: return runtime.deserialize_cuda_engine(f.read()) def __init__(self, model, category_num=80, letter_box=False, cuda_ctx=None): """Initialize TensorRT plugins, engine and conetxt.""" # 保存engine模型 self.model = model # 保存分类数 self.category_num = category_num # 统一输入大小到letterbox self.letter_box = letter_box # 默认CUDA上下文只能从创建它的CPU线程访问,其他线程访问需push/pop从创建它的线程中弹出它,这样context可以被推送到任何其他CPU线程的当前上下文栈,并且随后的CUDA调用将引用该上下文。 self.cuda_ctx = cuda_ctx if self.cuda_ctx: self.cuda_ctx.push() # 设置推理函数 self.inference_fn = do_inference if trt.__version__[0] < '7' \ else do_inference_v2 # 打印日志,启动一个logging界面,抑制warning和errors,仅报告informational messages。 self.trt_logger = trt.Logger(trt.Logger.INFO) # 加载模型deserialize self.engine = self._load_engine() # 从模型中获取输入大小 self.input_shape = get_input_shape(self.engine) try: # 创建一个上下文,储存中间值,因为engine包含network定义和训练参数,因此需要额外的空间。 self.context = self.engine.create_execution_context() # create_execution_context是写在ICudaEngine.py的一个闭源方法,这个方法是创建立一个IExecutionContext类型的对象。 self.inputs, self.outputs, self.bindings, self.stream = \ allocate_buffers(self.engine) # 为输入输出分配host和device的buffers。host指的是CPU内存,device指的是GPU显存 except Exception as e: raise RuntimeError('fail to allocate CUDA resources') from e finally: if self.cuda_ctx: self.cuda_ctx.pop() def __del__(self): """Free CUDA memories.""" del self.outputs del self.inputs del self.stream def detect(self, img, conf_th=0.3, letter_box=None): """Detect objects in the input image.""" letter_box = self.letter_box if letter_box is None else letter_box # 保证输入源统一大小,符合推理模型使用 img_resized = _preprocess_yolo(img, self.input_shape, letter_box) # Set host input to the image. The do_inference() function # will copy the input to the GPU before executing. # 开辟一块内存空间,用于放入输入图像 self.inputs[0].host = np.ascontiguousarray(img_resized) if self.cuda_ctx: self.cuda_ctx.push() # 开始推理 trt_outputs = self.inference_fn( context=self.context, # 制定GPU的Context,可以理解为上下文,{} bindings=self.bindings, # 大概指的是内存到显存之间的绑定关系 inputs=self.inputs, # 输入数据 outputs=self.outputs, # 输出数据 stream=self.stream) # cuda的操作顺序流 if self.cuda_ctx: self.cuda_ctx.pop() # 后处理GPU返回的输出结果 boxes, scores, classes = _postprocess_yolo( trt_outputs, img.shape[1], img.shape[0], conf_th, nms_threshold=0.5, input_shape=self.input_shape, letter_box=letter_box) # clip x1, y1, x2, y2 within original image boxes[:, [0, 2]] = np.clip(boxes[:, [0, 2]], 0, img.shape[1]-1) boxes[:, [1, 3]] = np.clip(boxes[:, [1, 3]], 0, img.shape[0]-1) return boxes, scores, classes