"""mtcnn_trt.py """ import numpy as np import cv2 import pytrt PIXEL_MEAN = 127.5 PIXEL_SCALE = 0.0078125 def convert_to_1x1(boxes): """Convert detection boxes to 1:1 sizes # Arguments boxes: numpy array, shape (n,5), dtype=float32 # Returns boxes_1x1 """ boxes_1x1 = boxes.copy() hh = boxes[:, 3] - boxes[:, 1] + 1. ww = boxes[:, 2] - boxes[:, 0] + 1. mm = np.maximum(hh, ww) boxes_1x1[:, 0] = boxes[:, 0] + ww * 0.5 - mm * 0.5 boxes_1x1[:, 1] = boxes[:, 1] + hh * 0.5 - mm * 0.5 boxes_1x1[:, 2] = boxes_1x1[:, 0] + mm - 1. boxes_1x1[:, 3] = boxes_1x1[:, 1] + mm - 1. boxes_1x1[:, 0:4] = np.fix(boxes_1x1[:, 0:4]) return boxes_1x1 def crop_img_with_padding(img, box, padding=0): """Crop a box from image, with out-of-boundary pixels padded # Arguments img: img as a numpy array, shape (H, W, 3) box: numpy array, shape (5,) or (4,) padding: integer value for padded pixels # Returns cropped_im: cropped image as a numpy array, shape (H, W, 3) """ img_h, img_w, _ = img.shape if box.shape[0] == 5: cx1, cy1, cx2, cy2, _ = box.astype(int) elif box.shape[0] == 4: cx1, cy1, cx2, cy2 = box.astype(int) else: raise ValueError cw = cx2 - cx1 + 1 ch = cy2 - cy1 + 1 cropped_im = np.zeros((ch, cw, 3), dtype=np.uint8) + padding ex1 = max(0, -cx1) # ex/ey's are the destination coordinates ey1 = max(0, -cy1) ex2 = min(cw, img_w - cx1) ey2 = min(ch, img_h - cy1) fx1 = max(cx1, 0) # fx/fy's are the source coordinates fy1 = max(cy1, 0) fx2 = min(cx2+1, img_w) fy2 = min(cy2+1, img_h) cropped_im[ey1:ey2, ex1:ex2, :] = img[fy1:fy2, fx1:fx2, :] return cropped_im def nms(boxes, threshold, type='Union'): """Non-Maximum Supression # Arguments boxes: numpy array [:, 0:5] of [x1, y1, x2, y2, score]'s threshold: confidence/score threshold, e.g. 0.5 type: 'Union' or 'Min' # Returns A list of indices indicating the result of NMS """ if boxes.shape[0] == 0: return [] xx1, yy1, xx2, yy2 = boxes[:, 0], boxes[:, 1], boxes[:, 2], boxes[:, 3] areas = np.multiply(xx2-xx1+1, yy2-yy1+1) sorted_idx = boxes[:, 4].argsort() pick = [] while len(sorted_idx) > 0: # In each loop, pick the last box (highest score) and remove # all other boxes with IoU over threshold tx1 = np.maximum(xx1[sorted_idx[-1]], xx1[sorted_idx[0:-1]]) ty1 = np.maximum(yy1[sorted_idx[-1]], yy1[sorted_idx[0:-1]]) tx2 = np.minimum(xx2[sorted_idx[-1]], xx2[sorted_idx[0:-1]]) ty2 = np.minimum(yy2[sorted_idx[-1]], yy2[sorted_idx[0:-1]]) tw = np.maximum(0.0, tx2 - tx1 + 1) th = np.maximum(0.0, ty2 - ty1 + 1) inter = tw * th if type == 'Min': iou = inter / \ np.minimum(areas[sorted_idx[-1]], areas[sorted_idx[0:-1]]) else: iou = inter / \ (areas[sorted_idx[-1]] + areas[sorted_idx[0:-1]] - inter) pick.append(sorted_idx[-1]) sorted_idx = sorted_idx[np.where(iou <= threshold)[0]] return pick def generate_pnet_bboxes(conf, reg, scale, t): """ # Arguments conf: softmax score (face or not) of each grid reg: regression values of x1, y1, x2, y2 coordinates. The values are normalized to grid width (12) and height (12). scale: scale-down factor with respect to original image t: confidence threshold # Returns A numpy array of bounding box coordinates and the cooresponding scores: [[x1, y1, x2, y2, score], ...] # Notes Top left corner coordinates of each grid is (x*2, y*2), or (x*2/scale, y*2/scale) in the original image. Bottom right corner coordinates is (x*2+12-1, y*2+12-1), or ((x*2+12-1)/scale, (y*2+12-1)/scale) in the original image. """ conf = conf.T # swap H and W dimensions dx1 = reg[0, :, :].T dy1 = reg[1, :, :].T dx2 = reg[2, :, :].T dy2 = reg[3, :, :].T (x, y) = np.where(conf >= t) if len(x) == 0: return np.zeros((0, 5), np.float32) score = np.array(conf[x, y]).reshape(-1, 1) # Nx1 reg = np.array([dx1[x, y], dy1[x, y], dx2[x, y], dy2[x, y]]).T * 12. # Nx4 topleft = np.array([x, y], dtype=np.float32).T * 2. # Nx2 bottomright = topleft + np.array([11., 11.], dtype=np.float32) # Nx2 boxes = (np.concatenate((topleft, bottomright), axis=1) + reg) / scale boxes = np.concatenate((boxes, score), axis=1) # Nx5 # filter bboxes which are too small #boxes = boxes[boxes[:, 2]-boxes[:, 0] >= 12., :] #boxes = boxes[boxes[:, 3]-boxes[:, 1] >= 12., :] return boxes def generate_rnet_bboxes(conf, reg, pboxes, t): """ # Arguments conf: softmax score (face or not) of each box reg: regression values of x1, y1, x2, y2 coordinates. The values are normalized to box width and height. pboxes: input boxes to RNet t: confidence threshold # Returns boxes: a numpy array of box coordinates and cooresponding scores: [[x1, y1, x2, y2, score], ...] """ boxes = pboxes.copy() # make a copy assert boxes.shape[0] == conf.shape[0] boxes[:, 4] = conf # update 'score' of all boxes boxes = boxes[conf >= t, :] reg = reg[conf >= t, :] ww = (boxes[:, 2]-boxes[:, 0]+1).reshape(-1, 1) # x2 - x1 + 1 hh = (boxes[:, 3]-boxes[:, 1]+1).reshape(-1, 1) # y2 - y1 + 1 boxes[:, 0:4] += np.concatenate((ww, hh, ww, hh), axis=1) * reg return boxes def generate_onet_outputs(conf, reg_boxes, reg_marks, rboxes, t): """ # Arguments conf: softmax score (face or not) of each box reg_boxes: regression values of x1, y1, x2, y2 The values are normalized to box width and height. reg_marks: regression values of the 5 facial landmark points rboxes: input boxes to ONet (already converted to 2x1) t: confidence threshold # Returns boxes: a numpy array of box coordinates and cooresponding scores: [[x1, y1, x2, y2,... , score], ...] landmarks: a numpy array of facial landmark coordinates: [[x1, x2, ..., x5, y1, y2, ..., y5], ...] """ boxes = rboxes.copy() # make a copy assert boxes.shape[0] == conf.shape[0] boxes[:, 4] = conf boxes = boxes[conf >= t, :] reg_boxes = reg_boxes[conf >= t, :] reg_marks = reg_marks[conf >= t, :] xx = boxes[:, 0].reshape(-1, 1) yy = boxes[:, 1].reshape(-1, 1) ww = (boxes[:, 2]-boxes[:, 0]).reshape(-1, 1) hh = (boxes[:, 3]-boxes[:, 1]).reshape(-1, 1) marks = np.concatenate((xx, xx, xx, xx, xx, yy, yy, yy, yy, yy), axis=1) marks += np.concatenate((ww, ww, ww, ww, ww, hh, hh, hh, hh, hh), axis=1) * reg_marks ww = ww + 1 hh = hh + 1 boxes[:, 0:4] += np.concatenate((ww, hh, ww, hh), axis=1) * reg_boxes return boxes, marks def clip_dets(dets, img_w, img_h): """Round and clip detection (x1, y1, ...) values. Note we exclude the last value of 'dets' in computation since it is 'conf'. """ dets[:, 0:-1] = np.fix(dets[:, 0:-1]) evens = np.arange(0, dets.shape[1]-1, 2) odds = np.arange(1, dets.shape[1]-1, 2) dets[:, evens] = np.clip(dets[:, evens], 0., float(img_w-1)) dets[:, odds] = np.clip(dets[:, odds], 0., float(img_h-1)) return dets class TrtPNet(object): """TrtPNet Refer to mtcnn/det1_relu.prototxt for calculation of input/output dimmensions of TrtPNet, as well as input H offsets (for all scales). The output H offsets are merely input offsets divided by stride (2). """ input_h_offsets = (0, 216, 370, 478, 556, 610, 648, 676, 696) output_h_offsets = (0, 108, 185, 239, 278, 305, 324, 338, 348) max_n_scales = 9 def __init__(self, engine): """__init__ # Arguments engine: path to the TensorRT engine file """ self.trtnet = pytrt.PyTrtMtcnn(engine, (3, 710, 384), (2, 350, 187), (4, 350, 187)) self.trtnet.set_batchsize(1) def detect(self, img, minsize=40, factor=0.709, threshold=0.7): """Detect faces using PNet # Arguments img: input image as a RGB numpy array threshold: confidence threshold # Returns A numpy array of bounding box coordinates and the cooresponding scores: [[x1, y1, x2, y2, score], ...] """ if minsize < 40: raise ValueError("TrtPNet is currently designed with " "'minsize' >= 40") if factor > 0.709: raise ValueError("TrtPNet is currently designed with " "'factor' <= 0.709") m = 12.0 / minsize img_h, img_w, _ = img.shape minl = min(img_h, img_w) * m # create scale pyramid scales = [] while minl >= 12: scales.append(m) m *= factor minl *= factor if len(scales) > self.max_n_scales: # probably won't happen... raise ValueError('Too many scales, try increasing minsize ' 'or decreasing factor.') total_boxes = np.zeros((0, 5), dtype=np.float32) img = (img.astype(np.float32) - PIXEL_MEAN) * PIXEL_SCALE # stack all scales of the input image vertically into 1 big # image, and only do inferencing once im_data = np.zeros((1, 3, 710, 384), dtype=np.float32) for i, scale in enumerate(scales): h_offset = self.input_h_offsets[i] h = int(img_h * scale) w = int(img_w * scale) im_data[0, :, h_offset:(h_offset+h), :w] = \ cv2.resize(img, (w, h)).transpose((2, 0, 1)) out = self.trtnet.forward(im_data) # extract outputs of each scale from the big output blob for i, scale in enumerate(scales): h_offset = self.output_h_offsets[i] h = (int(img_h * scale) - 12) // 2 + 1 w = (int(img_w * scale) - 12) // 2 + 1 pp = out['prob1'][0, 1, h_offset:(h_offset+h), :w] cc = out['boxes'][0, :, h_offset:(h_offset+h), :w] boxes = generate_pnet_bboxes(pp, cc, scale, threshold) if boxes.shape[0] > 0: pick = nms(boxes, 0.5, 'Union') if len(pick) > 0: boxes = boxes[pick, :] if boxes.shape[0] > 0: total_boxes = np.concatenate((total_boxes, boxes), axis=0) if total_boxes.shape[0] == 0: return total_boxes pick = nms(total_boxes, 0.7, 'Union') dets = clip_dets(total_boxes[pick, :], img_w, img_h) return dets def destroy(self): self.trtnet.destroy() self.trtnet = None class TrtRNet(object): """TrtRNet # Arguments engine: path to the TensorRT engine (det2) file """ def __init__(self, engine): self.trtnet = pytrt.PyTrtMtcnn(engine, (3, 24, 24), (2, 1, 1), (4, 1, 1)) def detect(self, img, boxes, max_batch=256, threshold=0.6): """Detect faces using RNet # Arguments img: input image as a RGB numpy array boxes: detection results by PNet, a numpy array [:, 0:5] of [x1, y1, x2, y2, score]'s max_batch: only process these many top boxes from PNet threshold: confidence threshold # Returns A numpy array of bounding box coordinates and the cooresponding scores: [[x1, y1, x2, y2, score], ...] """ if max_batch > 256: raise ValueError('Bad max_batch: %d' % max_batch) boxes = boxes[:max_batch] # assuming boxes are sorted by score if boxes.shape[0] == 0: return boxes img_h, img_w, _ = img.shape boxes = convert_to_1x1(boxes) crops = np.zeros((boxes.shape[0], 24, 24, 3), dtype=np.uint8) for i, det in enumerate(boxes): cropped_im = crop_img_with_padding(img, det) # NOTE: H and W dimensions need to be transposed for RNet! crops[i, ...] = cv2.transpose(cv2.resize(cropped_im, (24, 24))) crops = crops.transpose((0, 3, 1, 2)) # NHWC -> NCHW crops = (crops.astype(np.float32) - PIXEL_MEAN) * PIXEL_SCALE self.trtnet.set_batchsize(crops.shape[0]) out = self.trtnet.forward(crops) pp = out['prob1'][:, 1, 0, 0] cc = out['boxes'][:, :, 0, 0] boxes = generate_rnet_bboxes(pp, cc, boxes, threshold) if boxes.shape[0] == 0: return boxes pick = nms(boxes, 0.7, 'Union') dets = clip_dets(boxes[pick, :], img_w, img_h) return dets def destroy(self): self.trtnet.destroy() self.trtnet = None class TrtONet(object): """TrtONet # Arguments engine: path to the TensorRT engine (det3) file """ def __init__(self, engine): self.trtnet = pytrt.PyTrtMtcnn(engine, (3, 48, 48), (2, 1, 1), (4, 1, 1), (10, 1, 1)) def detect(self, img, boxes, max_batch=64, threshold=0.7): """Detect faces using ONet # Arguments img: input image as a RGB numpy array boxes: detection results by RNet, a numpy array [:, 0:5] of [x1, y1, x2, y2, score]'s max_batch: only process these many top boxes from RNet threshold: confidence threshold # Returns dets: boxes and conf scores landmarks """ if max_batch > 64: raise ValueError('Bad max_batch: %d' % max_batch) if boxes.shape[0] == 0: return (np.zeros((0, 5), dtype=np.float32), np.zeros((0, 10), dtype=np.float32)) boxes = boxes[:max_batch] # assuming boxes are sorted by score img_h, img_w, _ = img.shape boxes = convert_to_1x1(boxes) crops = np.zeros((boxes.shape[0], 48, 48, 3), dtype=np.uint8) for i, det in enumerate(boxes): cropped_im = crop_img_with_padding(img, det) # NOTE: H and W dimensions need to be transposed for RNet! crops[i, ...] = cv2.transpose(cv2.resize(cropped_im, (48, 48))) crops = crops.transpose((0, 3, 1, 2)) # NHWC -> NCHW crops = (crops.astype(np.float32) - PIXEL_MEAN) * PIXEL_SCALE self.trtnet.set_batchsize(crops.shape[0]) out = self.trtnet.forward(crops) pp = out['prob1'][:, 1, 0, 0] cc = out['boxes'][:, :, 0, 0] mm = out['landmarks'][:, :, 0, 0] boxes, landmarks = generate_onet_outputs(pp, cc, mm, boxes, threshold) pick = nms(boxes, 0.7, 'Min') return (clip_dets(boxes[pick, :], img_w, img_h), np.fix(landmarks[pick, :])) def destroy(self): self.trtnet.destroy() self.trtnet = None class TrtMtcnn(object): """TrtMtcnn""" def __init__(self): self.pnet = TrtPNet('mtcnn/det1.engine') self.rnet = TrtRNet('mtcnn/det2.engine') self.onet = TrtONet('mtcnn/det3.engine') def __del__(self): self.onet.destroy() self.rnet.destroy() self.pnet.destroy() def _detect_1280x720(self, img, minsize): """_detec_1280x720() Assuming 'img' has been resized to less than 1280x720. """ # MTCNN model was trained with 'MATLAB' image so its channel # order is RGB instead of BGR. img = img[:, :, ::-1] # BGR -> RGB dets = self.pnet.detect(img, minsize=minsize) dets = self.rnet.detect(img, dets) dets, landmarks = self.onet.detect(img, dets) return dets, landmarks def detect(self, img, minsize=40): """detect() This function handles rescaling of the input image if it's larger than 1280x720. """ if img is None: raise ValueError img_h, img_w, _ = img.shape scale = min(720. / img_h, 1280. / img_w) if scale < 1.0: new_h = int(np.ceil(img_h * scale)) new_w = int(np.ceil(img_w * scale)) img = cv2.resize(img, (new_w, new_h)) minsize = max(int(np.ceil(minsize * scale)), 40) dets, landmarks = self._detect_1280x720(img, minsize) if scale < 1.0: dets[:, :-1] = np.fix(dets[:, :-1] / scale) landmarks = np.fix(landmarks / scale) return dets, landmarks