commit eb89f036bd3cc6afc90a25d79771281755b80031 Author: 12345qiupeng Date: Mon Mar 6 20:44:29 2023 +0800 first commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1440668 --- /dev/null +++ b/.gitignore @@ -0,0 +1,43 @@ +__pycache__ +*.pyc + +pytrt.cpp +build +pytrt.*.so + +*.jpg +*.png +*.mp4 +*.ts + +googlenet/*.engine +googlenet/chobj +googlenet/dchobj +googlenet/create_engine + +mtcnn/*.engine +mtcnn/chobj +mtcnn/dchobj +mtcnn/create_engines + +ssd/libflattenconcat.so +ssd/*.uff +ssd/*.pbtxt +ssd/*.bin +ssd/*.json + +yolo/yolo*.cfg +yolo/yolo*.weights +yolo/yolo*.onnx +yolo/yolo*.trt +yolo/*.json +yolo/calib_images/ +yolo/calib_*.bin + +plugins/*.o +plugins/*.so + +modnet/venv* +modnet/*.ckpt +modnet/*.onnx +modnet/*.engine diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..134e8b5 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "modnet/onnx-tensorrt"] + path = modnet/onnx-tensorrt + url = https://github.com/onnx/onnx-tensorrt.git diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000..f5ab5b8 --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,15 @@ + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..b512b7a --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..8924a51 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/tensorrt_demos-master.iml b/.idea/tensorrt_demos-master.iml new file mode 100644 index 0000000..2342ee9 --- /dev/null +++ b/.idea/tensorrt_demos-master.iml @@ -0,0 +1,14 @@ + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..dad7a0b --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2019 JK Jung + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..9eda762 --- /dev/null +++ b/Makefile @@ -0,0 +1,8 @@ +PYTHON ?= python3 + +all: + ${PYTHON} setup.py build_ext -if + rm -rf build + +clean: + rm -rf build pytrt.cpp *.so diff --git a/README.md b/README.md new file mode 100644 index 0000000..d793ddc --- /dev/null +++ b/README.md @@ -0,0 +1,545 @@ +# tensorrt_demos + +Examples demonstrating how to optimize Caffe/TensorFlow/DarkNet/PyTorch models with TensorRT. + +Highlights: + +* Run an optimized "MODNet" video matting model at ~21 FPS on Jetson Xavier NX. +* Run an optimized "yolov4-416" object detector at ~4.6 FPS on Jetson Nano. +* Run an optimized "yolov3-416" object detector at ~4.9 FPS on Jetson Nano. +* Run an optimized "ssd_mobilenet_v1_coco" object detector ("trt_ssd_async.py") at 27~28 FPS on Jetson Nano. +* Run an optimized "MTCNN" face detector at 6~11 FPS on Jetson Nano. +* Run an optimized "GoogLeNet" image classifier at "~16 ms per image (inference only)" on Jetson Nano. + +Supported hardware: + +* NVIDIA Jetson + - All NVIDIA Jetson Developer Kits, e.g. [Jetson AGX Orin DevKit](https://www.nvidia.com/en-us/autonomous-machines/embedded-systems/jetson-orin/#advanced-features), [Jetson AGX Xavier DevKit](https://developer.nvidia.com/embedded/jetson-agx-xavier-developer-kit), [Jetson Xavier NX DevKit](https://developer.nvidia.com/embedded/jetson-xavier-nx-devkit), Jetson TX2 DevKit, [Jetson Nano DevKit](https://developer.nvidia.com/embedded/jetson-nano-developer-kit). + - Seeed [reComputer J1010](https://www.seeedstudio.com/Jetson-10-1-A0-p-5336.html) with Jetson Nano and [reComputer J2021](https://www.seeedstudio.com/reComputer-J2021-p-5438.html) with Jetson Xavier NX, which are built with NVIDIA Jetson production module and pre-installed with NVIDIA [JetPack SDK](https://developer.nvidia.com/embedded/jetpack). +* x86_64 PC with modern NVIDIA GPU(s). Refer to [README_x86.md](https://github.com/jkjung-avt/tensorrt_demos/blob/master/README_x86.md) for more information. + +Table of contents +----------------- + +* [Prerequisite](#prerequisite) +* [Demo #1: GoogLeNet](#googlenet) +* [Demo #2: MTCNN](#mtcnn) +* [Demo #3: SSD](#ssd) +* [Demo #4: YOLOv3](#yolov3) +* [Demo #5: YOLOv4](#yolov4) +* [Demo #6: Using INT8 and DLA core](#int8_and_dla) +* [Demo #7: MODNet](#modnet) + + +Prerequisite +------------ + +The code in this repository was tested on Jetson Nano, TX2, and Xavier NX DevKits. In order to run the demos below, first make sure you have the proper version of image (JetPack) installed on the target Jetson system. For example, [Setting up Jetson Nano: The Basics](https://jkjung-avt.github.io/setting-up-nano/) and [Setting up Jetson Xavier NX](https://jkjung-avt.github.io/setting-up-xavier-nx/). + +More specifically, the target Jetson system must have TensorRT libraries installed. + +* Demo #1 and Demo #2: works for TensorRT 3.x+, +* Demo #3: requires TensoRT 5.x+, +* Demo #4 and Demo #5: requires TensorRT 6.x+. +* Demo #6 part 1: INT8 requires TensorRT 6.x+ and only works on GPUs with CUDA compute 6.1+. +* Demo #6 part 2: DLA core requires TensorRT 7.x+ (is only tested on Jetson Xavier NX). +* Demo #7: requires TensorRT 7.x+. + +You could check which version of TensorRT has been installed on your Jetson system by looking at file names of the libraries. For example, TensorRT v5.1.6 (JetPack-4.2.2) was present on one of my Jetson Nano DevKits. + +```shell +$ ls /usr/lib/aarch64-linux-gnu/libnvinfer.so* +/usr/lib/aarch64-linux-gnu/libnvinfer.so +/usr/lib/aarch64-linux-gnu/libnvinfer.so.5 +/usr/lib/aarch64-linux-gnu/libnvinfer.so.5.1.6 +``` + +Furthermore, all demo programs in this repository require "cv2" (OpenCV) module for python3. You could use the "cv2" module which came in the JetPack. Or, if you'd prefer building your own, refer to [Installing OpenCV 3.4.6 on Jetson Nano](https://jkjung-avt.github.io/opencv-on-nano/) for how to build from source and install opencv-3.4.6 on your Jetson system. + +If you plan to run Demo #3 (SSD), you'd also need to have "tensorflow-1.x" installed. You could probably use the [official tensorflow wheels provided by NVIDIA](https://docs.nvidia.com/deeplearning/frameworks/pdf/Install-TensorFlow-Jetson-Platform.pdf), or refer to [Building TensorFlow 1.12.2 on Jetson Nano](https://jkjung-avt.github.io/build-tensorflow-1.12.2/) for how to install tensorflow-1.12.2 on the Jetson system. + +Or if you plan to run Demo #4 and Demo #5, you'd need to have "protobuf" installed. I recommend installing "protobuf-3.8.0" using my [install_protobuf-3.8.0.sh](https://github.com/jkjung-avt/jetson_nano/blob/master/install_protobuf-3.8.0.sh) script. This script would take a couple of hours to finish on a Jetson system. Alternatively, doing `pip3 install` with a recent version of "protobuf" should also work (but might run a little bit slowlier). + +In case you are setting up a Jetson Nano, TX2 or Xavier NX from scratch to run these demos, you could refer to the following blog posts. + +* [JetPack-4.6](https://jkjung-avt.github.io/jetpack-4.6/) +* [JetPack-4.5](https://jkjung-avt.github.io/jetpack-4.5/) +* [Setting up Jetson Xavier NX](https://jkjung-avt.github.io/setting-up-xavier-nx/) +* [JetPack-4.4 for Jetson Nano](https://jkjung-avt.github.io/jetpack-4.4/) +* [JetPack-4.3 for Jetson Nano](https://jkjung-avt.github.io/jetpack-4.3/) + + +Demo #1: GoogLeNet +------------------ + +This demo illustrates how to convert a prototxt file and a caffemodel file into a TensorRT engine file, and to classify images with the optimized TensorRT engine. + +Step-by-step: + +1. Clone this repository. + + ```shell + $ cd ${HOME}/project + $ git clone https://github.com/jkjung-avt/tensorrt_demos.git + $ cd tensorrt_demos + ``` + +2. Build the TensorRT engine from the pre-trained googlenet (ILSVRC2012) model. Note that I downloaded the pre-trained model files from [BVLC caffe](https://github.com/BVLC/caffe/tree/master/models/bvlc_googlenet) and have put a copy of all necessary files in this repository. + + ```shell + $ cd ${HOME}/project/tensorrt_demos/googlenet + $ make + $ ./create_engine + ``` + +3. Build the Cython code. Install Cython if not previously installed. + + ```shell + $ sudo pip3 install Cython + $ cd ${HOME}/project/tensorrt_demos + $ make + ``` + +4. Run the "trt_googlenet.py" demo program. For example, run the demo using a USB webcam (/dev/video0) as the input. + + ```shell + $ cd ${HOME}/project/tensorrt_demos + $ python3 trt_googlenet.py --usb 0 --width 1280 --height 720 + ``` + + Here's a screenshot of the demo (JetPack-4.2.2, i.e. TensorRT 5). + + ![A picture of a golden retriever](https://raw.githubusercontent.com/jkjung-avt/tensorrt_demos/master/doc/golden_retriever.png) + +5. The demo program supports 5 different image/video inputs. You could do `python3 trt_googlenet.py --help` to read the help messages. Or more specifically, the following inputs could be specified: + + * `--image test_image.jpg`: an image file, e.g. jpg or png. + * `--video test_video.mp4`: a video file, e.g. mp4 or ts. An optional `--video_looping` flag could be enabled if needed. + * `--usb 0`: USB webcam (/dev/video0). + * `--rtsp rtsp://admin:123456@192.168.1.1/live.sdp`: RTSP source, e.g. an IP cam. An optional `--rtsp_latency` argument could be used to adjust the latency setting in this case. + * `--onboard 0`: Jetson onboard camera. + + In additional, you could use `--width` and `--height` to specify the desired input image size, and use `--do_resize` to force resizing of image/video file source. + + The `--usb`, `--rtsp` and `--onboard` video sources usually produce image frames at 30 FPS. If the TensorRT engine inference code runs faster than that (which happens easily on a x86_64 PC with a good GPU), one particular image could be inferenced multiple times before the next image frame becomes available. This causes problem in the object detector demos, since the original image could have been altered (bounding boxes drawn) and the altered image is taken for inference again. To cope with this problem, use the optional `--copy_frame` flag to force copying/cloning image frames internally. + +6. Check out my blog post for implementation details: + + * [Running TensorRT Optimized GoogLeNet on Jetson Nano](https://jkjung-avt.github.io/tensorrt-googlenet/) + + +Demo #2: MTCNN +-------------- + +This demo builds upon the previous one. It converts 3 sets of prototxt and caffemodel files into 3 TensorRT engines, namely the PNet, RNet and ONet. Then it combines the 3 engine files to implement MTCNN, a very good face detector. + +Assuming this repository has been cloned at "${HOME}/project/tensorrt_demos", follow these steps: + +1. Build the TensorRT engines from the pre-trained MTCNN model. (Refer to [mtcnn/README.md](https://github.com/jkjung-avt/tensorrt_demos/blob/master/mtcnn/README.md) for more information about the prototxt and caffemodel files.) + + ```shell + $ cd ${HOME}/project/tensorrt_demos/mtcnn + $ make + $ ./create_engines + ``` + +2. Build the Cython code if it has not been done yet. Refer to step 3 in Demo #1. + +3. Run the "trt_mtcnn.py" demo program. For example, I grabbed from the internet a poster of The Avengers for testing. + + ```shell + $ cd ${HOME}/project/tensorrt_demos + $ python3 trt_mtcnn.py --image ${HOME}/Pictures/avengers.jpg + ``` + + Here's the result (JetPack-4.2.2, i.e. TensorRT 5). + + ![Avengers faces detected](https://raw.githubusercontent.com/jkjung-avt/tensorrt_demos/master/doc/avengers.png) + +4. The "trt_mtcnn.py" demo program could also take various image inputs. Refer to step 5 in Demo #1 for details. + +5. Check out my related blog posts: + + * [TensorRT MTCNN Face Detector](https://jkjung-avt.github.io/tensorrt-mtcnn/) + * [Optimizing TensorRT MTCNN](https://jkjung-avt.github.io/optimize-mtcnn/) + + +Demo #3: SSD +------------ + +This demo shows how to convert pre-trained tensorflow Single-Shot Multibox Detector (SSD) models through UFF to TensorRT engines, and to do real-time object detection with the TensorRT engines. + +NOTE: This particular demo requires TensorRT "Python API", which is only available in TensorRT 5.x+ on the Jetson systems. In other words, this demo only works on Jetson systems properly set up with JetPack-4.2+, but **not** JetPack-3.x or earlier versions. + +Assuming this repository has been cloned at "${HOME}/project/tensorrt_demos", follow these steps: + +1. Install requirements (pycuda, etc.) and build TensorRT engines from the pre-trained SSD models. + + ```shell + $ cd ${HOME}/project/tensorrt_demos/ssd + $ ./install.sh + $ ./build_engines.sh + ``` + + NOTE: On my Jetson Nano DevKit with TensorRT 5.1.6, the version number of UFF converter was "0.6.3". When I ran "build_engine.py", the UFF library actually printed out: `UFF has been tested with tensorflow 1.12.0. Other versions are not guaranteed to work.` So I would strongly suggest you to use **tensorflow 1.12.x** (or whatever matching version for the UFF library installed on your system) when converting pb to uff. + +2. Run the "trt_ssd.py" demo program. The demo supports 4 models: "ssd_mobilenet_v1_coco", "ssd_mobilenet_v1_egohands", "ssd_mobilenet_v2_coco", or "ssd_mobilenet_v2_egohands". For example, I tested the "ssd_mobilenet_v1_coco" model with the "huskies" picture. + + ```shell + $ cd ${HOME}/project/tensorrt_demos + $ python3 trt_ssd.py --image ${HOME}/project/tf_trt_models/examples/detection/data/huskies.jpg \ + --model ssd_mobilenet_v1_coco + ``` + + Here's the result (JetPack-4.2.2, i.e. TensorRT 5). Frame rate was good (over 20 FPS). + + ![Huskies detected](https://raw.githubusercontent.com/jkjung-avt/tensorrt_demos/master/doc/huskies.png) + + NOTE: When running this demo with TensorRT 6 (JetPack-4.3) on the Jetson Nano, I encountered the following error message which could probably be ignored for now. Quote from [NVIDIA's NVES_R](https://devtalk.nvidia.com/default/topic/1065233/tensorrt/-tensorrt-error-could-not-register-plugin-creator-flattenconcat_trt-in-namespace-/post/5394191/#5394191): `This is a known issue and will be fixed in a future version.` + + ``` + [TensorRT] ERROR: Could not register plugin creator: FlattenConcat_TRT in namespace + ``` + + I also tested the "ssd_mobilenet_v1_egohands" (hand detector) model with a video clip from YouTube, and got the following result. Again, frame rate was pretty good. But the detection didn't seem very accurate though :-( + + ```shell + $ python3 trt_ssd.py --video ${HOME}/Videos/Nonverbal_Communication.mp4 \ + --model ssd_mobilenet_v1_egohands + ``` + + (Click on the image below to see the whole video clip...) + + [![Hands detected](https://raw.githubusercontent.com/jkjung-avt/tensorrt_demos/master/doc/hands.png)](https://youtu.be/3ieN5BBdDF0) + +3. The "trt_ssd.py" demo program could also take various image inputs. Refer to step 5 in Demo #1 again. + +4. Referring to this comment, ["#TODO enable video pipeline"](https://github.com/AastaNV/TRT_object_detection/blob/master/main.py#L78), in the original TRT_object_detection code, I did implement an "async" version of ssd detection code to do just that. When I tested "ssd_mobilenet_v1_coco" on the same huskies image with the async demo program on the Jetson Nano DevKit, frame rate improved 3~4 FPS. + + ```shell + $ cd ${HOME}/project/tensorrt_demos + $ python3 trt_ssd_async.py --image ${HOME}/project/tf_trt_models/examples/detection/data/huskies.jpg \ + --model ssd_mobilenet_v1_coco + ``` + +5. To verify accuracy (mAP) of the optimized TensorRT engines and make sure they do not degrade too much (due to reduced floating-point precision of "FP16") from the original TensorFlow frozen inference graphs, you could prepare validation data and run "eval_ssd.py". Refer to [README_mAP.md](README_mAP.md) for details. + + I compared mAP of the TensorRT engine and the original tensorflow model for both "ssd_mobilenet_v1_coco" and "ssd_mobilenet_v2_coco" using COCO "val2017" data. The results were good. In both cases, mAP of the optimized TensorRT engine matched the original tensorflow model. The FPS (frames per second) numbers in the table were measured using "trt_ssd_async.py" on my Jetson Nano DevKit with JetPack-4.3. + + | TensorRT engine | mAP @
IoU=0.5:0.95 | mAP @
IoU=0.5 | FPS on Nano | + |:------------------------|:---------------------:|:------------------:|:-----------:| + | mobilenet_v1 TF | 0.232 | 0.351 | -- | + | mobilenet_v1 TRT (FP16) | 0.232 | 0.351 | 27.7 | + | mobilenet_v2 TF | 0.248 | 0.375 | -- | + | mobilenet_v2 TRT (FP16) | 0.248 | 0.375 | 22.7 | + +6. Check out my blog posts for implementation details: + + * [TensorRT UFF SSD](https://jkjung-avt.github.io/tensorrt-ssd/) + * [Speeding Up TensorRT UFF SSD](https://jkjung-avt.github.io/speed-up-trt-ssd/) + * [Verifying mAP of TensorRT Optimized SSD and YOLOv3 Models](https://jkjung-avt.github.io/trt-detection-map/) + * Or if you'd like to learn how to train your own custom object detectors which could be easily converted to TensorRT engines and inferenced with "trt_ssd.py" and "trt_ssd_async.py": [Training a Hand Detector with TensorFlow Object Detection API](https://jkjung-avt.github.io/hand-detection-tutorial/) + + +Demo #4: YOLOv3 +--------------- + +(Merged with Demo #5: YOLOv4...) + + +Demo #5: YOLOv4 +--------------- + +Along the same line as Demo #3, these 2 demos showcase how to convert pre-trained yolov3 and yolov4 models through ONNX to TensorRT engines. The code for these 2 demos has gone through some significant changes. More specifically, I have recently updated the implementation with a "yolo_layer" plugin to speed up inference time of the yolov3/yolov4 models. + +My current "yolo_layer" plugin implementation is based on TensorRT's [IPluginV2IOExt](https://docs.nvidia.com/deeplearning/tensorrt/api/c_api/classnvinfer1_1_1_i_plugin_v2_i_o_ext.html). It only works for **TensorRT 6+**. I'm thinking about updating the code to support TensorRT 5 if I have time late on. + +I developed my "yolo_layer" plugin by referencing similar plugin code by [wang-xinyu](https://github.com/wang-xinyu/tensorrtx/tree/master/yolov4) and [dongfangduoshou123](https://github.com/dongfangduoshou123/YoloV3-TensorRT/blob/master/seralizeEngineFromPythonAPI.py). So big thanks to both of them. + +Assuming this repository has been cloned at "${HOME}/project/tensorrt_demos", follow these steps: + +1. Install "pycuda". + + ```shell + $ cd ${HOME}/project/tensorrt_demos/yolo + $ ./install_pycuda.sh + ``` + +2. Install **version "1.9.0"** of python3 **"onnx"** module. Note that the "onnx" module would depend on "protobuf" as stated in the [Prerequisite](#prerequisite) section. + + ```shell + $ sudo pip3 install onnx==1.9.0 + ``` + +3. Go to the "plugins/" subdirectory and build the "yolo_layer" plugin. When done, a "libyolo_layer.so" would be generated. + + ```shell + $ cd ${HOME}/project/tensorrt_demos/plugins + $ make + ``` + +4. Download the pre-trained yolov3/yolov4 COCO models and convert the targeted model to ONNX and then to TensorRT engine. I use "yolov4-416" as example below. (Supported models: "yolov3-tiny-288", "yolov3-tiny-416", "yolov3-288", "yolov3-416", "yolov3-608", "yolov3-spp-288", "yolov3-spp-416", "yolov3-spp-608", "yolov4-tiny-288", "yolov4-tiny-416", "yolov4-288", "yolov4-416", "yolov4-608", "yolov4-csp-256", "yolov4-csp-512", "yolov4x-mish-320", "yolov4x-mish-640", and [custom models](https://jkjung-avt.github.io/trt-yolo-custom-updated/) such as "yolov4-416x256".) + + ```shell + $ cd ${HOME}/project/tensorrt_demos/yolo + $ ./download_yolo.sh + $ python3 yolo_to_onnx.py -m yolov4-416 + $ python3 onnx_to_tensorrt.py -m yolov4-416 + ``` + + The last step ("onnx_to_tensorrt.py") takes a little bit more than half an hour to complete on my Jetson Nano DevKit. When that is done, the optimized TensorRT engine would be saved as "yolov4-416.trt". + + In case "onnx_to_tensorrt.py" fails (process "Killed" by Linux kernel), it could likely be that the Jetson platform runs out of memory during conversion of the TensorRT engine. This problem might be solved by adding a larger swap file to the system. Reference: [Process killed in onnx_to_tensorrt.py Demo#5](https://github.com/jkjung-avt/tensorrt_demos/issues/344). + +5. Test the TensorRT "yolov4-416" engine with the "dog.jpg" image. + + ```shell + $ cd ${HOME}/project/tensorrt_demos + $ wget https://raw.githubusercontent.com/pjreddie/darknet/master/data/dog.jpg -O ${HOME}/Pictures/dog.jpg + $ python3 trt_yolo.py --image ${HOME}/Pictures/dog.jpg \ + -m yolov4-416 + ``` + + This is a screenshot of the demo against JetPack-4.4, i.e. TensorRT 7. + + !["yolov4-416" detection result on dog.jpg](doc/dog_trt_yolov4_416.jpg) + +6. The "trt_yolo.py" demo program could also take various image inputs. Refer to step 5 in Demo #1 again. + + For example, I tested my own custom trained ["yolov4-crowdhuman-416x416"](https://github.com/jkjung-avt/yolov4_crowdhuman) TensorRT engine with the "Avengers: Infinity War" movie trailer: + + [![Testing with the Avengers: Infinity War trailer](https://raw.githubusercontent.com/jkjung-avt/yolov4_crowdhuman/master/doc/infinity_war.jpg)](https://youtu.be/7Qr_Fq18FgM) + +7. (Optional) Test other models than "yolov4-416". + +8. (Optional) If you would like to stream TensorRT YOLO detection output over the network and view the results on a remote host, check out my [trt_yolo_mjpeg.py example](https://github.com/jkjung-avt/tensorrt_demos/issues/226). + +9. Similar to step 5 of Demo #3, I created an "eval_yolo.py" for evaluating mAP of the TensorRT yolov3/yolov4 engines. Refer to [README_mAP.md](README_mAP.md) for details. + + ```shell + $ python3 eval_yolo.py -m yolov3-tiny-288 + $ python3 eval_yolo.py -m yolov4-tiny-416 + ...... + $ python3 eval_yolo.py -m yolov4-608 + $ python3 eval_yolo.py -l -m yolov4-csp-256 + ...... + $ python3 eval_yolo.py -l -m yolov4x-mish-640 + ``` + + I evaluated all these TensorRT yolov3/yolov4 engines with COCO "val2017" data and got the following results. I also checked the FPS (frames per second) numbers on my Jetson Nano DevKit with JetPack-4.4 (TensorRT 7). + + | TensorRT engine | mAP @
IoU=0.5:0.95 | mAP @
IoU=0.5 | FPS on Nano | + |:------------------------|:---------------------:|:------------------:|:-----------:| + | yolov3-tiny-288 (FP16) | 0.077 | 0.158 | 35.8 | + | yolov3-tiny-416 (FP16) | 0.096 | 0.202 | 25.5 | + | yolov3-288 (FP16) | 0.331 | 0.601 | 8.16 | + | yolov3-416 (FP16) | 0.373 | 0.664 | 4.93 | + | yolov3-608 (FP16) | 0.376 | 0.665 | 2.53 | + | yolov3-spp-288 (FP16) | 0.339 | 0.594 | 8.16 | + | yolov3-spp-416 (FP16) | 0.391 | 0.664 | 4.82 | + | yolov3-spp-608 (FP16) | 0.410 | 0.685 | 2.49 | + | yolov4-tiny-288 (FP16) | 0.179 | 0.344 | 36.6 | + | yolov4-tiny-416 (FP16) | 0.196 | 0.387 | 25.5 | + | yolov4-288 (FP16) | 0.376 | 0.591 | 7.93 | + | yolov4-416 (FP16) | 0.459 | 0.700 | 4.62 | + | yolov4-608 (FP16) | 0.488 | 0.736 | 2.35 | + | yolov4-csp-256 (FP16) | 0.336 | 0.502 | 12.8 | + | yolov4-csp-512 (FP16) | 0.436 | 0.630 | 4.26 | + | yolov4x-mish-320 (FP16) | 0.400 | 0.581 | 4.79 | + | yolov4x-mish-640 (FP16) | 0.470 | 0.668 | 1.46 | + +10. Check out my blog posts for implementation details: + + * [TensorRT ONNX YOLOv3](https://jkjung-avt.github.io/tensorrt-yolov3/) + * [TensorRT YOLOv4](https://jkjung-avt.github.io/tensorrt-yolov4/) + * [Verifying mAP of TensorRT Optimized SSD and YOLOv3 Models](https://jkjung-avt.github.io/trt-detection-map/) + * For training your own custom yolov4 model: [Custom YOLOv4 Model on Google Colab](https://jkjung-avt.github.io/colab-yolov4/) + * For adapting the code to your own custom trained yolov3/yolov4 models: [TensorRT YOLO For Custom Trained Models (Updated)](https://jkjung-avt.github.io/trt-yolo-custom-updated/) + + +Demo #6: Using INT8 and DLA core +-------------------------------- + +NVIDIA introduced [INT8 TensorRT inferencing](https://on-demand.gputechconf.com/gtc/2017/presentation/s7310-8-bit-inference-with-tensorrt.pdf) since CUDA compute 6.1+. For the embedded Jetson product line, INT8 is available on Jetson AGX Xavier and Xavier NX. In addition, NVIDIA further introduced [Deep Learning Accelerator (NVDLA)](http://nvdla.org/) on Jetson Xavier NX. I tested both features on my Jetson Xavier NX DevKit, and shared the source code in this repo. + +Please make sure you have gone through the steps of [Demo #5](#yolov4) and are able to run TensorRT yolov3/yolov4 engines successfully, before following along: + +1. In order to use INT8 TensorRT, you'll first have to prepare some images for "calibration". These images for calibration should cover all distributions of possible image inputs at inference time. According to [official documentation](https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#optimizing_int8_c), 500 of such images are suggested by NVIDIA. As an example, I used 1,000 images from the COCO "val2017" dataset for that purpose. Note that I've previously downloaded the "val2017" images for [mAP evaluation](README_mAP.md). + + ```shell + $ cd ${HOME}/project/tensorrt_demos/yolo + $ mkdir calib_images + ### randomly pick and copy over 1,000 images from "val207" + $ for jpg in $(ls -1 ${HOME}/data/coco/images/val2017/*.jpg | sort -R | head -1000); do \ + cp ${HOME}/data/coco/images/val2017/${jpg} calib_images/; \ + done + ``` + + When this is done, the 1,000 images for calibration should be present in the "${HOME}/project/tensorrt_demos/yolo/calib_images/" directory. + +2. Build the INT8 TensorRT engine. I use the "yolov3-608" model in the example commands below. (I've also created a "build_int8_engines.sh" script to facilitate building multiple INT8 engines at once.) Note that building the INT8 TensorRT engine on Jetson Xavier NX takes quite long. By enabling verbose logging ("-v"), you would be able to monitor the progress more closely. + + ``` + $ ln -s yolov3-608.cfg yolov3-int8-608.cfg + $ ln -s yolov3-608.onnx yolov3-int8-608.onnx + $ python3 onnx_to_tensorrt.py -v --int8 -m yolov3-int8-608 + ``` + +3. (Optional) Build the TensorRT engines for the DLA cores. I use the "yolov3-608" model as example again. (I've also created a "build_dla_engines.sh" script for building multiple DLA engines at once.) + + ``` + $ ln -s yolov3-608.cfg yolov3-dla0-608.cfg + $ ln -s yolov3-608.onnx yolov3-dla0-608.onnx + $ python3 onnx_to_tensorrt.py -v --int8 --dla_core 0 -m yolov3-dla0-608 + $ ln -s yolov3-608.cfg yolov3-dla1-608.cfg + $ ln -s yolov3-608.onnx yolov3-dla1-608.onnx + $ python3 onnx_to_tensorrt.py -v --int8 --dla_core 1 -m yolov3-int8-608 + ``` + +4. Test the INT8 TensorRT engine with the "dog.jpg" image. + + ```shell + $ cd ${HOME}/project/tensorrt_demos + $ python3 trt_yolo.py --image ${HOME}/Pictures/dog.jpg \ + -m yolov3-int8-608 + ``` + + (Optional) Also test the DLA0 and DLA1 TensorRT engines. + + ```shell + $ python3 trt_yolo.py --image ${HOME}/Pictures/dog.jpg \ + -m yolov3-dla0-608 + $ python3 trt_yolo.py --image ${HOME}/Pictures/dog.jpg \ + -m yolov3-dla1-608 + ``` + +5. Evaluate mAP of the INT8 and DLA TensorRT engines. + + ```shell + $ python3 eval_yolo.py -m yolov3-int8-608 + $ python3 eval_yolo.py -m yolov3-dla0-608 + $ python3 eval_yolo.py -m yolov3-dla1-608 + ``` + +6. I tested the 5 original yolov3/yolov4 models on my Jetson Xavier NX DevKit with JetPack-4.4 (TensorRT 7.1.3.4). Here are the results. + + The following **FPS numbers** were measured under "15W 6CORE" mode, with CPU/GPU clocks set to maximum value (`sudo jetson_clocks`). + + | TensorRT engine | FP16 | INT8 | DLA0 | DLA1 | + |:-----------------|:--------:|:--------:|:--------:|:--------:| + | yolov3-tiny-416 | 58 | 65 | 42 | 42 | + | yolov3-608 | 15.2 | 23.1 | 14.9 | 14.9 | + | yolov3-spp-608 | 15.0 | 22.7 | 14.7 | 14.7 | + | yolov4-tiny-416 | 57 | 60 | X | X | + | yolov4-608 | 13.8 | 20.5 | 8.97 | 8.97 | + | yolov4-csp-512 | 19.8 | 27.8 | -- | -- | + | yolov4x-mish-640 | 9.01 | 14.1 | -- | -- | + + And the following are **"mAP@IoU=0.5:0.95" / "mAP@IoU=0.5"** of those TensorRT engines. + + | TensorRT engine | FP16 | INT8 | DLA0 | DLA1 | + |:-----------------|:---------------:|:---------------:|:---------------:|:---------------:| + | yolov3-tiny-416 | 0.096 / 0.202 | 0.094 / 0.198 | 0.096 / 0.199 | 0.096 / 0.199 | + | yolov3-608 | 0.376 / 0.665 | 0.378 / 0.670 | 0.378 / 0.670 | 0.378 / 0.670 | + | yolov3-spp-608 | 0.410 / 0.685 | 0.407 / 0.681 | 0.404 / 0.676 | 0.404 / 0.676 | + | yolov4-tiny-416 | 0.196 / 0.387 | 0.190 / 0.376 | X | X | + | yolov4-608 | 0.488 / 0.736 | *0.317 / 0.507* | 0.474 / 0.727 | 0.473 / 0.726 | + | yolov4-csp-512 | 0.436 / 0.630 | 0.391 / 0.577 | -- | -- | + | yolov4x-mish-640 | 0.470 / 0.668 | 0.434 / 0.631 | -- | -- | + +7. Issues: + + * For some reason, I'm not able to build DLA TensorRT engines for the "yolov4-tiny-416" model. I have [reported the issue](https://forums.developer.nvidia.com/t/problem-building-tensorrt-engines-for-dla-core/155749) to NVIDIA. + * There is no method in TensorRT 7.1 Python API to specifically set DLA core at inference time. I also [reported this issue](https://forums.developer.nvidia.com/t/no-method-in-tensorrt-python-api-for-setting-dla-core-for-inference/155874) to NVIDIA. When testing, I simply deserialize the TensorRT engines onto Jetson Xavier NX. I'm not 100% sure whether the engine is really executed on DLA core 0 or DLA core 1. + * mAP of the INT8 TensorRT engine of the "yolov4-608" model is not good. Originally, I thought it was [an issue of TensorRT library's handling of "Concat" nodes](https://forums.developer.nvidia.com/t/concat-in-caffe-parser-is-wrong-when-working-with-int8-calibration/142639/3?u=jkjung13). But after some more investigation, I saw that was not the case. Currently, I'm still not sure what the problem is... + + +Demo #7: MODNet +--------------- + +This demo illustrates the use of TensorRT to optimize an image segmentation model. More specifically, I build and test a TensorRT engine from the pre-trained MODNet to do real-time image/video "matting". The PyTorch MODNet model comes from [ZHKKKe/MODNet](https://github.com/ZHKKKe/MODNet). Note that, as stated by the original auther, this pre-trained model is under [Creative Commons Attribution NonCommercial ShareAlike 4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode) license. Thanks to [ZHKKKe](https://github.com/ZHKKKe) for sharing the model and inference code. + +This MODNet model contains [InstanceNorm2d](https://pytorch.org/docs/stable/generated/torch.nn.InstanceNorm2d.html) layers, which are only supported in recent versions of TensorRT. So far I have only tested the code with TensorRT 7.1 and 7.2. I don't guarantee the code would work for older versions of TensorRT. + +To make the demo simpler to follow, I have already converted the PyTorch MODNet model to ONNX ("modnet/modnet.onnx"). If you'd like to do the PyTorch-to-ONNX conversion by yourself, you could refer to [modnet/README.md](https://github.com/jkjung-avt/tensorrt_demos/blob/master/modnet/README.md). + +Here is the step-by-step guide for the demo: + +1. Install "pycuda" in case you haven't done so before. + + ```shell + $ cd ${HOME}/project/tensorrt_demos/modnet + $ ./install_pycuda.sh + ``` + +2. Build TensorRT engine from "modnet/modnet.onnx". + + This step would be easy if you are using **TensorRT 7.2 or later**. Just use the "modnet/onnx_to_tensorrt.py" script: (You could optionally use "-v" command-line option to see verbose logs.) + + ```shell + $ python3 onnx_to_tensorrt.py modnet.onnx modnet.engine + ``` + + When "onnx_to_tensorrt.py" finishes, the "modnet.engine" file should be generated. And you could go to step #3. + + In case you are using **TensorRT 7.1** (JetPack-4.5 or JetPack-4.4), "modnet/onnx_to_tensorrt.py" wouldn't work due to this error (which has been fixed in TensorRT 7.2): [UNSUPPORTED_NODE: Assertion failed: !isDynamic(tensorPtr->getDimensions()) && "InstanceNormalization does not support dynamic inputs!"](https://github.com/onnx/onnx-tensorrt/issues/374). I worked around the problem by building [onnx-tensorrt](https://github.com/onnx/onnx-tensorrt) by myself. Here's how you could do it too. + + ``` + $ cd ${HOME}/project/tensorrt_demos/modnet + ### check out the "onnx-tensorrt" submodule + $ git submodule update --init --recursive + ### patch CMakeLists.txt + $ sed -i '21s/cmake_minimum_required(VERSION 3.13)/#cmake_minimum_required(VERSION 3.13)/' \ + onnx-tensorrt/CMakeLists.txt + ### build onnx-tensorrt + $ mkdir -p onnx-tensorrt/build + $ cd onnx-tensorrt/build + $ cmake -DCMAKE_CXX_FLAGS=-I/usr/local/cuda/targets/aarch64-linux/include \ + -DONNX_NAMESPACE=onnx2trt_onnx .. + $ make -j4 + ### finally, we could build the TensorRT (FP16) engine + $ cd ${HOME}/project/tensorrt_demos/modnet + $ LD_LIBRARY_PATH=$(pwd)/onnx-tensorrt/build \ + onnx-tensorrt/build/onnx2trt modnet.onnx -o modnet.engine \ + -d 16 -v + ``` + +3. Test the TensorRT MODNet engine with "modnet/image.jpg". + + ```shell + $ cd ${HOME}/project/tensorrt_demos + $ python3 trt_modnet.py --image modnet/image.jpg + ``` + + You could see the matted image as below. Note that I get ~21 FPS when running the code on Jetson Xavier NX with JetPack-4.5. + + ![Matted modnet/image.jpg](https://raw.githubusercontent.com/jkjung-avt/tensorrt_demos/master/doc/image_trt_modnet.jpg) + +4. The "trt_modnet.py" demo program could also take various image inputs. Refer to step 5 in Demo #1 again. (For example, the "--usb" command-line option would be useful.) + +5. Instead of a boring black background, you could use the "--background" option to specify an alternative background. The background could be either a still image or a video file. Furthermore, you could also use the "--create_video" option to save the matted outputs as a video file. + + For example, I took a [Chou, Tzu-Yu video](https://youtu.be/L6B9BObaIRA) and a [beach video](https://youtu.be/LdsTydS4eww), and created a blended video like this: + + ```shell + $ cd ${HOME}/project/tensorrt_demos + $ python3 trt_modnet.py --video Tzu-Yu.mp4 \ + --background beach.mp4 \ + --demo_mode \ + --create_video output + ``` + + The result would be saved as "output.ts" on Jetson Xavier NX (or "output.mp4" on x86_64 PC). + + [![Video Matting Demo \| TensorRT MODNet](https://raw.githubusercontent.com/jkjung-avt/tensorrt_demos/master/doc/trt_modnet_youtube.jpg)](https://youtu.be/SIoJAI1bMyc) + +Licenses +-------- + +1. I referenced source code of [NVIDIA/TensorRT](https://github.com/NVIDIA/TensorRT) samples to develop most of the demos in this repository. Those NVIDIA samples are under [Apache License 2.0](https://github.com/NVIDIA/TensorRT/blob/master/LICENSE). +2. [GoogLeNet](https://github.com/BVLC/caffe/tree/master/models/bvlc_googlenet): "This model is released for unrestricted use." +3. [MTCNN](https://github.com/PKUZHOU/MTCNN_FaceDetection_TensorRT): license not specified. Note [the original MTCNN](https://github.com/kpzhang93/MTCNN_face_detection_alignment) is under [MIT License](https://github.com/kpzhang93/MTCNN_face_detection_alignment/blob/master/LICENSE). +4. [TensorFlow Object Detection Models](https://github.com/tensorflow/models/tree/master/research/object_detection): [Apache License 2.0](https://github.com/tensorflow/models/blob/master/LICENSE). +5. YOLOv3/YOLOv4 models ([DarkNet](https://github.com/AlexeyAB/darknet)): [YOLO LICENSE](https://github.com/AlexeyAB/darknet/blob/master/LICENSE). +6. [MODNet](https://github.com/ZHKKKe/MODNet): [Creative Commons Attribution NonCommercial ShareAlike 4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode) license. +7. For the rest of the code (developed by jkjung-avt and other contributors): [MIT License](https://github.com/jkjung-avt/tensorrt_demos/blob/master/LICENSE). diff --git a/README_mAP.md b/README_mAP.md new file mode 100644 index 0000000..16a56b2 --- /dev/null +++ b/README_mAP.md @@ -0,0 +1,122 @@ +# Instructions for evaluating accuracy (mAP) of SSD models + +Preparation +----------- + +1. Prepare image data and label ('bbox') file for the evaluation. I used COCO [2017 Val images (5K/1GB)](http://images.cocodataset.org/zips/val2017.zip) and [2017 Train/Val annotations (241MB)](http://images.cocodataset.org/annotations/annotations_trainval2017.zip). You could try to use your own dataset for evaluation, but you'd need to convert the labels into [COCO Object Detection ('bbox') format](http://cocodataset.org/#format-data) if you want to use code in this repository without modifications. + + More specifically, I downloaded the images and labels, and unzipped files into `${HOME}/data/coco/`. + + ```shell + $ wget http://images.cocodataset.org/zips/val2017.zip \ + -O ${HOME}/Downloads/val2017.zip + $ wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip \ + -O ${HOME}/Downloads/annotations_trainval2017.zip + $ mkdir -p ${HOME}/data/coco/images + $ cd ${HOME}/data/coco/images + $ unzip ${HOME}/Downloads/val2017.zip + $ cd ${HOME}/data/coco + $ unzip ${HOME}/Downloads/annotations_trainval2017.zip + ``` + + Later on I would be using the following (unzipped) image and annotation files for the evaluation. + + ``` + ${HOME}/data/coco/images/val2017/*.jpg + ${HOME}/data/coco/annotations/instances_val2017.json + ``` + +2. Install 'pycocotools'. The easiest way is to use `pip3 install`. + + ```shell + $ sudo pip3 install pycocotools + ``` + + Alternatively, you could build and install it from [source](https://github.com/cocodataset/cocoapi). + +3. Install additional requirements. + + ```shell + $ sudo pip3 install progressbar2 + ``` + +Evaluation +---------- + +I've created the [eval_ssd.py](eval_ssd.py) script to do the [mAP evaluation](http://cocodataset.org/#detection-eval). + +``` +usage: eval_ssd.py [-h] [--mode {tf,trt}] [--imgs_dir IMGS_DIR] + [--annotations ANNOTATIONS] + {ssd_mobilenet_v1_coco,ssd_mobilenet_v2_coco} +``` + +The script takes 1 mandatory argument: either 'ssd_mobilenet_v1_coco' or 'ssd_mobilenet_v2_coco'. In addition, it accepts the following options: + +* `--mode {tf,trt}`: to evaluate either the unoptimized TensorFlow frozen inference graph (tf) or the optimized TensorRT engine (trt). +* `--imgs_dir IMGS_DIR`: to specify an alternative directory for reading image files. +* `--annotations ANNOTATIONS`: to specify an alternative annotation/label file. + +For example, I evaluated both 'ssd_mobilenet_v1_coco' and 'ssd_mobilenet_v2_coco' TensorRT engines on my x86_64 PC and got these results. The overall mAP values are `0.230` and `0.246`, respectively. + +```shell +$ python3 eval_ssd.py --mode trt ssd_mobilenet_v1_coco +...... +100% (5000 of 5000) |####################| Elapsed Time: 0:00:26 Time: 0:00:26 +loading annotations into memory... +Done (t=0.36s) +creating index... +index created! +Loading and preparing results... +DONE (t=0.11s) +creating index... +index created! +Running per image evaluation... +Evaluate annotation type *bbox* +DONE (t=8.89s). +Accumulating evaluation results... +DONE (t=1.37s). + Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.232 + Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.351 + Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.254 + Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.018 + Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.166 + Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.530 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.209 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.264 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.264 + Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.022 + Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.191 + Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.606 +None +$ +$ python3 eval_ssd.py --mode trt ssd_mobilenet_v2_coco +...... +100% (5000 of 5000) |####################| Elapsed Time: 0:00:29 Time: 0:00:29 +loading annotations into memory... +Done (t=0.37s) +creating index... +index created! +Loading and preparing results... +DONE (t=0.12s) +creating index... +index created! +Running per image evaluation... +Evaluate annotation type *bbox* +DONE (t=9.47s). +Accumulating evaluation results... +DONE (t=1.42s). + Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.248 + Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.375 + Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.273 + Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.021 + Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.176 + Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.573 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.221 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.278 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.279 + Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.027 + Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.202 + Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.643 +None +``` diff --git a/README_x86.md b/README_x86.md new file mode 100644 index 0000000..06f15d9 --- /dev/null +++ b/README_x86.md @@ -0,0 +1,123 @@ +# Instructions for x86_64 platforms + +All demos in this repository, with minor tweaks, should also work on x86_64 platforms with NVIDIA GPU(s). Here is a list of required modifications if you'd like to run the demos on an x86_64 PC/server. + + +Make sure you have TensorRT installed properly on your x86_64 system. You could follow NVIDIA's official [Installation Guide :: NVIDIA Deep Learning TensorRT](https://docs.nvidia.com/deeplearning/tensorrt/install-guide/index.html) documentation. + +Demo #1 (GoogLeNet) and #2 (MTCNN) +---------------------------------- + +1. Set `TENSORRT_INCS` and `TENSORRT_LIBS` in "common/Makefile.config" correctly for your x86_64 system. More specifically, you should find the following lines in "common/Mafefile.config" and modify them if needed. + + ``` + # These are the directories where I installed TensorRT on my x86_64 PC. + TENSORRT_INCS=-I"/usr/local/TensorRT-7.1.3.4/include" + TENSORRT_LIBS=-L"/usr/local/TensorRT-7.1.3.4/lib" + ``` + +2. Set `library_dirs` and `include_dirs` in "setup.py". More specifically, you should check and make sure the 2 TensorRT path lines are correct. + + ```python + library_dirs = [ + '/usr/local/cuda/lib64', + '/usr/local/TensorRT-7.1.3.4/lib', # for my x86_64 PC + '/usr/local/lib', + ] + ...... + include_dirs = [ + # in case the following numpy include path does not work, you + # could replace it manually with, say, + # '-I/usr/local/lib/python3.6/dist-packages/numpy/core/include', + '-I' + numpy.__path__[0] + '/core/include', + '-I/usr/local/cuda/include', + '-I/usr/local/TensorRT-7.1.3.4/include', # for my x86_64 PC + '-I/usr/local/include', + ] + ``` + +3. Follow the steps in the original [README.md](https://github.com/jkjung-avt/tensorrt_demos/blob/master/README.md), and the demos should work on x86_64 as well. + +Demo #3 (SSD) +------------- + +1. Make sure to follow NVIDIA's official [Installation Guide :: NVIDIA Deep Learning TensorRT](https://docs.nvidia.com/deeplearning/tensorrt/install-guide/index.html) documentation and pip3 install "tensorrt", "uff", and "graphsurgeon" packages. + +2. Patch `/usr/local/lib/python3.?/dist-packages/graphsurgeon/node_manipulation.py` by adding the following line (around line #42): + + ```python + def shape(node): + ...... + node.name = name or node.name + node.op = op or node.op or node.name + + node.attr["dtype"].type = 1 + for key, val in kwargs.items(): + ...... + ``` +3. (I think this step is only required for TensorRT 6 or earlier versions.) Re-build `libflattenconcat.so` from TensorRT's 'python/uff_ssd' sample source code. For example, + + ```shell + $ mkdir -p ${HOME}/src/TensorRT-5.1.5.0 + $ cp -r /usr/local/TensorRT-5.1.5.0/samples ${HOME}/src/TensorRT-5.1.5.0 + $ cd ${HOME}/src/TensorRT-5.1.5.0/samples/python/uff_ssd + $ mkdir build + $ cd build + $ cmake -D NVINFER_LIB=/usr/local/TensorRT-5.1.5.0/lib/libnvinfer.so \ + -D TRT_INCLUDE=/usr/local/TensorRT-5.1.5.0/include .. + $ make + $ cp libflattenconcat.so ${HOME}/project/tensorrt_demos/ssd/ + ``` + +4. Install "pycuda". + + ```shell + $ sudo apt-get install -y build-essential python-dev + $ sudo apt-get install -y libboost-python-dev libboost-thread-dev + $ sudo pip3 install setuptools + $ export boost_pylib=$(basename /usr/lib/x86_64-linux-gnu/libboost_python3-py3?.so) + $ export boost_pylibname=${boost_pylib%.so} + $ export boost_pyname=${boost_pylibname/lib/} + $ cd ${HOME}/src + $ wget https://files.pythonhosted.org/packages/5e/3f/5658c38579b41866ba21ee1b5020b8225cec86fe717e4b1c5c972de0a33c/pycuda-2019.1.2.tar.gz + $ tar xzvf pycuda-2019.1.2.tar.gz + $ cd pycuda-2019.1.2 + $ ./configure.py --python-exe=/usr/bin/python3 \ + --cuda-root=/usr/local/cuda \ + --cudadrv-lib-dir=/usr/lib/x86_64-linux-gnu \ + --boost-inc-dir=/usr/include \ + --boost-lib-dir=/usr/lib/x86_64-linux-gnu \ + --boost-python-libname=${boost_pyname} \ + --boost-thread-libname=boost_thread \ + --no-use-shipped-boost + $ make -j4 + $ python3 setup.py build + $ sudo python3 setup.py install + $ python3 -c "import pycuda; print('pycuda version:', pycuda.VERSION)" + ``` + +5. Follow the steps in the original [README.md](https://github.com/jkjung-avt/tensorrt_demos/blob/master/README.md) but skip `install.sh`. You should be able to build the SSD TensorRT engines and run them on on x86_64 as well. + +Demo #4 (YOLOv3) & Demo #5 (YOLOv4) +----------------------------------- + +Checkout "plugins/Makefile". You'll need to make sure in "plugins/Makefile": + +* CUDA `compute` is set correctly for your GPU (reference: [CUDA GPUs | NVIDIA Developer]()); +* `TENSORRT_INCS` and `TENSORRT_LIBS` point to the right paths. + +``` +...... +else ifeq ($(cpu_arch), x86_64) # x86_64 PC + $(warning "compute=75" is for GeForce RTX-2080 Ti. Please make sure CUDA compute is set correctly for your system in the Makefile.) + compute=75 +...... +NVCCFLAGS=-m64 -gencode arch=compute_$(compute),code=sm_$(compute) \ + -gencode arch=compute_$(compute),code=compute_$(compute) +...... +# These are the directories where I installed TensorRT on my x86_64 PC. +TENSORRT_INCS=-I"/usr/local/TensorRT-7.1.3.4/include" +TENSORRT_LIBS=-L"/usr/local/TensorRT-7.1.3.4/lib" +...... +``` + +Otherwise, you should be able to follow the steps in the original [README.md](https://github.com/jkjung-avt/tensorrt_demos/blob/master/README.md) to get these 2 demos working. diff --git a/common/Makefile.config b/common/Makefile.config new file mode 100644 index 0000000..fbf1f65 --- /dev/null +++ b/common/Makefile.config @@ -0,0 +1,207 @@ +.SUFFIXES: +TARGET?=$(shell uname -m) +ifeq ($(CUDA_INSTALL_DIR),) +$(warning CUDA_INSTALL_DIR variable is not specified, using /usr/local/cuda by default, use CUDA_INSTALL_DIR= to change.) +endif +ifeq ($(CUDNN_INSTALL_DIR),) +$(warning CUDNN_INSTALL_DIR variable is not specified, using $(CUDA_INSTALL_DIR) by default, use CUDNN_INSTALL_DIR= to change.) +endif +CUDA_INSTALL_DIR?=/usr/local/cuda +CUDNN_INSTALL_DIR?=$(CUDA_INSTALL_DIR) +CUDA_LIBDIR=lib +CUDNN_LIBDIR=lib64 +ifeq ($(TARGET), aarch64) +ifeq ($(shell uname -m), aarch64) +CUDA_LIBDIR=lib64 +CC = g++ +else +CC = aarch64-linux-gnu-g++ +endif +CUCC =$(CUDA_INSTALL_DIR)/bin/nvcc -m64 -ccbin $(CC) +else ifeq ($(TARGET), x86_64) +CUDA_LIBDIR=lib64 +CC = g++ +CUCC =$(CUDA_INSTALL_DIR)/bin/nvcc -m64 +else ifeq ($(TARGET), qnx) +CC = ${QNX_HOST}/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++ +CUCC = $(CUDA_INSTALL_DIR)/bin/nvcc -m64 -ccbin $(CC) +else ifeq ($(TARGET), android64) +ifeq ($(NDK_ROOT),) +$(error NDK_ROOT must be set to build for android platforms) +endif +ifeq ($(ANDROID_CC),) +$(error ANDROID_CC must be set to the g++ compiler to build for android 64bit, for example $(NDK_ROOT)/toolschains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-g++) +endif +ifeq ($(NDK_SYS_ROOT),) +$(error NDK_SYS_ROOT must be set to compiler for android 64bit, for example $(NDK_ROOT)/platforms/android-24/arch-arm64) +endif +CUDA_LIBDIR=lib64 +ANDROID_FLAGS=--sysroot=${NDK_SYS_ROOT} -DANDROID -D_GLIBCXX_USE_C99=1 -Wno-sign-compare -D__aarch64__ -Wno-strict-aliasing -Werror -pie -fPIE +COMMON_FLAGS+=$(ANDROID_FLAGS) +COMMON_LD_FLAGS+=$(ANDROID_FLAGS) +CC=$(ANDROID_CC) +CUCC = $(CUDA_INSTALL_DIR)/bin/nvcc -m64 -ccbin $(CC) --compiler-options="--sysroot=${NDK_SYS_ROOT} -DANDROID -D_GLIBCXX_USE_C99=1 -Wno-sign-compare" +TGT_INCLUDES=-I$(NDK_ROOT)/platforms/android-24/arch-aarch64/usr/include -I$(NDK_ROOT)/sources/cxx-stl/gnu-libstdc++/4.9/include -I$(NDK_ROOT)/sources/cxx-stl/gnu-libstdc++/4.9/libs/arm64-v8a/include +TGT_LIBS=-L$(NDK_ROOT)/sources/cxx-stl/gnu-libstdc++/4.9/libs/arm64-v8a +ANDROID=1 +else ######## +$(error Auto-detection of platform failed. Please specify one of the following arguments to make: TARGET=[aarch64|x86_64|qnx]) +endif + +ifdef VERBOSE +AT= +else +AT=@ +endif + +AR = ar cr +ECHO = @echo + +SHELL=/bin/sh + +#ROOT_PATH=../.. +#OUT_PATH=$(ROOT_PATH)/bin +OUT_PATH=. +OUTDIR=$(OUT_PATH) + +define concat +$1$2$3$4$5$6$7$8 +endef + +#$(call make-depend,source-file,object-file,depend-file) +define make-depend + $(AT)$(CC) -MM -MF $3 -MP -MT $2 $(COMMON_FLAGS) $1 +endef + +######################### + +# These are the directories where I installed TensorRT on my x86_64 PC. +TENSORRT_INCS=-I"/usr/local/TensorRT-7.1.3.4/include" +TENSORRT_LIBS=-L"/usr/local/TensorRT-7.1.3.4/lib" + +INCPATHS=-I"$(CUDA_INSTALL_DIR)/include" $(TENSORRT_INCS) -I"/usr/local/include" -I"$(CUDNN_INSTALL_DIR)/include" $(TGT_INCLUDES) -I"../common" +LIBPATHS=-L"$(CUDA_INSTALL_DIR)/$(CUDA_LIBDIR)" $(TENSORRT_LIBS) -L"/usr/local/lib" -L"$(CUDA_INSTALL_DIR)/$(CUDA_LIBDIR)" -L"$(CUDNN_INSTALL_DIR)/$(CUDNN_LIBDIR)" $(TGT_LIBS) + +.SUFFIXES: +vpath %.h $(EXTRA_DIRECTORIES) +vpath %.cpp $(EXTRA_DIRECTORIES) + +COMMON_FLAGS += -Wall -std=c++11 $(INCPATHS) +ifneq ($(ANDROID),1) +COMMON_FLAGS += -D_REENTRANT +endif +COMMON_LD_FLAGS += $(LIBPATHS) -L$(OUTDIR) + +OBJDIR =$(call concat,$(OUTDIR),/chobj) +DOBJDIR =$(call concat,$(OUTDIR),/dchobj) + +ifeq ($(ANDROID),1) +COMMON_LIBS = -lcudnn -lcublas -lnvToolsExt -lcudart +else +COMMON_LIBS = -lcudnn -lcublas -lcudart_static -lnvToolsExt -lcudart +endif +ifneq ($(TARGET), qnx) +ifneq ($(ANDROID),1) +COMMON_LIBS += -lrt -ldl -lpthread +endif +endif +ifeq ($(ANDROID),1) +COMMON_LIBS += -lculibos -lgnustl_shared -llog +endif + +LIBS =-lnvinfer -lnvparsers -lnvinfer_plugin $(COMMON_LIBS) +DLIBS =-lnvinfer -lnvparsers -lnvinfer_plugin $(COMMON_LIBS) +OBJS =$(patsubst %.cpp, $(OBJDIR)/%.o, $(wildcard *.cpp $(addsuffix /*.cpp, $(EXTRA_DIRECTORIES)))) +DOBJS =$(patsubst %.cpp, $(DOBJDIR)/%.o, $(wildcard *.cpp $(addsuffix /*.cpp, $(EXTRA_DIRECTORIES)))) +CUOBJS =$(patsubst %.cu, $(OBJDIR)/%.o, $(wildcard *.cu $(addsuffix /*.cu, $(EXTRA_DIRECTORIES)))) +CUDOBJS =$(patsubst %.cu, $(DOBJDIR)/%.o, $(wildcard *.cu $(addsuffix /*.cu, $(EXTRA_DIRECTORIES)))) + +CFLAGS=$(COMMON_FLAGS) +CFLAGSD=$(COMMON_FLAGS) -g +LFLAGS=$(COMMON_LD_FLAGS) +LFLAGSD=$(COMMON_LD_FLAGS) + +#all: debug release + +release : $(OUTDIR)/$(OUTNAME_RELEASE) + +debug : $(OUTDIR)/$(OUTNAME_DEBUG) + +test: test_debug test_release + +test_debug: + $(AT)cd $(OUTDIR) && ./$(OUTNAME_DEBUG) + +test_release: + $(AT)cd $(OUTDIR) && ./$(OUTNAME_RELEASE) + +ifdef MAC +$(OUTDIR)/$(OUTNAME_RELEASE) : $(OBJS) $(CUOBJS) + $(ECHO) Linking: $@ + $(AT)$(CC) -o $@ $^ $(LFLAGS) $(LIBS) + $(foreach EXTRA_FILE,$(EXTRA_FILES), cp -f $(EXTRA_FILE) $(OUTDIR)/$(EXTRA_FILE); ) + +$(OUTDIR)/$(OUTNAME_DEBUG) : $(DOBJS) $(CUDOBJS) + $(ECHO) Linking: $@ + $(AT)$(CC) -o $@ $^ $(LFLAGSD) $(DLIBS) +else +$(OUTDIR)/$(OUTNAME_RELEASE) : $(OBJS) $(CUOBJS) + $(ECHO) Linking: $@ + $(AT)$(CC) -o $@ $^ $(LFLAGS) -Wl,--start-group $(LIBS) -Wl,--end-group + $(foreach EXTRA_FILE,$(EXTRA_FILES), cp -f $(EXTRA_FILE) $(OUTDIR)/$(EXTRA_FILE); ) + +$(OUTDIR)/$(OUTNAME_DEBUG) : $(DOBJS) $(CUDOBJS) + $(ECHO) Linking: $@ + $(AT)$(CC) -o $@ $^ $(LFLAGSD) -Wl,--start-group $(DLIBS) -Wl,--end-group +endif + +$(OBJDIR)/%.o: %.cpp + $(AT)if [ ! -d $(OBJDIR) ]; then mkdir -p $(OBJDIR); fi + $(foreach XDIR,$(EXTRA_DIRECTORIES), if [ ! -d $(OBJDIR)/$(XDIR) ]; then mkdir -p $(OBJDIR)/$(XDIR); fi;) : + $(call make-depend,$<,$@,$(subst .o,.d,$@)) + $(ECHO) Compiling: $< + $(AT)$(CC) $(CFLAGS) -c -o $@ $< + +$(DOBJDIR)/%.o: %.cpp + $(AT)if [ ! -d $(DOBJDIR) ]; then mkdir -p $(DOBJDIR); fi + $(foreach XDIR,$(EXTRA_DIRECTORIES), if [ ! -d $(OBJDIR)/$(XDIR) ]; then mkdir -p $(DOBJDIR)/$(XDIR); fi;) : + $(call make-depend,$<,$@,$(subst .o,.d,$@)) + $(ECHO) Compiling: $< + $(AT)$(CC) $(CFLAGSD) -c -o $@ $< + +######################################################################### CU +$(OBJDIR)/%.o: %.cu + $(AT)if [ ! -d $(OBJDIR) ]; then mkdir -p $(OBJDIR); fi + $(foreach XDIR,$(EXTRA_DIRECTORIES), if [ ! -d $(OBJDIR)/$(XDIR) ]; then mkdir -p $(OBJDIR)/$(XDIR); fi;) : + $(call make-depend,$<,$@,$(subst .o,.d,$@)) + $(ECHO) Compiling CUDA release: $< + $(AT)$(CUCC) $(CUFLAGS) -c -o $@ $< + +$(DOBJDIR)/%.o: %.cu + $(AT)if [ ! -d $(DOBJDIR) ]; then mkdir -p $(DOBJDIR); fi + $(foreach XDIR,$(EXTRA_DIRECTORIES), if [ ! -d $(DOBJDIR)/$(XDIR) ]; then mkdir -p $(DOBJDIR)/$(XDIR); fi;) : + $(call make-depend,$<,$@,$(subst .o,.d,$@)) + $(ECHO) Compiling CUDA debug: $< + $(AT)$(CUCC) $(CUFLAGSD) -c -o $@ $< + +clean: + $(ECHO) Cleaning... + $(AT)-rm -rf $(OBJDIR) $(DOBJDIR) $(OUTDIR)/$(OUTNAME_RELEASE) $(OUTDIR)/$(OUTNAME_DEBUG) + $(AT)-rm -rf *.engine + +ifneq "$(MAKECMDGOALS)" "clean" + -include $(OBJDIR)/*.d $(DOBJDIR)/*.d +endif + +ifeq ($(DO_CUDNN_CHECK), 1) +# To display newlines in the message +define _cudnn_missing_newline_5020fd0 + + +endef +SHELL=/bin/bash +CUDNN_CHECK = $(shell echo -e '\#include \nint main(){ cudnnCreate(nullptr); return 0; }' | $(CC) -xc++ -o /dev/null $(CFLAGS) $(LFLAGS) - $(COMMON_LIBS) 2> /dev/null && echo 'passed_cudnn_exists_check') +ifneq ($(CUDNN_CHECK), passed_cudnn_exists_check) +$(error $(_cudnn_missing_newline_5020fd0)$(_cudnn_missing_newline_5020fd0)This sample requires CUDNN, but it could not be found.$(_cudnn_missing_newline_5020fd0)Please install CUDNN from https://developer.nvidia.com/cudnn or specify CUDNN_INSTALL_DIR when compiling.$(_cudnn_missing_newline_5020fd0)For example, `make CUDNN_INSTALL_DIR=/path/to/CUDNN/` where /path/to/CUDNN/ contains include/ and lib/ subdirectories.$(_cudnn_missing_newline_5020fd0)$(_cudnn_missing_newline_5020fd0)) +endif +endif diff --git a/common/common.h b/common/common.h new file mode 100644 index 0000000..288d298 --- /dev/null +++ b/common/common.h @@ -0,0 +1,364 @@ +#ifndef _TRT_COMMON_H_ +#define _TRT_COMMON_H_ +#include "NvInfer.h" +//#include "NvOnnxConfig.h" +//#include "NvOnnxParser.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std; + +#if NV_TENSORRT_MAJOR >= 8 +#define NOEXCEPT noexcept +#else +#define NOEXCEPT +#endif + +#define CHECK(status) \ + do \ + { \ + auto ret = (status); \ + if (ret != 0) \ + { \ + std::cout << "Cuda failure: " << ret; \ + abort(); \ + } \ + } while (0) + +constexpr long double operator"" _GB(long double val) { return val * (1 << 30); } +constexpr long double operator"" _MB(long double val) { return val * (1 << 20); } +constexpr long double operator"" _KB(long double val) { return val * (1 << 10); } + +// These is necessary if we want to be able to write 1_GB instead of 1.0_GB. +// Since the return type is signed, -1_GB will work as expected. +constexpr long long int operator"" _GB(long long unsigned int val) { return val * (1 << 30); } +constexpr long long int operator"" _MB(long long unsigned int val) { return val * (1 << 20); } +constexpr long long int operator"" _KB(long long unsigned int val) { return val * (1 << 10); } + +// Logger for TensorRT info/warning/errors +class Logger : public nvinfer1::ILogger +{ +public: + + //Logger(): Logger(Severity::kWARNING) {} + + Logger(Severity severity): reportableSeverity(severity) {} + + void log(Severity severity, const char* msg) NOEXCEPT override + { + // suppress messages with severity enum value greater than the reportable + if (severity > reportableSeverity) return; + + switch (severity) + { + case Severity::kINTERNAL_ERROR: std::cerr << "INTERNAL_ERROR: "; break; + case Severity::kERROR: std::cerr << "ERROR: "; break; + case Severity::kWARNING: std::cerr << "WARNING: "; break; + case Severity::kINFO: std::cerr << "INFO: "; break; + default: std::cerr << "UNKNOWN: "; break; + } + std::cerr << msg << std::endl; + } + + Severity reportableSeverity{Severity::kWARNING}; +}; + +// Locate path to file, given its filename or filepath suffix and possible dirs it might lie in +// Function will also walk back MAX_DEPTH dirs from CWD to check for such a file path +inline std::string locateFile(const std::string& filepathSuffix, const std::vector& directories) +{ + const int MAX_DEPTH{10}; + bool found{false}; + std::string filepath; + + for (auto& dir : directories) + { + filepath = dir + filepathSuffix; + + for (int i = 0; i < MAX_DEPTH && !found; i++) + { + std::ifstream checkFile(filepath); + found = checkFile.is_open(); + if (found) break; + filepath = "../" + filepath; // Try again in parent dir + } + + if (found) + { + break; + } + + filepath.clear(); + } + + if (filepath.empty()) { + std::string directoryList = std::accumulate(directories.begin() + 1, directories.end(), directories.front(), + [](const std::string& a, const std::string& b) { return a + "\n\t" + b; }); + throw std::runtime_error("Could not find " + filepathSuffix + " in data directories:\n\t" + directoryList); + } + return filepath; +} + +inline void readPGMFile(const std::string& fileName, uint8_t* buffer, int inH, int inW) +{ + std::ifstream infile(fileName, std::ifstream::binary); + assert(infile.is_open() && "Attempting to read from a file that is not open."); + std::string magic, h, w, max; + infile >> magic >> h >> w >> max; + infile.seekg(1, infile.cur); + infile.read(reinterpret_cast(buffer), inH * inW); +} + +namespace samples_common +{ + +inline void* safeCudaMalloc(size_t memSize) +{ + void* deviceMem; + CHECK(cudaMalloc(&deviceMem, memSize)); + if (deviceMem == nullptr) + { + std::cerr << "Out of memory" << std::endl; + exit(1); + } + return deviceMem; +} + +inline bool isDebug() +{ + return (std::getenv("TENSORRT_DEBUG") ? true : false); +} + +struct InferDeleter +{ + template + void operator()(T* obj) const + { + if (obj) { + obj->destroy(); + } + } +}; + +template +inline std::shared_ptr infer_object(T* obj) +{ + if (!obj) { + throw std::runtime_error("Failed to create object"); + } + return std::shared_ptr(obj, InferDeleter()); +} + +template +inline std::vector argsort(Iter begin, Iter end, bool reverse = false) +{ + std::vector inds(end - begin); + std::iota(inds.begin(), inds.end(), 0); + if (reverse) { + std::sort(inds.begin(), inds.end(), [&begin](size_t i1, size_t i2) { + return begin[i2] < begin[i1]; + }); + } + else + { + std::sort(inds.begin(), inds.end(), [&begin](size_t i1, size_t i2) { + return begin[i1] < begin[i2]; + }); + } + return inds; +} + +inline bool readReferenceFile(const std::string& fileName, std::vector& refVector) +{ + std::ifstream infile(fileName); + if (!infile.is_open()) { + cout << "ERROR: readReferenceFile: Attempting to read from a file that is not open." << endl; + return false; + } + std::string line; + while (std::getline(infile, line)) { + if (line.empty()) continue; + refVector.push_back(line); + } + infile.close(); + return true; +} + +template +inline std::vector classify(const vector& refVector, const result_vector_t& output, const size_t topK) +{ + auto inds = samples_common::argsort(output.cbegin(), output.cend(), true); + std::vector result; + for (size_t k = 0; k < topK; ++k) { + result.push_back(refVector[inds[k]]); + } + return result; +} + +//...LG returns top K indices, not values. +template +inline vector topK(const vector inp, const size_t k) +{ + vector result; + std::vector inds = samples_common::argsort(inp.cbegin(), inp.cend(), true); + result.assign(inds.begin(), inds.begin()+k); + return result; +} + +template +inline bool readASCIIFile(const string& fileName, const size_t size, vector& out) +{ + std::ifstream infile(fileName); + if (!infile.is_open()) { + cout << "ERROR readASCIIFile: Attempting to read from a file that is not open." << endl; + return false; + } + out.clear(); + out.reserve(size); + out.assign(std::istream_iterator(infile), std::istream_iterator()); + infile.close(); + return true; +} + +template +inline bool writeASCIIFile(const string& fileName, const vector& in) +{ + std::ofstream outfile(fileName); + if (!outfile.is_open()) { + cout << "ERROR: writeASCIIFile: Attempting to write to a file that is not open." << endl; + return false; + } + for (auto fn : in) { + outfile << fn << " "; + } + outfile.close(); + return true; +} + +#if 0 // for compatibility between TensorRT 3.x and 4.x +inline void print_version() +{ +//... This can be only done after statically linking this support into parserONNX.library + std::cout << "Parser built against:" << std::endl; + std::cout << " ONNX IR version: " << nvonnxparser::onnx_ir_version_string(onnx::IR_VERSION) << std::endl; + std::cout << " TensorRT version: " + << NV_TENSORRT_MAJOR << "." + << NV_TENSORRT_MINOR << "." + << NV_TENSORRT_PATCH << "." + << NV_TENSORRT_BUILD << std::endl; +} +#endif // 0 + +inline string getFileType(const string& filepath) +{ + return filepath.substr(filepath.find_last_of(".") + 1); +} + +inline string toLower(const string& inp) +{ + string out = inp; + std::transform(out.begin(), out.end(), out.begin(), ::tolower); + return out; +} + +#if 0 // for compatibility between TensorRT 3.x and 4.x +inline unsigned int getElementSize(nvinfer1::DataType t) +{ + switch (t) + { + case nvinfer1::DataType::kINT32: return 4; + case nvinfer1::DataType::kFLOAT: return 4; + case nvinfer1::DataType::kHALF: return 2; + case nvinfer1::DataType::kINT8: return 1; + } + throw std::runtime_error("Invalid DataType."); + return 0; +} +#endif // 0 + +inline int64_t volume(const nvinfer1::Dims& d) +{ + return std::accumulate(d.d, d.d + d.nbDims, 1, std::multiplies()); +} + +// Struct to maintain command-line arguments. +struct Args +{ + bool runInInt8 = false; +}; + +// Populates the Args struct with the provided command-line parameters. +inline void parseArgs(Args& args, int argc, char* argv[]) +{ + if (argc >= 1) + { + for (int i = 1; i < argc; ++i) + { + if (!strcmp(argv[i], "--int8")) args.runInInt8 = true; + } + } +} + +template +struct PPM +{ + std::string magic, fileName; + int h, w, max; + uint8_t buffer[C * H * W]; +}; + +struct BBox +{ + float x1, y1, x2, y2; +}; + +template +inline void writePPMFileWithBBox(const std::string& filename, PPM& ppm, const BBox& bbox) +{ + std::ofstream outfile("./" + filename, std::ofstream::binary); + assert(!outfile.fail()); + outfile << "P6" << "\n" << ppm.w << " " << ppm.h << "\n" << ppm.max << "\n"; + auto round = [](float x) -> int { return int(std::floor(x + 0.5f)); }; + const int x1 = std::min(std::max(0, round(int(bbox.x1))), W - 1); + const int x2 = std::min(std::max(0, round(int(bbox.x2))), W - 1); + const int y1 = std::min(std::max(0, round(int(bbox.y1))), H - 1); + const int y2 = std::min(std::max(0, round(int(bbox.y2))), H - 1); + for (int x = x1; x <= x2; ++x) + { + // bbox top border + ppm.buffer[(y1 * ppm.w + x) * 3] = 255; + ppm.buffer[(y1 * ppm.w + x) * 3 + 1] = 0; + ppm.buffer[(y1 * ppm.w + x) * 3 + 2] = 0; + // bbox bottom border + ppm.buffer[(y2 * ppm.w + x) * 3] = 255; + ppm.buffer[(y2 * ppm.w + x) * 3 + 1] = 0; + ppm.buffer[(y2 * ppm.w + x) * 3 + 2] = 0; + } + for (int y = y1; y <= y2; ++y) + { + // bbox left border + ppm.buffer[(y * ppm.w + x1) * 3] = 255; + ppm.buffer[(y * ppm.w + x1) * 3 + 1] = 0; + ppm.buffer[(y * ppm.w + x1) * 3 + 2] = 0; + // bbox right border + ppm.buffer[(y * ppm.w + x2) * 3] = 255; + ppm.buffer[(y * ppm.w + x2) * 3 + 1] = 0; + ppm.buffer[(y * ppm.w + x2) * 3 + 2] = 0; + } + outfile.write(reinterpret_cast(ppm.buffer), ppm.w * ppm.h * 3); +} + +} // namespace samples_common + +#endif // _TRT_COMMON_H_ diff --git a/eval_ssd.py b/eval_ssd.py new file mode 100644 index 0000000..4eaf1d4 --- /dev/null +++ b/eval_ssd.py @@ -0,0 +1,104 @@ +"""eval_ssd.py + +This script is for evaluating mAP (accuracy) of SSD models. The +model being evaluated could be either a TensorFlow frozen inference +graph (pb) or a TensorRT engine. +""" + + +import os +import sys +import json +import argparse + +import cv2 +import pycuda.autoinit # This is needed for initializing CUDA driver +from pycocotools.coco import COCO +from pycocotools.cocoeval import COCOeval +from progressbar import progressbar + +from utils.ssd import TrtSSD +from utils.ssd_tf import TfSSD + + +INPUT_HW = (300, 300) +SUPPORTED_MODELS = [ + 'ssd_mobilenet_v1_coco', + 'ssd_mobilenet_v2_coco', +] + +HOME = os.environ['HOME'] +VAL_IMGS_DIR = HOME + '/data/coco/images/val2017' +VAL_ANNOTATIONS = HOME + '/data/coco/annotations/instances_val2017.json' + + +def parse_args(): + """Parse input arguments.""" + desc = 'Evaluate mAP of SSD model' + parser = argparse.ArgumentParser(description=desc) + parser.add_argument('--mode', type=str, default='trt', + choices=['tf', 'trt']) + parser.add_argument('--imgs_dir', type=str, default=VAL_IMGS_DIR, + help='directory of validation images [%s]' % VAL_IMGS_DIR) + parser.add_argument('--annotations', type=str, default=VAL_ANNOTATIONS, + help='groundtruth annotations [%s]' % VAL_ANNOTATIONS) + parser.add_argument('model', type=str, choices=SUPPORTED_MODELS) + args = parser.parse_args() + return args + + +def check_args(args): + """Check and make sure command-line arguments are valid.""" + if not os.path.isdir(args.imgs_dir): + sys.exit('%s is not a valid directory' % args.imgs_dir) + if not os.path.isfile(args.annotations): + sys.exit('%s is not a valid file' % args.annotations) + + +def generate_results(ssd, imgs_dir, jpgs, results_file): + """Run detection on each jpg and write results to file.""" + results = [] + for jpg in progressbar(jpgs): + img = cv2.imread(os.path.join(imgs_dir, jpg)) + image_id = int(jpg.split('.')[0].split('_')[-1]) + boxes, confs, clss = ssd.detect(img, conf_th=1e-2) + for box, conf, cls in zip(boxes, confs, clss): + x = float(box[0]) + y = float(box[1]) + w = float(box[2] - box[0] + 1) + h = float(box[3] - box[1] + 1) + results.append({'image_id': image_id, + 'category_id': int(cls), + 'bbox': [x, y, w, h], + 'score': float(conf)}) + with open(results_file, 'w') as f: + f.write(json.dumps(results, indent=4)) + + +def main(): + args = parse_args() + check_args(args) + + results_file = 'ssd/results_%s_%s.json' % (args.model, args.mode) + if args.mode == 'trt': + ssd = TrtSSD(args.model, INPUT_HW) + else: + ssd = TfSSD(args.model, INPUT_HW) + + jpgs = [j for j in os.listdir(args.imgs_dir) if j.endswith('.jpg')] + generate_results(ssd, args.imgs_dir, jpgs, results_file) + + # Run COCO mAP evaluation + # Reference: https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocoEvalDemo.ipynb + cocoGt = COCO(args.annotations) + cocoDt = cocoGt.loadRes(results_file) + imgIds = sorted(cocoGt.getImgIds()) + cocoEval = COCOeval(cocoGt, cocoDt, 'bbox') + cocoEval.params.imgIds = imgIds + cocoEval.evaluate() + cocoEval.accumulate() + cocoEval.summarize() + + +if __name__ == '__main__': + main() diff --git a/eval_yolo.py b/eval_yolo.py new file mode 100644 index 0000000..258c378 --- /dev/null +++ b/eval_yolo.py @@ -0,0 +1,116 @@ +"""eval_yolo.py + +This script is for evaluating mAP (accuracy) of YOLO models. +""" + + +import os +import sys +import json +import argparse + +import cv2 +import pycuda.autoinit # This is needed for initializing CUDA driver + +from pycocotools.coco import COCO +from pycocotools.cocoeval import COCOeval +from progressbar import progressbar + +from utils.yolo_with_plugins import TrtYOLO +from utils.yolo_classes import yolo_cls_to_ssd + + + +HOME = os.environ['HOME'] +VAL_IMGS_DIR = HOME + '/data/coco/images/val2017' +VAL_ANNOTATIONS = HOME + '/data/coco/annotations/instances_val2017.json' + + +def parse_args(): + """Parse input arguments.""" + desc = 'Evaluate mAP of YOLO model' + parser = argparse.ArgumentParser(description=desc) + parser.add_argument( + '--imgs_dir', type=str, default=VAL_IMGS_DIR, + help='directory of validation images [%s]' % VAL_IMGS_DIR) + parser.add_argument( + '--annotations', type=str, default=VAL_ANNOTATIONS, + help='groundtruth annotations [%s]' % VAL_ANNOTATIONS) + parser.add_argument( + '--non_coco', action='store_true', + help='don\'t do coco class translation [False]') + parser.add_argument( + '-c', '--category_num', type=int, default=80, + help='number of object categories [80]') + parser.add_argument( + '-m', '--model', type=str, required=True, + help=('[yolov3|yolov3-tiny|yolov3-spp|yolov4|yolov4-tiny]-' + '[{dimension}], where dimension could be a single ' + 'number (e.g. 288, 416, 608) or WxH (e.g. 416x256)')) + parser.add_argument( + '-l', '--letter_box', action='store_true', + help='inference with letterboxed image [False]') + args = parser.parse_args() + return args + + +def check_args(args): + """Check and make sure command-line arguments are valid.""" + if not os.path.isdir(args.imgs_dir): + sys.exit('%s is not a valid directory' % args.imgs_dir) + if not os.path.isfile(args.annotations): + sys.exit('%s is not a valid file' % args.annotations) + + +def generate_results(trt_yolo, imgs_dir, jpgs, results_file, non_coco): + """Run detection on each jpg and write results to file.""" + results = [] + for jpg in progressbar(jpgs): + img = cv2.imread(os.path.join(imgs_dir, jpg)) + image_id = int(jpg.split('.')[0].split('_')[-1]) + boxes, confs, clss = trt_yolo.detect(img, conf_th=1e-2) + for box, conf, cls in zip(boxes, confs, clss): + x = float(box[0]) + y = float(box[1]) + w = float(box[2] - box[0] + 1) + h = float(box[3] - box[1] + 1) + cls = int(cls) + cls = cls if non_coco else yolo_cls_to_ssd[cls] + results.append({'image_id': image_id, + 'category_id': cls, + 'bbox': [x, y, w, h], + 'score': float(conf)}) + with open(results_file, 'w') as f: + f.write(json.dumps(results, indent=4)) + + +def main(): + args = parse_args() + check_args(args) + if args.category_num <= 0: + raise SystemExit('ERROR: bad category_num (%d)!' % args.category_num) + if not os.path.isfile('yolo/%s.trt' % args.model): + raise SystemExit('ERROR: file (yolo/%s.trt) not found!' % args.model) + + results_file = 'yolo/results_%s.json' % args.model + + trt_yolo = TrtYOLO(args.model, args.category_num, args.letter_box) + + jpgs = [j for j in os.listdir(args.imgs_dir) if j.endswith('.jpg')] + generate_results(trt_yolo, args.imgs_dir, jpgs, results_file, + non_coco=args.non_coco) + + # Run COCO mAP evaluation + # Reference: https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocoEvalDemo.ipynb + cocoGt = COCO(args.annotations) + cocoDt = cocoGt.loadRes(results_file) + imgIds = sorted(cocoGt.getImgIds()) + cocoEval = COCOeval(cocoGt, cocoDt, 'bbox') + cocoEval.params.imgIds = imgIds + cocoEval.evaluate() + cocoEval.accumulate() + cocoEval.summarize() + + +if __name__ == '__main__': + main() diff --git a/googlenet/Makefile b/googlenet/Makefile new file mode 100644 index 0000000..3ca9d45 --- /dev/null +++ b/googlenet/Makefile @@ -0,0 +1,6 @@ +OUTNAME_RELEASE = create_engine +OUTNAME_DEBUG = create_engine_debug +MAKEFILE_CONFIG ?= ../common/Makefile.config +include $(MAKEFILE_CONFIG) + +all: release diff --git a/googlenet/README.md b/googlenet/README.md new file mode 100644 index 0000000..8cc5365 --- /dev/null +++ b/googlenet/README.md @@ -0,0 +1 @@ +The caffe prototxt and model files in this directory were copied from [BVLC/caffe/models/bvlc_googlenet/](https://github.com/BVLC/caffe/tree/master/models/bvlc_googlenet). diff --git a/googlenet/create_engine.cpp b/googlenet/create_engine.cpp new file mode 100644 index 0000000..0a7b0ef --- /dev/null +++ b/googlenet/create_engine.cpp @@ -0,0 +1,222 @@ +// create_engine.cpp +// +// This program creates TensorRT engine for the GoogLeNet model. +// +// Inputs: +// deploy.prototxt +// deploy.caffemodel +// +// Outputs: +// deploy.engine + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "NvInfer.h" +#include "NvCaffeParser.h" +#include "common.h" + +using namespace nvinfer1; +using namespace nvcaffeparser1; + +//static Logger gLogger(ILogger::Severity::kINFO); +static Logger gLogger(ILogger::Severity::kWARNING); + +class IHostMemoryFromFile : public IHostMemory +{ + public: + IHostMemoryFromFile(std::string filename); +#if NV_TENSORRT_MAJOR >= 6 + void* data() const noexcept { return mem; } + std::size_t size() const noexcept { return s; } + DataType type () const noexcept { return DataType::kFLOAT; } // not used + void destroy() noexcept { free(mem); } +#else // NV_TENSORRT_MAJOR < 6 + void* data() const { return mem; } + std::size_t size() const { return s; } + DataType type () const { return DataType::kFLOAT; } // not used + void destroy() { free(mem); } +#endif // NV_TENSORRT_MAJOR + private: + void *mem{nullptr}; + std::size_t s; +}; + +IHostMemoryFromFile::IHostMemoryFromFile(std::string filename) +{ + std::ifstream infile(filename, std::ifstream::binary | std::ifstream::ate); + s = infile.tellg(); + infile.seekg(0, std::ios::beg); + mem = malloc(s); + infile.read(reinterpret_cast(mem), s); +} + +std::string locateFile(const std::string& input) +{ + std::vector dirs{"./"}; + return locateFile(input, dirs); +} + +void caffeToTRTModel(const std::string& deployFile, // name for caffe prototxt + const std::string& modelFile, // name for model + const std::vector& outputs, // network outputs + unsigned int maxBatchSize, // batch size - NB must be at least as large as the batch we want to run with) + IHostMemory *&trtModelStream) +{ + // create API root class - must span the lifetime of the engine usage + IBuilder* builder = createInferBuilder(gLogger); +#if NV_TENSORRT_MAJOR >= 7 + INetworkDefinition* network = builder->createNetworkV2(0); // no kEXPLICIT_BATCH +#else // NV_TENSORRT_MAJOR < 7 + INetworkDefinition* network = builder->createNetwork(); +#endif + + // parse the caffe model to populate the network, then set the outputs + ICaffeParser* parser = createCaffeParser(); + + bool useFp16 = builder->platformHasFastFp16(); + + // create a 16-bit model if it's natively supported + DataType modelDataType = useFp16 ? DataType::kHALF : DataType::kFLOAT; + const IBlobNameToTensor *blobNameToTensor = + parser->parse(locateFile(deployFile).c_str(), // caffe deploy file + locateFile(modelFile).c_str(), // caffe model file + *network, // network definition that the parser will populate + modelDataType); + assert(blobNameToTensor != nullptr); + + // the caffe file has no notion of outputs, so we need to manually say which tensors the engine should generate + for (auto& s : outputs) + network->markOutput(*blobNameToTensor->find(s.c_str())); + +#if NV_TENSORRT_MAJOR >= 7 + auto config = builder->createBuilderConfig(); + assert(config != nullptr); + + builder->setMaxBatchSize(maxBatchSize); + config->setMaxWorkspaceSize(64_MB); + if (useFp16) { + config->setFlag(BuilderFlag::kFP16); + cout << "Building TensorRT engine in FP16 mode..." << endl; + } else { + cout << "Building TensorRT engine in FP32 mode..." << endl; + } + ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); + config->destroy(); +#else // NV_TENSORRT_MAJOR < 7 + // Build the engine + builder->setMaxBatchSize(maxBatchSize); + builder->setMaxWorkspaceSize(64_MB); + + // set up the network for paired-fp16 format if available + if (useFp16) { +#if NV_TENSORRT_MAJOR >= 4 + builder->setFp16Mode(true); +#else // NV_TENSORRT_MAJOR < 4 + builder->setHalf2Mode(true); +#endif + } + ICudaEngine* engine = builder->buildCudaEngine(*network); +#endif // NV_TENSORRT_MAJOR >= 7 + assert(engine != nullptr); + + // we don't need the network any more, and we can destroy the parser + parser->destroy(); + network->destroy(); + + // serialize the engine, then close everything down + trtModelStream = engine->serialize(); + engine->destroy(); + builder->destroy(); +} + +void giestream_to_file(IHostMemory *trtModelStream, const std::string filename) +{ + assert(trtModelStream != nullptr); + std::ofstream outfile(filename, std::ofstream::binary); + assert(!outfile.fail()); + outfile.write(reinterpret_cast(trtModelStream->data()), trtModelStream->size()); + outfile.close(); +} + +void file_to_giestream(const std::string filename, IHostMemoryFromFile *&trtModelStream) +{ + trtModelStream = new IHostMemoryFromFile(filename); +} + +void verify_engine(std::string det_name) +{ + std::stringstream ss; + ss << det_name << ".engine"; + IHostMemoryFromFile *trtModelStream{nullptr}; + file_to_giestream(ss.str(), trtModelStream); + + // create an engine + IRuntime* infer = createInferRuntime(gLogger); + assert(infer != nullptr); + ICudaEngine* engine = infer->deserializeCudaEngine( + trtModelStream->data(), + trtModelStream->size(), + nullptr); + assert(engine != nullptr); + + assert(engine->getNbBindings() == 2); + std::cout << "Bindings for " << det_name << " after deserializing:" + << std::endl; + for (int bi = 0; bi < 2; bi++) { +#if NV_TENSORRT_MAJOR >= 4 + Dims3 dim = static_cast(engine->getBindingDimensions(bi)); + if (engine->bindingIsInput(bi) == true) { + std::cout << " Input "; + } else { + std::cout << " Output "; + } + std::cout << bi << ": " << engine->getBindingName(bi) << ", " + << dim.d[0] << "x" << dim.d[1] << "x" << dim.d[2] + << std::endl; +#else // NV_TENSORRT_MAJOR < 4 + DimsCHW dim = static_cast(engine->getBindingDimensions(bi)); + if (engine->bindingIsInput(bi) == true) { + std::cout << " Input "; + } else { + std::cout << " Output "; + } + std::cout << bi << ": " << engine->getBindingName(bi) << ", " + << dim.c() << "x" << dim.h() << "x" << dim.w() + << std::endl; +#endif // NV_TENSORRT_MAJOR + } + engine->destroy(); + infer->destroy(); + trtModelStream->destroy(); +} + +int main(int argc, char** argv) +{ + IHostMemory *trtModelStream{nullptr}; + + std::cout << "Building deploy.engine, maxBatchSize = 1" << std::endl; + caffeToTRTModel("deploy.prototxt", + "deploy.caffemodel", + std::vector { "prob" }, + 1, // batch size + trtModelStream); + giestream_to_file(trtModelStream, "deploy.engine"); + trtModelStream->destroy(); + //delete trtModelStream; + + shutdownProtobufLibrary(); + + std::cout << std::endl << "Verifying engine..." << std::endl; + verify_engine("deploy"); + std::cout << "Done." << std::endl; + return 0; +} diff --git a/googlenet/deploy.caffemodel b/googlenet/deploy.caffemodel new file mode 100644 index 0000000..a21694f Binary files /dev/null and b/googlenet/deploy.caffemodel differ diff --git a/googlenet/deploy.prototxt b/googlenet/deploy.prototxt new file mode 100644 index 0000000..0c296a7 --- /dev/null +++ b/googlenet/deploy.prototxt @@ -0,0 +1,2157 @@ +name: "GoogleNet" +layer { + name: "data" + type: "Input" + top: "data" + input_param { shape: { dim: 1 dim: 3 dim: 224 dim: 224 } } +} +layer { + name: "conv1/7x7_s2" + type: "Convolution" + bottom: "data" + top: "conv1/7x7_s2" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 64 + pad: 3 + kernel_size: 7 + stride: 2 + weight_filler { + type: "xavier" + std: 0.1 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "conv1/relu_7x7" + type: "ReLU" + bottom: "conv1/7x7_s2" + top: "conv1/7x7_s2" +} +layer { + name: "pool1/3x3_s2" + type: "Pooling" + bottom: "conv1/7x7_s2" + top: "pool1/3x3_s2" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "pool1/norm1" + type: "LRN" + bottom: "pool1/3x3_s2" + top: "pool1/norm1" + lrn_param { + local_size: 5 + alpha: 0.0001 + beta: 0.75 + } +} +layer { + name: "conv2/3x3_reduce" + type: "Convolution" + bottom: "pool1/norm1" + top: "conv2/3x3_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 64 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.1 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "conv2/relu_3x3_reduce" + type: "ReLU" + bottom: "conv2/3x3_reduce" + top: "conv2/3x3_reduce" +} +layer { + name: "conv2/3x3" + type: "Convolution" + bottom: "conv2/3x3_reduce" + top: "conv2/3x3" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 192 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "conv2/relu_3x3" + type: "ReLU" + bottom: "conv2/3x3" + top: "conv2/3x3" +} +layer { + name: "conv2/norm2" + type: "LRN" + bottom: "conv2/3x3" + top: "conv2/norm2" + lrn_param { + local_size: 5 + alpha: 0.0001 + beta: 0.75 + } +} +layer { + name: "pool2/3x3_s2" + type: "Pooling" + bottom: "conv2/norm2" + top: "pool2/3x3_s2" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "inception_3a/1x1" + type: "Convolution" + bottom: "pool2/3x3_s2" + top: "inception_3a/1x1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 64 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_3a/relu_1x1" + type: "ReLU" + bottom: "inception_3a/1x1" + top: "inception_3a/1x1" +} +layer { + name: "inception_3a/3x3_reduce" + type: "Convolution" + bottom: "pool2/3x3_s2" + top: "inception_3a/3x3_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 96 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.09 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_3a/relu_3x3_reduce" + type: "ReLU" + bottom: "inception_3a/3x3_reduce" + top: "inception_3a/3x3_reduce" +} +layer { + name: "inception_3a/3x3" + type: "Convolution" + bottom: "inception_3a/3x3_reduce" + top: "inception_3a/3x3" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_3a/relu_3x3" + type: "ReLU" + bottom: "inception_3a/3x3" + top: "inception_3a/3x3" +} +layer { + name: "inception_3a/5x5_reduce" + type: "Convolution" + bottom: "pool2/3x3_s2" + top: "inception_3a/5x5_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 16 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.2 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_3a/relu_5x5_reduce" + type: "ReLU" + bottom: "inception_3a/5x5_reduce" + top: "inception_3a/5x5_reduce" +} +layer { + name: "inception_3a/5x5" + type: "Convolution" + bottom: "inception_3a/5x5_reduce" + top: "inception_3a/5x5" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 32 + pad: 2 + kernel_size: 5 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_3a/relu_5x5" + type: "ReLU" + bottom: "inception_3a/5x5" + top: "inception_3a/5x5" +} +layer { + name: "inception_3a/pool" + type: "Pooling" + bottom: "pool2/3x3_s2" + top: "inception_3a/pool" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 1 + pad: 1 + } +} +layer { + name: "inception_3a/pool_proj" + type: "Convolution" + bottom: "inception_3a/pool" + top: "inception_3a/pool_proj" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 32 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.1 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_3a/relu_pool_proj" + type: "ReLU" + bottom: "inception_3a/pool_proj" + top: "inception_3a/pool_proj" +} +layer { + name: "inception_3a/output" + type: "Concat" + bottom: "inception_3a/1x1" + bottom: "inception_3a/3x3" + bottom: "inception_3a/5x5" + bottom: "inception_3a/pool_proj" + top: "inception_3a/output" +} +layer { + name: "inception_3b/1x1" + type: "Convolution" + bottom: "inception_3a/output" + top: "inception_3b/1x1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_3b/relu_1x1" + type: "ReLU" + bottom: "inception_3b/1x1" + top: "inception_3b/1x1" +} +layer { + name: "inception_3b/3x3_reduce" + type: "Convolution" + bottom: "inception_3a/output" + top: "inception_3b/3x3_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.09 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_3b/relu_3x3_reduce" + type: "ReLU" + bottom: "inception_3b/3x3_reduce" + top: "inception_3b/3x3_reduce" +} +layer { + name: "inception_3b/3x3" + type: "Convolution" + bottom: "inception_3b/3x3_reduce" + top: "inception_3b/3x3" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 192 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_3b/relu_3x3" + type: "ReLU" + bottom: "inception_3b/3x3" + top: "inception_3b/3x3" +} +layer { + name: "inception_3b/5x5_reduce" + type: "Convolution" + bottom: "inception_3a/output" + top: "inception_3b/5x5_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 32 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.2 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_3b/relu_5x5_reduce" + type: "ReLU" + bottom: "inception_3b/5x5_reduce" + top: "inception_3b/5x5_reduce" +} +layer { + name: "inception_3b/5x5" + type: "Convolution" + bottom: "inception_3b/5x5_reduce" + top: "inception_3b/5x5" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 96 + pad: 2 + kernel_size: 5 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_3b/relu_5x5" + type: "ReLU" + bottom: "inception_3b/5x5" + top: "inception_3b/5x5" +} +layer { + name: "inception_3b/pool" + type: "Pooling" + bottom: "inception_3a/output" + top: "inception_3b/pool" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 1 + pad: 1 + } +} +layer { + name: "inception_3b/pool_proj" + type: "Convolution" + bottom: "inception_3b/pool" + top: "inception_3b/pool_proj" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 64 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.1 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_3b/relu_pool_proj" + type: "ReLU" + bottom: "inception_3b/pool_proj" + top: "inception_3b/pool_proj" +} +layer { + name: "inception_3b/output" + type: "Concat" + bottom: "inception_3b/1x1" + bottom: "inception_3b/3x3" + bottom: "inception_3b/5x5" + bottom: "inception_3b/pool_proj" + top: "inception_3b/output" +} +layer { + name: "pool3/3x3_s2" + type: "Pooling" + bottom: "inception_3b/output" + top: "pool3/3x3_s2" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "inception_4a/1x1" + type: "Convolution" + bottom: "pool3/3x3_s2" + top: "inception_4a/1x1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 192 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4a/relu_1x1" + type: "ReLU" + bottom: "inception_4a/1x1" + top: "inception_4a/1x1" +} +layer { + name: "inception_4a/3x3_reduce" + type: "Convolution" + bottom: "pool3/3x3_s2" + top: "inception_4a/3x3_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 96 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.09 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4a/relu_3x3_reduce" + type: "ReLU" + bottom: "inception_4a/3x3_reduce" + top: "inception_4a/3x3_reduce" +} +layer { + name: "inception_4a/3x3" + type: "Convolution" + bottom: "inception_4a/3x3_reduce" + top: "inception_4a/3x3" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 208 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4a/relu_3x3" + type: "ReLU" + bottom: "inception_4a/3x3" + top: "inception_4a/3x3" +} +layer { + name: "inception_4a/5x5_reduce" + type: "Convolution" + bottom: "pool3/3x3_s2" + top: "inception_4a/5x5_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 16 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.2 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4a/relu_5x5_reduce" + type: "ReLU" + bottom: "inception_4a/5x5_reduce" + top: "inception_4a/5x5_reduce" +} +layer { + name: "inception_4a/5x5" + type: "Convolution" + bottom: "inception_4a/5x5_reduce" + top: "inception_4a/5x5" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 48 + pad: 2 + kernel_size: 5 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4a/relu_5x5" + type: "ReLU" + bottom: "inception_4a/5x5" + top: "inception_4a/5x5" +} +layer { + name: "inception_4a/pool" + type: "Pooling" + bottom: "pool3/3x3_s2" + top: "inception_4a/pool" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 1 + pad: 1 + } +} +layer { + name: "inception_4a/pool_proj" + type: "Convolution" + bottom: "inception_4a/pool" + top: "inception_4a/pool_proj" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 64 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.1 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4a/relu_pool_proj" + type: "ReLU" + bottom: "inception_4a/pool_proj" + top: "inception_4a/pool_proj" +} +layer { + name: "inception_4a/output" + type: "Concat" + bottom: "inception_4a/1x1" + bottom: "inception_4a/3x3" + bottom: "inception_4a/5x5" + bottom: "inception_4a/pool_proj" + top: "inception_4a/output" +} +layer { + name: "inception_4b/1x1" + type: "Convolution" + bottom: "inception_4a/output" + top: "inception_4b/1x1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 160 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4b/relu_1x1" + type: "ReLU" + bottom: "inception_4b/1x1" + top: "inception_4b/1x1" +} +layer { + name: "inception_4b/3x3_reduce" + type: "Convolution" + bottom: "inception_4a/output" + top: "inception_4b/3x3_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 112 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.09 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4b/relu_3x3_reduce" + type: "ReLU" + bottom: "inception_4b/3x3_reduce" + top: "inception_4b/3x3_reduce" +} +layer { + name: "inception_4b/3x3" + type: "Convolution" + bottom: "inception_4b/3x3_reduce" + top: "inception_4b/3x3" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 224 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4b/relu_3x3" + type: "ReLU" + bottom: "inception_4b/3x3" + top: "inception_4b/3x3" +} +layer { + name: "inception_4b/5x5_reduce" + type: "Convolution" + bottom: "inception_4a/output" + top: "inception_4b/5x5_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 24 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.2 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4b/relu_5x5_reduce" + type: "ReLU" + bottom: "inception_4b/5x5_reduce" + top: "inception_4b/5x5_reduce" +} +layer { + name: "inception_4b/5x5" + type: "Convolution" + bottom: "inception_4b/5x5_reduce" + top: "inception_4b/5x5" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 64 + pad: 2 + kernel_size: 5 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4b/relu_5x5" + type: "ReLU" + bottom: "inception_4b/5x5" + top: "inception_4b/5x5" +} +layer { + name: "inception_4b/pool" + type: "Pooling" + bottom: "inception_4a/output" + top: "inception_4b/pool" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 1 + pad: 1 + } +} +layer { + name: "inception_4b/pool_proj" + type: "Convolution" + bottom: "inception_4b/pool" + top: "inception_4b/pool_proj" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 64 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.1 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4b/relu_pool_proj" + type: "ReLU" + bottom: "inception_4b/pool_proj" + top: "inception_4b/pool_proj" +} +layer { + name: "inception_4b/output" + type: "Concat" + bottom: "inception_4b/1x1" + bottom: "inception_4b/3x3" + bottom: "inception_4b/5x5" + bottom: "inception_4b/pool_proj" + top: "inception_4b/output" +} +layer { + name: "inception_4c/1x1" + type: "Convolution" + bottom: "inception_4b/output" + top: "inception_4c/1x1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4c/relu_1x1" + type: "ReLU" + bottom: "inception_4c/1x1" + top: "inception_4c/1x1" +} +layer { + name: "inception_4c/3x3_reduce" + type: "Convolution" + bottom: "inception_4b/output" + top: "inception_4c/3x3_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.09 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4c/relu_3x3_reduce" + type: "ReLU" + bottom: "inception_4c/3x3_reduce" + top: "inception_4c/3x3_reduce" +} +layer { + name: "inception_4c/3x3" + type: "Convolution" + bottom: "inception_4c/3x3_reduce" + top: "inception_4c/3x3" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4c/relu_3x3" + type: "ReLU" + bottom: "inception_4c/3x3" + top: "inception_4c/3x3" +} +layer { + name: "inception_4c/5x5_reduce" + type: "Convolution" + bottom: "inception_4b/output" + top: "inception_4c/5x5_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 24 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.2 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4c/relu_5x5_reduce" + type: "ReLU" + bottom: "inception_4c/5x5_reduce" + top: "inception_4c/5x5_reduce" +} +layer { + name: "inception_4c/5x5" + type: "Convolution" + bottom: "inception_4c/5x5_reduce" + top: "inception_4c/5x5" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 64 + pad: 2 + kernel_size: 5 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4c/relu_5x5" + type: "ReLU" + bottom: "inception_4c/5x5" + top: "inception_4c/5x5" +} +layer { + name: "inception_4c/pool" + type: "Pooling" + bottom: "inception_4b/output" + top: "inception_4c/pool" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 1 + pad: 1 + } +} +layer { + name: "inception_4c/pool_proj" + type: "Convolution" + bottom: "inception_4c/pool" + top: "inception_4c/pool_proj" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 64 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.1 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4c/relu_pool_proj" + type: "ReLU" + bottom: "inception_4c/pool_proj" + top: "inception_4c/pool_proj" +} +layer { + name: "inception_4c/output" + type: "Concat" + bottom: "inception_4c/1x1" + bottom: "inception_4c/3x3" + bottom: "inception_4c/5x5" + bottom: "inception_4c/pool_proj" + top: "inception_4c/output" +} +layer { + name: "inception_4d/1x1" + type: "Convolution" + bottom: "inception_4c/output" + top: "inception_4d/1x1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 112 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4d/relu_1x1" + type: "ReLU" + bottom: "inception_4d/1x1" + top: "inception_4d/1x1" +} +layer { + name: "inception_4d/3x3_reduce" + type: "Convolution" + bottom: "inception_4c/output" + top: "inception_4d/3x3_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 144 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.09 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4d/relu_3x3_reduce" + type: "ReLU" + bottom: "inception_4d/3x3_reduce" + top: "inception_4d/3x3_reduce" +} +layer { + name: "inception_4d/3x3" + type: "Convolution" + bottom: "inception_4d/3x3_reduce" + top: "inception_4d/3x3" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 288 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4d/relu_3x3" + type: "ReLU" + bottom: "inception_4d/3x3" + top: "inception_4d/3x3" +} +layer { + name: "inception_4d/5x5_reduce" + type: "Convolution" + bottom: "inception_4c/output" + top: "inception_4d/5x5_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 32 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.2 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4d/relu_5x5_reduce" + type: "ReLU" + bottom: "inception_4d/5x5_reduce" + top: "inception_4d/5x5_reduce" +} +layer { + name: "inception_4d/5x5" + type: "Convolution" + bottom: "inception_4d/5x5_reduce" + top: "inception_4d/5x5" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 64 + pad: 2 + kernel_size: 5 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4d/relu_5x5" + type: "ReLU" + bottom: "inception_4d/5x5" + top: "inception_4d/5x5" +} +layer { + name: "inception_4d/pool" + type: "Pooling" + bottom: "inception_4c/output" + top: "inception_4d/pool" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 1 + pad: 1 + } +} +layer { + name: "inception_4d/pool_proj" + type: "Convolution" + bottom: "inception_4d/pool" + top: "inception_4d/pool_proj" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 64 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.1 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4d/relu_pool_proj" + type: "ReLU" + bottom: "inception_4d/pool_proj" + top: "inception_4d/pool_proj" +} +layer { + name: "inception_4d/output" + type: "Concat" + bottom: "inception_4d/1x1" + bottom: "inception_4d/3x3" + bottom: "inception_4d/5x5" + bottom: "inception_4d/pool_proj" + top: "inception_4d/output" +} +layer { + name: "inception_4e/1x1" + type: "Convolution" + bottom: "inception_4d/output" + top: "inception_4e/1x1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 256 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4e/relu_1x1" + type: "ReLU" + bottom: "inception_4e/1x1" + top: "inception_4e/1x1" +} +layer { + name: "inception_4e/3x3_reduce" + type: "Convolution" + bottom: "inception_4d/output" + top: "inception_4e/3x3_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 160 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.09 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4e/relu_3x3_reduce" + type: "ReLU" + bottom: "inception_4e/3x3_reduce" + top: "inception_4e/3x3_reduce" +} +layer { + name: "inception_4e/3x3" + type: "Convolution" + bottom: "inception_4e/3x3_reduce" + top: "inception_4e/3x3" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 320 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4e/relu_3x3" + type: "ReLU" + bottom: "inception_4e/3x3" + top: "inception_4e/3x3" +} +layer { + name: "inception_4e/5x5_reduce" + type: "Convolution" + bottom: "inception_4d/output" + top: "inception_4e/5x5_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 32 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.2 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4e/relu_5x5_reduce" + type: "ReLU" + bottom: "inception_4e/5x5_reduce" + top: "inception_4e/5x5_reduce" +} +layer { + name: "inception_4e/5x5" + type: "Convolution" + bottom: "inception_4e/5x5_reduce" + top: "inception_4e/5x5" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 2 + kernel_size: 5 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4e/relu_5x5" + type: "ReLU" + bottom: "inception_4e/5x5" + top: "inception_4e/5x5" +} +layer { + name: "inception_4e/pool" + type: "Pooling" + bottom: "inception_4d/output" + top: "inception_4e/pool" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 1 + pad: 1 + } +} +layer { + name: "inception_4e/pool_proj" + type: "Convolution" + bottom: "inception_4e/pool" + top: "inception_4e/pool_proj" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.1 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4e/relu_pool_proj" + type: "ReLU" + bottom: "inception_4e/pool_proj" + top: "inception_4e/pool_proj" +} +layer { + name: "inception_4e/output" + type: "Concat" + bottom: "inception_4e/1x1" + bottom: "inception_4e/3x3" + bottom: "inception_4e/5x5" + bottom: "inception_4e/pool_proj" + top: "inception_4e/output" +} +layer { + name: "pool4/3x3_s2" + type: "Pooling" + bottom: "inception_4e/output" + top: "pool4/3x3_s2" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "inception_5a/1x1" + type: "Convolution" + bottom: "pool4/3x3_s2" + top: "inception_5a/1x1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 256 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_5a/relu_1x1" + type: "ReLU" + bottom: "inception_5a/1x1" + top: "inception_5a/1x1" +} +layer { + name: "inception_5a/3x3_reduce" + type: "Convolution" + bottom: "pool4/3x3_s2" + top: "inception_5a/3x3_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 160 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.09 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_5a/relu_3x3_reduce" + type: "ReLU" + bottom: "inception_5a/3x3_reduce" + top: "inception_5a/3x3_reduce" +} +layer { + name: "inception_5a/3x3" + type: "Convolution" + bottom: "inception_5a/3x3_reduce" + top: "inception_5a/3x3" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 320 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_5a/relu_3x3" + type: "ReLU" + bottom: "inception_5a/3x3" + top: "inception_5a/3x3" +} +layer { + name: "inception_5a/5x5_reduce" + type: "Convolution" + bottom: "pool4/3x3_s2" + top: "inception_5a/5x5_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 32 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.2 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_5a/relu_5x5_reduce" + type: "ReLU" + bottom: "inception_5a/5x5_reduce" + top: "inception_5a/5x5_reduce" +} +layer { + name: "inception_5a/5x5" + type: "Convolution" + bottom: "inception_5a/5x5_reduce" + top: "inception_5a/5x5" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 2 + kernel_size: 5 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_5a/relu_5x5" + type: "ReLU" + bottom: "inception_5a/5x5" + top: "inception_5a/5x5" +} +layer { + name: "inception_5a/pool" + type: "Pooling" + bottom: "pool4/3x3_s2" + top: "inception_5a/pool" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 1 + pad: 1 + } +} +layer { + name: "inception_5a/pool_proj" + type: "Convolution" + bottom: "inception_5a/pool" + top: "inception_5a/pool_proj" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.1 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_5a/relu_pool_proj" + type: "ReLU" + bottom: "inception_5a/pool_proj" + top: "inception_5a/pool_proj" +} +layer { + name: "inception_5a/output" + type: "Concat" + bottom: "inception_5a/1x1" + bottom: "inception_5a/3x3" + bottom: "inception_5a/5x5" + bottom: "inception_5a/pool_proj" + top: "inception_5a/output" +} +layer { + name: "inception_5b/1x1" + type: "Convolution" + bottom: "inception_5a/output" + top: "inception_5b/1x1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 384 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_5b/relu_1x1" + type: "ReLU" + bottom: "inception_5b/1x1" + top: "inception_5b/1x1" +} +layer { + name: "inception_5b/3x3_reduce" + type: "Convolution" + bottom: "inception_5a/output" + top: "inception_5b/3x3_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 192 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.09 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_5b/relu_3x3_reduce" + type: "ReLU" + bottom: "inception_5b/3x3_reduce" + top: "inception_5b/3x3_reduce" +} +layer { + name: "inception_5b/3x3" + type: "Convolution" + bottom: "inception_5b/3x3_reduce" + top: "inception_5b/3x3" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 384 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_5b/relu_3x3" + type: "ReLU" + bottom: "inception_5b/3x3" + top: "inception_5b/3x3" +} +layer { + name: "inception_5b/5x5_reduce" + type: "Convolution" + bottom: "inception_5a/output" + top: "inception_5b/5x5_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 48 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.2 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_5b/relu_5x5_reduce" + type: "ReLU" + bottom: "inception_5b/5x5_reduce" + top: "inception_5b/5x5_reduce" +} +layer { + name: "inception_5b/5x5" + type: "Convolution" + bottom: "inception_5b/5x5_reduce" + top: "inception_5b/5x5" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 2 + kernel_size: 5 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_5b/relu_5x5" + type: "ReLU" + bottom: "inception_5b/5x5" + top: "inception_5b/5x5" +} +layer { + name: "inception_5b/pool" + type: "Pooling" + bottom: "inception_5a/output" + top: "inception_5b/pool" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 1 + pad: 1 + } +} +layer { + name: "inception_5b/pool_proj" + type: "Convolution" + bottom: "inception_5b/pool" + top: "inception_5b/pool_proj" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.1 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_5b/relu_pool_proj" + type: "ReLU" + bottom: "inception_5b/pool_proj" + top: "inception_5b/pool_proj" +} +layer { + name: "inception_5b/output" + type: "Concat" + bottom: "inception_5b/1x1" + bottom: "inception_5b/3x3" + bottom: "inception_5b/5x5" + bottom: "inception_5b/pool_proj" + top: "inception_5b/output" +} +layer { + name: "pool5/7x7_s1" + type: "Pooling" + bottom: "inception_5b/output" + top: "pool5/7x7_s1" + pooling_param { + pool: AVE + kernel_size: 7 + stride: 1 + } +} +layer { + name: "pool5/drop_7x7_s1" + type: "Dropout" + bottom: "pool5/7x7_s1" + top: "pool5/7x7_s1" + dropout_param { + dropout_ratio: 0.4 + } +} +layer { + name: "loss3/classifier" + type: "InnerProduct" + bottom: "pool5/7x7_s1" + top: "loss3/classifier" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + inner_product_param { + num_output: 1000 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "prob" + type: "Softmax" + bottom: "loss3/classifier" + top: "prob" +} diff --git a/googlenet/synset_words.txt b/googlenet/synset_words.txt new file mode 100644 index 0000000..a9e8c7f --- /dev/null +++ b/googlenet/synset_words.txt @@ -0,0 +1,1000 @@ +n01440764 tench, Tinca tinca +n01443537 goldfish, Carassius auratus +n01484850 great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias +n01491361 tiger shark, Galeocerdo cuvieri +n01494475 hammerhead, hammerhead shark +n01496331 electric ray, crampfish, numbfish, torpedo +n01498041 stingray +n01514668 cock +n01514859 hen +n01518878 ostrich, Struthio camelus +n01530575 brambling, Fringilla montifringilla +n01531178 goldfinch, Carduelis carduelis +n01532829 house finch, linnet, Carpodacus mexicanus +n01534433 junco, snowbird +n01537544 indigo bunting, indigo finch, indigo bird, Passerina cyanea +n01558993 robin, American robin, Turdus migratorius +n01560419 bulbul +n01580077 jay +n01582220 magpie +n01592084 chickadee +n01601694 water ouzel, dipper +n01608432 kite +n01614925 bald eagle, American eagle, Haliaeetus leucocephalus +n01616318 vulture +n01622779 great grey owl, great gray owl, Strix nebulosa +n01629819 European fire salamander, Salamandra salamandra +n01630670 common newt, Triturus vulgaris +n01631663 eft +n01632458 spotted salamander, Ambystoma maculatum +n01632777 axolotl, mud puppy, Ambystoma mexicanum +n01641577 bullfrog, Rana catesbeiana +n01644373 tree frog, tree-frog +n01644900 tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui +n01664065 loggerhead, loggerhead turtle, Caretta caretta +n01665541 leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea +n01667114 mud turtle +n01667778 terrapin +n01669191 box turtle, box tortoise +n01675722 banded gecko +n01677366 common iguana, iguana, Iguana iguana +n01682714 American chameleon, anole, Anolis carolinensis +n01685808 whiptail, whiptail lizard +n01687978 agama +n01688243 frilled lizard, Chlamydosaurus kingi +n01689811 alligator lizard +n01692333 Gila monster, Heloderma suspectum +n01693334 green lizard, Lacerta viridis +n01694178 African chameleon, Chamaeleo chamaeleon +n01695060 Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis +n01697457 African crocodile, Nile crocodile, Crocodylus niloticus +n01698640 American alligator, Alligator mississipiensis +n01704323 triceratops +n01728572 thunder snake, worm snake, Carphophis amoenus +n01728920 ringneck snake, ring-necked snake, ring snake +n01729322 hognose snake, puff adder, sand viper +n01729977 green snake, grass snake +n01734418 king snake, kingsnake +n01735189 garter snake, grass snake +n01737021 water snake +n01739381 vine snake +n01740131 night snake, Hypsiglena torquata +n01742172 boa constrictor, Constrictor constrictor +n01744401 rock python, rock snake, Python sebae +n01748264 Indian cobra, Naja naja +n01749939 green mamba +n01751748 sea snake +n01753488 horned viper, cerastes, sand viper, horned asp, Cerastes cornutus +n01755581 diamondback, diamondback rattlesnake, Crotalus adamanteus +n01756291 sidewinder, horned rattlesnake, Crotalus cerastes +n01768244 trilobite +n01770081 harvestman, daddy longlegs, Phalangium opilio +n01770393 scorpion +n01773157 black and gold garden spider, Argiope aurantia +n01773549 barn spider, Araneus cavaticus +n01773797 garden spider, Aranea diademata +n01774384 black widow, Latrodectus mactans +n01774750 tarantula +n01775062 wolf spider, hunting spider +n01776313 tick +n01784675 centipede +n01795545 black grouse +n01796340 ptarmigan +n01797886 ruffed grouse, partridge, Bonasa umbellus +n01798484 prairie chicken, prairie grouse, prairie fowl +n01806143 peacock +n01806567 quail +n01807496 partridge +n01817953 African grey, African gray, Psittacus erithacus +n01818515 macaw +n01819313 sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita +n01820546 lorikeet +n01824575 coucal +n01828970 bee eater +n01829413 hornbill +n01833805 hummingbird +n01843065 jacamar +n01843383 toucan +n01847000 drake +n01855032 red-breasted merganser, Mergus serrator +n01855672 goose +n01860187 black swan, Cygnus atratus +n01871265 tusker +n01872401 echidna, spiny anteater, anteater +n01873310 platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus +n01877812 wallaby, brush kangaroo +n01882714 koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus +n01883070 wombat +n01910747 jellyfish +n01914609 sea anemone, anemone +n01917289 brain coral +n01924916 flatworm, platyhelminth +n01930112 nematode, nematode worm, roundworm +n01943899 conch +n01944390 snail +n01945685 slug +n01950731 sea slug, nudibranch +n01955084 chiton, coat-of-mail shell, sea cradle, polyplacophore +n01968897 chambered nautilus, pearly nautilus, nautilus +n01978287 Dungeness crab, Cancer magister +n01978455 rock crab, Cancer irroratus +n01980166 fiddler crab +n01981276 king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica +n01983481 American lobster, Northern lobster, Maine lobster, Homarus americanus +n01984695 spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish +n01985128 crayfish, crawfish, crawdad, crawdaddy +n01986214 hermit crab +n01990800 isopod +n02002556 white stork, Ciconia ciconia +n02002724 black stork, Ciconia nigra +n02006656 spoonbill +n02007558 flamingo +n02009229 little blue heron, Egretta caerulea +n02009912 American egret, great white heron, Egretta albus +n02011460 bittern +n02012849 crane +n02013706 limpkin, Aramus pictus +n02017213 European gallinule, Porphyrio porphyrio +n02018207 American coot, marsh hen, mud hen, water hen, Fulica americana +n02018795 bustard +n02025239 ruddy turnstone, Arenaria interpres +n02027492 red-backed sandpiper, dunlin, Erolia alpina +n02028035 redshank, Tringa totanus +n02033041 dowitcher +n02037110 oystercatcher, oyster catcher +n02051845 pelican +n02056570 king penguin, Aptenodytes patagonica +n02058221 albatross, mollymawk +n02066245 grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus +n02071294 killer whale, killer, orca, grampus, sea wolf, Orcinus orca +n02074367 dugong, Dugong dugon +n02077923 sea lion +n02085620 Chihuahua +n02085782 Japanese spaniel +n02085936 Maltese dog, Maltese terrier, Maltese +n02086079 Pekinese, Pekingese, Peke +n02086240 Shih-Tzu +n02086646 Blenheim spaniel +n02086910 papillon +n02087046 toy terrier +n02087394 Rhodesian ridgeback +n02088094 Afghan hound, Afghan +n02088238 basset, basset hound +n02088364 beagle +n02088466 bloodhound, sleuthhound +n02088632 bluetick +n02089078 black-and-tan coonhound +n02089867 Walker hound, Walker foxhound +n02089973 English foxhound +n02090379 redbone +n02090622 borzoi, Russian wolfhound +n02090721 Irish wolfhound +n02091032 Italian greyhound +n02091134 whippet +n02091244 Ibizan hound, Ibizan Podenco +n02091467 Norwegian elkhound, elkhound +n02091635 otterhound, otter hound +n02091831 Saluki, gazelle hound +n02092002 Scottish deerhound, deerhound +n02092339 Weimaraner +n02093256 Staffordshire bullterrier, Staffordshire bull terrier +n02093428 American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier +n02093647 Bedlington terrier +n02093754 Border terrier +n02093859 Kerry blue terrier +n02093991 Irish terrier +n02094114 Norfolk terrier +n02094258 Norwich terrier +n02094433 Yorkshire terrier +n02095314 wire-haired fox terrier +n02095570 Lakeland terrier +n02095889 Sealyham terrier, Sealyham +n02096051 Airedale, Airedale terrier +n02096177 cairn, cairn terrier +n02096294 Australian terrier +n02096437 Dandie Dinmont, Dandie Dinmont terrier +n02096585 Boston bull, Boston terrier +n02097047 miniature schnauzer +n02097130 giant schnauzer +n02097209 standard schnauzer +n02097298 Scotch terrier, Scottish terrier, Scottie +n02097474 Tibetan terrier, chrysanthemum dog +n02097658 silky terrier, Sydney silky +n02098105 soft-coated wheaten terrier +n02098286 West Highland white terrier +n02098413 Lhasa, Lhasa apso +n02099267 flat-coated retriever +n02099429 curly-coated retriever +n02099601 golden retriever +n02099712 Labrador retriever +n02099849 Chesapeake Bay retriever +n02100236 German short-haired pointer +n02100583 vizsla, Hungarian pointer +n02100735 English setter +n02100877 Irish setter, red setter +n02101006 Gordon setter +n02101388 Brittany spaniel +n02101556 clumber, clumber spaniel +n02102040 English springer, English springer spaniel +n02102177 Welsh springer spaniel +n02102318 cocker spaniel, English cocker spaniel, cocker +n02102480 Sussex spaniel +n02102973 Irish water spaniel +n02104029 kuvasz +n02104365 schipperke +n02105056 groenendael +n02105162 malinois +n02105251 briard +n02105412 kelpie +n02105505 komondor +n02105641 Old English sheepdog, bobtail +n02105855 Shetland sheepdog, Shetland sheep dog, Shetland +n02106030 collie +n02106166 Border collie +n02106382 Bouvier des Flandres, Bouviers des Flandres +n02106550 Rottweiler +n02106662 German shepherd, German shepherd dog, German police dog, alsatian +n02107142 Doberman, Doberman pinscher +n02107312 miniature pinscher +n02107574 Greater Swiss Mountain dog +n02107683 Bernese mountain dog +n02107908 Appenzeller +n02108000 EntleBucher +n02108089 boxer +n02108422 bull mastiff +n02108551 Tibetan mastiff +n02108915 French bulldog +n02109047 Great Dane +n02109525 Saint Bernard, St Bernard +n02109961 Eskimo dog, husky +n02110063 malamute, malemute, Alaskan malamute +n02110185 Siberian husky +n02110341 dalmatian, coach dog, carriage dog +n02110627 affenpinscher, monkey pinscher, monkey dog +n02110806 basenji +n02110958 pug, pug-dog +n02111129 Leonberg +n02111277 Newfoundland, Newfoundland dog +n02111500 Great Pyrenees +n02111889 Samoyed, Samoyede +n02112018 Pomeranian +n02112137 chow, chow chow +n02112350 keeshond +n02112706 Brabancon griffon +n02113023 Pembroke, Pembroke Welsh corgi +n02113186 Cardigan, Cardigan Welsh corgi +n02113624 toy poodle +n02113712 miniature poodle +n02113799 standard poodle +n02113978 Mexican hairless +n02114367 timber wolf, grey wolf, gray wolf, Canis lupus +n02114548 white wolf, Arctic wolf, Canis lupus tundrarum +n02114712 red wolf, maned wolf, Canis rufus, Canis niger +n02114855 coyote, prairie wolf, brush wolf, Canis latrans +n02115641 dingo, warrigal, warragal, Canis dingo +n02115913 dhole, Cuon alpinus +n02116738 African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus +n02117135 hyena, hyaena +n02119022 red fox, Vulpes vulpes +n02119789 kit fox, Vulpes macrotis +n02120079 Arctic fox, white fox, Alopex lagopus +n02120505 grey fox, gray fox, Urocyon cinereoargenteus +n02123045 tabby, tabby cat +n02123159 tiger cat +n02123394 Persian cat +n02123597 Siamese cat, Siamese +n02124075 Egyptian cat +n02125311 cougar, puma, catamount, mountain lion, painter, panther, Felis concolor +n02127052 lynx, catamount +n02128385 leopard, Panthera pardus +n02128757 snow leopard, ounce, Panthera uncia +n02128925 jaguar, panther, Panthera onca, Felis onca +n02129165 lion, king of beasts, Panthera leo +n02129604 tiger, Panthera tigris +n02130308 cheetah, chetah, Acinonyx jubatus +n02132136 brown bear, bruin, Ursus arctos +n02133161 American black bear, black bear, Ursus americanus, Euarctos americanus +n02134084 ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus +n02134418 sloth bear, Melursus ursinus, Ursus ursinus +n02137549 mongoose +n02138441 meerkat, mierkat +n02165105 tiger beetle +n02165456 ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle +n02167151 ground beetle, carabid beetle +n02168699 long-horned beetle, longicorn, longicorn beetle +n02169497 leaf beetle, chrysomelid +n02172182 dung beetle +n02174001 rhinoceros beetle +n02177972 weevil +n02190166 fly +n02206856 bee +n02219486 ant, emmet, pismire +n02226429 grasshopper, hopper +n02229544 cricket +n02231487 walking stick, walkingstick, stick insect +n02233338 cockroach, roach +n02236044 mantis, mantid +n02256656 cicada, cicala +n02259212 leafhopper +n02264363 lacewing, lacewing fly +n02268443 dragonfly, darning needle, devil's darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk +n02268853 damselfly +n02276258 admiral +n02277742 ringlet, ringlet butterfly +n02279972 monarch, monarch butterfly, milkweed butterfly, Danaus plexippus +n02280649 cabbage butterfly +n02281406 sulphur butterfly, sulfur butterfly +n02281787 lycaenid, lycaenid butterfly +n02317335 starfish, sea star +n02319095 sea urchin +n02321529 sea cucumber, holothurian +n02325366 wood rabbit, cottontail, cottontail rabbit +n02326432 hare +n02328150 Angora, Angora rabbit +n02342885 hamster +n02346627 porcupine, hedgehog +n02356798 fox squirrel, eastern fox squirrel, Sciurus niger +n02361337 marmot +n02363005 beaver +n02364673 guinea pig, Cavia cobaya +n02389026 sorrel +n02391049 zebra +n02395406 hog, pig, grunter, squealer, Sus scrofa +n02396427 wild boar, boar, Sus scrofa +n02397096 warthog +n02398521 hippopotamus, hippo, river horse, Hippopotamus amphibius +n02403003 ox +n02408429 water buffalo, water ox, Asiatic buffalo, Bubalus bubalis +n02410509 bison +n02412080 ram, tup +n02415577 bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis +n02417914 ibex, Capra ibex +n02422106 hartebeest +n02422699 impala, Aepyceros melampus +n02423022 gazelle +n02437312 Arabian camel, dromedary, Camelus dromedarius +n02437616 llama +n02441942 weasel +n02442845 mink +n02443114 polecat, fitch, foulmart, foumart, Mustela putorius +n02443484 black-footed ferret, ferret, Mustela nigripes +n02444819 otter +n02445715 skunk, polecat, wood pussy +n02447366 badger +n02454379 armadillo +n02457408 three-toed sloth, ai, Bradypus tridactylus +n02480495 orangutan, orang, orangutang, Pongo pygmaeus +n02480855 gorilla, Gorilla gorilla +n02481823 chimpanzee, chimp, Pan troglodytes +n02483362 gibbon, Hylobates lar +n02483708 siamang, Hylobates syndactylus, Symphalangus syndactylus +n02484975 guenon, guenon monkey +n02486261 patas, hussar monkey, Erythrocebus patas +n02486410 baboon +n02487347 macaque +n02488291 langur +n02488702 colobus, colobus monkey +n02489166 proboscis monkey, Nasalis larvatus +n02490219 marmoset +n02492035 capuchin, ringtail, Cebus capucinus +n02492660 howler monkey, howler +n02493509 titi, titi monkey +n02493793 spider monkey, Ateles geoffroyi +n02494079 squirrel monkey, Saimiri sciureus +n02497673 Madagascar cat, ring-tailed lemur, Lemur catta +n02500267 indri, indris, Indri indri, Indri brevicaudatus +n02504013 Indian elephant, Elephas maximus +n02504458 African elephant, Loxodonta africana +n02509815 lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens +n02510455 giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca +n02514041 barracouta, snoek +n02526121 eel +n02536864 coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch +n02606052 rock beauty, Holocanthus tricolor +n02607072 anemone fish +n02640242 sturgeon +n02641379 gar, garfish, garpike, billfish, Lepisosteus osseus +n02643566 lionfish +n02655020 puffer, pufferfish, blowfish, globefish +n02666196 abacus +n02667093 abaya +n02669723 academic gown, academic robe, judge's robe +n02672831 accordion, piano accordion, squeeze box +n02676566 acoustic guitar +n02687172 aircraft carrier, carrier, flattop, attack aircraft carrier +n02690373 airliner +n02692877 airship, dirigible +n02699494 altar +n02701002 ambulance +n02704792 amphibian, amphibious vehicle +n02708093 analog clock +n02727426 apiary, bee house +n02730930 apron +n02747177 ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin +n02749479 assault rifle, assault gun +n02769748 backpack, back pack, knapsack, packsack, rucksack, haversack +n02776631 bakery, bakeshop, bakehouse +n02777292 balance beam, beam +n02782093 balloon +n02783161 ballpoint, ballpoint pen, ballpen, Biro +n02786058 Band Aid +n02787622 banjo +n02788148 bannister, banister, balustrade, balusters, handrail +n02790996 barbell +n02791124 barber chair +n02791270 barbershop +n02793495 barn +n02794156 barometer +n02795169 barrel, cask +n02797295 barrow, garden cart, lawn cart, wheelbarrow +n02799071 baseball +n02802426 basketball +n02804414 bassinet +n02804610 bassoon +n02807133 bathing cap, swimming cap +n02808304 bath towel +n02808440 bathtub, bathing tub, bath, tub +n02814533 beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon +n02814860 beacon, lighthouse, beacon light, pharos +n02815834 beaker +n02817516 bearskin, busby, shako +n02823428 beer bottle +n02823750 beer glass +n02825657 bell cote, bell cot +n02834397 bib +n02835271 bicycle-built-for-two, tandem bicycle, tandem +n02837789 bikini, two-piece +n02840245 binder, ring-binder +n02841315 binoculars, field glasses, opera glasses +n02843684 birdhouse +n02859443 boathouse +n02860847 bobsled, bobsleigh, bob +n02865351 bolo tie, bolo, bola tie, bola +n02869837 bonnet, poke bonnet +n02870880 bookcase +n02871525 bookshop, bookstore, bookstall +n02877765 bottlecap +n02879718 bow +n02883205 bow tie, bow-tie, bowtie +n02892201 brass, memorial tablet, plaque +n02892767 brassiere, bra, bandeau +n02894605 breakwater, groin, groyne, mole, bulwark, seawall, jetty +n02895154 breastplate, aegis, egis +n02906734 broom +n02909870 bucket, pail +n02910353 buckle +n02916936 bulletproof vest +n02917067 bullet train, bullet +n02927161 butcher shop, meat market +n02930766 cab, hack, taxi, taxicab +n02939185 caldron, cauldron +n02948072 candle, taper, wax light +n02950826 cannon +n02951358 canoe +n02951585 can opener, tin opener +n02963159 cardigan +n02965783 car mirror +n02966193 carousel, carrousel, merry-go-round, roundabout, whirligig +n02966687 carpenter's kit, tool kit +n02971356 carton +n02974003 car wheel +n02977058 cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM +n02978881 cassette +n02979186 cassette player +n02980441 castle +n02981792 catamaran +n02988304 CD player +n02992211 cello, violoncello +n02992529 cellular telephone, cellular phone, cellphone, cell, mobile phone +n02999410 chain +n03000134 chainlink fence +n03000247 chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour +n03000684 chain saw, chainsaw +n03014705 chest +n03016953 chiffonier, commode +n03017168 chime, bell, gong +n03018349 china cabinet, china closet +n03026506 Christmas stocking +n03028079 church, church building +n03032252 cinema, movie theater, movie theatre, movie house, picture palace +n03041632 cleaver, meat cleaver, chopper +n03042490 cliff dwelling +n03045698 cloak +n03047690 clog, geta, patten, sabot +n03062245 cocktail shaker +n03063599 coffee mug +n03063689 coffeepot +n03065424 coil, spiral, volute, whorl, helix +n03075370 combination lock +n03085013 computer keyboard, keypad +n03089624 confectionery, confectionary, candy store +n03095699 container ship, containership, container vessel +n03100240 convertible +n03109150 corkscrew, bottle screw +n03110669 cornet, horn, trumpet, trump +n03124043 cowboy boot +n03124170 cowboy hat, ten-gallon hat +n03125729 cradle +n03126707 crane +n03127747 crash helmet +n03127925 crate +n03131574 crib, cot +n03133878 Crock Pot +n03134739 croquet ball +n03141823 crutch +n03146219 cuirass +n03160309 dam, dike, dyke +n03179701 desk +n03180011 desktop computer +n03187595 dial telephone, dial phone +n03188531 diaper, nappy, napkin +n03196217 digital clock +n03197337 digital watch +n03201208 dining table, board +n03207743 dishrag, dishcloth +n03207941 dishwasher, dish washer, dishwashing machine +n03208938 disk brake, disc brake +n03216828 dock, dockage, docking facility +n03218198 dogsled, dog sled, dog sleigh +n03220513 dome +n03223299 doormat, welcome mat +n03240683 drilling platform, offshore rig +n03249569 drum, membranophone, tympan +n03250847 drumstick +n03255030 dumbbell +n03259280 Dutch oven +n03271574 electric fan, blower +n03272010 electric guitar +n03272562 electric locomotive +n03290653 entertainment center +n03291819 envelope +n03297495 espresso maker +n03314780 face powder +n03325584 feather boa, boa +n03337140 file, file cabinet, filing cabinet +n03344393 fireboat +n03345487 fire engine, fire truck +n03347037 fire screen, fireguard +n03355925 flagpole, flagstaff +n03372029 flute, transverse flute +n03376595 folding chair +n03379051 football helmet +n03384352 forklift +n03388043 fountain +n03388183 fountain pen +n03388549 four-poster +n03393912 freight car +n03394916 French horn, horn +n03400231 frying pan, frypan, skillet +n03404251 fur coat +n03417042 garbage truck, dustcart +n03424325 gasmask, respirator, gas helmet +n03425413 gas pump, gasoline pump, petrol pump, island dispenser +n03443371 goblet +n03444034 go-kart +n03445777 golf ball +n03445924 golfcart, golf cart +n03447447 gondola +n03447721 gong, tam-tam +n03450230 gown +n03452741 grand piano, grand +n03457902 greenhouse, nursery, glasshouse +n03459775 grille, radiator grille +n03461385 grocery store, grocery, food market, market +n03467068 guillotine +n03476684 hair slide +n03476991 hair spray +n03478589 half track +n03481172 hammer +n03482405 hamper +n03483316 hand blower, blow dryer, blow drier, hair dryer, hair drier +n03485407 hand-held computer, hand-held microcomputer +n03485794 handkerchief, hankie, hanky, hankey +n03492542 hard disc, hard disk, fixed disk +n03494278 harmonica, mouth organ, harp, mouth harp +n03495258 harp +n03496892 harvester, reaper +n03498962 hatchet +n03527444 holster +n03529860 home theater, home theatre +n03530642 honeycomb +n03532672 hook, claw +n03534580 hoopskirt, crinoline +n03535780 horizontal bar, high bar +n03538406 horse cart, horse-cart +n03544143 hourglass +n03584254 iPod +n03584829 iron, smoothing iron +n03590841 jack-o'-lantern +n03594734 jean, blue jean, denim +n03594945 jeep, landrover +n03595614 jersey, T-shirt, tee shirt +n03598930 jigsaw puzzle +n03599486 jinrikisha, ricksha, rickshaw +n03602883 joystick +n03617480 kimono +n03623198 knee pad +n03627232 knot +n03630383 lab coat, laboratory coat +n03633091 ladle +n03637318 lampshade, lamp shade +n03642806 laptop, laptop computer +n03649909 lawn mower, mower +n03657121 lens cap, lens cover +n03658185 letter opener, paper knife, paperknife +n03661043 library +n03662601 lifeboat +n03666591 lighter, light, igniter, ignitor +n03670208 limousine, limo +n03673027 liner, ocean liner +n03676483 lipstick, lip rouge +n03680355 Loafer +n03690938 lotion +n03691459 loudspeaker, speaker, speaker unit, loudspeaker system, speaker system +n03692522 loupe, jeweler's loupe +n03697007 lumbermill, sawmill +n03706229 magnetic compass +n03709823 mailbag, postbag +n03710193 mailbox, letter box +n03710637 maillot +n03710721 maillot, tank suit +n03717622 manhole cover +n03720891 maraca +n03721384 marimba, xylophone +n03724870 mask +n03729826 matchstick +n03733131 maypole +n03733281 maze, labyrinth +n03733805 measuring cup +n03742115 medicine chest, medicine cabinet +n03743016 megalith, megalithic structure +n03759954 microphone, mike +n03761084 microwave, microwave oven +n03763968 military uniform +n03764736 milk can +n03769881 minibus +n03770439 miniskirt, mini +n03770679 minivan +n03773504 missile +n03775071 mitten +n03775546 mixing bowl +n03776460 mobile home, manufactured home +n03777568 Model T +n03777754 modem +n03781244 monastery +n03782006 monitor +n03785016 moped +n03786901 mortar +n03787032 mortarboard +n03788195 mosque +n03788365 mosquito net +n03791053 motor scooter, scooter +n03792782 mountain bike, all-terrain bike, off-roader +n03792972 mountain tent +n03793489 mouse, computer mouse +n03794056 mousetrap +n03796401 moving van +n03803284 muzzle +n03804744 nail +n03814639 neck brace +n03814906 necklace +n03825788 nipple +n03832673 notebook, notebook computer +n03837869 obelisk +n03838899 oboe, hautboy, hautbois +n03840681 ocarina, sweet potato +n03841143 odometer, hodometer, mileometer, milometer +n03843555 oil filter +n03854065 organ, pipe organ +n03857828 oscilloscope, scope, cathode-ray oscilloscope, CRO +n03866082 overskirt +n03868242 oxcart +n03868863 oxygen mask +n03871628 packet +n03873416 paddle, boat paddle +n03874293 paddlewheel, paddle wheel +n03874599 padlock +n03876231 paintbrush +n03877472 pajama, pyjama, pj's, jammies +n03877845 palace +n03884397 panpipe, pandean pipe, syrinx +n03887697 paper towel +n03888257 parachute, chute +n03888605 parallel bars, bars +n03891251 park bench +n03891332 parking meter +n03895866 passenger car, coach, carriage +n03899768 patio, terrace +n03902125 pay-phone, pay-station +n03903868 pedestal, plinth, footstall +n03908618 pencil box, pencil case +n03908714 pencil sharpener +n03916031 perfume, essence +n03920288 Petri dish +n03924679 photocopier +n03929660 pick, plectrum, plectron +n03929855 pickelhaube +n03930313 picket fence, paling +n03930630 pickup, pickup truck +n03933933 pier +n03935335 piggy bank, penny bank +n03937543 pill bottle +n03938244 pillow +n03942813 ping-pong ball +n03944341 pinwheel +n03947888 pirate, pirate ship +n03950228 pitcher, ewer +n03954731 plane, carpenter's plane, woodworking plane +n03956157 planetarium +n03958227 plastic bag +n03961711 plate rack +n03967562 plow, plough +n03970156 plunger, plumber's helper +n03976467 Polaroid camera, Polaroid Land camera +n03976657 pole +n03977966 police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria +n03980874 poncho +n03982430 pool table, billiard table, snooker table +n03983396 pop bottle, soda bottle +n03991062 pot, flowerpot +n03992509 potter's wheel +n03995372 power drill +n03998194 prayer rug, prayer mat +n04004767 printer +n04005630 prison, prison house +n04008634 projectile, missile +n04009552 projector +n04019541 puck, hockey puck +n04023962 punching bag, punch bag, punching ball, punchball +n04026417 purse +n04033901 quill, quill pen +n04033995 quilt, comforter, comfort, puff +n04037443 racer, race car, racing car +n04039381 racket, racquet +n04040759 radiator +n04041544 radio, wireless +n04044716 radio telescope, radio reflector +n04049303 rain barrel +n04065272 recreational vehicle, RV, R.V. +n04067472 reel +n04069434 reflex camera +n04070727 refrigerator, icebox +n04074963 remote control, remote +n04081281 restaurant, eating house, eating place, eatery +n04086273 revolver, six-gun, six-shooter +n04090263 rifle +n04099969 rocking chair, rocker +n04111531 rotisserie +n04116512 rubber eraser, rubber, pencil eraser +n04118538 rugby ball +n04118776 rule, ruler +n04120489 running shoe +n04125021 safe +n04127249 safety pin +n04131690 saltshaker, salt shaker +n04133789 sandal +n04136333 sarong +n04141076 sax, saxophone +n04141327 scabbard +n04141975 scale, weighing machine +n04146614 school bus +n04147183 schooner +n04149813 scoreboard +n04152593 screen, CRT screen +n04153751 screw +n04154565 screwdriver +n04162706 seat belt, seatbelt +n04179913 sewing machine +n04192698 shield, buckler +n04200800 shoe shop, shoe-shop, shoe store +n04201297 shoji +n04204238 shopping basket +n04204347 shopping cart +n04208210 shovel +n04209133 shower cap +n04209239 shower curtain +n04228054 ski +n04229816 ski mask +n04235860 sleeping bag +n04238763 slide rule, slipstick +n04239074 sliding door +n04243546 slot, one-armed bandit +n04251144 snorkel +n04252077 snowmobile +n04252225 snowplow, snowplough +n04254120 soap dispenser +n04254680 soccer ball +n04254777 sock +n04258138 solar dish, solar collector, solar furnace +n04259630 sombrero +n04263257 soup bowl +n04264628 space bar +n04265275 space heater +n04266014 space shuttle +n04270147 spatula +n04273569 speedboat +n04275548 spider web, spider's web +n04277352 spindle +n04285008 sports car, sport car +n04286575 spotlight, spot +n04296562 stage +n04310018 steam locomotive +n04311004 steel arch bridge +n04311174 steel drum +n04317175 stethoscope +n04325704 stole +n04326547 stone wall +n04328186 stopwatch, stop watch +n04330267 stove +n04332243 strainer +n04335435 streetcar, tram, tramcar, trolley, trolley car +n04336792 stretcher +n04344873 studio couch, day bed +n04346328 stupa, tope +n04347754 submarine, pigboat, sub, U-boat +n04350905 suit, suit of clothes +n04355338 sundial +n04355933 sunglass +n04356056 sunglasses, dark glasses, shades +n04357314 sunscreen, sunblock, sun blocker +n04366367 suspension bridge +n04367480 swab, swob, mop +n04370456 sweatshirt +n04371430 swimming trunks, bathing trunks +n04371774 swing +n04372370 switch, electric switch, electrical switch +n04376876 syringe +n04380533 table lamp +n04389033 tank, army tank, armored combat vehicle, armoured combat vehicle +n04392985 tape player +n04398044 teapot +n04399382 teddy, teddy bear +n04404412 television, television system +n04409515 tennis ball +n04417672 thatch, thatched roof +n04418357 theater curtain, theatre curtain +n04423845 thimble +n04428191 thresher, thrasher, threshing machine +n04429376 throne +n04435653 tile roof +n04442312 toaster +n04443257 tobacco shop, tobacconist shop, tobacconist +n04447861 toilet seat +n04456115 torch +n04458633 totem pole +n04461696 tow truck, tow car, wrecker +n04462240 toyshop +n04465501 tractor +n04467665 trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi +n04476259 tray +n04479046 trench coat +n04482393 tricycle, trike, velocipede +n04483307 trimaran +n04485082 tripod +n04486054 triumphal arch +n04487081 trolleybus, trolley coach, trackless trolley +n04487394 trombone +n04493381 tub, vat +n04501370 turnstile +n04505470 typewriter keyboard +n04507155 umbrella +n04509417 unicycle, monocycle +n04515003 upright, upright piano +n04517823 vacuum, vacuum cleaner +n04522168 vase +n04523525 vault +n04525038 velvet +n04525305 vending machine +n04532106 vestment +n04532670 viaduct +n04536866 violin, fiddle +n04540053 volleyball +n04542943 waffle iron +n04548280 wall clock +n04548362 wallet, billfold, notecase, pocketbook +n04550184 wardrobe, closet, press +n04552348 warplane, military plane +n04553703 washbasin, handbasin, washbowl, lavabo, wash-hand basin +n04554684 washer, automatic washer, washing machine +n04557648 water bottle +n04560804 water jug +n04562935 water tower +n04579145 whiskey jug +n04579432 whistle +n04584207 wig +n04589890 window screen +n04590129 window shade +n04591157 Windsor tie +n04591713 wine bottle +n04592741 wing +n04596742 wok +n04597913 wooden spoon +n04599235 wool, woolen, woollen +n04604644 worm fence, snake fence, snake-rail fence, Virginia fence +n04606251 wreck +n04612504 yawl +n04613696 yurt +n06359193 web site, website, internet site, site +n06596364 comic book +n06785654 crossword puzzle, crossword +n06794110 street sign +n06874185 traffic light, traffic signal, stoplight +n07248320 book jacket, dust cover, dust jacket, dust wrapper +n07565083 menu +n07579787 plate +n07583066 guacamole +n07584110 consomme +n07590611 hot pot, hotpot +n07613480 trifle +n07614500 ice cream, icecream +n07615774 ice lolly, lolly, lollipop, popsicle +n07684084 French loaf +n07693725 bagel, beigel +n07695742 pretzel +n07697313 cheeseburger +n07697537 hotdog, hot dog, red hot +n07711569 mashed potato +n07714571 head cabbage +n07714990 broccoli +n07715103 cauliflower +n07716358 zucchini, courgette +n07716906 spaghetti squash +n07717410 acorn squash +n07717556 butternut squash +n07718472 cucumber, cuke +n07718747 artichoke, globe artichoke +n07720875 bell pepper +n07730033 cardoon +n07734744 mushroom +n07742313 Granny Smith +n07745940 strawberry +n07747607 orange +n07749582 lemon +n07753113 fig +n07753275 pineapple, ananas +n07753592 banana +n07754684 jackfruit, jak, jack +n07760859 custard apple +n07768694 pomegranate +n07802026 hay +n07831146 carbonara +n07836838 chocolate sauce, chocolate syrup +n07860988 dough +n07871810 meat loaf, meatloaf +n07873807 pizza, pizza pie +n07875152 potpie +n07880968 burrito +n07892512 red wine +n07920052 espresso +n07930864 cup +n07932039 eggnog +n09193705 alp +n09229709 bubble +n09246464 cliff, drop, drop-off +n09256479 coral reef +n09288635 geyser +n09332890 lakeside, lakeshore +n09399592 promontory, headland, head, foreland +n09421951 sandbar, sand bar +n09428293 seashore, coast, seacoast, sea-coast +n09468604 valley, vale +n09472597 volcano +n09835506 ballplayer, baseball player +n10148035 groom, bridegroom +n10565667 scuba diver +n11879895 rapeseed +n11939491 daisy +n12057211 yellow lady's slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum +n12144580 corn +n12267677 acorn +n12620546 hip, rose hip, rosehip +n12768682 buckeye, horse chestnut, conker +n12985857 coral fungus +n12998815 agaric +n13037406 gyromitra +n13040303 stinkhorn, carrion fungus +n13044778 earthstar +n13052670 hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa +n13054560 bolete +n13133613 ear, spike, capitulum +n15075141 toilet tissue, toilet paper, bathroom tissue diff --git a/modnet/README.md b/modnet/README.md new file mode 100644 index 0000000..917faa4 --- /dev/null +++ b/modnet/README.md @@ -0,0 +1,51 @@ +# How to convert the original PyTorch MODNet model to ONNX + +The original pre-trained PyTorch MODNet model comes from [ZHKKKe/MODNet](https://github.com/ZHKKKe/MODNet). Note that this pre-trained model is under [Creative Commons Attribution NonCommercial ShareAlike 4.0 license](https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode). + +You could use the script in this repository to convert the original PyTorch model to ONNX. I recommend to do such conversion within a python3 virtual environment, since you'd need to use some specific versions of pip3 packages. Below is a step-by-step guide about how to build the python3 virtual environment and then convert the PyTorch MODNet model to ONNX. + +1. Make sure python3 "venv" module is installed. + + ```shell + $ sudo apt install python3-venv + ``` + +2. Create a virtual environment named "venv-onnx" and activate it. + + ```shell + $ cd ${HOME}/project/tensorrt_demos/modnet + $ python3 -m venv venv-onnx + $ source venv-onnx/bin/activate + ``` + + At this point, you should have entered the virtual environment and would see shell prompt proceeded with "(venv-onnx) ". You could do `deactivate` to quit the virtual environment when you are done using it. + + Download "torch-1.7.0-cp36-cp36m-linux_aarch64.whl" from here: [PyTorch for Jetson](https://forums.developer.nvidia.com/t/pytorch-for-jetson-version-1-8-0-now-available/72048). Then install all required packages into the virtual environment. (Note the following should be done inside the "venv-onnx" virtual environment.) + + ```shell + ### update pip to the latest version in the virtual env + $ curl https://bootstrap.pypa.io/get-pip.py | python + ### udpate these essential packages + $ python -m pip install -U setuptools Cython + ### I recommend numpy 1.16.x on Jetson + $ python -m pip install "numpy<1.17.0" + ### install cv2 into the virtual env + $ cp -r /usr/lib/python3.6/dist-packages/cv2 venv-onnx/lib/python3.6/site-packages/ + ### install PyImage, onnx and onnxruntime + $ python -m pip install PyImage onnx==1.8.1 onnxruntime==1.6.0 + ### install PyTorch v1.7.0 + $ sudo apt install libopenblas-base libopenmpi-dev + $ python -m pip install ${HOME}/Downloads/torch-1.7.0-cp36-cp36m-linux_aarch64.whl + ``` + + In addition, you might also install [onnx-graphsurgeon](https://pypi.org/project/onnx-graphsurgeon/) and [polygraphy](https://pypi.org/project/polygraphy/) for debugging. Otherwise, you could do some simple testing to make sure "onnx" and "torch" are working OK in the virtual env. + +3. Download the pre-trained MODNet model (PyTorch checkpoint file) from the link on this page: [/ZHKKKe/MODNet/pretrained](https://github.com/ZHKKKe/MODNet/tree/master/pretrained). I recommend using "modnet_webcam_portrait_matting.ckpt". Just put the file in the current directory. + +4. Do the conversion using the following command. The ouput "modnet.onnx" would be generated. + + ```shell + $ python -m torch2onnx.export modnet_webcam_portrait_matting.ckpt modnet.onnx + ``` + + By default, the "torch2onnx.export" script sets input image width and height to 512x288. They could be modified by the "--width" and "--height" command-line options. In addition, the "-v" command-line option could be used to enable verbose logs of `torch.onnx.export()`. diff --git a/modnet/install_pycuda.sh b/modnet/install_pycuda.sh new file mode 100755 index 0000000..578ad60 --- /dev/null +++ b/modnet/install_pycuda.sh @@ -0,0 +1,43 @@ +#!/bin/bash +# +# Reference for installing 'pycuda': https://wiki.tiker.net/PyCuda/Installation/Linux/Ubuntu + +set -e + +if ! which nvcc > /dev/null; then + echo "ERROR: nvcc not found" + exit +fi + +arch=$(uname -m) +folder=${HOME}/src +mkdir -p $folder + +echo "** Install requirements" +sudo apt-get install -y build-essential python3-dev +sudo apt-get install -y libboost-python-dev libboost-thread-dev +sudo pip3 install setuptools + +boost_pylib=$(basename /usr/lib/${arch}-linux-gnu/libboost_python*-py3?.so) +boost_pylibname=${boost_pylib%.so} +boost_pyname=${boost_pylibname/lib/} + +echo "** Download pycuda-2019.1.2 sources" +pushd $folder +if [ ! -f pycuda-2019.1.2.tar.gz ]; then + wget https://files.pythonhosted.org/packages/5e/3f/5658c38579b41866ba21ee1b5020b8225cec86fe717e4b1c5c972de0a33c/pycuda-2019.1.2.tar.gz +fi + +echo "** Build and install pycuda-2019.1.2" +CPU_CORES=$(nproc) +echo "** cpu cores available: " $CPU_CORES +tar xzvf pycuda-2019.1.2.tar.gz +cd pycuda-2019.1.2 +python3 ./configure.py --python-exe=/usr/bin/python3 --cuda-root=/usr/local/cuda --cudadrv-lib-dir=/usr/lib/${arch}-linux-gnu --boost-inc-dir=/usr/include --boost-lib-dir=/usr/lib/${arch}-linux-gnu --boost-python-libname=${boost_pyname} --boost-thread-libname=boost_thread --no-use-shipped-boost +make -j$CPU_CORES +python3 setup.py build +sudo python3 setup.py install + +popd + +python3 -c "import pycuda; print('pycuda version:', pycuda.VERSION)" diff --git a/modnet/onnx_to_tensorrt.py b/modnet/onnx_to_tensorrt.py new file mode 100644 index 0000000..23f0e8f --- /dev/null +++ b/modnet/onnx_to_tensorrt.py @@ -0,0 +1,117 @@ +"""onnx_to_tensorrt.py + +For converting a MODNet ONNX model to a TensorRT engine. +""" + + +import os +import argparse + +import tensorrt as trt + +if trt.__version__[0] < '7': + raise SystemExit('TensorRT version < 7') + + +BATCH_SIZE = 1 + + +def parse_args(): + """Parse command-line options and arguments.""" + parser = argparse.ArgumentParser() + parser.add_argument( + '-v', '--verbose', action='store_true', + help='enable verbose output (for debugging) [False]') + parser.add_argument( + '--int8', action='store_true', + help='build INT8 TensorRT engine [False]') + parser.add_argument( + '--dla_core', type=int, default=-1, + help='id of DLA core for inference, ranging from 0 to N-1 [-1]') + parser.add_argument( + '--width', type=int, default=640, + help='input image width of the model [640]') + parser.add_argument( + '--height', type=int, default=480, + help='input image height of the model [480]') + parser.add_argument( + 'input_onnx', type=str, help='the input onnx file') + parser.add_argument( + 'output_engine', type=str, help='the output TensorRT engine file') + args = parser.parse_args() + return args + + +def load_onnx(onnx_file_path): + """Read the ONNX file.""" + with open(onnx_file_path, 'rb') as f: + return f.read() + + +def set_net_batch(network, batch_size): + """Set network input batch size.""" + shape = list(network.get_input(0).shape) + shape[0] = batch_size + network.get_input(0).shape = shape + return network + + +def build_engine(onnx_file_path, width, height, + do_int8=False, dla_core=False, verbose=False): + """Build a TensorRT engine from ONNX using the older API.""" + onnx_data = load_onnx(onnx_file_path) + + TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE) if verbose else trt.Logger() + EXPLICIT_BATCH = [1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)] + with trt.Builder(TRT_LOGGER) as builder, builder.create_network(*EXPLICIT_BATCH) as network, trt.OnnxParser(network, TRT_LOGGER) as parser: + if do_int8 and not builder.platform_has_fast_int8: + raise RuntimeError('INT8 not supported on this platform') + if not parser.parse(onnx_data): + print('ERROR: Failed to parse the ONNX file.') + for error in range(parser.num_errors): + print(parser.get_error(error)) + return None + network = set_net_batch(network, BATCH_SIZE) + + builder.max_batch_size = BATCH_SIZE + config = builder.create_builder_config() + config.max_workspace_size = 1 << 30 + config.set_flag(trt.BuilderFlag.GPU_FALLBACK) + config.set_flag(trt.BuilderFlag.FP16) + profile = builder.create_optimization_profile() + profile.set_shape( + 'Input', # input tensor name + (BATCH_SIZE, 3, height, width), # min shape + (BATCH_SIZE, 3, height, width), # opt shape + (BATCH_SIZE, 3, height, width)) # max shape + config.add_optimization_profile(profile) + if do_int8: + raise RuntimeError('INT8 not implemented yet') + if dla_core >= 0: + raise RuntimeError('DLA_core not implemented yet') + engine = builder.build_engine(network, config) + + return engine + + +def main(): + args = parse_args() + if not os.path.isfile(args.input_onnx): + raise FileNotFoundError(args.input_onnx) + + print('Building an engine. This would take a while...') + print('(Use "-v" or "--verbose" to enable verbose logging.)') + engine = build_engine( + args.input_onnx, args.width, args.height, + args.int8, args.dla_core, args.verbose) + if engine is None: + raise SystemExit('ERROR: failed to build the TensorRT engine!') + print('Completed creating engine.') + + with open(args.output_engine, 'wb') as f: + f.write(engine.serialize()) + print('Serialized the TensorRT engine to file: %s' % args.output_engine) + + +if __name__ == '__main__': + main() diff --git a/modnet/test_onnx.py b/modnet/test_onnx.py new file mode 100644 index 0000000..a03ff67 --- /dev/null +++ b/modnet/test_onnx.py @@ -0,0 +1,29 @@ +"""run_onnx.py + +A simple script for verifying the modnet.onnx model. + +I used the following image for testing: +$ gdown --id 1fkyh03NEuSwvjFttYVwV7TjnJML04Xn6 -O image.jpg +""" + + +import numpy as np +import cv2 +import onnx +import onnxruntime + +img = cv2.imread('image.jpg') +img = cv2.resize(img, (512, 288), cv2.INTER_AREA) +img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) +img = img.transpose((2, 0, 1)).astype(np.float32) +img = (img - 127.5) / 127.5 +img = np.expand_dims(img, axis=0) + +session = onnxruntime.InferenceSession('modnet.onnx', None) +input_name = session.get_inputs()[0].name +output_name = session.get_outputs()[0].name +result = session.run([output_name], {input_name: img}) +matte = np.squeeze(result[0]) +cv2.imshow('Matte', (matte * 255.).astype(np.uint8)) +cv2.waitKey(0) +cv2.destroyAllWindows() diff --git a/modnet/torch2onnx/__init__.py b/modnet/torch2onnx/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/modnet/torch2onnx/backbone.py b/modnet/torch2onnx/backbone.py new file mode 100644 index 0000000..98f5cdc --- /dev/null +++ b/modnet/torch2onnx/backbone.py @@ -0,0 +1,87 @@ +"""backbone.py + +This is a copy of: +https://github.com/ZHKKKe/MODNet/blob/master/src/models/backbones/wrapper.py +""" + + +import os +from functools import reduce + +import torch +import torch.nn as nn + +from .mobilenetv2 import MobileNetV2 + + +class BaseBackbone(nn.Module): + """Superclass of Replaceable Backbone Model for Semantic Estimation""" + + def __init__(self, in_channels): + super(BaseBackbone, self).__init__() + self.in_channels = in_channels + + self.model = None + self.enc_channels = [] + + def forward(self, x): + raise NotImplementedError + + def load_pretrained_ckpt(self): + raise NotImplementedError + + +class MobileNetV2Backbone(BaseBackbone): + """MobileNetV2 Backbone""" + + def __init__(self, in_channels): + super(MobileNetV2Backbone, self).__init__(in_channels) + + self.model = MobileNetV2(self.in_channels, alpha=1.0, expansion=6, num_classes=None) + self.enc_channels = [16, 24, 32, 96, 1280] + + def forward(self, x): + # x = reduce(lambda x, n: self.model.features[n](x), list(range(0, 2)), x) + x = self.model.features[0](x) + x = self.model.features[1](x) + enc2x = x + + # x = reduce(lambda x, n: self.model.features[n](x), list(range(2, 4)), x) + x = self.model.features[2](x) + x = self.model.features[3](x) + enc4x = x + + # x = reduce(lambda x, n: self.model.features[n](x), list(range(4, 7)), x) + x = self.model.features[4](x) + x = self.model.features[5](x) + x = self.model.features[6](x) + enc8x = x + + # x = reduce(lambda x, n: self.model.features[n](x), list(range(7, 14)), x) + x = self.model.features[7](x) + x = self.model.features[8](x) + x = self.model.features[9](x) + x = self.model.features[10](x) + x = self.model.features[11](x) + x = self.model.features[12](x) + x = self.model.features[13](x) + enc16x = x + + # x = reduce(lambda x, n: self.model.features[n](x), list(range(14, 19)), x) + x = self.model.features[14](x) + x = self.model.features[15](x) + x = self.model.features[16](x) + x = self.model.features[17](x) + x = self.model.features[18](x) + enc32x = x + return [enc2x, enc4x, enc8x, enc16x, enc32x] + + def load_pretrained_ckpt(self): + # the pre-trained model is provided by https://github.com/thuyngch/Human-Segmentation-PyTorch + ckpt_path = './pretrained/mobilenetv2_human_seg.ckpt' + if not os.path.exists(ckpt_path): + print('cannot find the pretrained mobilenetv2 backbone') + exit() + + ckpt = torch.load(ckpt_path) + self.model.load_state_dict(ckpt) diff --git a/modnet/torch2onnx/export.py b/modnet/torch2onnx/export.py new file mode 100644 index 0000000..5cbf9cc --- /dev/null +++ b/modnet/torch2onnx/export.py @@ -0,0 +1,63 @@ +"""export.py + +This script is an adatped copy of: +https://github.com/ZHKKKe/MODNet/blob/master/onnx/export_onnx.py + +This script is for converting a PyTorch MODNet model to ONNX. The +output ONNX model will have fixed batch size (1) and input image +width/height. The input image width and height could be specified +by command-line options (default to 512x288). + +Example usage: (Recommended to run this inside a virtual environment) +$ python export.py --width 512 --height 288 \ + modnet_photographic_portrait_matting.ckpt \ + modnet.onnx +""" + + +import os +import argparse + +import torch +from torch.autograd import Variable + +from .modnet import MODNet + + +BATCH_SIZE = 1 + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--width', type=int, default=512, + help='image width of the converted ONNX model [512]') + parser.add_argument( + '--height', type=int, default=288, + help='image width of the converted ONNX model [288]') + parser.add_argument( + '-v', '--verbose', action='store_true', + help='enable verbose logging [False]') + parser.add_argument( + 'input_ckpt', type=str, help='the input PyTorch checkpoint file path') + parser.add_argument( + 'output_onnx', type=str, help='the output ONNX file path') + args = parser.parse_args() + + if not os.path.isfile(args.input_ckpt): + raise SystemExit('ERROR: file (%s) not found!' % args.input_ckpt) + + # define model & load checkpoint + modnet = torch.nn.DataParallel(MODNet()).cuda() + modnet.load_state_dict(torch.load(args.input_ckpt)) + modnet.eval() + + # prepare dummy input + dummy_img = torch.rand(BATCH_SIZE, 3, args.height, args.width) * 2. - 1. + dummy_img = dummy_img.cuda() + + # export to onnx model + torch.onnx.export( + modnet.module, dummy_img, args.output_onnx, + opset_version=11, export_params=True, verbose=args.verbose, + input_names=['input'], output_names=['output']) diff --git a/modnet/torch2onnx/mobilenetv2.py b/modnet/torch2onnx/mobilenetv2.py new file mode 100644 index 0000000..997ab88 --- /dev/null +++ b/modnet/torch2onnx/mobilenetv2.py @@ -0,0 +1,204 @@ +"""mobilenetv2.py + +This is a copy of: +https://github.com/ZHKKKe/MODNet/blob/master/src/models/backbones/mobilenetv2.py +""" + + +import math +import json +from functools import reduce + +import torch +from torch import nn + + +#------------------------------------------------------------------------------ +# Useful functions +#------------------------------------------------------------------------------ + +def _make_divisible(v, divisor, min_value=None): + if min_value is None: + min_value = divisor + new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) + # Make sure that round down does not go down by more than 10%. + if new_v < 0.9 * v: + new_v += divisor + return new_v + + +def conv_bn(inp, oup, stride): + return nn.Sequential( + nn.Conv2d(inp, oup, 3, stride, 1, bias=False), + nn.BatchNorm2d(oup), + nn.ReLU6(inplace=True) + ) + + +def conv_1x1_bn(inp, oup): + return nn.Sequential( + nn.Conv2d(inp, oup, 1, 1, 0, bias=False), + nn.BatchNorm2d(oup), + nn.ReLU6(inplace=True) + ) + + +#------------------------------------------------------------------------------ +# Class of Inverted Residual block +#------------------------------------------------------------------------------ + +class InvertedResidual(nn.Module): + def __init__(self, inp, oup, stride, expansion, dilation=1): + super(InvertedResidual, self).__init__() + self.stride = stride + assert stride in [1, 2] + + hidden_dim = round(inp * expansion) + self.use_res_connect = self.stride == 1 and inp == oup + + if expansion == 1: + self.conv = nn.Sequential( + # dw + nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, dilation=dilation, bias=False), + nn.BatchNorm2d(hidden_dim), + nn.ReLU6(inplace=True), + # pw-linear + nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False), + nn.BatchNorm2d(oup), + ) + else: + self.conv = nn.Sequential( + # pw + nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False), + nn.BatchNorm2d(hidden_dim), + nn.ReLU6(inplace=True), + # dw + nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, dilation=dilation, bias=False), + nn.BatchNorm2d(hidden_dim), + nn.ReLU6(inplace=True), + # pw-linear + nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False), + nn.BatchNorm2d(oup), + ) + + def forward(self, x): + if self.use_res_connect: + return x + self.conv(x) + else: + return self.conv(x) + + +#------------------------------------------------------------------------------ +# Class of MobileNetV2 +#------------------------------------------------------------------------------ + +class MobileNetV2(nn.Module): + def __init__(self, in_channels, alpha=1.0, expansion=6, num_classes=1000): + super(MobileNetV2, self).__init__() + self.in_channels = in_channels + self.num_classes = num_classes + input_channel = 32 + last_channel = 1280 + interverted_residual_setting = [ + # t, c, n, s + [1 , 16, 1, 1], + [expansion, 24, 2, 2], + [expansion, 32, 3, 2], + [expansion, 64, 4, 2], + [expansion, 96, 3, 1], + [expansion, 160, 3, 2], + [expansion, 320, 1, 1], + ] + + # building first layer + input_channel = _make_divisible(input_channel*alpha, 8) + self.last_channel = _make_divisible(last_channel*alpha, 8) if alpha > 1.0 else last_channel + self.features = [conv_bn(self.in_channels, input_channel, 2)] + + # building inverted residual blocks + for t, c, n, s in interverted_residual_setting: + output_channel = _make_divisible(int(c*alpha), 8) + for i in range(n): + if i == 0: + self.features.append(InvertedResidual(input_channel, output_channel, s, expansion=t)) + else: + self.features.append(InvertedResidual(input_channel, output_channel, 1, expansion=t)) + input_channel = output_channel + + # building last several layers + self.features.append(conv_1x1_bn(input_channel, self.last_channel)) + + # make it nn.Sequential + self.features = nn.Sequential(*self.features) + + # building classifier + if self.num_classes is not None: + self.classifier = nn.Sequential( + nn.Dropout(0.2), + nn.Linear(self.last_channel, num_classes), + ) + + # Initialize weights + self._init_weights() + + def forward(self, x): + # Stage1 + x = self.features[0](x) + x = self.features[1](x) + # Stage2 + x = self.features[2](x) + x = self.features[3](x) + # Stage3 + x = self.features[4](x) + x = self.features[5](x) + x = self.features[6](x) + # Stage4 + x = self.features[7](x) + x = self.features[8](x) + x = self.features[9](x) + x = self.features[10](x) + x = self.features[11](x) + x = self.features[12](x) + x = self.features[13](x) + # Stage5 + x = self.features[14](x) + x = self.features[15](x) + x = self.features[16](x) + x = self.features[17](x) + x = self.features[18](x) + + # Classification + if self.num_classes is not None: + x = x.mean(dim=(2,3)) + x = self.classifier(x) + + # Output + return x + + def _load_pretrained_model(self, pretrained_file): + pretrain_dict = torch.load(pretrained_file, map_location='cpu') + model_dict = {} + state_dict = self.state_dict() + print("[MobileNetV2] Loading pretrained model...") + for k, v in pretrain_dict.items(): + if k in state_dict: + model_dict[k] = v + else: + print(k, "is ignored") + state_dict.update(model_dict) + self.load_state_dict(state_dict) + + def _init_weights(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + m.weight.data.normal_(0, math.sqrt(2. / n)) + if m.bias is not None: + m.bias.data.zero_() + elif isinstance(m, nn.BatchNorm2d): + m.weight.data.fill_(1) + m.bias.data.zero_() + elif isinstance(m, nn.Linear): + n = m.weight.size(1) + m.weight.data.normal_(0, 0.01) + m.bias.data.zero_() diff --git a/modnet/torch2onnx/modnet.py b/modnet/torch2onnx/modnet.py new file mode 100644 index 0000000..f94876a --- /dev/null +++ b/modnet/torch2onnx/modnet.py @@ -0,0 +1,248 @@ +"""modnet.py + +This is a modified version of: +https://github.com/ZHKKKe/MODNet/blob/master/onnx/modnet_onnx.py + +* "scale_factor" replaced by "size" in all F.interpolate() +* SEBlock takes only 1 "channels" argument +""" + + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from .backbone import MobileNetV2Backbone + + +SUPPORTED_BACKBONES = {'mobilenetv2': MobileNetV2Backbone} + + +#------------------------------------------------------------------------------ +# MODNet Basic Modules +#------------------------------------------------------------------------------ + +class IBNorm(nn.Module): + """Combine Instance Norm and Batch Norm into One Layer""" + + def __init__(self, in_channels): + super(IBNorm, self).__init__() + assert in_channels % 2 == 0 + self.bnorm_channels = in_channels // 2 + self.inorm_channels = in_channels - self.bnorm_channels + + self.bnorm = nn.BatchNorm2d(self.bnorm_channels, affine=True) + self.inorm = nn.InstanceNorm2d(self.inorm_channels, affine=False) + + def forward(self, x): + bn_x = self.bnorm(x[:, :self.bnorm_channels, ...].contiguous()) + in_x = self.inorm(x[:, self.bnorm_channels:, ...].contiguous()) + + return torch.cat((bn_x, in_x), 1) + + +class Conv2dIBNormRelu(nn.Module): + """Convolution + IBNorm + ReLu""" + + def __init__(self, in_channels, out_channels, kernel_size, + stride=1, padding=0, dilation=1, groups=1, bias=True, + with_ibn=True, with_relu=True): + super(Conv2dIBNormRelu, self).__init__() + + layers = [ + nn.Conv2d(in_channels, out_channels, kernel_size, + stride=stride, padding=padding, dilation=dilation, + groups=groups, bias=bias) + ] + + if with_ibn: + layers.append(IBNorm(out_channels)) + if with_relu: + layers.append(nn.ReLU(inplace=True)) + + self.layers = nn.Sequential(*layers) + + def forward(self, x): + return self.layers(x) + + +class SEBlock(nn.Module): + """SE Block as proposed in https://arxiv.org/pdf/1709.01507.pdf""" + + def __init__(self, channels, reduction=1): + super(SEBlock, self).__init__() + self.channels = channels + self.pool = nn.AdaptiveAvgPool2d(1) + self.fc = nn.Sequential( + nn.Linear(channels, channels // reduction, bias=False), + nn.ReLU(inplace=True), + nn.Linear(channels // reduction, channels, bias=False), + nn.Sigmoid() + ) + + def forward(self, x): + b = x.size()[0] + w = self.pool(x).view(b, self.channels) + w = self.fc(w).view(b, self.channels, 1, 1) + return x * w + + +#------------------------------------------------------------------------------ +# MODNet Branches +#------------------------------------------------------------------------------ + +class LRBranch(nn.Module): + """Low Resolution Branch of MODNet""" + + def __init__(self, backbone): + super(LRBranch, self).__init__() + + enc_channels = backbone.enc_channels + + self.backbone = backbone + self.se_block = SEBlock(enc_channels[4], reduction=4) + self.conv_lr16x = Conv2dIBNormRelu(enc_channels[4], enc_channels[3], 5, stride=1, padding=2) + self.conv_lr8x = Conv2dIBNormRelu(enc_channels[3], enc_channels[2], 5, stride=1, padding=2) + self.conv_lr = Conv2dIBNormRelu(enc_channels[2], 1, kernel_size=3, stride=2, padding=1, with_ibn=False, with_relu=False) + + def forward(self, img): + enc_features = self.backbone.forward(img) + enc2x, enc4x, enc32x = enc_features[0], enc_features[1], enc_features[4] + + enc32x = self.se_block(enc32x) + h, w = enc32x.size()[2:] # replacing "scale_factor" + lr16x = F.interpolate(enc32x, size=(h*2, w*2), mode='bilinear', align_corners=False) + lr16x = self.conv_lr16x(lr16x) + h, w = lr16x.size()[2:] # replacing "scale_factor" + lr8x = F.interpolate(lr16x, size=(h*2, w*2), mode='bilinear', align_corners=False) + lr8x = self.conv_lr8x(lr8x) + + return lr8x, [enc2x, enc4x] + + +class HRBranch(nn.Module): + """High Resolution Branch of MODNet""" + + def __init__(self, hr_channels, enc_channels): + super(HRBranch, self).__init__() + + self.tohr_enc2x = Conv2dIBNormRelu(enc_channels[0], hr_channels, 1, stride=1, padding=0) + self.conv_enc2x = Conv2dIBNormRelu(hr_channels + 3, hr_channels, 3, stride=2, padding=1) + + self.tohr_enc4x = Conv2dIBNormRelu(enc_channels[1], hr_channels, 1, stride=1, padding=0) + self.conv_enc4x = Conv2dIBNormRelu(2 * hr_channels, 2 * hr_channels, 3, stride=1, padding=1) + + self.conv_hr4x = nn.Sequential( + Conv2dIBNormRelu(3 * hr_channels + 3, 2 * hr_channels, 3, stride=1, padding=1), + Conv2dIBNormRelu(2 * hr_channels, 2 * hr_channels, 3, stride=1, padding=1), + Conv2dIBNormRelu(2 * hr_channels, hr_channels, 3, stride=1, padding=1), + ) + + self.conv_hr2x = nn.Sequential( + Conv2dIBNormRelu(2 * hr_channels, 2 * hr_channels, 3, stride=1, padding=1), + Conv2dIBNormRelu(2 * hr_channels, hr_channels, 3, stride=1, padding=1), + Conv2dIBNormRelu(hr_channels, hr_channels, 3, stride=1, padding=1), + Conv2dIBNormRelu(hr_channels, hr_channels, 3, stride=1, padding=1), + ) + + self.conv_hr = nn.Sequential( + Conv2dIBNormRelu(hr_channels + 3, hr_channels, 3, stride=1, padding=1), + Conv2dIBNormRelu(hr_channels, 1, kernel_size=1, stride=1, padding=0, with_ibn=False, with_relu=False), + ) + + def forward(self, img, enc2x, enc4x, lr8x): + h, w = img.size()[2:] # replacing "scale_factor" + assert h % 4 == 0 and w % 4 == 0 + img2x = F.interpolate(img, size=(h//2, w//2), mode='bilinear', align_corners=False) + img4x = F.interpolate(img, size=(h//4, w//4), mode='bilinear', align_corners=False) + + enc2x = self.tohr_enc2x(enc2x) + hr4x = self.conv_enc2x(torch.cat((img2x, enc2x), dim=1)) + + enc4x = self.tohr_enc4x(enc4x) + hr4x = self.conv_enc4x(torch.cat((hr4x, enc4x), dim=1)) + + h, w = lr8x.size()[2:] # replacing "scale_factor" + lr4x = F.interpolate(lr8x, size=(h*2, w*2), mode='bilinear', align_corners=False) + hr4x = self.conv_hr4x(torch.cat((hr4x, lr4x, img4x), dim=1)) + + h, w = hr4x.size()[2:] # replacing "scale_factor" + hr2x = F.interpolate(hr4x, size=(h*2, w*2), mode='bilinear', align_corners=False) + hr2x = self.conv_hr2x(torch.cat((hr2x, enc2x), dim=1)) + + return hr2x + + +class FusionBranch(nn.Module): + """Fusion Branch of MODNet""" + + def __init__(self, hr_channels, enc_channels): + super(FusionBranch, self).__init__() + self.conv_lr4x = Conv2dIBNormRelu(enc_channels[2], hr_channels, 5, stride=1, padding=2) + + self.conv_f2x = Conv2dIBNormRelu(2 * hr_channels, hr_channels, 3, stride=1, padding=1) + self.conv_f = nn.Sequential( + Conv2dIBNormRelu(hr_channels + 3, int(hr_channels / 2), 3, stride=1, padding=1), + Conv2dIBNormRelu(int(hr_channels / 2), 1, 1, stride=1, padding=0, with_ibn=False, with_relu=False), + ) + + def forward(self, img, lr8x, hr2x): + h, w = lr8x.size()[2:] # replacing "scale_factor" + lr4x = F.interpolate(lr8x, size=(h*2, w*2), mode='bilinear', align_corners=False) + lr4x = self.conv_lr4x(lr4x) + h, w = lr4x.size()[2:] # replacing "scale_factor" + lr2x = F.interpolate(lr4x, size=(h*2, w*2), mode='bilinear', align_corners=False) + + f2x = self.conv_f2x(torch.cat((lr2x, hr2x), dim=1)) + h, w = f2x.size()[2:] # replacing "scale_factor" + f = F.interpolate(f2x, size=(h*2, w*2), mode='bilinear', align_corners=False) + f = self.conv_f(torch.cat((f, img), dim=1)) + pred_matte = torch.sigmoid(f) + + return pred_matte + + +#------------------------------------------------------------------------------ +# MODNet +#------------------------------------------------------------------------------ + +class MODNet(nn.Module): + """Architecture of MODNet""" + + def __init__(self, in_channels=3, hr_channels=32, backbone_arch='mobilenetv2', backbone_pretrained=False): + super(MODNet, self).__init__() + + self.in_channels = in_channels + self.hr_channels = hr_channels + self.backbone_arch = backbone_arch + + self.backbone = SUPPORTED_BACKBONES[self.backbone_arch](self.in_channels) + self.lr_branch = LRBranch(self.backbone) + self.hr_branch = HRBranch(self.hr_channels, self.backbone.enc_channels) + self.f_branch = FusionBranch(self.hr_channels, self.backbone.enc_channels) + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + self._init_conv(m) + elif isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.InstanceNorm2d): + self._init_norm(m) + + if backbone_pretrained: + self.backbone.load_pretrained_ckpt() + + def forward(self, img): + lr8x, [enc2x, enc4x] = self.lr_branch(img) + hr2x = self.hr_branch(img, enc2x, enc4x, lr8x) + pred_matte = self.f_branch(img, lr8x, hr2x) + return pred_matte + + def _init_conv(self, conv): + nn.init.kaiming_uniform_( + conv.weight, a=0, mode='fan_in', nonlinearity='relu') + if conv.bias is not None: + nn.init.constant_(conv.bias, 0) + + def _init_norm(self, norm): + if norm.weight is not None: + nn.init.constant_(norm.weight, 1) + nn.init.constant_(norm.bias, 0) diff --git a/modnet/torch2onnx/requirements.txt b/modnet/torch2onnx/requirements.txt new file mode 100644 index 0000000..0e715aa --- /dev/null +++ b/modnet/torch2onnx/requirements.txt @@ -0,0 +1,8 @@ +Cython +numpy +scikit-build +opencv-python +PyImage +onnx==1.8.1 +onnxruntime==1.6.0 +torch==1.7.1 diff --git a/mtcnn/Makefile b/mtcnn/Makefile new file mode 100644 index 0000000..c763264 --- /dev/null +++ b/mtcnn/Makefile @@ -0,0 +1,6 @@ +OUTNAME_RELEASE = create_engines +OUTNAME_DEBUG = create_engines_debug +MAKEFILE_CONFIG ?= ../common/Makefile.config +include $(MAKEFILE_CONFIG) + +all: release diff --git a/mtcnn/README.md b/mtcnn/README.md new file mode 100644 index 0000000..6ff324b --- /dev/null +++ b/mtcnn/README.md @@ -0,0 +1,8 @@ +The MTCNN caffe model files are taken from [https://github.com/PKUZHOU/MTCNN_FaceDetection_TensorRT](https://github.com/PKUZHOU/MTCNN_FaceDetection_TensorRT). These model files contains a workaround which replaces 'PReLU' with 'ReLU', 'Scale' and 'Elementwise Addition' layers. I use them to get around the issue of TensorRT 3.x/4.x not supporting PReLU layers. Please refer to the original GitHub page (linked above) for more details. + +* det1_relu.prototxt +* det1_relu.caffemodel +* det2_relu.prototxt +* det2_relu.caffemodel +* det3_relu.prototxt +* det3_relu.caffemodel diff --git a/mtcnn/create_engines.cpp b/mtcnn/create_engines.cpp new file mode 100644 index 0000000..8adbed2 --- /dev/null +++ b/mtcnn/create_engines.cpp @@ -0,0 +1,251 @@ +// create_engines.cpp +// +// This program creates TensorRT engines for MTCNN models. +// +// Inputs: +// det1.prototxt +// det1.caffemodel +// det2.prototxt +// det2.caffemodel +// det3.prototxt +// det3.caffemodel +// +// Outputs: +// det1.engine +// det2.engine +// det3.engine + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "NvInfer.h" +#include "NvCaffeParser.h" +#include "common.h" + +using namespace nvinfer1; +using namespace nvcaffeparser1; + +//static Logger gLogger(ILogger::Severity::kINFO); +static Logger gLogger(ILogger::Severity::kWARNING); + +class IHostMemoryFromFile : public IHostMemory +{ + public: + IHostMemoryFromFile(std::string filename); +#if NV_TENSORRT_MAJOR >= 6 + void* data() const noexcept { return mem; } + std::size_t size() const noexcept { return s; } + DataType type () const noexcept { return DataType::kFLOAT; } // not used + void destroy() noexcept { free(mem); } +#else // NV_TENSORRT_MAJOR < 6 + void* data() const { return mem; } + std::size_t size() const { return s; } + DataType type () const { return DataType::kFLOAT; } // not used + void destroy() { free(mem); } +#endif // NV_TENSORRT_MAJOR + private: + void *mem{nullptr}; + std::size_t s; +}; + +IHostMemoryFromFile::IHostMemoryFromFile(std::string filename) +{ + std::ifstream infile(filename, std::ifstream::binary | std::ifstream::ate); + s = infile.tellg(); + infile.seekg(0, std::ios::beg); + mem = malloc(s); + infile.read(reinterpret_cast(mem), s); +} + +std::string locateFile(const std::string& input) +{ + std::vector dirs{"./"}; + return locateFile(input, dirs); +} + +void caffeToTRTModel(const std::string& deployFile, // name for caffe prototxt + const std::string& modelFile, // name for model + const std::vector& outputs, // network outputs + unsigned int maxBatchSize, // batch size - NB must be at least as large as the batch we want to run with) + IHostMemory *&trtModelStream) +{ + // create API root class - must span the lifetime of the engine usage + IBuilder* builder = createInferBuilder(gLogger); +#if NV_TENSORRT_MAJOR >= 7 + INetworkDefinition* network = builder->createNetworkV2(0); // no kEXPLICIT_BATCH +#else // NV_TENSORRT_MAJOR < 7 + INetworkDefinition* network = builder->createNetwork(); +#endif + + // parse the caffe model to populate the network, then set the outputs + ICaffeParser* parser = createCaffeParser(); + + bool useFp16 = builder->platformHasFastFp16(); + + // create a 16-bit model if it's natively supported + DataType modelDataType = useFp16 ? DataType::kHALF : DataType::kFLOAT; + const IBlobNameToTensor *blobNameToTensor = + parser->parse(locateFile(deployFile).c_str(), // caffe deploy file + locateFile(modelFile).c_str(), // caffe model file + *network, // network definition that the parser will populate + modelDataType); + assert(blobNameToTensor != nullptr); + + // the caffe file has no notion of outputs, so we need to manually say which tensors the engine should generate + for (auto& s : outputs) + network->markOutput(*blobNameToTensor->find(s.c_str())); + +#if NV_TENSORRT_MAJOR >= 7 + auto config = builder->createBuilderConfig(); + assert(config != nullptr); + + builder->setMaxBatchSize(maxBatchSize); + config->setMaxWorkspaceSize(64_MB); + if (useFp16) { + config->setFlag(BuilderFlag::kFP16); + cout << "Building TensorRT engine in FP16 mode..." << endl; + } else { + cout << "Building TensorRT engine in FP32 mode..." << endl; + } + ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); + config->destroy(); +#else // NV_TENSORRT_MAJOR < 7 + // Build the engine + builder->setMaxBatchSize(maxBatchSize); + builder->setMaxWorkspaceSize(64_MB); + + // set up the network for paired-fp16 format if available + if (useFp16) { +#if NV_TENSORRT_MAJOR >= 4 + builder->setFp16Mode(true); +#else // NV_TENSORRT_MAJOR < 4 + builder->setHalf2Mode(true); +#endif + } + ICudaEngine* engine = builder->buildCudaEngine(*network); +#endif // NV_TENSORRT_MAJOR >= 7 + assert(engine != nullptr); + + // we don't need the network any more, and we can destroy the parser + parser->destroy(); + network->destroy(); + + // serialize the engine, then close everything down + trtModelStream = engine->serialize(); + engine->destroy(); + builder->destroy(); +} + +void giestream_to_file(IHostMemory *trtModelStream, const std::string filename) +{ + assert(trtModelStream != nullptr); + std::ofstream outfile(filename, std::ofstream::binary); + assert(!outfile.fail()); + outfile.write(reinterpret_cast(trtModelStream->data()), trtModelStream->size()); + outfile.close(); +} + +void file_to_giestream(const std::string filename, IHostMemoryFromFile *&trtModelStream) +{ + trtModelStream = new IHostMemoryFromFile(filename); +} + +void verify_engine(std::string det_name, int num_bindings) +{ + std::stringstream ss; + ss << det_name << ".engine"; + IHostMemoryFromFile *trtModelStream{nullptr}; + file_to_giestream(ss.str(), trtModelStream); + + // create an engine + IRuntime* infer = createInferRuntime(gLogger); + assert(infer != nullptr); + ICudaEngine* engine = infer->deserializeCudaEngine( + trtModelStream->data(), + trtModelStream->size(), + nullptr); + assert(engine != nullptr); + + assert(engine->getNbBindings() == num_bindings); + std::cout << "Bindings for " << det_name << " after deserializing:" + << std::endl; + for (int bi = 0; bi < num_bindings; bi++) { +#if NV_TENSORRT_MAJOR >= 4 + Dims3 dim = static_cast(engine->getBindingDimensions(bi)); + if (engine->bindingIsInput(bi) == true) { + std::cout << " Input "; + } else { + std::cout << " Output "; + } + std::cout << bi << ": " << engine->getBindingName(bi) << ", " + << dim.d[0] << "x" << dim.d[1] << "x" << dim.d[2] + << std::endl; +#else // NV_TENSORRT_MAJOR < 4 + DimsCHW dim = static_cast(engine->getBindingDimensions(bi)); + if (engine->bindingIsInput(bi) == true) { + std::cout << " Input "; + } else { + std::cout << " Output "; + } + std::cout << bi << ": " << engine->getBindingName(bi) << ", " + << dim.c() << "x" << dim.h() << "x" << dim.w() + << std::endl; +#endif // NV_TENSORRT_MAJOR + } + engine->destroy(); + infer->destroy(); + trtModelStream->destroy(); +} + +int main(int argc, char** argv) +{ + IHostMemory *trtModelStream{nullptr}; + + std::cout << "Building det1.engine (PNet), maxBatchSize = 1" + << std::endl; + caffeToTRTModel("det1_relu.prototxt", + "det1_relu.caffemodel", + std::vector { "prob1", "conv4-2" }, + 1, // max batch size + trtModelStream); + giestream_to_file(trtModelStream, "det1.engine"); + trtModelStream->destroy(); + + std::cout << "Building det2.engine (RNet), maxBatchSize = 256" + << std::endl; + caffeToTRTModel("det2_relu.prototxt", + "det2_relu.caffemodel", + std::vector { "prob1", "conv5-2" }, + 256, // max batch size + trtModelStream); + giestream_to_file(trtModelStream, "det2.engine"); + trtModelStream->destroy(); + + std::cout << "Building det3.engine (ONet), maxBatchSize = 64" + << std::endl; + caffeToTRTModel("det3_relu.prototxt", + "det3_relu.caffemodel", + std::vector { "prob1", "conv6-2", "conv6-3" }, + 64, // max batch size + trtModelStream); + giestream_to_file(trtModelStream, "det3.engine"); + trtModelStream->destroy(); + //delete trtModelStream; + + shutdownProtobufLibrary(); + + std::cout << std::endl << "Verifying engines..." << std::endl; + verify_engine("det1", 3); + verify_engine("det2", 3); + verify_engine("det3", 4); + std::cout << "Done." << std::endl; + return 0; +} diff --git a/mtcnn/det1_relu.caffemodel b/mtcnn/det1_relu.caffemodel new file mode 100644 index 0000000..df1ec14 Binary files /dev/null and b/mtcnn/det1_relu.caffemodel differ diff --git a/mtcnn/det1_relu.prototxt b/mtcnn/det1_relu.prototxt new file mode 100644 index 0000000..2c82972 --- /dev/null +++ b/mtcnn/det1_relu.prototxt @@ -0,0 +1,290 @@ +name: "PNet" +layer +{ + name: "data" + type: "Input" + top: "data" + # + # Max allowed input image size as: 1280x720 + # 'minsize' = 40 + # + # Input dimension of the 1st 'scale': + # 720 * 12 / 40 = 216 + # 1280 * 12 / 40 = 384 + # + # H's in all scales: (scale factor = 0.709) + # Original: 216.0, 153.1, 108.6 77.0, 54.6, 38.7, 27.4, 19.5, 13.8, (9.8) + # Rounded: 216, 154, 108, 78, 54, 38, 28, 20, 14 + # Offsets: 0, 216, 370, 478, 556, 610, 648, 676, 696, (710) + # + # Input dimension of the 'stacked image': 710x384 + # + # Output dimension: (stride=2) + # (710 - 12) / 2 + 1 = 350 + # (384 - 12) / 2 + 1 = 187 + # + input_param{shape:{dim:1 dim:3 dim:710 dim:384}} +} + +layer { + name: "conv1" + type: "Convolution" + bottom: "data" + top: "conv1" + param { + lr_mult: 1 + } + param { + lr_mult: 2 + } + convolution_param { + num_output: 10 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "ReLU1" + type: "ReLU" + bottom: "conv1" + top: "conv1_1" +} + +layer { + name: "scale1_1" + bottom: "conv1" + top: "conv1_2" + type: "Scale" + scale_param { + axis: 1 + bias_term:false + } +} +layer { + name: "ReLU1_2" + type: "ReLU" + bottom: "conv1_2" + top: "conv1_2" +} +layer { + name: "scale1_2" + bottom: "conv1_2" + top: "conv1_2" + type: "Scale" + scale_param { + axis: 1 + bias_term:false + + } +} +layer { + name: "eltwise-sum1" + type: "Eltwise" + bottom: "conv1_1" + bottom: "conv1_2" + top: "conv1_3" + eltwise_param { operation: SUM } +} +layer { + name: "pool1" + type: "Pooling" + bottom: "conv1_3" + top: "pool1" + pooling_param { + pool: MAX + kernel_size: 2 + stride: 2 + } +} + +layer { + name: "conv2" + type: "Convolution" + bottom: "pool1" + top: "conv2" + param { + lr_mult: 1 + } + param { + lr_mult: 2 + } + convolution_param { + num_output: 16 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "ReLU2" + type: "ReLU" + bottom: "conv2" + top: "conv2_1" +} + +layer { + name: "scale2_1" + bottom: "conv2" + top: "conv2_2" + type: "Scale" + scale_param { + axis: 1 + bias_term:false + + } +} +layer { + name: "ReLU2_2" + type: "ReLU" + bottom: "conv2_2" + top: "conv2_2" +} +layer { + name: "scale2_2" + bottom: "conv2_2" + top: "conv2_2" + type: "Scale" + scale_param { + axis: 1 + bias_term:false + } +} +layer { + name: "eltwise-sum2" + type: "Eltwise" + bottom: "conv2_1" + bottom: "conv2_2" + top: "conv2_3" + eltwise_param { operation: SUM } +} + + +layer { + name: "conv3" + type: "Convolution" + bottom: "conv2_3" + top: "conv3" + param { + lr_mult: 1 + } + param { + lr_mult: 2 + } + convolution_param { + num_output: 32 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "ReLU3" + type: "ReLU" + bottom: "conv3" + top: "conv3_1" +} +layer { + name: "scale3_1" + bottom: "conv3" + top: "conv3_2" + type: "Scale" + scale_param { + axis: 1 + bias_term:false + } +} +layer { + name: "ReLU3_2" + type: "ReLU" + bottom: "conv3_2" + top: "conv3_2" +} +layer { + name: "scale3_2" + bottom: "conv3_2" + top: "conv3_2" + type: "Scale" + scale_param { + axis: 1 + bias_term:false + } +} +layer { + name: "eltwise-sum3" + type: "Eltwise" + bottom: "conv3_1" + bottom: "conv3_2" + top: "conv3_3" + eltwise_param { operation: SUM } +} + +layer { + name: "conv4-1" + type: "Convolution" + bottom: "conv3_3" + top: "conv4-1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + } + convolution_param { + num_output: 2 + kernel_size: 1 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + } + } +} + +layer { + name: "conv4-2" + type: "Convolution" + bottom: "conv3_3" + top: "conv4-2" + param { + lr_mult: 1 + } + param { + lr_mult: 2 + } + convolution_param { + num_output: 4 + kernel_size: 1 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "prob1" + type: "Softmax" + bottom: "conv4-1" + top: "prob1" +} diff --git a/mtcnn/det2_relu.caffemodel b/mtcnn/det2_relu.caffemodel new file mode 100644 index 0000000..f62e6da Binary files /dev/null and b/mtcnn/det2_relu.caffemodel differ diff --git a/mtcnn/det2_relu.prototxt b/mtcnn/det2_relu.prototxt new file mode 100644 index 0000000..1992882 --- /dev/null +++ b/mtcnn/det2_relu.prototxt @@ -0,0 +1,370 @@ +name: "RNet" +layer +{ + name: "data" + type: "Input" + top: "data" + input_param{shape:{dim:1 dim:3 dim:24 dim:24}} +} + +layer { + name: "conv1" + type: "Convolution" + bottom: "data" + top: "conv1" + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } + convolution_param { + num_output: 28 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu1_1" + type: "ReLU" + bottom: "conv1" + top: "conv1_1" + propagate_down: true +} + +layer { + name: "scale1_1" + bottom: "conv1" + top: "conv1_2" + type: "Scale" + scale_param { + axis: 1 + bias_term:false + } +} +layer { + name: "ReLU1_2" + type: "ReLU" + bottom: "conv1_2" + top: "conv1_2" +} +layer { + name: "scale1_2" + bottom: "conv1_2" + top: "conv1_2" + type: "Scale" + scale_param { + axis: 1 + bias_term:false + } +} +layer { + name: "eltwise-sum1" + type: "Eltwise" + bottom: "conv1_1" + bottom: "conv1_2" + top: "conv1_3" + eltwise_param { operation: SUM } +} +layer { + name: "pool1" + type: "Pooling" + bottom: "conv1_3" + top: "pool1" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} + +layer { + name: "conv2" + type: "Convolution" + bottom: "pool1" + top: "conv2" + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } + convolution_param { + num_output: 48 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu2_1" + type: "ReLU" + bottom: "conv2" + top: "conv2_1" + propagate_down: true +} + +layer { + name: "scale2_1" + bottom: "conv2" + top: "conv2_2" + type: "Scale" + scale_param { + axis: 1 + bias_term:false + } +} +layer { + name: "ReLU2_2" + type: "ReLU" + bottom: "conv2_2" + top: "conv2_2" +} +layer { + name: "scale2_2" + bottom: "conv2_2" + top: "conv2_2" + type: "Scale" + scale_param { + axis: 1 + bias_term:false + } +} +layer { + name: "eltwise-sum2" + type: "Eltwise" + bottom: "conv2_1" + bottom: "conv2_2" + top: "conv2_3" + eltwise_param { operation: SUM } +} + +layer { + name: "pool2" + type: "Pooling" + bottom: "conv2_3" + top: "pool2" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +#################################### + +################################## +layer { + name: "conv3" + type: "Convolution" + bottom: "pool2" + top: "conv3" + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } + convolution_param { + num_output: 64 + kernel_size: 2 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + name: "scale3_1" + bottom: "conv3" + top: "conv3_2" + type: "Scale" + scale_param { + axis: 1 + bias_term:false + } +} +layer { + name: "ReLU3_2" + type: "ReLU" + bottom: "conv3_2" + top: "conv3_2" +} +layer { + name: "scale3_2" + bottom: "conv3_2" + top: "conv3_2" + type: "Scale" + scale_param { + axis: 1 + bias_term:false + } +} +layer { + name: "relu3" + type: "ReLU" + bottom: "conv3" + top: "conv3_1" + propagate_down: true +} +layer { + name: "eltwise-sum3" + type: "Eltwise" + bottom: "conv3_1" + bottom: "conv3_2" + top: "conv3_3" + eltwise_param { operation: SUM } +} + +############################### + +############################### + +layer { + name: "conv4" + type: "InnerProduct" + bottom: "conv3_3" + top: "conv4" + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } + inner_product_param { + num_output: 128 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu4_1" + type: "ReLU" + bottom: "conv4" + top: "conv4_1" +} + +layer { + name: "scale4_1" + bottom: "conv4" + top: "conv4_2" + type: "Scale" + scale_param { + axis: 1 + bias_term:false + } +} +layer { + name: "ReLU4_2" + type: "ReLU" + bottom: "conv4_2" + top: "conv4_2" +} +layer { + name: "scale4_2" + bottom: "conv4_2" + top: "conv4_2" + type: "Scale" + scale_param { + axis: 1 + bias_term:false + } +} +layer { + name: "eltwise-sum4" + type: "Eltwise" + bottom: "conv4_1" + bottom: "conv4_2" + top: "conv4_3" + eltwise_param { operation: SUM } +} + + +layer { + name: "conv5-1" + type: "InnerProduct" + bottom: "conv4_3" + top: "conv5-1" + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } + inner_product_param { + num_output: 2 + #kernel_size: 1 + #stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv5-2" + type: "InnerProduct" + bottom: "conv4_3" + top: "conv5-2" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 1 + } + inner_product_param { + num_output: 4 + #kernel_size: 1 + #stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "prob1" + type: "Softmax" + bottom: "conv5-1" + top: "prob1" +} \ No newline at end of file diff --git a/mtcnn/det3_relu.caffemodel b/mtcnn/det3_relu.caffemodel new file mode 100644 index 0000000..15e8d70 Binary files /dev/null and b/mtcnn/det3_relu.caffemodel differ diff --git a/mtcnn/det3_relu.prototxt b/mtcnn/det3_relu.prototxt new file mode 100644 index 0000000..7db4f62 --- /dev/null +++ b/mtcnn/det3_relu.prototxt @@ -0,0 +1,457 @@ +name: "ONet" +input: "data" +input_dim: 1 +input_dim: 3 +input_dim: 48 +input_dim: 48 +################################## +layer { + name: "conv1" + type: "Convolution" + bottom: "data" + top: "conv1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 1 + } + convolution_param { + num_output: 32 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu1_1" + type: "ReLU" + bottom: "conv1" + top: "conv1_1" +} + +layer { + name: "scale1_1" + bottom: "conv1" + top: "conv1_2" + type: "Scale" + scale_param { + axis: 1 + bias_term:false + } +} +layer { + name: "ReLU1_2" + type: "ReLU" + bottom: "conv1_2" + top: "conv1_2" +} +layer { + name: "scale1_2" + bottom: "conv1_2" + top: "conv1_2" + type: "Scale" + scale_param { + axis: 1 + bias_term:false + } +} +layer { + name: "eltwise-sum1" + type: "Eltwise" + bottom: "conv1_1" + bottom: "conv1_2" + top: "conv1_3" + eltwise_param { operation: SUM } +} + +layer { + name: "pool1" + type: "Pooling" + bottom: "conv1_3" + top: "pool1" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "conv2" + type: "Convolution" + bottom: "pool1" + top: "conv2" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 1 + } + convolution_param { + num_output: 64 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + name: "relu2_1" + type: "ReLU" + bottom: "conv2" + top: "conv2_1" +} +layer { + name: "scale2_1" + bottom: "conv2" + top: "conv2_2" + type: "Scale" + scale_param { + axis: 1 + bias_term:false + } +} +layer { + name: "ReLU2_2" + type: "ReLU" + bottom: "conv2_2" + top: "conv2_2" +} +layer { + name: "scale2_2" + bottom: "conv2_2" + top: "conv2_2" + type: "Scale" + scale_param { + axis: 1 + bias_term:false + } +} +layer { + name: "eltwise-sum2" + type: "Eltwise" + bottom: "conv2_1" + bottom: "conv2_2" + top: "conv2_3" + eltwise_param { operation: SUM } +} +layer { + name: "pool2" + type: "Pooling" + bottom: "conv2_3" + top: "pool2" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} + +layer { + name: "conv3" + type: "Convolution" + bottom: "pool2" + top: "conv3" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 1 + } + convolution_param { + num_output: 64 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu3_1" + type: "ReLU" + bottom: "conv3" + top: "conv3_1" +} +layer { + name: "scale3_1" + bottom: "conv3" + top: "conv3_2" + type: "Scale" + scale_param { + axis: 1 + bias_term:false + } +} +layer { + name: "ReLU3_2" + type: "ReLU" + bottom: "conv3_2" + top: "conv3_2" +} +layer { + name: "scale3_2" + bottom: "conv3_2" + top: "conv3_2" + type: "Scale" + scale_param { + axis: 1 + bias_term:false + } +} +layer { + name: "eltwise-sum3" + type: "Eltwise" + bottom: "conv3_1" + bottom: "conv3_2" + top: "conv3_3" + eltwise_param { operation: SUM } +} +layer { + name: "pool3" + type: "Pooling" + bottom: "conv3_3" + top: "pool3" + pooling_param { + pool: MAX + kernel_size: 2 + stride: 2 + } +} +layer { + name: "conv4" + type: "Convolution" + bottom: "pool3" + top: "conv4" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 1 + } + convolution_param { + num_output: 128 + kernel_size: 2 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu4" + type: "ReLU" + bottom: "conv4" + top: "conv4_1" +} + +layer { + name: "scale4_1" + bottom: "conv4" + top: "conv4_2" + type: "Scale" + scale_param { + axis: 1 + bias_term:false + } +} +layer { + name: "ReLU4_2" + type: "ReLU" + bottom: "conv4_2" + top: "conv4_2" +} +layer { + name: "scale4_2" + bottom: "conv4_2" + top: "conv4_2" + type: "Scale" + scale_param { + axis: 1 + bias_term:false + } +} +layer { + name: "eltwise-sum4" + type: "Eltwise" + bottom: "conv4_1" + bottom: "conv4_2" + top: "conv4_3" + eltwise_param { operation: SUM } +} + +layer { + name: "conv5" + type: "InnerProduct" + bottom: "conv4_3" + top: "conv5" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 1 + } + inner_product_param { + #kernel_size: 3 + num_output: 256 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + name: "relu5_1" + type: "ReLU" + bottom: "conv5" + top: "conv5_1" +} + +layer { + name: "scale5_1" + bottom: "conv5" + top: "conv5_2" + type: "Scale" + scale_param { + axis: 1 + bias_term:false + } +} +layer { + name: "ReLU5_2" + type: "ReLU" + bottom: "conv5_2" + top: "conv5_2" +} +layer { + name: "scale5_2" + bottom: "conv5_2" + top: "conv5_2" + type: "Scale" + scale_param { + axis: 1 + bias_term:false + } +} +layer { + name: "eltwise-sum5" + type: "Eltwise" + bottom: "conv5_1" + bottom: "conv5_2" + top: "conv5_3" + eltwise_param { operation: SUM } +} + +layer { + name: "conv6-1" + type: "InnerProduct" + bottom: "conv5_3" + top: "conv6-1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 1 + } + inner_product_param { + #kernel_size: 1 + num_output: 2 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv6-2" + type: "InnerProduct" + bottom: "conv5_3" + top: "conv6-2" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 1 + } + inner_product_param { + #kernel_size: 1 + num_output: 4 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv6-3" + type: "InnerProduct" + bottom: "conv5_3" + top: "conv6-3" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 1 + } + inner_product_param { + #kernel_size: 1 + num_output: 10 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "prob1" + type: "Softmax" + bottom: "conv6-1" + top: "prob1" +} \ No newline at end of file diff --git a/plugins/Makefile b/plugins/Makefile new file mode 100644 index 0000000..51af1cd --- /dev/null +++ b/plugins/Makefile @@ -0,0 +1,37 @@ +CC=g++ +LD=ld +CXXFLAGS=-Wall -std=c++11 -g -O + +NVCC=nvcc + +# space separated compute values ex: computes=70 75. If not present will fetch device's CC +computes= + +ifeq ($(computes), ) + computes= $(shell python gpu_cc.py) + $(info computes: $(computes)) +endif + +NVCCFLAGS= $(foreach compute, $(computes),-gencode arch=compute_$(compute),code=[sm_$(compute),compute_$(compute)]) +$(info NVCCFLAGS: $(NVCCFLAGS)) + +# These are the directories where I installed TensorRT on my x86_64 PC. +TENSORRT_INCS=-I"/usr/local/TensorRT-7.1.3.4/include" +TENSORRT_LIBS=-L"/usr/local/TensorRT-7.1.3.4/lib" + +# INCS and LIBS +INCS=-I"/usr/local/cuda/include" $(TENSORRT_INCS) -I"/usr/local/include" -I"plugin" +LIBS=-L"/usr/local/cuda/lib64" $(TENSORRT_LIBS) -L"/usr/local/lib" -Wl,--start-group -lnvinfer -lnvparsers -lnvinfer_plugin -lcudnn -lcublas -lnvToolsExt -lcudart -lrt -ldl -lpthread -Wl,--end-group + +.PHONY: all clean + +all: libyolo_layer.so + +clean: + rm -f *.so *.o + +libyolo_layer.so: yolo_layer.o + $(CC) -shared -o $@ $< $(LIBS) + +yolo_layer.o: yolo_layer.cu yolo_layer.h + $(NVCC) -ccbin $(CC) $(INCS) $(NVCCFLAGS) -Xcompiler -fPIC -c -o $@ $< diff --git a/plugins/README.md b/plugins/README.md new file mode 100644 index 0000000..075bb77 --- /dev/null +++ b/plugins/README.md @@ -0,0 +1 @@ +The "yolo_layer.h" and "yolo_layer.cu" were taken and modified from [wang-xinyu/tensorrtx/yolov4](https://github.com/wang-xinyu/tensorrtx/tree/master/yolov4). The original code is under [MIT License](https://github.com/wang-xinyu/tensorrtx/blob/master/LICENSE). diff --git a/plugins/gpu_cc.py b/plugins/gpu_cc.py new file mode 100644 index 0000000..ba4f385 --- /dev/null +++ b/plugins/gpu_cc.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +''' +# ported from https://gist.github.com/f0k/63a664160d016a491b2cbea15913d549 +''' + +import ctypes + +CUDA_SUCCESS = 0 + +def get_gpu_archs(): + libnames = ('libcuda.so', 'libcuda.dylib', 'cuda.dll') + for libname in libnames: + try: + cuda = ctypes.CDLL(libname) + except OSError: + continue + else: + break + else: + return + + gpu_archs = set() + + n_gpus = ctypes.c_int() + cc_major = ctypes.c_int() + cc_minor = ctypes.c_int() + + result = ctypes.c_int() + device = ctypes.c_int() + error_str = ctypes.c_char_p() + + result = cuda.cuInit(0) + if result != CUDA_SUCCESS: + cuda.cuGetErrorString(result, ctypes.byref(error_str)) + # print('cuInit failed with error code %d: %s' % (result, error_str.value.decode())) + return [] + + result = cuda.cuDeviceGetCount(ctypes.byref(n_gpus)) + if result != CUDA_SUCCESS: + cuda.cuGetErrorString(result, ctypes.byref(error_str)) + # print('cuDeviceGetCount failed with error code %d: %s' % (result, error_str.value.decode())) + return [] + + for i in range(n_gpus.value): + if cuda.cuDeviceComputeCapability(ctypes.byref(cc_major), ctypes.byref(cc_minor), device) == CUDA_SUCCESS: + gpu_archs.add(str(cc_major.value) + str(cc_minor.value)) + + return list(gpu_archs) + +if __name__ == '__main__': + print(' '.join(get_gpu_archs())) diff --git a/plugins/yolo_layer.cu b/plugins/yolo_layer.cu new file mode 100644 index 0000000..e35e26c --- /dev/null +++ b/plugins/yolo_layer.cu @@ -0,0 +1,372 @@ +/* + * yolo_layer.cu + * + * This code was originally written by wang-xinyu under MIT license. + * I took it from: + * + * https://github.com/wang-xinyu/tensorrtx/tree/master/yolov4 + * + * and made necessary modifications. + * + * - JK Jung + */ + +#include "yolo_layer.h" + +using namespace Yolo; + +namespace +{ +// Write values into buffer +template +void write(char*& buffer, const T& val) +{ + *reinterpret_cast(buffer) = val; + buffer += sizeof(T); +} + +// Read values from buffer +template +void read(const char*& buffer, T& val) +{ + val = *reinterpret_cast(buffer); + buffer += sizeof(T); +} +} // namespace + +namespace nvinfer1 +{ + YoloLayerPlugin::YoloLayerPlugin(int yolo_width, int yolo_height, int num_anchors, float* anchors, int num_classes, int input_width, int input_height, float scale_x_y, int new_coords) + { + mYoloWidth = yolo_width; + mYoloHeight = yolo_height; + mNumAnchors = num_anchors; + memcpy(mAnchorsHost, anchors, num_anchors * 2 * sizeof(float)); + mNumClasses = num_classes; + mInputWidth = input_width; + mInputHeight = input_height; + mScaleXY = scale_x_y; + mNewCoords = new_coords; + + CHECK(cudaMalloc(&mAnchors, MAX_ANCHORS * 2 * sizeof(float))); + CHECK(cudaMemcpy(mAnchors, mAnchorsHost, mNumAnchors * 2 * sizeof(float), cudaMemcpyHostToDevice)); + } + + YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) + { + const char *d = reinterpret_cast(data), *a = d; + read(d, mThreadCount); + read(d, mYoloWidth); + read(d, mYoloHeight); + read(d, mNumAnchors); + memcpy(mAnchorsHost, d, MAX_ANCHORS * 2 * sizeof(float)); + d += MAX_ANCHORS * 2 * sizeof(float); + read(d, mNumClasses); + read(d, mInputWidth); + read(d, mInputHeight); + read(d, mScaleXY); + read(d, mNewCoords); + + CHECK(cudaMalloc(&mAnchors, MAX_ANCHORS * 2 * sizeof(float))); + CHECK(cudaMemcpy(mAnchors, mAnchorsHost, mNumAnchors * 2 * sizeof(float), cudaMemcpyHostToDevice)); + + assert(d == a + length); + } + + IPluginV2IOExt* YoloLayerPlugin::clone() const NOEXCEPT + { + YoloLayerPlugin *p = new YoloLayerPlugin(mYoloWidth, mYoloHeight, mNumAnchors, (float*) mAnchorsHost, mNumClasses, mInputWidth, mInputHeight, mScaleXY, mNewCoords); + p->setPluginNamespace(mPluginNamespace); + return p; + } + + void YoloLayerPlugin::terminate() NOEXCEPT + { + CHECK(cudaFree(mAnchors)); + } + + size_t YoloLayerPlugin::getSerializationSize() const NOEXCEPT + { + return sizeof(mThreadCount) + \ + sizeof(mYoloWidth) + sizeof(mYoloHeight) + \ + sizeof(mNumAnchors) + MAX_ANCHORS * 2 * sizeof(float) + \ + sizeof(mNumClasses) + \ + sizeof(mInputWidth) + sizeof(mInputHeight) + \ + sizeof(mScaleXY) + sizeof(mNewCoords); + } + + void YoloLayerPlugin::serialize(void* buffer) const NOEXCEPT + { + char* d = static_cast(buffer), *a = d; + write(d, mThreadCount); + write(d, mYoloWidth); + write(d, mYoloHeight); + write(d, mNumAnchors); + memcpy(d, mAnchorsHost, MAX_ANCHORS * 2 * sizeof(float)); + d += MAX_ANCHORS * 2 * sizeof(float); + write(d, mNumClasses); + write(d, mInputWidth); + write(d, mInputHeight); + write(d, mScaleXY); + write(d, mNewCoords); + + assert(d == a + getSerializationSize()); + } + + Dims YoloLayerPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims) NOEXCEPT + { + assert(index == 0); + assert(nbInputDims == 1); + assert(inputs[0].d[0] == (mNumClasses + 5) * mNumAnchors); + assert(inputs[0].d[1] == mYoloHeight); + assert(inputs[0].d[2] == mYoloWidth); + // output detection results to the channel dimension + int totalsize = mYoloWidth * mYoloHeight * mNumAnchors * sizeof(Detection) / sizeof(float); + return Dims3(totalsize, 1, 1); + } + + inline __device__ float sigmoidGPU(float x) { return 1.0f / (1.0f + __expf(-x)); } + + inline __device__ float scale_sigmoidGPU(float x, float s) + { + return s * sigmoidGPU(x) - (s - 1.0f) * 0.5f; + } + + // CalDetection(): This kernel processes 1 yolo layer calculation. It + // distributes calculations so that 1 GPU thread would be responsible + // for each grid/anchor combination. + // NOTE: The output (x, y, w, h) are between 0.0 and 1.0 + // (relative to orginal image width and height). + __global__ void CalDetection(const float *input, float *output, + int batch_size, + int yolo_width, int yolo_height, + int num_anchors, const float *anchors, + int num_classes, int input_w, int input_h, + float scale_x_y) + { + int idx = threadIdx.x + blockDim.x * blockIdx.x; + Detection* det = ((Detection*) output) + idx; + int total_grids = yolo_width * yolo_height; + if (idx >= batch_size * total_grids * num_anchors) return; + + int info_len = 5 + num_classes; + //int batch_idx = idx / (total_grids * num_anchors); + int group_idx = idx / total_grids; + int anchor_idx = group_idx % num_anchors; + const float* cur_input = input + group_idx * (info_len * total_grids) + (idx % total_grids); + + int class_id; + float max_cls_logit = -CUDART_INF_F; // minus infinity + for (int i = 5; i < info_len; ++i) { + float l = *(cur_input + i * total_grids); + if (l > max_cls_logit) { + max_cls_logit = l; + class_id = i - 5; + } + } + float max_cls_prob = sigmoidGPU(max_cls_logit); + float box_prob = sigmoidGPU(*(cur_input + 4 * total_grids)); + //if (max_cls_prob < IGNORE_THRESH || box_prob < IGNORE_THRESH) + // return; + + int row = (idx % total_grids) / yolo_width; + int col = (idx % total_grids) % yolo_width; + + det->bbox[0] = (col + scale_sigmoidGPU(*(cur_input + 0 * total_grids), scale_x_y)) / yolo_width; // [0, 1] + det->bbox[1] = (row + scale_sigmoidGPU(*(cur_input + 1 * total_grids), scale_x_y)) / yolo_height; // [0, 1] + det->bbox[2] = __expf(*(cur_input + 2 * total_grids)) * *(anchors + 2 * anchor_idx + 0) / input_w; // [0, 1] + det->bbox[3] = __expf(*(cur_input + 3 * total_grids)) * *(anchors + 2 * anchor_idx + 1) / input_h; // [0, 1] + + det->bbox[0] -= det->bbox[2] / 2; // shift from center to top-left + det->bbox[1] -= det->bbox[3] / 2; + + det->det_confidence = box_prob; + det->class_id = class_id; + det->class_confidence = max_cls_prob; + } + + inline __device__ float scale(float x, float s) + { + return s * x - (s - 1.0f) * 0.5f; + } + + inline __device__ float square(float x) + { + return x * x; + } + + __global__ void CalDetection_NewCoords(const float *input, float *output, + int batch_size, + int yolo_width, int yolo_height, + int num_anchors, const float *anchors, + int num_classes, int input_w, int input_h, + float scale_x_y) + { + int idx = threadIdx.x + blockDim.x * blockIdx.x; + Detection* det = ((Detection*) output) + idx; + int total_grids = yolo_width * yolo_height; + if (idx >= batch_size * total_grids * num_anchors) return; + + int info_len = 5 + num_classes; + //int batch_idx = idx / (total_grids * num_anchors); + int group_idx = idx / total_grids; + int anchor_idx = group_idx % num_anchors; + const float* cur_input = input + group_idx * (info_len * total_grids) + (idx % total_grids); + + int class_id; + float max_cls_prob = -CUDART_INF_F; // minus infinity + for (int i = 5; i < info_len; ++i) { + float l = *(cur_input + i * total_grids); + if (l > max_cls_prob) { + max_cls_prob = l; + class_id = i - 5; + } + } + float box_prob = *(cur_input + 4 * total_grids); + //if (max_cls_prob < IGNORE_THRESH || box_prob < IGNORE_THRESH) + // return; + + int row = (idx % total_grids) / yolo_width; + int col = (idx % total_grids) % yolo_width; + + det->bbox[0] = (col + scale(*(cur_input + 0 * total_grids), scale_x_y)) / yolo_width; // [0, 1] + det->bbox[1] = (row + scale(*(cur_input + 1 * total_grids), scale_x_y)) / yolo_height; // [0, 1] + det->bbox[2] = square(*(cur_input + 2 * total_grids)) * 4 * *(anchors + 2 * anchor_idx + 0) / input_w; // [0, 1] + det->bbox[3] = square(*(cur_input + 3 * total_grids)) * 4 * *(anchors + 2 * anchor_idx + 1) / input_h; // [0, 1] + + det->bbox[0] -= det->bbox[2] / 2; // shift from center to top-left + det->bbox[1] -= det->bbox[3] / 2; + + det->det_confidence = box_prob; + det->class_id = class_id; + det->class_confidence = max_cls_prob; + } + + void YoloLayerPlugin::forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int batchSize) + { + int num_elements = batchSize * mNumAnchors * mYoloWidth * mYoloHeight; + + //CHECK(cudaMemset(output, 0, num_elements * sizeof(Detection))); + + if (mNewCoords) { + CalDetection_NewCoords<<<(num_elements + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream>>> + (inputs[0], output, batchSize, mYoloWidth, mYoloHeight, mNumAnchors, (const float*) mAnchors, mNumClasses, mInputWidth, mInputHeight, mScaleXY); + } else { + CalDetection<<<(num_elements + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream>>> + (inputs[0], output, batchSize, mYoloWidth, mYoloHeight, mNumAnchors, (const float*) mAnchors, mNumClasses, mInputWidth, mInputHeight, mScaleXY); + } + } + +#if NV_TENSORRT_MAJOR >= 8 + int32_t YoloLayerPlugin::enqueue(int32_t batchSize, void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) NOEXCEPT +#else // NV_TENSORRT_MAJOR < 8 + int YoloLayerPlugin::enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) +#endif // NV_TENSORRT_MAJOR + { + forwardGpu((const float* const*)inputs, (float*)outputs[0], stream, batchSize); + return 0; + } + + YoloPluginCreator::YoloPluginCreator() + { + mPluginAttributes.clear(); + + mFC.nbFields = mPluginAttributes.size(); + mFC.fields = mPluginAttributes.data(); + } + + const char* YoloPluginCreator::getPluginName() const NOEXCEPT + { + return "YoloLayer_TRT"; + } + + const char* YoloPluginCreator::getPluginVersion() const NOEXCEPT + { + return "1"; + } + + const PluginFieldCollection* YoloPluginCreator::getFieldNames() NOEXCEPT + { + return &mFC; + } + + IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) NOEXCEPT + { + assert(!strcmp(name, getPluginName())); + const PluginField* fields = fc->fields; + int yolo_width, yolo_height, num_anchors = 0; + float anchors[MAX_ANCHORS * 2]; + int num_classes, input_multiplier, new_coords = 0; + float scale_x_y = 1.0; + + for (int i = 0; i < fc->nbFields; ++i) + { + const char* attrName = fields[i].name; + if (!strcmp(attrName, "yoloWidth")) + { + assert(fields[i].type == PluginFieldType::kINT32); + yolo_width = *(static_cast(fields[i].data)); + } + else if (!strcmp(attrName, "yoloHeight")) + { + assert(fields[i].type == PluginFieldType::kINT32); + yolo_height = *(static_cast(fields[i].data)); + } + else if (!strcmp(attrName, "numAnchors")) + { + assert(fields[i].type == PluginFieldType::kINT32); + num_anchors = *(static_cast(fields[i].data)); + } + else if (!strcmp(attrName, "numClasses")) + { + assert(fields[i].type == PluginFieldType::kINT32); + num_classes = *(static_cast(fields[i].data)); + } + else if (!strcmp(attrName, "inputMultiplier")) + { + assert(fields[i].type == PluginFieldType::kINT32); + input_multiplier = *(static_cast(fields[i].data)); + } + else if (!strcmp(attrName, "anchors")){ + assert(num_anchors > 0 && num_anchors <= MAX_ANCHORS); + assert(fields[i].type == PluginFieldType::kFLOAT32); + memcpy(anchors, static_cast(fields[i].data), num_anchors * 2 * sizeof(float)); + } + else if (!strcmp(attrName, "scaleXY")) + { + assert(fields[i].type == PluginFieldType::kFLOAT32); + scale_x_y = *(static_cast(fields[i].data)); + } + else if (!strcmp(attrName, "newCoords")) + { + assert(fields[i].type == PluginFieldType::kINT32); + new_coords = *(static_cast(fields[i].data)); + } + else + { + std::cerr << "Unknown attribute: " << attrName << std::endl; + assert(0); + } + } + assert(yolo_width > 0 && yolo_height > 0); + assert(anchors[0] > 0.0f && anchors[1] > 0.0f); + assert(num_classes > 0); + assert(input_multiplier == 64 || input_multiplier == 32 || \ + input_multiplier == 16 || input_multiplier == 8); + assert(scale_x_y >= 1.0); + + YoloLayerPlugin* obj = new YoloLayerPlugin(yolo_width, yolo_height, num_anchors, anchors, num_classes, yolo_width * input_multiplier, yolo_height * input_multiplier, scale_x_y, new_coords); + obj->setPluginNamespace(mNamespace.c_str()); + return obj; + } + + IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) NOEXCEPT + { + YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength); + obj->setPluginNamespace(mNamespace.c_str()); + return obj; + } + + PluginFieldCollection YoloPluginCreator::mFC{}; + std::vector YoloPluginCreator::mPluginAttributes; +} // namespace nvinfer1 diff --git a/plugins/yolo_layer.h b/plugins/yolo_layer.h new file mode 100644 index 0000000..4264cb2 --- /dev/null +++ b/plugins/yolo_layer.h @@ -0,0 +1,150 @@ +#ifndef _YOLO_LAYER_H +#define _YOLO_LAYER_H + +#include +#include +#include +#include +#include "math_constants.h" +#include "NvInfer.h" + +#define MAX_ANCHORS 6 + +#if NV_TENSORRT_MAJOR >= 8 +#define NOEXCEPT noexcept +#else +#define NOEXCEPT +#endif + +#define CHECK(status) \ + do { \ + auto ret = status; \ + if (ret != 0) { \ + std::cerr << "Cuda failure in file '" << __FILE__ \ + << "' line " << __LINE__ \ + << ": " << ret << std::endl; \ + abort(); \ + } \ + } while (0) + +namespace Yolo +{ + static constexpr float IGNORE_THRESH = 0.01f; + + struct alignas(float) Detection { + float bbox[4]; // x, y, w, h + float det_confidence; + float class_id; + float class_confidence; + }; +} + +namespace nvinfer1 +{ + class YoloLayerPlugin: public IPluginV2IOExt + { + public: + YoloLayerPlugin(int yolo_width, int yolo_height, int num_anchors, float* anchors, int num_classes, int input_width, int input_height, float scale_x_y, int new_coords); + YoloLayerPlugin(const void* data, size_t length); + + ~YoloLayerPlugin() override = default; + + IPluginV2IOExt* clone() const NOEXCEPT override; + + int initialize() NOEXCEPT override { return 0; } + + void terminate() NOEXCEPT override; + + void destroy() NOEXCEPT override { delete this; } + + size_t getSerializationSize() const NOEXCEPT override; + + void serialize(void* buffer) const NOEXCEPT override; + + int getNbOutputs() const NOEXCEPT override { return 1; } + + Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) NOEXCEPT override; + + size_t getWorkspaceSize(int maxBatchSize) const NOEXCEPT override { return 0; } + + bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const NOEXCEPT override { return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT; } + + const char* getPluginType() const NOEXCEPT override { return "YoloLayer_TRT"; } + + const char* getPluginVersion() const NOEXCEPT override { return "1"; } + + void setPluginNamespace(const char* pluginNamespace) NOEXCEPT override { mPluginNamespace = pluginNamespace; } + + const char* getPluginNamespace() const NOEXCEPT override { return mPluginNamespace; } + + DataType getOutputDataType(int index, const DataType* inputTypes, int nbInputs) const NOEXCEPT override { return DataType::kFLOAT; } + + bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const NOEXCEPT override { return false; } + + bool canBroadcastInputAcrossBatch(int inputIndex) const NOEXCEPT override { return false; } + + void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) NOEXCEPT override {} + + //using IPluginV2IOExt::configurePlugin; + void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) NOEXCEPT override {} + + void detachFromContext() NOEXCEPT override {} + +#if NV_TENSORRT_MAJOR >= 8 + int32_t enqueue(int32_t batchSize, void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) NOEXCEPT override; +#else + int enqueue(int batchSize, const void* const * inputs, void** outputs, void* workspace, cudaStream_t stream) NOEXCEPT override; +#endif + + private: + void forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int batchSize = 1); + + int mThreadCount = 64; + int mYoloWidth, mYoloHeight, mNumAnchors; + float mAnchorsHost[MAX_ANCHORS * 2]; + float *mAnchors; // allocated on GPU + int mNumClasses; + int mInputWidth, mInputHeight; + float mScaleXY; + int mNewCoords = 0; + + const char* mPluginNamespace; + }; + + class YoloPluginCreator : public IPluginCreator + { + public: + YoloPluginCreator(); + + ~YoloPluginCreator() override = default; + + const char* getPluginName() const NOEXCEPT override; + + const char* getPluginVersion() const NOEXCEPT override; + + const PluginFieldCollection* getFieldNames() NOEXCEPT override; + + IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) NOEXCEPT override; + + IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) NOEXCEPT override; + + void setPluginNamespace(const char* libNamespace) NOEXCEPT override + { + mNamespace = libNamespace; + } + + const char* getPluginNamespace() const NOEXCEPT override + { + return mNamespace.c_str(); + } + + private: + static PluginFieldCollection mFC; + static std::vector mPluginAttributes; + std::string mNamespace; + }; + + REGISTER_TENSORRT_PLUGIN(YoloPluginCreator); +}; + +#endif diff --git a/pytrt.pxd b/pytrt.pxd new file mode 100644 index 0000000..6283766 --- /dev/null +++ b/pytrt.pxd @@ -0,0 +1,22 @@ +from libcpp.string cimport string + +cdef extern from 'trtNet.cpp' namespace 'trtnet': + pass + +cdef extern from 'trtNet.h' namespace 'trtnet': + cdef cppclass TrtGooglenet: + TrtGooglenet() except + + void initEngine(string, int *, int *) + void forward(float *, float *) + void destroy() + + cdef cppclass TrtMtcnnDet: + TrtMtcnnDet() except + + void initDet1(string, int *, int *, int *) + void initDet2(string, int *, int *, int *) + void initDet3(string, int *, int *, int *, int *) + void setBatchSize(int) + int getBatchSize() + void forward(float *, float *, float *) + void forward(float *, float *, float *, float *) + void destroy() diff --git a/pytrt.pyx b/pytrt.pyx new file mode 100644 index 0000000..1d9ff89 --- /dev/null +++ b/pytrt.pyx @@ -0,0 +1,134 @@ +import cython + +import numpy as np +cimport numpy as np +from libcpp.string cimport string + +from pytrt cimport TrtGooglenet +from pytrt cimport TrtMtcnnDet + +cdef class PyTrtGooglenet: + cdef TrtGooglenet *c_trtnet + cdef tuple data_dims, prob_dims + + def __cinit__(PyTrtGooglenet self): + self.c_trtnet = NULL + + def __init__(PyTrtGooglenet self, + str engine_path, tuple shape0, tuple shape1): + assert len(shape0) == 3 and len(shape1) == 3 + self.c_trtnet = new TrtGooglenet() + self.data_dims = shape0 + self.prob_dims = shape1 + cdef int[:] v0 = np.array(shape0, dtype=np.intc) + cdef int[:] v1 = np.array(shape1, dtype=np.intc) + cdef string c_str = engine_path.encode('UTF-8') + self.c_trtnet.initEngine(c_str, &v0[0], &v1[0]) + + def forward(PyTrtGooglenet self, + np.ndarray[np.float32_t, ndim=4] np_imgs not None): + """Do a forward() computation on the input batch of imgs.""" + assert np_imgs.shape[0] == 1 # only accept batch_size = 1 + if not np_imgs.flags['C_CONTIGUOUS']: + np_imgs = np.ascontiguousarray(np_imgs) + np_prob = np.ascontiguousarray( + np.zeros((1,) + self.prob_dims, dtype=np.float32) + ) + cdef float[:,:,:,::1] v_imgs = np_imgs + cdef float[:,:,:,::1] v_prob = np_prob + self.c_trtnet.forward(&v_imgs[0][0][0][0], &v_prob[0][0][0][0]) + return { 'prob': np_prob } + + def destroy(PyTrtGooglenet self): + self.c_trtnet.destroy() + + +cdef class PyTrtMtcnn: + cdef TrtMtcnnDet *c_trtnet + cdef int batch_size + cdef int num_bindings + cdef tuple data_dims, prob1_dims, boxes_dims, marks_dims + + def __cinit__(PyTrtMtcnn self): + self.c_trtnet = NULL + + def __init__(PyTrtMtcnn self, + str engine_path, + tuple shape0, tuple shape1, tuple shape2, tuple shape3=None): + self.num_bindings = 4 if shape3 else 3 + assert len(shape0) == 3 and len(shape1) == 3 and len(shape2) == 3 + if shape3: assert len(shape3) == 3 + else: shape3 = (0, 0, 0) # set to a dummy shape + self.c_trtnet = new TrtMtcnnDet() + self.batch_size = 0 + self.data_dims = shape0 + self.prob1_dims = shape1 + self.boxes_dims = shape2 + self.marks_dims = shape3 + cdef int[:] v0 = np.array(shape0, dtype=np.intc) + cdef int[:] v1 = np.array(shape1, dtype=np.intc) + cdef int[:] v2 = np.array(shape2, dtype=np.intc) + cdef int[:] v3 = np.array(shape3, dtype=np.intc) + cdef string c_str = engine_path.encode('UTF-8') + if 'det1' in engine_path: + self.c_trtnet.initDet1(c_str, &v0[0], &v1[0], &v2[0]) + elif 'det2' in engine_path: + self.c_trtnet.initDet2(c_str, &v0[0], &v1[0], &v2[0]) + elif 'det3' in engine_path: + self.c_trtnet.initDet3(c_str, &v0[0], &v1[0], &v2[0], &v3[0]) + else: + raise ValueError('engine is neither of det1, det2 or det3!') + + def set_batchsize(PyTrtMtcnn self, int batch_size): + self.c_trtnet.setBatchSize(batch_size) + self.batch_size = batch_size + + def _forward_3(PyTrtMtcnn self, + np.ndarray[np.float32_t, ndim=4] np_imgs not None, + np.ndarray[np.float32_t, ndim=4] np_prob1 not None, + np.ndarray[np.float32_t, ndim=4] np_boxes not None): + cdef float[:,:,:,::1] v_imgs = np_imgs + cdef float[:,:,:,::1] v_probs = np_prob1 + cdef float[:,:,:,::1] v_boxes = np_boxes + self.c_trtnet.forward(&v_imgs[0][0][0][0], + &v_probs[0][0][0][0], + &v_boxes[0][0][0][0]) + return { 'prob1': np_prob1, 'boxes': np_boxes } + + def _forward_4(PyTrtMtcnn self, + np.ndarray[np.float32_t, ndim=4] np_imgs not None, + np.ndarray[np.float32_t, ndim=4] np_prob1 not None, + np.ndarray[np.float32_t, ndim=4] np_boxes not None, + np.ndarray[np.float32_t, ndim=4] np_marks not None): + cdef float[:,:,:,::1] v_imgs = np_imgs + cdef float[:,:,:,::1] v_probs = np_prob1 + cdef float[:,:,:,::1] v_boxes = np_boxes + cdef float[:,:,:,::1] v_marks = np_marks + self.c_trtnet.forward(&v_imgs[0][0][0][0], + &v_probs[0][0][0][0], + &v_boxes[0][0][0][0], + &v_marks[0][0][0][0]) + return { 'prob1': np_prob1, 'boxes': np_boxes, 'landmarks': np_marks } + + def forward(PyTrtMtcnn self, + np.ndarray[np.float32_t, ndim=4] np_imgs not None): + """Do a forward() computation on the input batch of imgs.""" + assert(np_imgs.shape[0] == self.batch_size) + if not np_imgs.flags['C_CONTIGUOUS']: + np_imgs = np.ascontiguousarray(np_imgs) + np_prob1 = np.ascontiguousarray( + np.zeros((self.batch_size,) + self.prob1_dims, dtype=np.float32) + ) + np_boxes = np.ascontiguousarray( + np.zeros((self.batch_size,) + self.boxes_dims, dtype=np.float32) + ) + np_marks = np.ascontiguousarray( + np.zeros((self.batch_size,) + self.marks_dims, dtype=np.float32) + ) + if self.num_bindings == 3: + return self._forward_3(np_imgs, np_prob1, np_boxes) + else: # self.num_bindings == 4 + return self._forward_4(np_imgs, np_prob1, np_boxes, np_marks) + + def destroy(PyTrtMtcnn self): + self.c_trtnet.destroy() diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..b527557 --- /dev/null +++ b/setup.py @@ -0,0 +1,47 @@ +from distutils.core import setup +from distutils.extension import Extension +from Cython.Distutils import build_ext +from Cython.Build import cythonize + +import numpy + +library_dirs = [ + '/usr/local/cuda/lib64', + '/usr/local/TensorRT-7.1.3.4/lib', # for my x86_64 PC + '/usr/local/lib', +] + +libraries = [ + 'nvinfer', + 'cudnn', + 'cublas', + 'cudart_static', + 'nvToolsExt', + 'cudart', + 'rt', +] + +include_dirs = [ + # in case the following numpy include path does not work, you + # could replace it manually with, say, + # '-I/usr/local/lib/python3.6/dist-packages/numpy/core/include', + '-I' + numpy.__path__[0] + '/core/include', + '-I/usr/local/cuda/include', + '-I/usr/local/TensorRT-7.1.3.4/include', # for my x86_64 PC + '-I/usr/local/include', +] + +setup( + cmdclass={'build_ext': build_ext}, + ext_modules=cythonize( + Extension( + 'pytrt', + sources=['pytrt.pyx'], + language='c++', + library_dirs=library_dirs, + libraries=libraries, + extra_compile_args=['-O3', '-std=c++11'] + include_dirs + ), + compiler_directives={'language_level': '3'} + ) +) diff --git a/ssd/README.md b/ssd/README.md new file mode 100644 index 0000000..04b6cc9 --- /dev/null +++ b/ssd/README.md @@ -0,0 +1,12 @@ +Reference: + +1. [AastaNV/TRT_object_detection](https://github.com/AastaNV/TRT_object_detection) +2. ['sampleUffSSD' in TensorRT samples](https://docs.nvidia.com/deeplearning/sdk/tensorrt-sample-support-guide/index.html#uffssd_sample) + +Sources of the trained models: + +* 'ssd_mobilenet_v1_coco.pb' and 'ssd_mobilnet_v2_coco.pb': This is just the 'frozen_inference_graph.pb' file in [ssd_mobilenet_v1_coco_2018_01_28.tar.gz](http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_coco_2018_01_28.tar.gz) and [ssd_mobilenet_v2_coco_2018_03_29.tar.gz](http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v2_coco_2018_03_29.tar.gz), i.e. 2 of the trained models in [TensorFlow 1 Detection Model Zoo](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/tf1_detection_zoo.md). + +* 'ssd_mobilenet_v1_egohands.pb' and 'ssd_mobilenet_v2_egohands.pb': These models are trained using my [Hand Detection Tutorial](https://github.com/jkjung-avt/hand-detection-tutorial) code. After training, just run the [export.sh](https://github.com/jkjung-avt/hand-detection-tutorial/blob/master/export.sh) script to generated the frozen graph (pb) files. + +* I've also added support for [ssd_inception_v2_coco](http://download.tensorflow.org/models/object_detection/ssd_inception_v2_coco_2018_01_28.tar.gz) in the code. You could download the .pb by following the link. diff --git a/ssd/build_engine.py b/ssd/build_engine.py new file mode 100644 index 0000000..65729a9 --- /dev/null +++ b/ssd/build_engine.py @@ -0,0 +1,304 @@ +"""build_engine.py + +This script converts a SSD model (pb) to UFF and subsequently builds +the TensorRT engine. + +Input : ssd_mobilenet_v[1|2]_[coco|egohands].pb +Output: TRT_ssd_mobilenet_v[1|2]_[coco|egohands].bin +""" + + +import os +import ctypes +import argparse + +import numpy as np +import uff +import tensorrt as trt +import graphsurgeon as gs + + +DIR_NAME = os.path.dirname(__file__) +LIB_FILE = os.path.abspath(os.path.join(DIR_NAME, 'libflattenconcat.so')) +MODEL_SPECS = { + 'ssd_mobilenet_v1_coco': { + 'input_pb': os.path.abspath(os.path.join( + DIR_NAME, 'ssd_mobilenet_v1_coco.pb')), + 'tmp_uff': os.path.abspath(os.path.join( + DIR_NAME, 'ssd_mobilenet_v1_coco.uff')), + 'output_bin': os.path.abspath(os.path.join( + DIR_NAME, 'TRT_ssd_mobilenet_v1_coco.bin')), + 'num_classes': 91, + 'min_size': 0.2, + 'max_size': 0.95, + 'input_order': [0, 2, 1], # order of loc_data, conf_data, priorbox_data + }, + 'ssd_mobilenet_v1_egohands': { + 'input_pb': os.path.abspath(os.path.join( + DIR_NAME, 'ssd_mobilenet_v1_egohands.pb')), + 'tmp_uff': os.path.abspath(os.path.join( + DIR_NAME, 'ssd_mobilenet_v1_egohands.uff')), + 'output_bin': os.path.abspath(os.path.join( + DIR_NAME, 'TRT_ssd_mobilenet_v1_egohands.bin')), + 'num_classes': 2, + 'min_size': 0.05, + 'max_size': 0.95, + 'input_order': [0, 2, 1], # order of loc_data, conf_data, priorbox_data + }, + 'ssd_mobilenet_v2_coco': { + 'input_pb': os.path.abspath(os.path.join( + DIR_NAME, 'ssd_mobilenet_v2_coco.pb')), + 'tmp_uff': os.path.abspath(os.path.join( + DIR_NAME, 'ssd_mobilenet_v2_coco.uff')), + 'output_bin': os.path.abspath(os.path.join( + DIR_NAME, 'TRT_ssd_mobilenet_v2_coco.bin')), + 'num_classes': 91, + 'min_size': 0.2, + 'max_size': 0.95, + 'input_order': [1, 0, 2], # order of loc_data, conf_data, priorbox_data + }, + 'ssd_mobilenet_v2_egohands': { + 'input_pb': os.path.abspath(os.path.join( + DIR_NAME, 'ssd_mobilenet_v2_egohands.pb')), + 'tmp_uff': os.path.abspath(os.path.join( + DIR_NAME, 'ssd_mobilenet_v2_egohands.uff')), + 'output_bin': os.path.abspath(os.path.join( + DIR_NAME, 'TRT_ssd_mobilenet_v2_egohands.bin')), + 'num_classes': 2, + 'min_size': 0.05, + 'max_size': 0.95, + 'input_order': [0, 2, 1], # order of loc_data, conf_data, priorbox_data + }, + 'ssd_inception_v2_coco': { + 'input_pb': os.path.abspath(os.path.join( + DIR_NAME, 'ssd_inception_v2_coco.pb')), + 'tmp_uff': os.path.abspath(os.path.join( + DIR_NAME, 'ssd_inception_v2_coco.uff')), + 'output_bin': os.path.abspath(os.path.join( + DIR_NAME, 'TRT_ssd_inception_v2_coco.bin')), + 'num_classes': 91, + 'min_size': 0.2, + 'max_size': 0.95, + 'input_order': [0, 2, 1], # order of loc_data, conf_data, priorbox_data + }, + 'ssdlite_mobilenet_v2_coco': { + 'input_pb': os.path.abspath(os.path.join( + DIR_NAME, 'ssdlite_mobilenet_v2_coco.pb')), + 'tmp_uff': os.path.abspath(os.path.join( + DIR_NAME, 'ssdlite_mobilenet_v2_coco.uff')), + 'output_bin': os.path.abspath(os.path.join( + DIR_NAME, 'TRT_ssdlite_mobilenet_v2_coco.bin')), + 'num_classes': 91, + 'min_size': 0.2, + 'max_size': 0.95, + 'input_order': [0, 2, 1], # order of loc_data, conf_data, priorbox_data + }, +} +INPUT_DIMS = (3, 300, 300) +DEBUG_UFF = False + + +def replace_addv2(graph): + """Replace all 'AddV2' in the graph with 'Add'. + + 'AddV2' is not supported by UFF parser. + + Reference: + 1. https://github.com/jkjung-avt/tensorrt_demos/issues/113#issuecomment-629900809 + """ + for node in graph.find_nodes_by_op('AddV2'): + gs.update_node(node, op='Add') + return graph + + +def replace_fusedbnv3(graph): + """Replace all 'FusedBatchNormV3' in the graph with 'FusedBatchNorm'. + + 'FusedBatchNormV3' is not supported by UFF parser. + + Reference: + 1. https://devtalk.nvidia.com/default/topic/1066445/tensorrt/tensorrt-6-0-1-tensorflow-1-14-no-conversion-function-registered-for-layer-fusedbatchnormv3-yet/post/5403567/#5403567 + 2. https://github.com/jkjung-avt/tensorrt_demos/issues/76#issuecomment-607879831 + """ + for node in graph.find_nodes_by_op('FusedBatchNormV3'): + gs.update_node(node, op='FusedBatchNorm') + return graph + + +def add_anchor_input(graph): + """Add the missing const input for the GridAnchor node. + + Reference: + 1. https://www.minds.ai/post/deploying-ssd-mobilenet-v2-on-the-nvidia-jetson-and-nano-platforms + """ + data = np.array([1, 1], dtype=np.float32) + anchor_input = gs.create_node('AnchorInput', 'Const', value=data) + graph.append(anchor_input) + graph.find_nodes_by_op('GridAnchor_TRT')[0].input.insert(0, 'AnchorInput') + return graph + +def add_plugin(graph, model, spec): + """add_plugin + + Reference: + 1. https://github.com/AastaNV/TRT_object_detection/blob/master/config/model_ssd_mobilenet_v1_coco_2018_01_28.py + 2. https://github.com/AastaNV/TRT_object_detection/blob/master/config/model_ssd_mobilenet_v2_coco_2018_03_29.py + 3. https://devtalk.nvidia.com/default/topic/1050465/jetson-nano/how-to-write-config-py-for-converting-ssd-mobilenetv2-to-uff-format/post/5333033/#5333033 + """ + numClasses = spec['num_classes'] + minSize = spec['min_size'] + maxSize = spec['max_size'] + inputOrder = spec['input_order'] + + all_assert_nodes = graph.find_nodes_by_op('Assert') + graph.remove(all_assert_nodes, remove_exclusive_dependencies=True) + + all_identity_nodes = graph.find_nodes_by_op('Identity') + graph.forward_inputs(all_identity_nodes) + + Input = gs.create_plugin_node( + name='Input', + op='Placeholder', + shape=(1,) + INPUT_DIMS + ) + + PriorBox = gs.create_plugin_node( + name='MultipleGridAnchorGenerator', + op='GridAnchor_TRT', + minSize=minSize, # was 0.2 + maxSize=maxSize, # was 0.95 + aspectRatios=[1.0, 2.0, 0.5, 3.0, 0.33], + variance=[0.1, 0.1, 0.2, 0.2], + featureMapShapes=[19, 10, 5, 3, 2, 1], + numLayers=6 + ) + + NMS = gs.create_plugin_node( + name='NMS', + op='NMS_TRT', + shareLocation=1, + varianceEncodedInTarget=0, + backgroundLabelId=0, + confidenceThreshold=0.3, # was 1e-8 + nmsThreshold=0.6, + topK=100, + keepTopK=100, + numClasses=numClasses, # was 91 + inputOrder=inputOrder, + confSigmoid=1, + isNormalized=1 + ) + + concat_priorbox = gs.create_node( + 'concat_priorbox', + op='ConcatV2', + axis=2 + ) + + if trt.__version__[0] >= '7': + concat_box_loc = gs.create_plugin_node( + 'concat_box_loc', + op='FlattenConcat_TRT', + axis=1, + ignoreBatch=0 + ) + concat_box_conf = gs.create_plugin_node( + 'concat_box_conf', + op='FlattenConcat_TRT', + axis=1, + ignoreBatch=0 + ) + else: + concat_box_loc = gs.create_plugin_node( + 'concat_box_loc', + op='FlattenConcat_TRT' + ) + concat_box_conf = gs.create_plugin_node( + 'concat_box_conf', + op='FlattenConcat_TRT' + ) + + namespace_for_removal = [ + 'ToFloat', + 'image_tensor', + 'Preprocessor/map/TensorArrayStack_1/TensorArrayGatherV3', + ] + namespace_plugin_map = { + 'MultipleGridAnchorGenerator': PriorBox, + 'Postprocessor': NMS, + 'Preprocessor': Input, + 'ToFloat': Input, + 'Cast': Input, # added for models trained with tf 1.15+ + 'image_tensor': Input, + 'MultipleGridAnchorGenerator/Concatenate': concat_priorbox, # for 'ssd_mobilenet_v1_coco' + 'Concatenate': concat_priorbox, # for other models + 'concat': concat_box_loc, + 'concat_1': concat_box_conf + } + + graph.remove(graph.find_nodes_by_path(['Preprocessor/map/TensorArrayStack_1/TensorArrayGatherV3']), remove_exclusive_dependencies=False) # for 'ssd_inception_v2_coco' + + graph.collapse_namespaces(namespace_plugin_map) + graph = replace_addv2(graph) + graph = replace_fusedbnv3(graph) + + if 'image_tensor:0' in graph.find_nodes_by_name('Input')[0].input: + graph.find_nodes_by_name('Input')[0].input.remove('image_tensor:0') + if 'Input' in graph.find_nodes_by_name('NMS')[0].input: + graph.find_nodes_by_name('NMS')[0].input.remove('Input') + # Remove the Squeeze to avoid "Assertion 'isPlugin(layerName)' failed" + graph.forward_inputs(graph.find_node_inputs_by_name(graph.graph_outputs[0], 'Squeeze')) + if 'anchors' in [node.name for node in graph.graph_outputs]: + graph.remove('anchors', remove_exclusive_dependencies=False) + if len(graph.find_nodes_by_op('GridAnchor_TRT')[0].input) < 1: + graph = add_anchor_input(graph) + if 'NMS' not in [node.name for node in graph.graph_outputs]: + graph.remove(graph.graph_outputs, remove_exclusive_dependencies=False) + if 'NMS' not in [node.name for node in graph.graph_outputs]: + # We expect 'NMS' to be one of the outputs + raise RuntimeError('bad graph_outputs') + + return graph + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('model', type=str, choices=list(MODEL_SPECS.keys())) + args = parser.parse_args() + + # initialize + if trt.__version__[0] < '7': + ctypes.CDLL(LIB_FILE) + TRT_LOGGER = trt.Logger(trt.Logger.INFO) + trt.init_libnvinfer_plugins(TRT_LOGGER, '') + + # compile the model into TensorRT engine + model = args.model + spec = MODEL_SPECS[model] + dynamic_graph = add_plugin( + gs.DynamicGraph(spec['input_pb']), + model, + spec) + _ = uff.from_tensorflow( + dynamic_graph.as_graph_def(), + output_nodes=['NMS'], + output_filename=spec['tmp_uff'], + text=True, + debug_mode=DEBUG_UFF) + with trt.Builder(TRT_LOGGER) as builder, builder.create_network() as network, trt.UffParser() as parser: + builder.max_workspace_size = 1 << 28 + builder.max_batch_size = 1 + builder.fp16_mode = True + + parser.register_input('Input', INPUT_DIMS) + parser.register_output('MarkOutput_0') + parser.parse(spec['tmp_uff'], network) + engine = builder.build_cuda_engine(network) + + buf = engine.serialize() + with open(spec['output_bin'], 'wb') as f: + f.write(buf) + + +if __name__ == '__main__': + main() diff --git a/ssd/build_engines.sh b/ssd/build_engines.sh new file mode 100755 index 0000000..eb0243d --- /dev/null +++ b/ssd/build_engines.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +set -xe + +for model in ssd_mobilenet_v1_coco \ + ssd_mobilenet_v1_egohands \ + ssd_mobilenet_v2_coco \ + ssd_mobilenet_v2_egohands ; do + python3 build_engine.py ${model} +done diff --git a/ssd/graphsurgeon.patch-4.2 b/ssd/graphsurgeon.patch-4.2 new file mode 100644 index 0000000..7d8475a --- /dev/null +++ b/ssd/graphsurgeon.patch-4.2 @@ -0,0 +1,12 @@ +diff --git a/node_manipulation.py b/node_manipulation.py +index d2d012a..1ef30a0 100644 +--- a/node_manipulation.py ++++ b/node_manipulation.py +@@ -30,6 +30,7 @@ def create_node(name, op=None, _do_suffix=False, **kwargs): + node = NodeDef() + node.name = name + node.op = op if op else name ++ node.attr["dtype"].type = 1 + for key, val in kwargs.items(): + if key == "dtype": + node.attr["dtype"].type = val.as_datatype_enum diff --git a/ssd/graphsurgeon.patch-4.2.2 b/ssd/graphsurgeon.patch-4.2.2 new file mode 100644 index 0000000..4707c66 --- /dev/null +++ b/ssd/graphsurgeon.patch-4.2.2 @@ -0,0 +1,11 @@ +diff -Naur a/node_manipulation.py b/node_manipulation.py +--- a/node_manipulation.py 2019-10-24 13:17:10.203943256 +0800 ++++ b/node_manipulation.py 2019-10-24 13:19:08.851943211 +0800 +@@ -39,6 +39,7 @@ + ''' + node.name = name or node.name + node.op = op or node.op or node.name ++ node.attr["dtype"].type = 1 + for key, val in kwargs.items(): + if isinstance(val, tf.DType): + node.attr[key].type = val.as_datatype_enum diff --git a/ssd/graphsurgeon.patch-4.4 b/ssd/graphsurgeon.patch-4.4 new file mode 100644 index 0000000..0605e86 --- /dev/null +++ b/ssd/graphsurgeon.patch-4.4 @@ -0,0 +1,10 @@ +--- a/node_manipulation.py 2020-07-14 08:34:41.959988887 +0800 ++++ b/node_manipulation.py 2020-07-14 08:36:11.863988853 +0800 +@@ -86,6 +86,7 @@ + ''' + node.name = name or node.name + node.op = op or node.op or node.name ++ node.attr["dtype"].type = 1 + for key, val in kwargs.items(): + if isinstance(val, tf.DType): + node.attr[key].type = val.as_datatype_enum diff --git a/ssd/install.sh b/ssd/install.sh new file mode 100755 index 0000000..290afde --- /dev/null +++ b/ssd/install.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +set -e + +# install pycuda if necessary +if ! python3 -c "import pycuda" > /dev/null 2>&1; then + ./install_pycuda.sh +fi + +echo "** Patch 'graphsurgeon.py' in TensorRT" + +script_path=$(realpath $0) +gs_path=$(ls /usr/lib/python3.?/dist-packages/graphsurgeon/node_manipulation.py) +patch_path=$(dirname $script_path)/graphsurgeon.patch + +if head -30 ${gs_path} | tail -1 | grep -q NodeDef; then + # This is for JetPack-4.2 + sudo patch -N -p1 -r - ${gs_path} ${patch_path}-4.2 && echo +fi +if head -22 ${gs_path} | tail -1 | grep -q update_node; then + # This is for JetPack-4.2.2 + sudo patch -N -p1 -r - ${gs_path} ${patch_path}-4.2.2 && echo +fi +if head -69 ${gs_path} | tail -1 | grep -q update_node; then + # This is for JetPack-4.4 + sudo patch -N -p1 -r - ${gs_path} ${patch_path}-4.4 && echo +fi + +echo "** Making symbolic link of libflattenconcat.so" + +trt_version=$(echo /usr/lib/aarch64-linux-gnu/libnvinfer.so.? | cut -d '.' -f 3) +if [ "${trt_version}" = "5" ] || [ "${trt_version}" = "6" ]; then + ln -sf libflattenconcat.so.${trt_version} libflattenconcat.so +fi + +echo "** Installation done" diff --git a/ssd/install_pycuda.sh b/ssd/install_pycuda.sh new file mode 100755 index 0000000..578ad60 --- /dev/null +++ b/ssd/install_pycuda.sh @@ -0,0 +1,43 @@ +#!/bin/bash +# +# Reference for installing 'pycuda': https://wiki.tiker.net/PyCuda/Installation/Linux/Ubuntu + +set -e + +if ! which nvcc > /dev/null; then + echo "ERROR: nvcc not found" + exit +fi + +arch=$(uname -m) +folder=${HOME}/src +mkdir -p $folder + +echo "** Install requirements" +sudo apt-get install -y build-essential python3-dev +sudo apt-get install -y libboost-python-dev libboost-thread-dev +sudo pip3 install setuptools + +boost_pylib=$(basename /usr/lib/${arch}-linux-gnu/libboost_python*-py3?.so) +boost_pylibname=${boost_pylib%.so} +boost_pyname=${boost_pylibname/lib/} + +echo "** Download pycuda-2019.1.2 sources" +pushd $folder +if [ ! -f pycuda-2019.1.2.tar.gz ]; then + wget https://files.pythonhosted.org/packages/5e/3f/5658c38579b41866ba21ee1b5020b8225cec86fe717e4b1c5c972de0a33c/pycuda-2019.1.2.tar.gz +fi + +echo "** Build and install pycuda-2019.1.2" +CPU_CORES=$(nproc) +echo "** cpu cores available: " $CPU_CORES +tar xzvf pycuda-2019.1.2.tar.gz +cd pycuda-2019.1.2 +python3 ./configure.py --python-exe=/usr/bin/python3 --cuda-root=/usr/local/cuda --cudadrv-lib-dir=/usr/lib/${arch}-linux-gnu --boost-inc-dir=/usr/include --boost-lib-dir=/usr/lib/${arch}-linux-gnu --boost-python-libname=${boost_pyname} --boost-thread-libname=boost_thread --no-use-shipped-boost +make -j$CPU_CORES +python3 setup.py build +sudo python3 setup.py install + +popd + +python3 -c "import pycuda; print('pycuda version:', pycuda.VERSION)" diff --git a/ssd/libflattenconcat.so.5 b/ssd/libflattenconcat.so.5 new file mode 100755 index 0000000..1bbb2f5 Binary files /dev/null and b/ssd/libflattenconcat.so.5 differ diff --git a/ssd/libflattenconcat.so.6 b/ssd/libflattenconcat.so.6 new file mode 100755 index 0000000..189ec1b Binary files /dev/null and b/ssd/libflattenconcat.so.6 differ diff --git a/ssd/ssd_mobilenet_v1_coco.pb b/ssd/ssd_mobilenet_v1_coco.pb new file mode 100644 index 0000000..d11874e Binary files /dev/null and b/ssd/ssd_mobilenet_v1_coco.pb differ diff --git a/ssd/ssd_mobilenet_v1_egohands.pb b/ssd/ssd_mobilenet_v1_egohands.pb new file mode 100644 index 0000000..0a1165a Binary files /dev/null and b/ssd/ssd_mobilenet_v1_egohands.pb differ diff --git a/ssd/ssd_mobilenet_v2_coco.pb b/ssd/ssd_mobilenet_v2_coco.pb new file mode 100644 index 0000000..a0def1a Binary files /dev/null and b/ssd/ssd_mobilenet_v2_coco.pb differ diff --git a/ssd/ssd_mobilenet_v2_egohands.pb b/ssd/ssd_mobilenet_v2_egohands.pb new file mode 100644 index 0000000..1c6edc1 Binary files /dev/null and b/ssd/ssd_mobilenet_v2_egohands.pb differ diff --git a/test_modnet.py b/test_modnet.py new file mode 100644 index 0000000..bb41be5 --- /dev/null +++ b/test_modnet.py @@ -0,0 +1,12 @@ +import numpy as np +import cv2 + +import pycuda.autoinit +from utils.modnet import TrtMODNet + +img = cv2.imread('modnet/image.jpg') +modnet = TrtMODNet() +matte = modnet.infer(img) +cv2.imshow('Matte', matte) +cv2.waitKey(0) +cv2.destroyAllWindows() diff --git a/trtNet.cpp b/trtNet.cpp new file mode 100644 index 0000000..e017d90 --- /dev/null +++ b/trtNet.cpp @@ -0,0 +1,303 @@ +// trtNet.cpp + +#include "trtNet.h" + +using namespace nvinfer1; +using namespace nvcaffeparser1; + +#define CHECK(status) \ + do { \ + auto ret = status; \ + if (ret != 0) { \ + std::cerr << "Cuda failure in file '" << __FILE__ \ + << "' line " << __LINE__ \ + << ": " << ret << std::endl; \ + abort(); \ + } \ + } while (0) + +#define my_assert(EXP, MSG) \ + do { \ + if (!(EXP)) { \ + std::cerr << "Assertion fail in file '" << __FILE__ \ + << "' line " << __LINE__ \ + << ": " << (MSG) << std:: endl; \ + throw std::exception(); \ + } \ + } while (0) + + +namespace trtnet { + + // + // TrtGooglenet stuffs + // + + TrtGooglenet::TrtGooglenet() + { + for (int i = 0; i < 2; i++) { + _gpu_buffers[i] = nullptr; + } + } + + void TrtGooglenet::_initEngine(std::string filePath) + { + _gieModelStream = new IHostMemoryFromFile(filePath); + _runtime = createInferRuntime(_gLogger); + my_assert(_runtime != nullptr, "_runtime is null"); + _engine = _runtime->deserializeCudaEngine( + _gieModelStream->data(), + _gieModelStream->size(), + nullptr); + my_assert(_engine != nullptr, "_engine is null"); + my_assert(_engine->getNbBindings() == 2, "wrong number of bindings"); + _binding_data = _engine->getBindingIndex("data"); + my_assert(_engine->bindingIsInput(_binding_data) == true, "bad type of binding 'data'"); + _binding_prob = _engine->getBindingIndex("prob"); + my_assert(_engine->bindingIsInput(_binding_prob) == false, "bad type of binding 'prob'"); + _context = _engine->createExecutionContext(); + my_assert(_context != nullptr, "_context is null"); + _gieModelStream->destroy(); + CHECK(cudaStreamCreate(&_stream)); + } + + void TrtGooglenet::initEngine(std::string filePath, int dataDims[3], int probDims[3]) + { + _initEngine(filePath); +#if NV_TENSORRT_MAJOR >= 4 + Dims3 d; + d = static_cast(_engine->getBindingDimensions(_binding_data)); + my_assert(d.nbDims == 3, "bad nbDims for 'data'"); + my_assert(d.d[0] == dataDims[0] && d.d[1] == dataDims[1] && d.d[2] == dataDims[2], "bad dims for 'data'"); + _blob_sizes[_binding_data] = d.d[0] * d.d[1] * d.d[2]; + + d = static_cast(_engine->getBindingDimensions(_binding_prob)); + my_assert(d.nbDims == 3, "bad nbDims for 'prob'"); + my_assert(d.d[0] == probDims[0] && d.d[1] == probDims[1] && d.d[2] == probDims[2], "bad dims for 'prob'"); + _blob_sizes[_binding_prob] = d.d[0] * d.d[1] * d.d[2]; +#else // NV_TENSORRT_MAJOR < 4 + DimsCHW d; + d = static_cast(_engine->getBindingDimensions(_binding_data)); + my_assert(d.nbDims == 3, "bad nbDims for 'data'"); + my_assert(d.c() == dataDims[0] && d.h() == dataDims[1] && d.w() == dataDims[2], "bad dims for 'data'"); + _blob_sizes[_binding_data] = d.c() * d.h() * d.w(); + + d = static_cast(_engine->getBindingDimensions(_binding_prob)); + my_assert(d.nbDims == 3, "bad nbDims for 'prob'"); + my_assert(d.c() == probDims[0] && d.h() == probDims[1] && d.w() == probDims[2], "bad dims for 'prob'"); + _blob_sizes[_binding_prob] = d.c() * d.h() * d.w(); +#endif // NV_TENSORRT_MAJOR + + for (int i = 0; i < 2; i++) { + CHECK(cudaMalloc(&_gpu_buffers[i], _blob_sizes[i] * sizeof(float))); + } + } + + void TrtGooglenet::forward(float *imgs, float *prob) + { + CHECK(cudaMemcpyAsync(_gpu_buffers[_binding_data], + imgs, + _blob_sizes[_binding_data] * sizeof(float), + cudaMemcpyHostToDevice, + _stream)); + _context->enqueue(1, _gpu_buffers, _stream, nullptr); + CHECK(cudaMemcpyAsync(prob, + _gpu_buffers[_binding_prob], + _blob_sizes[_binding_prob] * sizeof(float), + cudaMemcpyDeviceToHost, + _stream)); + cudaStreamSynchronize(_stream); + } + + void TrtGooglenet::destroy() + { + for (int i = 0; i < 2; i++) { + if (_gpu_buffers[i] != nullptr) { + CHECK(cudaFree(_gpu_buffers[i])); + _gpu_buffers[i] = nullptr; + } + } + cudaStreamDestroy(_stream); + _context->destroy(); + _engine->destroy(); + _runtime->destroy(); + } + + // + // TrtMtcnnDet stuffs + // + + TrtMtcnnDet::TrtMtcnnDet() + { + for (int i = 0; i < 4; i++) { + _gpu_buffers[i] = nullptr; + } + } + + void TrtMtcnnDet::_initEngine(std::string filePath, const char *dataName, const char *prob1Name, const char *boxesName, const char *marksName="unspecified") + { + _gieModelStream = new IHostMemoryFromFile(filePath); + _runtime = createInferRuntime(_gLogger); + my_assert(_runtime != nullptr, "_runtime is null"); + _engine = _runtime->deserializeCudaEngine( + _gieModelStream->data(), + _gieModelStream->size(), + nullptr); + my_assert(_engine != nullptr, "_engine is null"); + my_assert(_engine->getNbBindings() == _num_bindings, "wrong number of bindings"); + _binding_data = _engine->getBindingIndex(dataName); + my_assert(_engine->bindingIsInput(_binding_data) == true, "bad type of binding 'data'"); + _binding_prob1 = _engine->getBindingIndex(prob1Name); + my_assert(_engine->bindingIsInput(_binding_prob1) == false, "bad type of binding 'prob1'"); + _binding_boxes = _engine->getBindingIndex(boxesName); + my_assert(_engine->bindingIsInput(_binding_boxes) == false, "bad type of binding 'boxes'"); + if (_num_bindings == 4) { + _binding_marks = _engine->getBindingIndex(marksName); + my_assert(_engine->bindingIsInput(_binding_marks) == false, "bad type of binding 'marks'"); + } + _context = _engine->createExecutionContext(); + my_assert(_context != nullptr, "_context is null"); + _gieModelStream->destroy(); + CHECK(cudaStreamCreate(&_stream)); + } + + void TrtMtcnnDet::_setBlobSizes(int dataDims[3], int prob1Dims[3], int boxesDims[3]) + { +#if NV_TENSORRT_MAJOR >= 4 + Dims3 d; + d = static_cast(_engine->getBindingDimensions(_binding_data)); + my_assert(d.nbDims == 3, "bad nbDims for 'data'"); + my_assert(d.d[0] == dataDims[0] && d.d[1] == dataDims[1] && d.d[2] == dataDims[2], "bad dims for 'data'"); + _blob_sizes[_binding_data] = d.d[0] * d.d[1] * d.d[2]; + + d = static_cast(_engine->getBindingDimensions(_binding_prob1)); + my_assert(d.nbDims == 3, "bad nbDims for 'prob1'"); + my_assert(d.d[0] == prob1Dims[0] && d.d[1] == prob1Dims[1] && d.d[2] == prob1Dims[2], "bad dims for 'prob1'"); + _blob_sizes[_binding_prob1] = d.d[0] * d.d[1] * d.d[2]; + + d = static_cast(_engine->getBindingDimensions(_binding_boxes)); + my_assert(d.nbDims == 3, "bad nbDims for 'boxes'"); + my_assert(d.d[0] == boxesDims[0] && d.d[1] == boxesDims[1] && d.d[2] == boxesDims[2], "bad dims for 'boxes'"); + _blob_sizes[_binding_boxes] = d.d[0] * d.d[1] * d.d[2]; +#else // NV_TENSORRT_MAJOR < 4 + DimsCHW d; + d = static_cast(_engine->getBindingDimensions(_binding_data)); + my_assert(d.nbDims == 3, "bad nbDims for 'data'"); + my_assert(d.c() == dataDims[0] && d.h() == dataDims[1] && d.w() == dataDims[2], "bad dims for 'data'"); + _blob_sizes[_binding_data] = d.c() * d.h() * d.w(); + + d = static_cast(_engine->getBindingDimensions(_binding_prob1)); + my_assert(d.nbDims == 3, "bad nbDims for 'prob1'"); + my_assert(d.c() == prob1Dims[0] && d.h() == prob1Dims[1] && d.w() == prob1Dims[2], "bad dims for 'prob1'"); + _blob_sizes[_binding_prob1] = d.c() * d.h() * d.w(); + + d = static_cast(_engine->getBindingDimensions(_binding_boxes)); + my_assert(d.nbDims == 3, "bad nbDims for 'boxes'"); + my_assert(d.c() == boxesDims[0] && d.h() == boxesDims[1] && d.w() == boxesDims[2], "bad dims for 'boxes'"); + _blob_sizes[_binding_boxes] = d.c() * d.h() * d.w(); +#endif // NV_TENSORRT_MAJOR + } + + void TrtMtcnnDet::initDet1(std::string filePath, int dataDims[3], int prob1Dims[3], int boxesDims[3]) + { + _num_bindings = 3; + _initEngine(filePath, "data", "prob1", "conv4-2"); + _setBlobSizes(dataDims, prob1Dims, boxesDims); + } + + void TrtMtcnnDet::initDet2(std::string filePath, int dataDims[3], int prob1Dims[3], int boxesDims[3]) + { + _num_bindings = 3; + _initEngine(filePath, "data", "prob1", "conv5-2"); + _setBlobSizes(dataDims, prob1Dims, boxesDims); + } + + void TrtMtcnnDet::initDet3(std::string filePath, int dataDims[3], int prob1Dims[3], int boxesDims[3], int marksDims[3]) + { + _num_bindings = 4; + _initEngine(filePath, "data", "prob1", "conv6-2", "conv6-3"); + _setBlobSizes(dataDims, prob1Dims, boxesDims); + +#if NV_TENSORRT_MAJOR >= 4 + Dims3 d; + d = static_cast(_engine->getBindingDimensions(_binding_marks)); + my_assert(d.nbDims == 3, "bad nbDims for 'marks'"); + my_assert(d.d[0] == marksDims[0] && d.d[1] == marksDims[1] && d.d[2] == marksDims[2], "bad dims for 'marks'"); + _blob_sizes[_binding_marks] = d.d[0] * d.d[1] * d.d[2]; +#else // NV_TENSORRT_MAJOR < 4 + DimsCHW d; + d = static_cast(_engine->getBindingDimensions(_binding_marks)); + my_assert(d.nbDims == 3, "bad nbDims for 'marks'"); + my_assert(d.c() == marksDims[0] && d.h() == marksDims[1] && d.w() == marksDims[2], "bad dims for 'marks'"); + _blob_sizes[_binding_marks] = d.c() * d.h() * d.w(); +#endif // NV_TENSORRT_MAJOR + } + + void TrtMtcnnDet::setBatchSize(int value) + { + my_assert(value > 0 && value <= 1024, "bad batch_size"); + if (value == _batchsize || _engine == nullptr) + return; // do nothing + _batchsize = value; + for (int i = 0; i < _num_bindings; i++) { + if (_gpu_buffers[i] != nullptr) { + CHECK(cudaFree(_gpu_buffers[i])); + _gpu_buffers[i] = nullptr; + } + } + for (int i = 0; i < _num_bindings; i++) { + CHECK(cudaMalloc(&_gpu_buffers[i], + _batchsize * _blob_sizes[i] * sizeof(float))); + } + } + + int TrtMtcnnDet::getBatchSize() + { + return _batchsize; + } + + void TrtMtcnnDet::forward(float *imgs, float *probs, float *boxes, float *marks=nullptr) + { + my_assert(_batchsize > 0, "_batchsize is not set"); + CHECK(cudaMemcpyAsync(_gpu_buffers[_binding_data], + imgs, + _batchsize * _blob_sizes[_binding_data] * sizeof(float), + cudaMemcpyHostToDevice, + _stream)); + _context->enqueue(_batchsize, _gpu_buffers, _stream, nullptr); + CHECK(cudaMemcpyAsync(probs, + _gpu_buffers[_binding_prob1], + _batchsize * _blob_sizes[_binding_prob1] * sizeof(float), + cudaMemcpyDeviceToHost, + _stream)); + CHECK(cudaMemcpyAsync(boxes, + _gpu_buffers[_binding_boxes], + _batchsize * _blob_sizes[_binding_boxes] * sizeof(float), + cudaMemcpyDeviceToHost, + _stream)); + if (_num_bindings == 4) { + my_assert(marks != nullptr, "pointer 'marks' is null"); + CHECK(cudaMemcpyAsync(marks, + _gpu_buffers[_binding_marks], + _batchsize * _blob_sizes[_binding_marks] * sizeof(float), + cudaMemcpyDeviceToHost, + _stream)); + } + cudaStreamSynchronize(_stream); + } + + void TrtMtcnnDet::destroy() + { + for (int i = 0; i < _num_bindings; i++) { + if (_gpu_buffers[i] != nullptr) { + CHECK(cudaFree(_gpu_buffers[i])); + _gpu_buffers[i] = nullptr; + } + } + cudaStreamDestroy(_stream); + _context->destroy(); + _engine->destroy(); + _runtime->destroy(); + } + +} // namespace trtnet diff --git a/trtNet.h b/trtNet.h new file mode 100644 index 0000000..c61d81c --- /dev/null +++ b/trtNet.h @@ -0,0 +1,121 @@ +// trtNet.h +#ifndef __TRTNET_H__ +#define __TRTNET_H__ + +#include +#include +#include +#include +#include +#include + +#include "NvInfer.h" +#include "NvCaffeParser.h" + +using namespace nvinfer1; +using namespace nvcaffeparser1; + +#if NV_TENSORRT_MAJOR >= 8 +#define NOEXCEPT noexcept +#else // NV_TENSORRT_MAJOR < 8 +#define NOEXCEPT +#endif // NV_TENSORRT_MAJOR + +namespace trtnet { + + class Logger : public ILogger + { + void log(Severity severity, const char *msg) NOEXCEPT override + { + if (severity != Severity::kINFO) + std::cout << msg << std::endl; + } + }; + + class IHostMemoryFromFile : public IHostMemory + { + public: + IHostMemoryFromFile(std::string filename) { + std::ifstream infile(filename, std::ifstream::binary | + std::ifstream::ate); + _s = infile.tellg(); + infile.seekg(0, std::ios::beg); + _mem = malloc(_s); + infile.read(reinterpret_cast(_mem), _s); + } +#if NV_TENSORRT_MAJOR >= 6 + void* data() const noexcept { return _mem; } + std::size_t size() const noexcept { return _s; } + DataType type () const noexcept { return DataType::kFLOAT; } // not used + void destroy() noexcept { free(_mem); } +#else // NV_TENSORRT_MAJOR < 6 + void* data() const { return _mem; } + std::size_t size() const { return _s; } + DataType type () const { return DataType::kFLOAT; } // not used + void destroy() { free(_mem); } +#endif // NV_TENSORRT_MAJOR + private: + void *_mem{nullptr}; + std::size_t _s; + }; + + class TrtGooglenet + { + public: + TrtGooglenet(); + // init from engine file + void initEngine(std::string filePath, int dataDims[3], int probDims[3]); + void forward(float *imgs, float *prob); + void destroy(); + + private: + Logger _gLogger; + IHostMemoryFromFile *_gieModelStream{nullptr}; + IRuntime *_runtime; + ICudaEngine *_engine; + IExecutionContext *_context; + cudaStream_t _stream; + void *_gpu_buffers[2]; + int _blob_sizes[2]; + int _binding_data; + int _binding_prob; + + void _initEngine(std::string filePath); + }; + + class TrtMtcnnDet + { + public: + TrtMtcnnDet(); + // init from engine file + void initDet1(std::string filePath, int dataDims[3], int prob1Dims[3], int boxesDims[3]); + void initDet2(std::string filePath, int dataDims[3], int prob1Dims[3], int boxesDims[3]); + void initDet3(std::string filePath, int dataDims[3], int prob1Dims[3], int boxesDims[3], int marksDims[3]); + void setBatchSize(int value); + int getBatchSize(); + void forward(float *imgs, float *probs, float *boxes, float *); + void destroy(); + + private: + Logger _gLogger; + IHostMemoryFromFile *_gieModelStream{nullptr}; + IRuntime *_runtime; + ICudaEngine *_engine; + IExecutionContext *_context; + cudaStream_t _stream; + void *_gpu_buffers[4]; + int _blob_sizes[4]; + int _num_bindings = 0; + int _binding_data; + int _binding_prob1; + int _binding_boxes; + int _binding_marks; + int _batchsize = 0; + + void _initEngine(std::string filePath, const char *dataName, const char *prob1Name, const char *boxesName, const char *marksName); + void _setBlobSizes(int dataDims[3], int prob1Dims[3], int boxesDims[3]); + }; + +} // namespace trtnet + +#endif // __TRTNET_H__ diff --git a/trt_googlenet.py b/trt_googlenet.py new file mode 100644 index 0000000..595b30f --- /dev/null +++ b/trt_googlenet.py @@ -0,0 +1,128 @@ +"""trt_googlenet.py + +This script demonstrates how to do real-time image classification +(inferencing) with Cython wrapped TensorRT optimized googlenet engine. +""" + + +import timeit +import argparse + +import numpy as np +import cv2 +from utils.camera import add_camera_args, Camera +from utils.display import open_window, show_help_text, set_display +from pytrt import PyTrtGooglenet + + +PIXEL_MEANS = np.array([[[104., 117., 123.]]], dtype=np.float32) +DEPLOY_ENGINE = 'googlenet/deploy.engine' +ENGINE_SHAPE0 = (3, 224, 224) +ENGINE_SHAPE1 = (1000, 1, 1) +RESIZED_SHAPE = (224, 224) + +WINDOW_NAME = 'TrtGooglenetDemo' + + +def parse_args(): + """Parse input arguments.""" + desc = ('Capture and display live camera video, while doing ' + 'real-time image classification with TrtGooglenet ' + 'on Jetson Nano') + parser = argparse.ArgumentParser(description=desc) + parser = add_camera_args(parser) + parser.add_argument('--crop', dest='crop_center', + help='crop center square of image for ' + 'inferencing [False]', + action='store_true') + args = parser.parse_args() + return args + + +def show_top_preds(img, top_probs, top_labels): + """Show top predicted classes and softmax scores.""" + x = 10 + y = 40 + for prob, label in zip(top_probs, top_labels): + pred = '{:.4f} {:20s}'.format(prob, label) + #cv2.putText(img, pred, (x+1, y), cv2.FONT_HERSHEY_PLAIN, 1.0, + # (32, 32, 32), 4, cv2.LINE_AA) + cv2.putText(img, pred, (x, y), cv2.FONT_HERSHEY_PLAIN, 1.0, + (0, 0, 240), 1, cv2.LINE_AA) + y += 20 + + +def classify(img, net, labels, do_cropping): + """Classify 1 image (crop).""" + crop = img + if do_cropping: + h, w, _ = img.shape + if h < w: + crop = img[:, ((w-h)//2):((w+h)//2), :] + else: + crop = img[((h-w)//2):((h+w)//2), :, :] + + # preprocess the image crop + crop = cv2.resize(crop, RESIZED_SHAPE) + crop = crop.astype(np.float32) - PIXEL_MEANS + crop = crop.transpose((2, 0, 1)) # HWC -> CHW + + # inference the (cropped) image + tic = timeit.default_timer() + out = net.forward(crop[None]) # add 1 dimension to 'crop' as batch + toc = timeit.default_timer() + print('{:.3f}s'.format(toc-tic)) + + # output top 3 predicted scores and class labels + out_prob = np.squeeze(out['prob'][0]) + top_inds = out_prob.argsort()[::-1][:3] + return (out_prob[top_inds], labels[top_inds]) + + +def loop_and_classify(cam, net, labels, do_cropping): + """Continuously capture images from camera and do classification.""" + show_help = True + full_scrn = False + help_text = '"Esc" to Quit, "H" for Help, "F" to Toggle Fullscreen' + while True: + if cv2.getWindowProperty(WINDOW_NAME, 0) < 0: + break + img = cam.read() + if img is None: + break + top_probs, top_labels = classify(img, net, labels, do_cropping) + show_top_preds(img, top_probs, top_labels) + if show_help: + show_help_text(img, help_text) + cv2.imshow(WINDOW_NAME, img) + key = cv2.waitKey(1) + if key == 27: # ESC key: quit program + break + elif key == ord('H') or key == ord('h'): # Toggle help message + show_help = not show_help + elif key == ord('F') or key == ord('f'): # Toggle fullscreen + full_scrn = not full_scrn + set_display(WINDOW_NAME, full_scrn) + + +def main(): + args = parse_args() + labels = np.loadtxt('googlenet/synset_words.txt', str, delimiter='\t') + cam = Camera(args) + if not cam.isOpened(): + raise SystemExit('ERROR: failed to open camera!') + + # initialize the tensorrt googlenet engine + net = PyTrtGooglenet(DEPLOY_ENGINE, ENGINE_SHAPE0, ENGINE_SHAPE1) + + open_window( + WINDOW_NAME, 'Camera TensorRT GoogLeNet Demo', + cam.img_width, cam.img_height) + loop_and_classify(cam, net, labels, args.crop_center) + + cam.release() + cv2.destroyAllWindows() + + +if __name__ == '__main__': + main() diff --git a/trt_googlenet_async.py b/trt_googlenet_async.py new file mode 100644 index 0000000..3991321 --- /dev/null +++ b/trt_googlenet_async.py @@ -0,0 +1,184 @@ +"""trt_googlenet.py + +This is the 'async' version of trt_googlenet.py implementation. + +Refer to trt_ssd_async.py for description about the design and +synchronization between the main and child threads. +""" + + +import sys +import time +import argparse +import threading + +import numpy as np +import cv2 +from utils.camera import add_camera_args, Camera +from utils.display import open_window, set_display, show_fps +from pytrt import PyTrtGooglenet + + +PIXEL_MEANS = np.array([[[104., 117., 123.]]], dtype=np.float32) +DEPLOY_ENGINE = 'googlenet/deploy.engine' +ENGINE_SHAPE0 = (3, 224, 224) +ENGINE_SHAPE1 = (1000, 1, 1) +RESIZED_SHAPE = (224, 224) + +WINDOW_NAME = 'TrtGooglenetDemo' +MAIN_THREAD_TIMEOUT = 10.0 # 10 seconds + +# 'shared' global variables +s_img, s_probs, s_labels = None, None, None + + +def parse_args(): + """Parse input arguments.""" + desc = ('Capture and display live camera video, while doing ' + 'real-time image classification with TrtGooglenet ' + 'on Jetson Nano') + parser = argparse.ArgumentParser(description=desc) + parser = add_camera_args(parser) + parser.add_argument('--crop', dest='crop_center', + help='crop center square of image for ' + 'inferencing [False]', + action='store_true') + args = parser.parse_args() + return args + + +def classify(img, net, labels, do_cropping): + """Classify 1 image (crop).""" + crop = img + if do_cropping: + h, w, _ = img.shape + if h < w: + crop = img[:, ((w-h)//2):((w+h)//2), :] + else: + crop = img[((h-w)//2):((h+w)//2), :, :] + + # preprocess the image crop + crop = cv2.resize(crop, RESIZED_SHAPE) + crop = crop.astype(np.float32) - PIXEL_MEANS + crop = crop.transpose((2, 0, 1)) # HWC -> CHW + + # inference the (cropped) image + out = net.forward(crop[None]) # add 1 dimension to 'crop' as batch + + # output top 3 predicted scores and class labels + out_prob = np.squeeze(out['prob'][0]) + top_inds = out_prob.argsort()[::-1][:3] + return (out_prob[top_inds], labels[top_inds]) + + +class TrtGooglenetThread(threading.Thread): + def __init__(self, condition, cam, labels, do_cropping): + """__init__ + + # Arguments + condition: the condition variable used to notify main + thread about new frame and detection result + cam: the camera object for reading input image frames + labels: a numpy array of class labels + do_cropping: whether to do center-cropping of input image + """ + threading.Thread.__init__(self) + self.condition = condition + self.cam = cam + self.labels = labels + self.do_cropping = do_cropping + self.running = False + + def run(self): + """Run until 'running' flag is set to False by main thread.""" + global s_img, s_probs, s_labels + + print('TrtGooglenetThread: loading the TRT Googlenet engine...') + self.net = PyTrtGooglenet(DEPLOY_ENGINE, ENGINE_SHAPE0, ENGINE_SHAPE1) + print('TrtGooglenetThread: start running...') + self.running = True + while self.running: + img = self.cam.read() + if img is None: + break + top_probs, top_labels = classify( + img, self.net, self.labels, self.do_cropping) + with self.condition: + s_img, s_probs, s_labels = img, top_probs, top_labels + self.condition.notify() + del self.net + print('TrtGooglenetThread: stopped...') + + def stop(self): + self.running = False + self.join() + + +def show_top_preds(img, top_probs, top_labels): + """Show top predicted classes and softmax scores.""" + x = 10 + y = 40 + for prob, label in zip(top_probs, top_labels): + pred = '{:.4f} {:20s}'.format(prob, label) + #cv2.putText(img, pred, (x+1, y), cv2.FONT_HERSHEY_PLAIN, 1.0, + # (32, 32, 32), 4, cv2.LINE_AA) + cv2.putText(img, pred, (x, y), cv2.FONT_HERSHEY_PLAIN, 1.0, + (0, 0, 240), 1, cv2.LINE_AA) + y += 20 + + +def loop_and_display(condition): + """Continuously capture images from camera and do classification.""" + global s_img, s_probs, s_labels + + full_scrn = False + fps = 0.0 + tic = time.time() + while True: + if cv2.getWindowProperty(WINDOW_NAME, 0) < 0: + break + with condition: + if condition.wait(timeout=MAIN_THREAD_TIMEOUT): + img, top_probs, top_labels = s_img, s_probs, s_labels + else: + raise SystemExit('ERROR: timeout waiting for img from child') + show_top_preds(img, top_probs, top_labels) + img = show_fps(img, fps) + cv2.imshow(WINDOW_NAME, img) + toc = time.time() + curr_fps = 1.0 / (toc - tic) + # calculate an exponentially decaying average of fps number + fps = curr_fps if fps == 0.0 else (fps*0.95 + curr_fps*0.05) + tic = toc + key = cv2.waitKey(1) + if key == 27: # ESC key: quit program + break + elif key == ord('H') or key == ord('h'): # Toggle help message + show_help = not show_help + elif key == ord('F') or key == ord('f'): # Toggle fullscreen + full_scrn = not full_scrn + set_display(WINDOW_NAME, full_scrn) + + +def main(): + args = parse_args() + labels = np.loadtxt('googlenet/synset_words.txt', str, delimiter='\t') + cam = Camera(args) + if not cam.isOpened(): + raise SystemExit('ERROR: failed to open camera!') + + open_window( + WINDOW_NAME, 'Camera TensorRT GoogLeNet Demo', + cam.img_width, cam.img_height) + condition = threading.Condition() + trt_thread = TrtGooglenetThread(condition, cam, labels, args.crop_center) + trt_thread.start() # start the child thread + loop_and_display(condition) + trt_thread.stop() # stop the child thread + + cam.release() + cv2.destroyAllWindows() + + +if __name__ == '__main__': + main() diff --git a/trt_modnet.py b/trt_modnet.py new file mode 100644 index 0000000..5e754b8 --- /dev/null +++ b/trt_modnet.py @@ -0,0 +1,170 @@ +"""trt_modnet.py + +This script demonstrates how to do real-time "image matting" with +TensorRT optimized MODNet engine. +""" + + +import argparse + +import numpy as np +import cv2 +import pycuda.autoinit # This is needed for initializing CUDA driver + +from utils.camera import add_camera_args, Camera +from utils.writer import get_video_writer +from utils.background import Background +from utils.display import open_window, show_fps +from utils.display import FpsCalculator, ScreenToggler +from utils.modnet import TrtMODNet + + +WINDOW_NAME = 'TrtMODNetDemo' + + +def parse_args(): + """Parse input arguments.""" + desc = ('Capture and display live camera video, while doing ' + 'real-time image matting with TensorRT optimized MODNet') + parser = argparse.ArgumentParser(description=desc) + parser = add_camera_args(parser) + parser.add_argument( + '--background', type=str, default='', + help='background image or video file name [None]') + parser.add_argument( + '--create_video', type=str, default='', + help='create output video (either .ts or .mp4) [None]') + parser.add_argument( + '--demo_mode', action='store_true', + help='run the program in a special "demo mode" [False]') + args = parser.parse_args() + return args + + +class BackgroundBlender(): + """BackgroundBlender + + # Arguments + demo_mode: if True, do foreground/background blending in a + special "demo mode" which alternates among the + original, replaced and black backgrounds. + """ + + def __init__(self, demo_mode=False): + self.demo_mode = demo_mode + self.count = 0 + + def blend(self, img, bg, matte): + """Blend foreground and background using the 'matte'. + + # Arguments + img: uint8 np.array of shape (H, W, 3), the foreground image + bg: uint8 np.array of shape (H, W, 3), the background image + matte: float32 np.array of shape (H, W), values between 0.0 and 1.0 + """ + if self.demo_mode: + img, bg, matte = self._mod_for_demo(img, bg, matte) + return (img * matte[..., np.newaxis] + + bg * (1 - matte[..., np.newaxis])).astype(np.uint8) + + def _mod_for_demo(self, img, bg, matte): + """Modify img, bg and matte for "demo mode" + + # Demo script (based on "count") + 0~ 59: black background left to right + 60~119: black background only + 120~179: replaced background left to right + 180~239: replaced background + 240~299: original background left to right + 300~359: original background + """ + img_h, img_w, _ = img.shape + if self.count < 120: + bg = np.zeros(bg.shape, dtype=np.uint8) + if self.count < 60: + offset = int(img_w * self.count / 59) + matte[:, offset:img_w] = 1.0 + elif self.count < 240: + if self.count < 180: + offset = int(img_w * (self.count - 120) / 59) + bg[:, offset:img_w, :] = 0 + else: + if self.count < 300: + offset = int(img_w * (self.count - 240) / 59) + matte[:, 0:offset] = 1.0 + else: + matte[:, :] = 1.0 + self.count = (self.count + 1) % 360 + return img, bg, matte + + +class TrtMODNetRunner(): + """TrtMODNetRunner + + # Arguments + modnet: TrtMODNet instance + cam: Camera object (for reading foreground images) + bggen: background generator (for reading background images) + blender: BackgroundBlender object + writer: VideoWriter object (for saving output video) + """ + + def __init__(self, modnet, cam, bggen, blender, writer=None): + self.modnet = modnet + self.cam = cam + self.bggen = bggen + self.blender = blender + self.writer = writer + open_window( + WINDOW_NAME, 'TensorRT MODNet Demo', cam.img_width, cam.img_height) + + def run(self): + """Get img and bg, infer matte, blend and show img, then repeat.""" + scrn_tog = ScreenToggler() + fps_calc = FpsCalculator() + while True: + if cv2.getWindowProperty(WINDOW_NAME, 0) < 0: break + img, bg = self.cam.read(), self.bggen.read() + if img is None: break + matte = self.modnet.infer(img) + matted_img = self.blender.blend(img, bg, matte) + fps = fps_calc.update() + matted_img = show_fps(matted_img, fps) + if self.writer: self.writer.write(matted_img) + cv2.imshow(WINDOW_NAME, matted_img) + key = cv2.waitKey(1) + if key == ord('F') or key == ord('f'): # Toggle fullscreen + scrn_tog.toggle() + elif key == 27: # ESC key: quit + break + + def __del__(self): + cv2.destroyAllWindows() + + +def main(): + args = parse_args() + + cam = Camera(args) + if not cam.isOpened(): + raise SystemExit('ERROR: failed to open camera!') + + writer = None + if args.create_video: + writer = get_video_writer( + args.create_video, cam.img_width, cam.img_height) + + modnet = TrtMODNet() + bggen = Background(args.background, cam.img_width, cam.img_height) + blender = BackgroundBlender(args.demo_mode) + + runner = TrtMODNetRunner(modnet, cam, bggen, blender, writer) + runner.run() + + if writer: + writer.release() + cam.release() + + +if __name__ == '__main__': + main() diff --git a/trt_mtcnn.py b/trt_mtcnn.py new file mode 100644 index 0000000..8a21069 --- /dev/null +++ b/trt_mtcnn.py @@ -0,0 +1,89 @@ +"""trt_mtcnn.py + +This script demonstrates how to do real-time face detection with +Cython wrapped TensorRT optimized MTCNN engine. +""" + +import time +import argparse + +import cv2 +from utils.camera import add_camera_args, Camera +from utils.display import open_window, set_display, show_fps +from utils.mtcnn import TrtMtcnn + + +WINDOW_NAME = 'TrtMtcnnDemo' +BBOX_COLOR = (0, 255, 0) # green + + +def parse_args(): + """Parse input arguments.""" + desc = ('Capture and display live camera video, while doing ' + 'real-time face detection with TrtMtcnn on Jetson ' + 'Nano') + parser = argparse.ArgumentParser(description=desc) + parser = add_camera_args(parser) + parser.add_argument('--minsize', type=int, default=40, + help='minsize (in pixels) for detection [40]') + args = parser.parse_args() + return args + + +def show_faces(img, boxes, landmarks): + """Draw bounding boxes and face landmarks on image.""" + for bb, ll in zip(boxes, landmarks): + x1, y1, x2, y2 = int(bb[0]), int(bb[1]), int(bb[2]), int(bb[3]) + cv2.rectangle(img, (x1, y1), (x2, y2), BBOX_COLOR, 2) + for j in range(5): + cv2.circle(img, (int(ll[j]), int(ll[j+5])), 2, BBOX_COLOR, 2) + return img + + +def loop_and_detect(cam, mtcnn, minsize): + """Continuously capture images from camera and do face detection.""" + full_scrn = False + fps = 0.0 + tic = time.time() + while True: + if cv2.getWindowProperty(WINDOW_NAME, 0) < 0: + break + img = cam.read() + if img is not None: + dets, landmarks = mtcnn.detect(img, minsize=minsize) + print('{} face(s) found'.format(len(dets))) + img = show_faces(img, dets, landmarks) + img = show_fps(img, fps) + cv2.imshow(WINDOW_NAME, img) + toc = time.time() + curr_fps = 1.0 / (toc - tic) + # calculate an exponentially decaying average of fps number + fps = curr_fps if fps == 0.0 else (fps*0.95 + curr_fps*0.05) + tic = toc + key = cv2.waitKey(1) + if key == 27: # ESC key: quit program + break + elif key == ord('F') or key == ord('f'): # Toggle fullscreen + full_scrn = not full_scrn + set_display(WINDOW_NAME, full_scrn) + + +def main(): + args = parse_args() + cam = Camera(args) + if not cam.isOpened(): + raise SystemExit('ERROR: failed to open camera!') + + mtcnn = TrtMtcnn() + + open_window( + WINDOW_NAME, 'Camera TensorRT MTCNN Demo for Jetson Nano', + cam.img_width, cam.img_height) + loop_and_detect(cam, mtcnn, args.minsize) + + cam.release() + cv2.destroyAllWindows() + + +if __name__ == '__main__': + main() diff --git a/trt_ssd.py b/trt_ssd.py new file mode 100644 index 0000000..20b6e46 --- /dev/null +++ b/trt_ssd.py @@ -0,0 +1,102 @@ +"""trt_ssd.py + +This script demonstrates how to do real-time object detection with +TensorRT optimized Single-Shot Multibox Detector (SSD) engine. +""" + + +import time +import argparse + +import cv2 +import pycuda.autoinit # This is needed for initializing CUDA driver + +from utils.ssd_classes import get_cls_dict +from utils.ssd import TrtSSD +from utils.camera import add_camera_args, Camera +from utils.display import open_window, set_display, show_fps +from utils.visualization import BBoxVisualization + + +WINDOW_NAME = 'TrtSsdDemo' +INPUT_HW = (300, 300) +SUPPORTED_MODELS = [ + 'ssd_mobilenet_v1_coco', + 'ssd_mobilenet_v1_egohands', + 'ssd_mobilenet_v2_coco', + 'ssd_mobilenet_v2_egohands', + 'ssd_inception_v2_coco', + 'ssdlite_mobilenet_v2_coco', +] + + +def parse_args(): + """Parse input arguments.""" + desc = ('Capture and display live camera video, while doing ' + 'real-time object detection with TensorRT optimized ' + 'SSD model on Jetson Nano') + parser = argparse.ArgumentParser(description=desc) + parser = add_camera_args(parser) + parser.add_argument('-m', '--model', type=str, + default='ssd_mobilenet_v1_coco', + choices=SUPPORTED_MODELS) + args = parser.parse_args() + return args + + +def loop_and_detect(cam, trt_ssd, conf_th, vis): + """Continuously capture images from camera and do object detection. + + # Arguments + cam: the camera instance (video source). + trt_ssd: the TRT SSD object detector instance. + conf_th: confidence/score threshold for object detection. + vis: for visualization. + """ + full_scrn = False + fps = 0.0 + tic = time.time() + while True: + if cv2.getWindowProperty(WINDOW_NAME, 0) < 0: + break + img = cam.read() + if img is None: + break + boxes, confs, clss = trt_ssd.detect(img, conf_th) + img = vis.draw_bboxes(img, boxes, confs, clss) + img = show_fps(img, fps) + cv2.imshow(WINDOW_NAME, img) + toc = time.time() + curr_fps = 1.0 / (toc - tic) + # calculate an exponentially decaying average of fps number + fps = curr_fps if fps == 0.0 else (fps*0.95 + curr_fps*0.05) + tic = toc + key = cv2.waitKey(1) + if key == 27: # ESC key: quit program + break + elif key == ord('F') or key == ord('f'): # Toggle fullscreen + full_scrn = not full_scrn + set_display(WINDOW_NAME, full_scrn) + + +def main(): + args = parse_args() + cam = Camera(args) + if not cam.isOpened(): + raise SystemExit('ERROR: failed to open camera!') + + cls_dict = get_cls_dict(args.model.split('_')[-1]) + trt_ssd = TrtSSD(args.model, INPUT_HW) + + open_window( + WINDOW_NAME, 'Camera TensorRT SSD Demo', + cam.img_width, cam.img_height) + vis = BBoxVisualization(cls_dict) + loop_and_detect(cam, trt_ssd, conf_th=0.3, vis=vis) + + cam.release() + cv2.destroyAllWindows() + + +if __name__ == '__main__': + main() diff --git a/trt_ssd_async.py b/trt_ssd_async.py new file mode 100644 index 0000000..35b1bed --- /dev/null +++ b/trt_ssd_async.py @@ -0,0 +1,185 @@ +"""trt_ssd_async.py + +This is the 'async' version of trt_ssd.py implementation. It creates +1 dedicated child thread for fetching camera input and do inferencing +with the TensorRT optimized SSD model/engine, while using the main +thread for drawing detection results and displaying video. Ideally, +the 2 threads work in a pipeline fashion so overall throughput (FPS) +would be improved comparing to the non-async version. +""" + + +import time +import argparse +import threading + +import cv2 +import pycuda.driver as cuda + +from utils.ssd_classes import get_cls_dict +from utils.ssd import TrtSSD +from utils.camera import add_camera_args, Camera +from utils.display import open_window, set_display, show_fps +from utils.visualization import BBoxVisualization + + +WINDOW_NAME = 'TrtSsdDemoAsync' +MAIN_THREAD_TIMEOUT = 20.0 # 20 seconds +INPUT_HW = (300, 300) +SUPPORTED_MODELS = [ + 'ssd_mobilenet_v1_coco', + 'ssd_mobilenet_v1_egohands', + 'ssd_mobilenet_v2_coco', + 'ssd_mobilenet_v2_egohands', + 'ssd_inception_v2_coco', + 'ssdlite_mobilenet_v2_coco', +] + +# These global variables are 'shared' between the main and child +# threads. The child thread writes new frame and detection result +# into these variables, while the main thread reads from them. +s_img, s_boxes, s_confs, s_clss = None, None, None, None + + +def parse_args(): + """Parse input arguments.""" + desc = ('Capture and display live camera video, while doing ' + 'real-time object detection with TensorRT optimized ' + 'SSD model on Jetson Nano') + parser = argparse.ArgumentParser(description=desc) + parser = add_camera_args(parser) + parser.add_argument('-m', '--model', type=str, + default='ssd_mobilenet_v1_coco', + choices=SUPPORTED_MODELS) + args = parser.parse_args() + return args + + +class TrtThread(threading.Thread): + """TrtThread + + This implements the child thread which continues to read images + from cam (input) and to do TRT engine inferencing. The child + thread stores the input image and detection results into global + variables and uses a condition varaiable to inform main thread. + In other words, the TrtThread acts as the producer while the + main thread is the consumer. + """ + def __init__(self, condition, cam, model, conf_th): + """__init__ + + # Arguments + condition: the condition variable used to notify main + thread about new frame and detection result + cam: the camera object for reading input image frames + model: a string, specifying the TRT SSD model + conf_th: confidence threshold for detection + """ + threading.Thread.__init__(self) + self.condition = condition + self.cam = cam + self.model = model + self.conf_th = conf_th + self.cuda_ctx = None # to be created when run + self.trt_ssd = None # to be created when run + self.running = False + + def run(self): + """Run until 'running' flag is set to False by main thread. + + NOTE: CUDA context is created here, i.e. inside the thread + which calls CUDA kernels. In other words, creating CUDA + context in __init__() doesn't work. + """ + global s_img, s_boxes, s_confs, s_clss + + print('TrtThread: loading the TRT SSD engine...') + self.cuda_ctx = cuda.Device(0).make_context() # GPU 0 + self.trt_ssd = TrtSSD(self.model, INPUT_HW) + print('TrtThread: start running...') + self.running = True + while self.running: + img = self.cam.read() + if img is None: + break + boxes, confs, clss = self.trt_ssd.detect(img, self.conf_th) + with self.condition: + s_img, s_boxes, s_confs, s_clss = img, boxes, confs, clss + self.condition.notify() + del self.trt_ssd + self.cuda_ctx.pop() + del self.cuda_ctx + print('TrtThread: stopped...') + + def stop(self): + self.running = False + self.join() + + +def loop_and_display(condition, vis): + """Take detection results from the child thread and display. + + # Arguments + condition: the condition variable for synchronization with + the child thread. + vis: for visualization. + """ + global s_img, s_boxes, s_confs, s_clss + + full_scrn = False + fps = 0.0 + tic = time.time() + while True: + if cv2.getWindowProperty(WINDOW_NAME, 0) < 0: + break + with condition: + # Wait for the next frame and detection result. When + # getting the signal from the child thread, save the + # references to the frame and detection result for + # display. + if condition.wait(timeout=MAIN_THREAD_TIMEOUT): + img, boxes, confs, clss = s_img, s_boxes, s_confs, s_clss + else: + raise SystemExit('ERROR: timeout waiting for img from child') + img = vis.draw_bboxes(img, boxes, confs, clss) + img = show_fps(img, fps) + cv2.imshow(WINDOW_NAME, img) + toc = time.time() + curr_fps = 1.0 / (toc - tic) + # calculate an exponentially decaying average of fps number + fps = curr_fps if fps == 0.0 else (fps*0.95 + curr_fps*0.05) + tic = toc + key = cv2.waitKey(1) + if key == 27: # ESC key: quit program + break + elif key == ord('F') or key == ord('f'): # Toggle fullscreen + full_scrn = not full_scrn + set_display(WINDOW_NAME, full_scrn) + + +def main(): + args = parse_args() + cam = Camera(args) + if not cam.isOpened(): + raise SystemExit('ERROR: failed to open camera!') + + cuda.init() # init pycuda driver + + cls_dict = get_cls_dict(args.model.split('_')[-1]) + + open_window( + WINDOW_NAME, 'Camera TensorRT SSD Demo', + cam.img_width, cam.img_height) + vis = BBoxVisualization(cls_dict) + condition = threading.Condition() + trt_thread = TrtThread(condition, cam, args.model, conf_th=0.3) + trt_thread.start() # start the child thread + loop_and_display(condition, vis) + trt_thread.stop() # stop the child thread + + cam.release() + cv2.destroyAllWindows() + + +if __name__ == '__main__': + main() diff --git a/trt_yolo.py b/trt_yolo.py new file mode 100644 index 0000000..88cc4ac --- /dev/null +++ b/trt_yolo.py @@ -0,0 +1,111 @@ +"""trt_yolo.py + +This script demonstrates how to do real-time object detection with +TensorRT optimized YOLO engine. +""" + + +import os +import time +import argparse + +import cv2 +import pycuda.autoinit # This is needed for initializing CUDA driver + +from utils.yolo_classes import get_cls_dict +from utils.camera import add_camera_args, Camera +from utils.display import open_window, set_display, show_fps +from utils.visualization import BBoxVisualization +from utils.yolo_with_plugins import TrtYOLO + + +WINDOW_NAME = 'TrtYOLODemo' + + +def parse_args(): + """Parse input arguments.""" + desc = ('Capture and display live camera video, while doing ' + 'real-time object detection with TensorRT optimized ' + 'YOLO model on Jetson') + parser = argparse.ArgumentParser(description=desc) + parser = add_camera_args(parser) + parser.add_argument( + '-c', '--category_num', type=int, default=80, + help='number of object categories [80]') + parser.add_argument( + '-t', '--conf_thresh', type=float, default=0.3, + help='set the detection confidence threshold') + parser.add_argument( + '-m', '--model', type=str, required=True, + help=('[yolov3-tiny|yolov3|yolov3-spp|yolov4-tiny|yolov4|' + 'yolov4-csp|yolov4x-mish|yolov4-p5]-[{dimension}], where ' + '{dimension} could be either a single number (e.g. ' + '288, 416, 608) or 2 numbers, WxH (e.g. 416x256)')) + parser.add_argument( + '-l', '--letter_box', action='store_true', + help='inference with letterboxed image [False]') + args = parser.parse_args() + return args + + +def loop_and_detect(cam, trt_yolo, conf_th, vis): + """Continuously capture images from camera and do object detection. + + # Arguments + cam: the camera instance (video source). + trt_yolo: the TRT YOLO object detector instance. + conf_th: confidence/score threshold for object detection. + vis: for visualization. + """ + full_scrn = False + fps = 0.0 + tic = time.time() + while True: + if cv2.getWindowProperty(WINDOW_NAME, 0) < 0: + break + img = cam.read() + if img is None: + break + boxes, confs, clss = trt_yolo.detect(img, conf_th) + img = vis.draw_bboxes(img, boxes, confs, clss) + img = show_fps(img, fps) + cv2.imshow(WINDOW_NAME, img) + toc = time.time() + curr_fps = 1.0 / (toc - tic) + # calculate an exponentially decaying average of fps number + fps = curr_fps if fps == 0.0 else (fps*0.95 + curr_fps*0.05) + tic = toc + key = cv2.waitKey(1) + if key == 27: # ESC key: quit program + break + elif key == ord('F') or key == ord('f'): # Toggle fullscreen + full_scrn = not full_scrn + set_display(WINDOW_NAME, full_scrn) + + +def main(): + args = parse_args() + if args.category_num <= 0: + raise SystemExit('ERROR: bad category_num (%d)!' % args.category_num) + if not os.path.isfile('yolo/%s.trt' % args.model): + raise SystemExit('ERROR: file (yolo/%s.trt) not found!' % args.model) + + cam = Camera(args) + if not cam.isOpened(): + raise SystemExit('ERROR: failed to open camera!') + + cls_dict = get_cls_dict(args.category_num) + vis = BBoxVisualization(cls_dict) + trt_yolo = TrtYOLO(args.model, args.category_num, args.letter_box) + + open_window( + WINDOW_NAME, 'Camera TensorRT YOLO Demo', + cam.img_width, cam.img_height) + loop_and_detect(cam, trt_yolo, args.conf_thresh, vis=vis) + + cam.release() + cv2.destroyAllWindows() + + +if __name__ == '__main__': + main() diff --git a/trt_yolo_cv.py b/trt_yolo_cv.py new file mode 100644 index 0000000..de7cfba --- /dev/null +++ b/trt_yolo_cv.py @@ -0,0 +1,97 @@ +"""trt_yolo_cv.py + +This script could be used to make object detection video with +TensorRT optimized YOLO engine. + +"cv" means "create video" +made by BigJoon (ref. jkjung-avt) +""" + + +import os +import argparse + +import cv2 +import pycuda.autoinit # This is needed for initializing CUDA driver + +from utils.yolo_classes import get_cls_dict +from utils.visualization import BBoxVisualization +from utils.yolo_with_plugins import TrtYOLO + + +def parse_args(): + """Parse input arguments.""" + desc = ('Run the TensorRT optimized object detecion model on an input ' + 'video and save BBoxed overlaid output as another video.') + parser = argparse.ArgumentParser(description=desc) + parser.add_argument( + '-v', '--video', type=str, required=True, + help='input video file name') + parser.add_argument( + '-o', '--output', type=str, required=True, + help='output video file name') + parser.add_argument( + '-c', '--category_num', type=int, default=80, + help='number of object categories [80]') + parser.add_argument( + '-m', '--model', type=str, required=True, + help=('[yolov3-tiny|yolov3|yolov3-spp|yolov4-tiny|yolov4|' + 'yolov4-csp|yolov4x-mish|yolov4-p5]-[{dimension}], where ' + '{dimension} could be either a single number (e.g. ' + '288, 416, 608) or 2 numbers, WxH (e.g. 416x256)')) + parser.add_argument( + '-l', '--letter_box', action='store_true', + help='inference with letterboxed image [False]') + args = parser.parse_args() + return args + + +def loop_and_detect(cap, trt_yolo, conf_th, vis, writer): + """Continuously capture images from camera and do object detection. + + # Arguments + cap: the camera instance (video source). + trt_yolo: the TRT YOLO object detector instance. + conf_th: confidence/score threshold for object detection. + vis: for visualization. + writer: the VideoWriter object for the output video. + """ + + while True: + ret, frame = cap.read() + if frame is None: break + boxes, confs, clss = trt_yolo.detect(frame, conf_th) + frame = vis.draw_bboxes(frame, boxes, confs, clss) + writer.write(frame) + print('.', end='', flush=True) + + print('\nDone.') + + +def main(): + args = parse_args() + if args.category_num <= 0: + raise SystemExit('ERROR: bad category_num (%d)!' % args.category_num) + if not os.path.isfile('yolo/%s.trt' % args.model): + raise SystemExit('ERROR: file (yolo/%s.trt) not found!' % args.model) + + cap = cv2.VideoCapture(args.video) + if not cap.isOpened(): + raise SystemExit('ERROR: failed to open the input video file!') + frame_width, frame_height = int(cap.get(3)), int(cap.get(4)) + writer = cv2.VideoWriter( + args.output, + cv2.VideoWriter_fourcc(*'mp4v'), 30, (frame_width, frame_height)) + + cls_dict = get_cls_dict(args.category_num) + vis = BBoxVisualization(cls_dict) + trt_yolo = TrtYOLO(args.model, args.category_num, args.letter_box) + + loop_and_detect(cap, trt_yolo, conf_th=0.3, vis=vis, writer=writer) + + writer.release() + cap.release() + + +if __name__ == '__main__': + main() diff --git a/trt_yolo_mjpeg.py b/trt_yolo_mjpeg.py new file mode 100644 index 0000000..4278a5c --- /dev/null +++ b/trt_yolo_mjpeg.py @@ -0,0 +1,101 @@ +"""trt_yolo_mjpeg.py + +MJPEG version of trt_yolo.py. +""" + + +import os +import time +import argparse + +import cv2 +import pycuda.autoinit # This is needed for initializing CUDA driver + +from utils.yolo_classes import get_cls_dict +from utils.camera import add_camera_args, Camera +from utils.display import show_fps +from utils.visualization import BBoxVisualization +from utils.mjpeg import MjpegServer +from utils.yolo_with_plugins import TrtYOLO + + +def parse_args(): + """Parse input arguments.""" + desc = 'MJPEG version of trt_yolo' + parser = argparse.ArgumentParser(description=desc) + parser = add_camera_args(parser) + parser.add_argument( + '-c', '--category_num', type=int, default=80, + help='number of object categories [80]') + parser.add_argument( + '-m', '--model', type=str, required=True, + help=('[yolov3-tiny|yolov3|yolov3-spp|yolov4-tiny|yolov4|' + 'yolov4-csp|yolov4x-mish|yolov4-p5]-[{dimension}], where ' + '{dimension} could be either a single number (e.g. ' + '288, 416, 608) or 2 numbers, WxH (e.g. 416x256)')) + parser.add_argument( + '-l', '--letter_box', action='store_true', + help='inference with letterboxed image [False]') + parser.add_argument( + '-p', '--mjpeg_port', type=int, default=8080, + help='MJPEG server port [8080]') + args = parser.parse_args() + return args + + +def loop_and_detect(cam, trt_yolo, conf_th, vis, mjpeg_server): + """Continuously capture images from camera and do object detection. + + # Arguments + cam: the camera instance (video source). + trt_yolo: the TRT YOLO object detector instance. + conf_th: confidence/score threshold for object detection. + vis: for visualization. + mjpeg_server + """ + fps = 0.0 + tic = time.time() + while True: + img = cam.read() + if img is None: + break + boxes, confs, clss = trt_yolo.detect(img, conf_th) + img = vis.draw_bboxes(img, boxes, confs, clss) + img = show_fps(img, fps) + mjpeg_server.send_img(img) + toc = time.time() + curr_fps = 1.0 / (toc - tic) + # calculate an exponentially decaying average of fps number + fps = curr_fps if fps == 0.0 else (fps*0.95 + curr_fps*0.05) + tic = toc + + +def main(): + args = parse_args() + if args.category_num <= 0: + raise SystemExit('ERROR: bad category_num (%d)!' % args.category_num) + if not os.path.isfile('yolo/%s.trt' % args.model): + raise SystemExit('ERROR: file (yolo/%s.trt) not found!' % args.model) + + cam = Camera(args) + if not cam.isOpened(): + raise SystemExit('ERROR: failed to open camera!') + + cls_dict = get_cls_dict(args.category_num) + vis = BBoxVisualization(cls_dict) + trt_yolo = TrtYOLO(args.model, args.category_num, args.letter_box) + + mjpeg_server = MjpegServer(port=args.mjpeg_port) + print('MJPEG server started...') + try: + loop_and_detect(cam, trt_yolo, conf_th=0.3, vis=vis, + mjpeg_server=mjpeg_server) + except Exception as e: + print(e) + finally: + mjpeg_server.shutdown() + cam.release() + + +if __name__ == '__main__': + main() diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/utils/background.py b/utils/background.py new file mode 100644 index 0000000..0179bff --- /dev/null +++ b/utils/background.py @@ -0,0 +1,65 @@ +"""background.py + +This code implements the Background class for the TensorRT MODNet +demo. The Background class could generate background images from +either a still image, a video file or nothing (pure black bg). +""" + + +import numpy as np +import cv2 + + +class Background(): + """Backgrounf class which supports one of the following sources: + + 1. Image (jpg, png, etc.) file, repeating indefinitely + 2. Video file, looping forever + 3. None -> black background + + # Arguments + src: if not spcified, use black background; else, src should be + a filename of an image (jpg/png) or video (mp4/ts) + width & height: width & height of the output background image + """ + + def __init__(self, src, width, height, demo_mode=False): + self.src = src + self.width = width + self.height = height + self.demo_mode = demo_mode + if not src: # empty source: black background + self.is_video = False + self.bg_frame = np.zeros((height, width, 3), dtype=np.uint8) + elif not isinstance(src, str): + raise ValueError('bad src') + elif src.endswith('.jpg') or src.endswith('.png'): + self.is_video = False + self.bg_frame = cv2.resize(cv2.imread(src), (width, height)) + assert self.bg_frame is not None and self.bg_frame.ndim == 3 + elif src.endswith('.mp4') or src.endswith('.ts'): + self.is_video = True + self.cap = cv2.VideoCapture(src) + assert self.cap.isOpened() + else: + raise ValueError('unknown src') + + def read(self): + """Read a frame from the Background object.""" + if self.is_video: + _, frame = self.cap.read() + if frame is None: + # assume end of video file has been reached, so loop around + self.cap.release() + self.cap = cv2.VideoCapture(self.src) + _, frame = self.cap.read() + return cv2.resize(frame, (self.width, self.height)) + else: + return self.bg_frame.copy() + + def __del__(self): + if self.is_video: + try: + self.cap.release() + except: + pass diff --git a/utils/camera.py b/utils/camera.py new file mode 100644 index 0000000..b5a362d --- /dev/null +++ b/utils/camera.py @@ -0,0 +1,273 @@ +"""camera.py + +This code implements the Camera class, which encapsulates code to +handle IP CAM, USB webcam or the Jetson onboard camera. In +addition, this Camera class is further extended to take a video +file or an image file as input. +""" + + +import logging +import threading +import subprocess + +import numpy as np +import cv2 + + +# The following flag ise used to control whether to use a GStreamer +# pipeline to open USB webcam source. If set to False, we just open +# the webcam using cv2.VideoCapture(index) machinery. i.e. relying +# on cv2's built-in function to capture images from the webcam. +USB_GSTREAMER = True + + +def add_camera_args(parser): + """Add parser augument for camera options.""" + parser.add_argument('--image', type=str, default=None, + help='image file name, e.g. dog.jpg') + parser.add_argument('--video', type=str, default=None, + help='video file name, e.g. traffic.mp4') + parser.add_argument('--video_looping', action='store_true', + help='loop around the video file [False]') + parser.add_argument('--rtsp', type=str, default=None, + help=('RTSP H.264 stream, e.g. ' + 'rtsp://admin:123456@192.168.1.64:554')) + parser.add_argument('--rtsp_latency', type=int, default=200, + help='RTSP latency in ms [200]') + parser.add_argument('--usb', type=int, default=None, + help='USB webcam device id (/dev/video?) [None]') + parser.add_argument('--gstr', type=str, default=None, + help='GStreamer string [None]') + parser.add_argument('--onboard', type=int, default=None, + help='Jetson onboard camera [None]') + parser.add_argument('--copy_frame', action='store_true', + help=('copy video frame internally [False]')) + parser.add_argument('--do_resize', action='store_true', + help=('resize image/video [False]')) + parser.add_argument('--width', type=int, default=640, + help='image width [640]') + parser.add_argument('--height', type=int, default=480, + help='image height [480]') + return parser + + +def open_cam_rtsp(uri, width, height, latency): + """Open an RTSP URI (IP CAM).""" + gst_elements = str(subprocess.check_output('gst-inspect-1.0')) + if 'omxh264dec' in gst_elements: + # Use hardware H.264 decoder on Jetson platforms + gst_str = ('rtspsrc location={} latency={} ! ' + 'rtph264depay ! h264parse ! omxh264dec ! ' + 'nvvidconv ! ' + 'video/x-raw, width=(int){}, height=(int){}, ' + 'format=(string)BGRx ! videoconvert ! ' + 'appsink').format(uri, latency, width, height) + elif 'avdec_h264' in gst_elements: + # Otherwise try to use the software decoder 'avdec_h264' + # NOTE: in case resizing images is necessary, try adding + # a 'videoscale' into the pipeline + gst_str = ('rtspsrc location={} latency={} ! ' + 'rtph264depay ! h264parse ! avdec_h264 ! ' + 'videoconvert ! appsink').format(uri, latency) + else: + raise RuntimeError('H.264 decoder not found!') + return cv2.VideoCapture(gst_str, cv2.CAP_GSTREAMER) + + +def open_cam_usb(dev, width, height): + """Open a USB webcam.""" + if USB_GSTREAMER: + gst_str = ('v4l2src device=/dev/video{} ! ' + 'video/x-raw, width=(int){}, height=(int){} ! ' + 'videoconvert ! appsink').format(dev, width, height) + return cv2.VideoCapture(gst_str, cv2.CAP_GSTREAMER) + else: + return cv2.VideoCapture(dev) + + +def open_cam_gstr(gstr, width, height): + """Open camera using a GStreamer string. + + Example: + gstr = 'v4l2src device=/dev/video0 ! video/x-raw, width=(int){width}, height=(int){height} ! videoconvert ! appsink' + """ + gst_str = gstr.format(width=width, height=height) + return cv2.VideoCapture(gst_str, cv2.CAP_GSTREAMER) + + +def open_cam_onboard(width, height): + """Open the Jetson onboard camera.""" + gst_elements = str(subprocess.check_output('gst-inspect-1.0')) + if 'nvcamerasrc' in gst_elements: + # On versions of L4T prior to 28.1, you might need to add + # 'flip-method=2' into gst_str below. + gst_str = ('nvcamerasrc ! ' + 'video/x-raw(memory:NVMM), ' + 'width=(int)2592, height=(int)1458, ' + 'format=(string)I420, framerate=(fraction)30/1 ! ' + 'nvvidconv ! ' + 'video/x-raw, width=(int){}, height=(int){}, ' + 'format=(string)BGRx ! ' + 'videoconvert ! appsink').format(width, height) + elif 'nvarguscamerasrc' in gst_elements: + gst_str = ('nvarguscamerasrc ! ' + 'video/x-raw(memory:NVMM), ' + 'width=(int)1920, height=(int)1080, ' + 'format=(string)NV12, framerate=(fraction)30/1 ! ' + 'nvvidconv flip-method=2 ! ' + 'video/x-raw, width=(int){}, height=(int){}, ' + 'format=(string)BGRx ! ' + 'videoconvert ! appsink').format(width, height) + else: + raise RuntimeError('onboard camera source not found!') + return cv2.VideoCapture(gst_str, cv2.CAP_GSTREAMER) + + +def grab_img(cam): + """This 'grab_img' function is designed to be run in the sub-thread. + Once started, this thread continues to grab a new image and put it + into the global 'img_handle', until 'thread_running' is set to False. + """ + while cam.thread_running: + _, cam.img_handle = cam.cap.read() + if cam.img_handle is None: + #logging.warning('Camera: cap.read() returns None...') + break + cam.thread_running = False + + +class Camera(): + """Camera class which supports reading images from theses video sources: + + 1. Image (jpg, png, etc.) file, repeating indefinitely + 2. Video file + 3. RTSP (IP CAM) + 4. USB webcam + 5. Jetson onboard camera + """ + + def __init__(self, args): + self.args = args + self.is_opened = False + self.video_file = '' + self.video_looping = args.video_looping + self.thread_running = False + self.img_handle = None + self.copy_frame = args.copy_frame + self.do_resize = args.do_resize + self.img_width = args.width + self.img_height = args.height + self.cap = None + self.thread = None + self._open() # try to open the camera + + def _open(self): + """Open camera based on command line arguments.""" + if self.cap is not None: + raise RuntimeError('camera is already opened!') + a = self.args + if a.image: + logging.info('Camera: using a image file %s' % a.image) + self.cap = 'image' + self.img_handle = cv2.imread(a.image) + if self.img_handle is not None: + if self.do_resize: + self.img_handle = cv2.resize( + self.img_handle, (a.width, a.height)) + self.is_opened = True + self.img_height, self.img_width, _ = self.img_handle.shape + elif a.video: + logging.info('Camera: using a video file %s' % a.video) + self.video_file = a.video + self.cap = cv2.VideoCapture(a.video) + self._start() + elif a.rtsp: + logging.info('Camera: using RTSP stream %s' % a.rtsp) + self.cap = open_cam_rtsp(a.rtsp, a.width, a.height, a.rtsp_latency) + self._start() + elif a.usb is not None: + logging.info('Camera: using USB webcam /dev/video%d' % a.usb) + self.cap = open_cam_usb(a.usb, a.width, a.height) + self._start() + elif a.gstr is not None: + logging.info('Camera: using GStreamer string "%s"' % a.gstr) + self.cap = open_cam_gstr(a.gstr, a.width, a.height) + self._start() + elif a.onboard is not None: + logging.info('Camera: using Jetson onboard camera') + self.cap = open_cam_onboard(a.width, a.height) + self._start() + else: + raise RuntimeError('no camera type specified!') + + def isOpened(self): + return self.is_opened + + def _start(self): + if not self.cap.isOpened(): + logging.warning('Camera: starting while cap is not opened!') + return + + # Try to grab the 1st image and determine width and height + _, self.img_handle = self.cap.read() + if self.img_handle is None: + logging.warning('Camera: cap.read() returns no image!') + self.is_opened = False + return + + self.is_opened = True + if self.video_file: + if not self.do_resize: + self.img_height, self.img_width, _ = self.img_handle.shape + else: + self.img_height, self.img_width, _ = self.img_handle.shape + # start the child thread if not using a video file source + # i.e. rtsp, usb or onboard + assert not self.thread_running + self.thread_running = True + self.thread = threading.Thread(target=grab_img, args=(self,)) + self.thread.start() + + def _stop(self): + if self.thread_running: + self.thread_running = False + #self.thread.join() + + def read(self): + """Read a frame from the camera object. + + Returns None if the camera runs out of image or error. + """ + if not self.is_opened: + return None + + if self.video_file: + _, img = self.cap.read() + if img is None: + logging.info('Camera: reaching end of video file') + if self.video_looping: + self.cap.release() + self.cap = cv2.VideoCapture(self.video_file) + _, img = self.cap.read() + if img is not None and self.do_resize: + img = cv2.resize(img, (self.img_width, self.img_height)) + return img + elif self.cap == 'image': + return np.copy(self.img_handle) + else: + if self.copy_frame: + return self.img_handle.copy() + else: + return self.img_handle + + def release(self): + self._stop() + try: + self.cap.release() + except: + pass + self.is_opened = False + + def __del__(self): + self.release() diff --git a/utils/display.py b/utils/display.py new file mode 100644 index 0000000..973ddc7 --- /dev/null +++ b/utils/display.py @@ -0,0 +1,76 @@ +"""display.py +""" + + +import time + +import cv2 + + +def open_window(window_name, title, width=None, height=None): + """Open the display window.""" + cv2.namedWindow(window_name, cv2.WINDOW_NORMAL) + cv2.setWindowTitle(window_name, title) + if width and height: + cv2.resizeWindow(window_name, width, height) + + +def show_help_text(img, help_text): + """Draw help text on image.""" + cv2.putText(img, help_text, (11, 20), cv2.FONT_HERSHEY_PLAIN, 1.0, + (32, 32, 32), 4, cv2.LINE_AA) + cv2.putText(img, help_text, (10, 20), cv2.FONT_HERSHEY_PLAIN, 1.0, + (240, 240, 240), 1, cv2.LINE_AA) + return img + + +def show_fps(img, fps): + """Draw fps number at top-left corner of the image.""" + font = cv2.FONT_HERSHEY_PLAIN + line = cv2.LINE_AA + fps_text = 'FPS: {:.2f}'.format(fps) + cv2.putText(img, fps_text, (11, 20), font, 1.0, (32, 32, 32), 4, line) + cv2.putText(img, fps_text, (10, 20), font, 1.0, (240, 240, 240), 1, line) + return img + + +def set_display(window_name, full_scrn): + """Set disply window to either full screen or normal.""" + if full_scrn: + cv2.setWindowProperty(window_name, cv2.WND_PROP_FULLSCREEN, + cv2.WINDOW_FULLSCREEN) + else: + cv2.setWindowProperty(window_name, cv2.WND_PROP_FULLSCREEN, + cv2.WINDOW_NORMAL) + + +class FpsCalculator(): + """Helper class for calculating frames-per-second (FPS).""" + + def __init__(self, decay_factor=0.95): + self.fps = 0.0 + self.tic = time.time() + self.decay_factor = decay_factor + + def update(self): + toc = time.time() + curr_fps = 1.0 / (toc - self.tic) + self.fps = curr_fps if self.fps == 0.0 else self.fps + self.fps = self.fps * self.decay_factor + \ + curr_fps * (1 - self.decay_factor) + self.tic = toc + return self.fps + + def reset(self): + self.fps = 0.0 + + +class ScreenToggler(): + """Helper class for toggling between non-fullscreen and fullscreen.""" + + def __init__(self): + self.full_scrn = False + + def toggle(self): + self.full_scrn = not self.full_scrn + set_display(WINDOW_NAME, self.full_scrn) diff --git a/utils/mjpeg.py b/utils/mjpeg.py new file mode 100644 index 0000000..264bde5 --- /dev/null +++ b/utils/mjpeg.py @@ -0,0 +1,107 @@ +"""mjpeg.py + +This module implements a simple MJPEG server which handles HTTP +requests from remote clients. +""" + + +import time +import queue +import threading +import socket +from http.server import BaseHTTPRequestHandler, HTTPServer +from socketserver import ThreadingMixIn + +import numpy as np +import cv2 + + +# globals +_MJPEG_QUEUE = queue.Queue(maxsize=2) +_SLEEP_INTERVAL = 0.1 # update JPG roughly every 0.1 second + + +class MjpegHandler(BaseHTTPRequestHandler): + """A simple MJPEG handler which publishes images.""" + + def _handle_mjpeg(self): + global _MJPEG_QUEUE + img = _MJPEG_QUEUE.get() + + self.send_response(200) + self.send_header( + 'Content-type', + 'multipart/x-mixed-replace; boundary=--jpgboundary' + ) + self.end_headers() + + while True: + if not _MJPEG_QUEUE.empty(): + img = _MJPEG_QUEUE.get() + ret, jpg = cv2.imencode('.jpg', img) + assert jpg is not None + self.wfile.write("--jpgboundary".encode("utf-8")) + self.send_header('Content-type', 'image/jpeg') + self.send_header('Content-length', str(jpg.size)) + self.end_headers() + self.wfile.write(jpg.tostring()) + time.sleep(_SLEEP_INTERVAL) + + def _handle_error(self): + self.send_response(404) + self.send_header('Content-type', 'text/html') + self.end_headers() + self.wfile.write('') + self.wfile.write('

{0!s} not found

'.format(self.path)) + self.wfile.write('') + + def do_GET(self): + if self.path == '/mjpg' or self.path == '/': + self._handle_mjpeg() + else: + #print('ERROR: ', self.path) + self._handle_error() + + def handle(self): + try: + super().handle() + except socket.error: + # ignore BrokenPipeError, which is caused by the client + # terminating the HTTP connection + pass + + +class ThreadedHTTPServer(ThreadingMixIn, HTTPServer): + """Handle HTTP requests in a separate thread.""" + # not used... + + +def run_server(server): + server.serve_forever() # this exits when server.shutdown() is called + server.socket.shutdown(socket.SHUT_RDWR) + server.socket.close() + + +class MjpegServer(object): + def __init__(self, init_img=None, ip='', port=8080): + # initialize the queue with a dummy image + global _MJPEG_QUEUE + init_img = init_img if init_img else \ + np.ones((480, 640, 3), np.uint8) * 255 # all white + _MJPEG_QUEUE.put(init_img) + # create the HTTP server and run it from the child thread + self.server = HTTPServer((ip, port), MjpegHandler) + self.run_thread = threading.Thread( + target=run_server, args=(self.server,)) + self.run_thread.start() + + def send_img(self, img): + global _MJPEG_QUEUE + try: + _MJPEG_QUEUE.put(img, block=False) + except queue.Full: + pass + + def shutdown(self): + self.server.shutdown() + del self.server diff --git a/utils/modnet.py b/utils/modnet.py new file mode 100644 index 0000000..83a75bc --- /dev/null +++ b/utils/modnet.py @@ -0,0 +1,164 @@ +"""modnet.py + +Implementation of TrtMODNet class. +""" + + +import numpy as np +import cv2 +import tensorrt as trt +import pycuda.driver as cuda + + +# Code in this module is only for TensorRT 7+ +if trt.__version__[0] < '7': + raise SystemExit('TensorRT version < 7') + + +def _preprocess_modnet(img, input_shape): + """Preprocess an image before TRT MODNet inferencing. + + # Args + img: int8 numpy array of shape (img_h, img_w, 3) + input_shape: a tuple of (H, W) + + # Returns + preprocessed img: float32 numpy array of shape (3, H, W) + """ + img = cv2.resize(img, (input_shape[1], input_shape[0]), cv2.INTER_AREA) + img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) + img = img.transpose((2, 0, 1)).astype(np.float32) + img = (img - 127.5) / 127.5 + return img + + +def _postprocess_modnet(output, output_shape): + """Postprocess TRT MODNet output. + + # Args + output: inferenced output by the TensorRT engine + output_shape: (H, W), e.g. (480, 640) + """ + matte = cv2.resize( + output, (output_shape[1], output_shape[0]), + interpolation=cv2.INTER_AREA) + return matte + + +class HostDeviceMem(object): + """Simple helper data class that's a little nicer to use than a 2-tuple.""" + def __init__(self, host_mem, device_mem): + self.host = host_mem + self.device = device_mem + + def __str__(self): + return 'Host:\n' + str(self.host) + '\nDevice:\n' + str(self.device) + + def __repr__(self): + return self.__str__() + + +def allocate_buffers(engine, context): + """Allocates all host/device in/out buffers required for an engine.""" + assert len(engine) == 2 and engine[0] == 'input' and engine[1] == 'output' + dtype = trt.nptype(engine.get_binding_dtype('input')) + assert trt.nptype(engine.get_binding_dtype('output')) == dtype + bindings = [] + + dims_in = context.get_binding_shape(0) + assert len(dims_in) == 4 and dims_in[0] == 1 and dims_in[1] == 3 + hmem_in = cuda.pagelocked_empty(trt.volume(dims_in), dtype) + dmem_in = cuda.mem_alloc(hmem_in.nbytes) + bindings.append(int(dmem_in)) + inputs = [HostDeviceMem(hmem_in, dmem_in)] + + dims_out = context.get_binding_shape(1) + assert len(dims_out) == 4 and dims_out[0] == 1 and dims_out[1] == 1 + assert dims_out[2] == dims_in[2] and dims_out[3] == dims_in[3] + hmem_out = cuda.pagelocked_empty(trt.volume(dims_out), dtype) + dmem_out = cuda.mem_alloc(hmem_out.nbytes) + bindings.append(int(dmem_out)) + outputs = [HostDeviceMem(hmem_out, dmem_out)] + + return bindings, inputs, outputs + + +def do_inference_v2(context, bindings, inputs, outputs, stream): + """do_inference_v2 (for TensorRT 7.0+) + + This function is generalized for multiple inputs/outputs for full + dimension networks. Inputs and outputs are expected to be lists + of HostDeviceMem objects. + """ + # Transfer input data to the GPU. + [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs] + # Run inference. + context.execute_async_v2(bindings=bindings, stream_handle=stream.handle) + # Transfer predictions back from the GPU. + [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs] + # Synchronize the stream + stream.synchronize() + # Return only the host outputs. + return [out.host for out in outputs] + + +class TrtMODNet(object): + """TrtMODNet class encapsulates things needed to run TRT MODNet.""" + + def __init__(self, cuda_ctx=None): + """Initialize TensorRT plugins, engine and conetxt. + + # Arguments + cuda_ctx: PyCUDA context for inferencing (usually only needed + in multi-threaded cases + """ + self.cuda_ctx = cuda_ctx + if self.cuda_ctx: + self.cuda_ctx.push() + self.trt_logger = trt.Logger(trt.Logger.INFO) + self.engine = self._load_engine() + assert self.engine.get_binding_dtype('input') == trt.tensorrt.DataType.FLOAT + + try: + self.context = self.engine.create_execution_context() + self.output_shape = self.context.get_binding_shape(1) # (1, 1, 480, 640) + self.stream = cuda.Stream() + self.bindings, self.inputs, self.outputs = allocate_buffers( + self.engine, self.context) + except Exception as e: + raise RuntimeError('fail to allocate CUDA resources') from e + finally: + if self.cuda_ctx: + self.cuda_ctx.pop() + dims = self.context.get_binding_shape(0) # 'input' + self.input_shape = (dims[2], dims[3]) + + def _load_engine(self): + if not trt.init_libnvinfer_plugins(self.trt_logger, ''): + raise RuntimeError('fail to init built-in plugins') + engine_path = 'modnet/modnet.engine' + with open(engine_path, 'rb') as f, trt.Runtime(self.trt_logger) as runtime: + return runtime.deserialize_cuda_engine(f.read()) + + def infer(self, img): + """Infer an image. + + The output is a matte (matting mask), which is a grayscale image + with either 0 or 255 pixels. + """ + img_resized = _preprocess_modnet(img, self.input_shape) + + self.inputs[0].host = np.ascontiguousarray(img_resized) + if self.cuda_ctx: + self.cuda_ctx.push() + trt_outputs = do_inference_v2( + context=self.context, + bindings=self.bindings, + inputs=self.inputs, + outputs=self.outputs, + stream=self.stream) + if self.cuda_ctx: + self.cuda_ctx.pop() + + output = trt_outputs[0].reshape(self.output_shape[-2:]) + return _postprocess_modnet(output, img.shape[:2]) diff --git a/utils/mtcnn.py b/utils/mtcnn.py new file mode 100644 index 0000000..dafeb70 --- /dev/null +++ b/utils/mtcnn.py @@ -0,0 +1,480 @@ +"""mtcnn_trt.py +""" + +import numpy as np +import cv2 +import pytrt + + +PIXEL_MEAN = 127.5 +PIXEL_SCALE = 0.0078125 + + +def convert_to_1x1(boxes): + """Convert detection boxes to 1:1 sizes + + # Arguments + boxes: numpy array, shape (n,5), dtype=float32 + + # Returns + boxes_1x1 + """ + boxes_1x1 = boxes.copy() + hh = boxes[:, 3] - boxes[:, 1] + 1. + ww = boxes[:, 2] - boxes[:, 0] + 1. + mm = np.maximum(hh, ww) + boxes_1x1[:, 0] = boxes[:, 0] + ww * 0.5 - mm * 0.5 + boxes_1x1[:, 1] = boxes[:, 1] + hh * 0.5 - mm * 0.5 + boxes_1x1[:, 2] = boxes_1x1[:, 0] + mm - 1. + boxes_1x1[:, 3] = boxes_1x1[:, 1] + mm - 1. + boxes_1x1[:, 0:4] = np.fix(boxes_1x1[:, 0:4]) + return boxes_1x1 + + +def crop_img_with_padding(img, box, padding=0): + """Crop a box from image, with out-of-boundary pixels padded + + # Arguments + img: img as a numpy array, shape (H, W, 3) + box: numpy array, shape (5,) or (4,) + padding: integer value for padded pixels + + # Returns + cropped_im: cropped image as a numpy array, shape (H, W, 3) + """ + img_h, img_w, _ = img.shape + if box.shape[0] == 5: + cx1, cy1, cx2, cy2, _ = box.astype(int) + elif box.shape[0] == 4: + cx1, cy1, cx2, cy2 = box.astype(int) + else: + raise ValueError + cw = cx2 - cx1 + 1 + ch = cy2 - cy1 + 1 + cropped_im = np.zeros((ch, cw, 3), dtype=np.uint8) + padding + ex1 = max(0, -cx1) # ex/ey's are the destination coordinates + ey1 = max(0, -cy1) + ex2 = min(cw, img_w - cx1) + ey2 = min(ch, img_h - cy1) + fx1 = max(cx1, 0) # fx/fy's are the source coordinates + fy1 = max(cy1, 0) + fx2 = min(cx2+1, img_w) + fy2 = min(cy2+1, img_h) + cropped_im[ey1:ey2, ex1:ex2, :] = img[fy1:fy2, fx1:fx2, :] + return cropped_im + + +def nms(boxes, threshold, type='Union'): + """Non-Maximum Supression + + # Arguments + boxes: numpy array [:, 0:5] of [x1, y1, x2, y2, score]'s + threshold: confidence/score threshold, e.g. 0.5 + type: 'Union' or 'Min' + + # Returns + A list of indices indicating the result of NMS + """ + if boxes.shape[0] == 0: + return [] + xx1, yy1, xx2, yy2 = boxes[:, 0], boxes[:, 1], boxes[:, 2], boxes[:, 3] + areas = np.multiply(xx2-xx1+1, yy2-yy1+1) + sorted_idx = boxes[:, 4].argsort() + + pick = [] + while len(sorted_idx) > 0: + # In each loop, pick the last box (highest score) and remove + # all other boxes with IoU over threshold + tx1 = np.maximum(xx1[sorted_idx[-1]], xx1[sorted_idx[0:-1]]) + ty1 = np.maximum(yy1[sorted_idx[-1]], yy1[sorted_idx[0:-1]]) + tx2 = np.minimum(xx2[sorted_idx[-1]], xx2[sorted_idx[0:-1]]) + ty2 = np.minimum(yy2[sorted_idx[-1]], yy2[sorted_idx[0:-1]]) + tw = np.maximum(0.0, tx2 - tx1 + 1) + th = np.maximum(0.0, ty2 - ty1 + 1) + inter = tw * th + if type == 'Min': + iou = inter / \ + np.minimum(areas[sorted_idx[-1]], areas[sorted_idx[0:-1]]) + else: + iou = inter / \ + (areas[sorted_idx[-1]] + areas[sorted_idx[0:-1]] - inter) + pick.append(sorted_idx[-1]) + sorted_idx = sorted_idx[np.where(iou <= threshold)[0]] + return pick + + +def generate_pnet_bboxes(conf, reg, scale, t): + """ + # Arguments + conf: softmax score (face or not) of each grid + reg: regression values of x1, y1, x2, y2 coordinates. + The values are normalized to grid width (12) and + height (12). + scale: scale-down factor with respect to original image + t: confidence threshold + + # Returns + A numpy array of bounding box coordinates and the + cooresponding scores: [[x1, y1, x2, y2, score], ...] + + # Notes + Top left corner coordinates of each grid is (x*2, y*2), + or (x*2/scale, y*2/scale) in the original image. + Bottom right corner coordinates is (x*2+12-1, y*2+12-1), + or ((x*2+12-1)/scale, (y*2+12-1)/scale) in the original + image. + """ + conf = conf.T # swap H and W dimensions + dx1 = reg[0, :, :].T + dy1 = reg[1, :, :].T + dx2 = reg[2, :, :].T + dy2 = reg[3, :, :].T + (x, y) = np.where(conf >= t) + if len(x) == 0: + return np.zeros((0, 5), np.float32) + + score = np.array(conf[x, y]).reshape(-1, 1) # Nx1 + reg = np.array([dx1[x, y], dy1[x, y], + dx2[x, y], dy2[x, y]]).T * 12. # Nx4 + topleft = np.array([x, y], dtype=np.float32).T * 2. # Nx2 + bottomright = topleft + np.array([11., 11.], dtype=np.float32) # Nx2 + boxes = (np.concatenate((topleft, bottomright), axis=1) + reg) / scale + boxes = np.concatenate((boxes, score), axis=1) # Nx5 + # filter bboxes which are too small + #boxes = boxes[boxes[:, 2]-boxes[:, 0] >= 12., :] + #boxes = boxes[boxes[:, 3]-boxes[:, 1] >= 12., :] + return boxes + + +def generate_rnet_bboxes(conf, reg, pboxes, t): + """ + # Arguments + conf: softmax score (face or not) of each box + reg: regression values of x1, y1, x2, y2 coordinates. + The values are normalized to box width and height. + pboxes: input boxes to RNet + t: confidence threshold + + # Returns + boxes: a numpy array of box coordinates and cooresponding + scores: [[x1, y1, x2, y2, score], ...] + """ + boxes = pboxes.copy() # make a copy + assert boxes.shape[0] == conf.shape[0] + boxes[:, 4] = conf # update 'score' of all boxes + boxes = boxes[conf >= t, :] + reg = reg[conf >= t, :] + ww = (boxes[:, 2]-boxes[:, 0]+1).reshape(-1, 1) # x2 - x1 + 1 + hh = (boxes[:, 3]-boxes[:, 1]+1).reshape(-1, 1) # y2 - y1 + 1 + boxes[:, 0:4] += np.concatenate((ww, hh, ww, hh), axis=1) * reg + return boxes + + +def generate_onet_outputs(conf, reg_boxes, reg_marks, rboxes, t): + """ + # Arguments + conf: softmax score (face or not) of each box + reg_boxes: regression values of x1, y1, x2, y2 + The values are normalized to box width and height. + reg_marks: regression values of the 5 facial landmark points + rboxes: input boxes to ONet (already converted to 2x1) + t: confidence threshold + + # Returns + boxes: a numpy array of box coordinates and cooresponding + scores: [[x1, y1, x2, y2,... , score], ...] + landmarks: a numpy array of facial landmark coordinates: + [[x1, x2, ..., x5, y1, y2, ..., y5], ...] + """ + boxes = rboxes.copy() # make a copy + assert boxes.shape[0] == conf.shape[0] + boxes[:, 4] = conf + boxes = boxes[conf >= t, :] + reg_boxes = reg_boxes[conf >= t, :] + reg_marks = reg_marks[conf >= t, :] + xx = boxes[:, 0].reshape(-1, 1) + yy = boxes[:, 1].reshape(-1, 1) + ww = (boxes[:, 2]-boxes[:, 0]).reshape(-1, 1) + hh = (boxes[:, 3]-boxes[:, 1]).reshape(-1, 1) + marks = np.concatenate((xx, xx, xx, xx, xx, yy, yy, yy, yy, yy), axis=1) + marks += np.concatenate((ww, ww, ww, ww, ww, hh, hh, hh, hh, hh), axis=1) * reg_marks + ww = ww + 1 + hh = hh + 1 + boxes[:, 0:4] += np.concatenate((ww, hh, ww, hh), axis=1) * reg_boxes + return boxes, marks + + +def clip_dets(dets, img_w, img_h): + """Round and clip detection (x1, y1, ...) values. + + Note we exclude the last value of 'dets' in computation since + it is 'conf'. + """ + dets[:, 0:-1] = np.fix(dets[:, 0:-1]) + evens = np.arange(0, dets.shape[1]-1, 2) + odds = np.arange(1, dets.shape[1]-1, 2) + dets[:, evens] = np.clip(dets[:, evens], 0., float(img_w-1)) + dets[:, odds] = np.clip(dets[:, odds], 0., float(img_h-1)) + return dets + + +class TrtPNet(object): + """TrtPNet + + Refer to mtcnn/det1_relu.prototxt for calculation of input/output + dimmensions of TrtPNet, as well as input H offsets (for all scales). + The output H offsets are merely input offsets divided by stride (2). + """ + input_h_offsets = (0, 216, 370, 478, 556, 610, 648, 676, 696) + output_h_offsets = (0, 108, 185, 239, 278, 305, 324, 338, 348) + max_n_scales = 9 + + def __init__(self, engine): + """__init__ + + # Arguments + engine: path to the TensorRT engine file + """ + self.trtnet = pytrt.PyTrtMtcnn(engine, + (3, 710, 384), + (2, 350, 187), + (4, 350, 187)) + self.trtnet.set_batchsize(1) + + def detect(self, img, minsize=40, factor=0.709, threshold=0.7): + """Detect faces using PNet + + # Arguments + img: input image as a RGB numpy array + threshold: confidence threshold + + # Returns + A numpy array of bounding box coordinates and the + cooresponding scores: [[x1, y1, x2, y2, score], ...] + """ + if minsize < 40: + raise ValueError("TrtPNet is currently designed with " + "'minsize' >= 40") + if factor > 0.709: + raise ValueError("TrtPNet is currently designed with " + "'factor' <= 0.709") + m = 12.0 / minsize + img_h, img_w, _ = img.shape + minl = min(img_h, img_w) * m + + # create scale pyramid + scales = [] + while minl >= 12: + scales.append(m) + m *= factor + minl *= factor + if len(scales) > self.max_n_scales: # probably won't happen... + raise ValueError('Too many scales, try increasing minsize ' + 'or decreasing factor.') + + total_boxes = np.zeros((0, 5), dtype=np.float32) + img = (img.astype(np.float32) - PIXEL_MEAN) * PIXEL_SCALE + + # stack all scales of the input image vertically into 1 big + # image, and only do inferencing once + im_data = np.zeros((1, 3, 710, 384), dtype=np.float32) + for i, scale in enumerate(scales): + h_offset = self.input_h_offsets[i] + h = int(img_h * scale) + w = int(img_w * scale) + im_data[0, :, h_offset:(h_offset+h), :w] = \ + cv2.resize(img, (w, h)).transpose((2, 0, 1)) + + out = self.trtnet.forward(im_data) + + # extract outputs of each scale from the big output blob + for i, scale in enumerate(scales): + h_offset = self.output_h_offsets[i] + h = (int(img_h * scale) - 12) // 2 + 1 + w = (int(img_w * scale) - 12) // 2 + 1 + pp = out['prob1'][0, 1, h_offset:(h_offset+h), :w] + cc = out['boxes'][0, :, h_offset:(h_offset+h), :w] + boxes = generate_pnet_bboxes(pp, cc, scale, threshold) + if boxes.shape[0] > 0: + pick = nms(boxes, 0.5, 'Union') + if len(pick) > 0: + boxes = boxes[pick, :] + if boxes.shape[0] > 0: + total_boxes = np.concatenate((total_boxes, boxes), axis=0) + + if total_boxes.shape[0] == 0: + return total_boxes + pick = nms(total_boxes, 0.7, 'Union') + dets = clip_dets(total_boxes[pick, :], img_w, img_h) + return dets + + def destroy(self): + self.trtnet.destroy() + self.trtnet = None + + +class TrtRNet(object): + """TrtRNet + + # Arguments + engine: path to the TensorRT engine (det2) file + """ + + def __init__(self, engine): + self.trtnet = pytrt.PyTrtMtcnn(engine, + (3, 24, 24), + (2, 1, 1), + (4, 1, 1)) + + def detect(self, img, boxes, max_batch=256, threshold=0.6): + """Detect faces using RNet + + # Arguments + img: input image as a RGB numpy array + boxes: detection results by PNet, a numpy array [:, 0:5] + of [x1, y1, x2, y2, score]'s + max_batch: only process these many top boxes from PNet + threshold: confidence threshold + + # Returns + A numpy array of bounding box coordinates and the + cooresponding scores: [[x1, y1, x2, y2, score], ...] + """ + if max_batch > 256: + raise ValueError('Bad max_batch: %d' % max_batch) + boxes = boxes[:max_batch] # assuming boxes are sorted by score + if boxes.shape[0] == 0: + return boxes + img_h, img_w, _ = img.shape + boxes = convert_to_1x1(boxes) + crops = np.zeros((boxes.shape[0], 24, 24, 3), dtype=np.uint8) + for i, det in enumerate(boxes): + cropped_im = crop_img_with_padding(img, det) + # NOTE: H and W dimensions need to be transposed for RNet! + crops[i, ...] = cv2.transpose(cv2.resize(cropped_im, (24, 24))) + crops = crops.transpose((0, 3, 1, 2)) # NHWC -> NCHW + crops = (crops.astype(np.float32) - PIXEL_MEAN) * PIXEL_SCALE + + self.trtnet.set_batchsize(crops.shape[0]) + out = self.trtnet.forward(crops) + + pp = out['prob1'][:, 1, 0, 0] + cc = out['boxes'][:, :, 0, 0] + boxes = generate_rnet_bboxes(pp, cc, boxes, threshold) + if boxes.shape[0] == 0: + return boxes + pick = nms(boxes, 0.7, 'Union') + dets = clip_dets(boxes[pick, :], img_w, img_h) + return dets + + def destroy(self): + self.trtnet.destroy() + self.trtnet = None + + +class TrtONet(object): + """TrtONet + + # Arguments + engine: path to the TensorRT engine (det3) file + """ + + def __init__(self, engine): + self.trtnet = pytrt.PyTrtMtcnn(engine, + (3, 48, 48), + (2, 1, 1), + (4, 1, 1), + (10, 1, 1)) + + def detect(self, img, boxes, max_batch=64, threshold=0.7): + """Detect faces using ONet + + # Arguments + img: input image as a RGB numpy array + boxes: detection results by RNet, a numpy array [:, 0:5] + of [x1, y1, x2, y2, score]'s + max_batch: only process these many top boxes from RNet + threshold: confidence threshold + + # Returns + dets: boxes and conf scores + landmarks + """ + if max_batch > 64: + raise ValueError('Bad max_batch: %d' % max_batch) + if boxes.shape[0] == 0: + return (np.zeros((0, 5), dtype=np.float32), + np.zeros((0, 10), dtype=np.float32)) + boxes = boxes[:max_batch] # assuming boxes are sorted by score + img_h, img_w, _ = img.shape + boxes = convert_to_1x1(boxes) + crops = np.zeros((boxes.shape[0], 48, 48, 3), dtype=np.uint8) + for i, det in enumerate(boxes): + cropped_im = crop_img_with_padding(img, det) + # NOTE: H and W dimensions need to be transposed for RNet! + crops[i, ...] = cv2.transpose(cv2.resize(cropped_im, (48, 48))) + crops = crops.transpose((0, 3, 1, 2)) # NHWC -> NCHW + crops = (crops.astype(np.float32) - PIXEL_MEAN) * PIXEL_SCALE + + self.trtnet.set_batchsize(crops.shape[0]) + out = self.trtnet.forward(crops) + + pp = out['prob1'][:, 1, 0, 0] + cc = out['boxes'][:, :, 0, 0] + mm = out['landmarks'][:, :, 0, 0] + boxes, landmarks = generate_onet_outputs(pp, cc, mm, boxes, threshold) + pick = nms(boxes, 0.7, 'Min') + return (clip_dets(boxes[pick, :], img_w, img_h), + np.fix(landmarks[pick, :])) + + def destroy(self): + self.trtnet.destroy() + self.trtnet = None + + +class TrtMtcnn(object): + """TrtMtcnn""" + + def __init__(self): + self.pnet = TrtPNet('mtcnn/det1.engine') + self.rnet = TrtRNet('mtcnn/det2.engine') + self.onet = TrtONet('mtcnn/det3.engine') + + def __del__(self): + self.onet.destroy() + self.rnet.destroy() + self.pnet.destroy() + + def _detect_1280x720(self, img, minsize): + """_detec_1280x720() + + Assuming 'img' has been resized to less than 1280x720. + """ + # MTCNN model was trained with 'MATLAB' image so its channel + # order is RGB instead of BGR. + img = img[:, :, ::-1] # BGR -> RGB + dets = self.pnet.detect(img, minsize=minsize) + dets = self.rnet.detect(img, dets) + dets, landmarks = self.onet.detect(img, dets) + return dets, landmarks + + def detect(self, img, minsize=40): + """detect() + + This function handles rescaling of the input image if it's + larger than 1280x720. + """ + if img is None: + raise ValueError + img_h, img_w, _ = img.shape + scale = min(720. / img_h, 1280. / img_w) + if scale < 1.0: + new_h = int(np.ceil(img_h * scale)) + new_w = int(np.ceil(img_w * scale)) + img = cv2.resize(img, (new_w, new_h)) + minsize = max(int(np.ceil(minsize * scale)), 40) + dets, landmarks = self._detect_1280x720(img, minsize) + if scale < 1.0: + dets[:, :-1] = np.fix(dets[:, :-1] / scale) + landmarks = np.fix(landmarks / scale) + return dets, landmarks diff --git a/utils/ssd.py b/utils/ssd.py new file mode 100644 index 0000000..63d642e --- /dev/null +++ b/utils/ssd.py @@ -0,0 +1,125 @@ +"""ssd.py + +This module implements the TrtSSD class. +""" + + +import ctypes + +import numpy as np +import cv2 +import tensorrt as trt +import pycuda.driver as cuda + + +def _preprocess_trt(img, shape=(300, 300)): + """Preprocess an image before TRT SSD inferencing.""" + img = cv2.resize(img, shape) + img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) + img = img.transpose((2, 0, 1)).astype(np.float32) + img *= (2.0/255.0) + img -= 1.0 + return img + + +def _postprocess_trt(img, output, conf_th, output_layout=7): + """Postprocess TRT SSD output.""" + img_h, img_w, _ = img.shape + boxes, confs, clss = [], [], [] + for prefix in range(0, len(output), output_layout): + #index = int(output[prefix+0]) + conf = float(output[prefix+2]) + if conf < conf_th: + continue + x1 = int(output[prefix+3] * img_w) + y1 = int(output[prefix+4] * img_h) + x2 = int(output[prefix+5] * img_w) + y2 = int(output[prefix+6] * img_h) + cls = int(output[prefix+1]) + boxes.append((x1, y1, x2, y2)) + confs.append(conf) + clss.append(cls) + return boxes, confs, clss + + +class TrtSSD(object): + """TrtSSD class encapsulates things needed to run TRT SSD.""" + + def _load_plugins(self): + if trt.__version__[0] < '7': + ctypes.CDLL("ssd/libflattenconcat.so") + trt.init_libnvinfer_plugins(self.trt_logger, '') + + def _load_engine(self): + TRTbin = 'ssd/TRT_%s.bin' % self.model + with open(TRTbin, 'rb') as f, trt.Runtime(self.trt_logger) as runtime: + return runtime.deserialize_cuda_engine(f.read()) + + def _allocate_buffers(self): + host_inputs, host_outputs, cuda_inputs, cuda_outputs, bindings = \ + [], [], [], [], [] + for binding in self.engine: + size = trt.volume(self.engine.get_binding_shape(binding)) * \ + self.engine.max_batch_size + host_mem = cuda.pagelocked_empty(size, np.float32) + cuda_mem = cuda.mem_alloc(host_mem.nbytes) + bindings.append(int(cuda_mem)) + if self.engine.binding_is_input(binding): + host_inputs.append(host_mem) + cuda_inputs.append(cuda_mem) + else: + host_outputs.append(host_mem) + cuda_outputs.append(cuda_mem) + return host_inputs, host_outputs, cuda_inputs, cuda_outputs, bindings + + def __init__(self, model, input_shape, cuda_ctx=None): + """Initialize TensorRT plugins, engine and conetxt.""" + self.model = model + self.input_shape = input_shape + self.cuda_ctx = cuda_ctx + if self.cuda_ctx: + self.cuda_ctx.push() + + self.trt_logger = trt.Logger(trt.Logger.INFO) + self._load_plugins() + self.engine = self._load_engine() + + try: + self.context = self.engine.create_execution_context() + self.stream = cuda.Stream() + self.host_inputs, self.host_outputs, self.cuda_inputs, self.cuda_outputs, self.bindings = self._allocate_buffers() + except Exception as e: + raise RuntimeError('fail to allocate CUDA resources') from e + finally: + if self.cuda_ctx: + self.cuda_ctx.pop() + + def __del__(self): + """Free CUDA memories and context.""" + del self.cuda_outputs + del self.cuda_inputs + del self.stream + + def detect(self, img, conf_th=0.3): + """Detect objects in the input image.""" + img_resized = _preprocess_trt(img, self.input_shape) + np.copyto(self.host_inputs[0], img_resized.ravel()) + + if self.cuda_ctx: + self.cuda_ctx.push() + cuda.memcpy_htod_async( + self.cuda_inputs[0], self.host_inputs[0], self.stream) + self.context.execute_async( + batch_size=1, + bindings=self.bindings, + stream_handle=self.stream.handle) + cuda.memcpy_dtoh_async( + self.host_outputs[1], self.cuda_outputs[1], self.stream) + cuda.memcpy_dtoh_async( + self.host_outputs[0], self.cuda_outputs[0], self.stream) + self.stream.synchronize() + if self.cuda_ctx: + self.cuda_ctx.pop() + + output = self.host_outputs[0] + return _postprocess_trt(img, output, conf_th) diff --git a/utils/ssd_classes.py b/utils/ssd_classes.py new file mode 100644 index 0000000..47ee672 --- /dev/null +++ b/utils/ssd_classes.py @@ -0,0 +1,115 @@ +"""ssd_classes.py + +This file was modified from: +http://github.com/AastaNV/TRT_object_detection/blob/master/coco.py +""" + +COCO_CLASSES_LIST = [ + 'background', # was 'unlabeled' + 'person', + 'bicycle', + 'car', + 'motorcycle', + 'airplane', + 'bus', + 'train', + 'truck', + 'boat', + 'traffic light', + 'fire hydrant', + 'street sign', + 'stop sign', + 'parking meter', + 'bench', + 'bird', + 'cat', + 'dog', + 'horse', + 'sheep', + 'cow', + 'elephant', + 'bear', + 'zebra', + 'giraffe', + 'hat', + 'backpack', + 'umbrella', + 'shoe', + 'eye glasses', + 'handbag', + 'tie', + 'suitcase', + 'frisbee', + 'skis', + 'snowboard', + 'sports ball', + 'kite', + 'baseball bat', + 'baseball glove', + 'skateboard', + 'surfboard', + 'tennis racket', + 'bottle', + 'plate', + 'wine glass', + 'cup', + 'fork', + 'knife', + 'spoon', + 'bowl', + 'banana', + 'apple', + 'sandwich', + 'orange', + 'broccoli', + 'carrot', + 'hot dog', + 'pizza', + 'donut', + 'cake', + 'chair', + 'couch', + 'potted plant', + 'bed', + 'mirror', + 'dining table', + 'window', + 'desk', + 'toilet', + 'door', + 'tv', + 'laptop', + 'mouse', + 'remote', + 'keyboard', + 'cell phone', + 'microwave', + 'oven', + 'toaster', + 'sink', + 'refrigerator', + 'blender', + 'book', + 'clock', + 'vase', + 'scissors', + 'teddy bear', + 'hair drier', + 'toothbrush', +] + +EGOHANDS_CLASSES_LIST = [ + 'background', + 'hand', +] + + +def get_cls_dict(model): + """Get the class ID to name translation dictionary.""" + if model == 'coco': + cls_list = COCO_CLASSES_LIST + elif model == 'egohands': + cls_list = EGOHANDS_CLASSES_LIST + else: + raise ValueError('Bad model name') + return {i: n for i, n in enumerate(cls_list)} diff --git a/utils/ssd_tf.py b/utils/ssd_tf.py new file mode 100644 index 0000000..e0c953b --- /dev/null +++ b/utils/ssd_tf.py @@ -0,0 +1,59 @@ +"""ssd_tf.py + +This module implements the TfSSD class. +""" + + +import numpy as np +import cv2 +import tensorflow as tf + + +def _postprocess_tf(img, boxes, scores, classes, conf_th): + """Postprocess TensorFlow SSD output.""" + h, w, _ = img.shape + out_boxes = boxes[0] * np.array([h, w, h, w]) + out_boxes = out_boxes.astype(np.int32) + out_boxes = out_boxes[:, [1, 0, 3, 2]] # swap x's and y's + out_confs = scores[0] + out_clss = classes[0].astype(np.int32) + + # only return bboxes with confidence score above threshold + mask = np.where(out_confs >= conf_th) + return out_boxes[mask], out_confs[mask], out_clss[mask] + + +class TfSSD(object): + """TfSSD class encapsulates things needed to run TensorFlow SSD.""" + + def __init__(self, model, input_shape): + self.model = model + self.input_shape = input_shape + + # load detection graph + ssd_graph = tf.Graph() + with ssd_graph.as_default(): + graph_def = tf.GraphDef() + with tf.gfile.GFile('ssd/%s.pb' % model, 'rb') as fid: + serialized_graph = fid.read() + graph_def.ParseFromString(serialized_graph) + tf.import_graph_def(graph_def, name='') + + # define input/output tensors + self.image_tensor = ssd_graph.get_tensor_by_name('image_tensor:0') + self.det_boxes = ssd_graph.get_tensor_by_name('detection_boxes:0') + self.det_scores = ssd_graph.get_tensor_by_name('detection_scores:0') + self.det_classes = ssd_graph.get_tensor_by_name('detection_classes:0') + + # create the session for inferencing + self.sess = tf.Session(graph=ssd_graph) + + def __del__(self): + self.sess.close() + + def detect(self, img, conf_th): + img_resized = _preprocess_tf(img, self.input_shape) + boxes, scores, classes = self.sess.run( + [self.det_boxes, self.det_scores, self.det_classes], + feed_dict={self.image_tensor: np.expand_dims(img_resized, 0)}) + return _postprocess_tf(img, boxes, scores, classes, conf_th) diff --git a/utils/visualization.py b/utils/visualization.py new file mode 100644 index 0000000..776d90f --- /dev/null +++ b/utils/visualization.py @@ -0,0 +1,102 @@ +"""visualization.py + +The BBoxVisualization class implements drawing of nice looking +bounding boxes based on object detection results. +""" + + +import numpy as np +import cv2 + + +# Constants +ALPHA = 0.5 +FONT = cv2.FONT_HERSHEY_PLAIN +TEXT_SCALE = 1.0 +TEXT_THICKNESS = 1 +BLACK = (0, 0, 0) +WHITE = (255, 255, 255) + + +def gen_colors(num_colors): + """Generate different colors. + + # Arguments + num_colors: total number of colors/classes. + + # Output + bgrs: a list of (B, G, R) tuples which correspond to each of + the colors/classes. + """ + import random + import colorsys + + hsvs = [[float(x) / num_colors, 1., 0.7] for x in range(num_colors)] + random.seed(1234) + random.shuffle(hsvs) + rgbs = list(map(lambda x: list(colorsys.hsv_to_rgb(*x)), hsvs)) + bgrs = [(int(rgb[2] * 255), int(rgb[1] * 255), int(rgb[0] * 255)) + for rgb in rgbs] + return bgrs + + +def draw_boxed_text(img, text, topleft, color): + """Draw a transluent boxed text in white, overlayed on top of a + colored patch surrounded by a black border. FONT, TEXT_SCALE, + TEXT_THICKNESS and ALPHA values are constants (fixed) as defined + on top. + + # Arguments + img: the input image as a numpy array. + text: the text to be drawn. + topleft: XY coordinate of the topleft corner of the boxed text. + color: color of the patch, i.e. background of the text. + + # Output + img: note the original image is modified inplace. + """ + assert img.dtype == np.uint8 + img_h, img_w, _ = img.shape + if topleft[0] >= img_w or topleft[1] >= img_h: + return img + margin = 3 + size = cv2.getTextSize(text, FONT, TEXT_SCALE, TEXT_THICKNESS) + w = size[0][0] + margin * 2 + h = size[0][1] + margin * 2 + # the patch is used to draw boxed text + patch = np.zeros((h, w, 3), dtype=np.uint8) + patch[...] = color + cv2.putText(patch, text, (margin+1, h-margin-2), FONT, TEXT_SCALE, + WHITE, thickness=TEXT_THICKNESS, lineType=cv2.LINE_8) + cv2.rectangle(patch, (0, 0), (w-1, h-1), BLACK, thickness=1) + w = min(w, img_w - topleft[0]) # clip overlay at image boundary + h = min(h, img_h - topleft[1]) + # Overlay the boxed text onto region of interest (roi) in img + roi = img[topleft[1]:topleft[1]+h, topleft[0]:topleft[0]+w, :] + cv2.addWeighted(patch[0:h, 0:w, :], ALPHA, roi, 1 - ALPHA, 0, roi) + return img + + +class BBoxVisualization(): + """BBoxVisualization class implements nice drawing of boudning boxes. + + # Arguments + cls_dict: a dictionary used to translate class id to its name. + """ + + def __init__(self, cls_dict): + self.cls_dict = cls_dict + self.colors = gen_colors(len(cls_dict)) + + def draw_bboxes(self, img, boxes, confs, clss): + """Draw detected bounding boxes on the original image.""" + for bb, cf, cl in zip(boxes, confs, clss): + cl = int(cl) + x_min, y_min, x_max, y_max = bb[0], bb[1], bb[2], bb[3] + color = self.colors[cl] + cv2.rectangle(img, (x_min, y_min), (x_max, y_max), color, 2) + txt_loc = (max(x_min+2, 0), max(y_min+2, 0)) + cls_name = self.cls_dict.get(cl, 'CLS{}'.format(cl)) + txt = '{} {:.2f}'.format(cls_name, cf) + img = draw_boxed_text(img, txt, txt_loc, color) + return img diff --git a/utils/writer.py b/utils/writer.py new file mode 100644 index 0000000..8214204 --- /dev/null +++ b/utils/writer.py @@ -0,0 +1,30 @@ +"""writer.py +""" + + +import subprocess + +import cv2 + + +def get_video_writer(name, width, height, fps=30): + """Get a VideoWriter object for saving output video. + + This function tries to use Jetson's hardware H.264 encoder (omxh264enc) + if available, in which case the output video would be a MPEG-2 TS file. + Otherwise, it uses cv2's built-in encoding mechanism and saves a MP4 + file. + """ + gst_elements = str(subprocess.check_output('gst-inspect-1.0')) + if 'omxh264dec' in gst_elements: + filename = name + '.ts' # Transport Stream + gst_str = ('appsrc ! videoconvert ! omxh264enc ! mpegtsmux ! ' + 'filesink location=%s') % filename + return cv2.VideoWriter( + gst_str, cv2.CAP_GSTREAMER, 0, fps, (width, height)) + else: + filename = name + '.mp4' # MP4 + return cv2.VideoWriter( + filename, cv2.VideoWriter_fourcc(*'mp4v'), fps, (width, height)) + + diff --git a/utils/yolo_classes.py b/utils/yolo_classes.py new file mode 100644 index 0000000..e2cd241 --- /dev/null +++ b/utils/yolo_classes.py @@ -0,0 +1,104 @@ +"""yolo_classes.py + +NOTE: Number of YOLO COCO output classes differs from SSD COCO models. +""" + +COCO_CLASSES_LIST = [ + 'person', + 'bicycle', + 'car', + 'motorbike', + 'aeroplane', + 'bus', + 'train', + 'truck', + 'boat', + 'traffic light', + 'fire hydrant', + 'stop sign', + 'parking meter', + 'bench', + 'bird', + 'cat', + 'dog', + 'horse', + 'sheep', + 'cow', + 'elephant', + 'bear', + 'zebra', + 'giraffe', + 'backpack', + 'umbrella', + 'handbag', + 'tie', + 'suitcase', + 'frisbee', + 'skis', + 'snowboard', + 'sports ball', + 'kite', + 'baseball bat', + 'baseball glove', + 'skateboard', + 'surfboard', + 'tennis racket', + 'bottle', + 'wine glass', + 'cup', + 'fork', + 'knife', + 'spoon', + 'bowl', + 'banana', + 'apple', + 'sandwich', + 'orange', + 'broccoli', + 'carrot', + 'hot dog', + 'pizza', + 'donut', + 'cake', + 'chair', + 'sofa', + 'pottedplant', + 'bed', + 'diningtable', + 'toilet', + 'tvmonitor', + 'laptop', + 'mouse', + 'remote', + 'keyboard', + 'cell phone', + 'microwave', + 'oven', + 'toaster', + 'sink', + 'refrigerator', + 'book', + 'clock', + 'vase', + 'scissors', + 'teddy bear', + 'hair drier', + 'toothbrush', +] + +# For translating YOLO class ids (0~79) to SSD class ids (0~90) +yolo_cls_to_ssd = [ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, + 21, 22, 23, 24, 25, 27, 28, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, + 41, 42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, + 59, 60, 61, 62, 63, 64, 65, 67, 70, 72, 73, 74, 75, 76, 77, 78, 79, + 80, 81, 82, 84, 85, 86, 87, 88, 89, 90, +] + + +def get_cls_dict(category_num): + """Get the class ID to name translation dictionary.""" + if category_num == 80: + return {i: n for i, n in enumerate(COCO_CLASSES_LIST)} + else: + return {i: 'CLS%d' % i for i in range(category_num)} diff --git a/utils/yolo_with_plugins.py b/utils/yolo_with_plugins.py new file mode 100644 index 0000000..42704be --- /dev/null +++ b/utils/yolo_with_plugins.py @@ -0,0 +1,338 @@ +"""yolo_with_plugins.py + +Implementation of TrtYOLO class with the yolo_layer plugins. +""" + + +from __future__ import print_function + +import ctypes + +import numpy as np +import cv2 +import tensorrt as trt +import pycuda.driver as cuda + + +try: + ctypes.cdll.LoadLibrary('./plugins/libyolo_layer.so') +except OSError as e: + raise SystemExit('ERROR: failed to load ./plugins/libyolo_layer.so. ' + 'Did you forget to do a "make" in the "./plugins/" ' + 'subdirectory?') from e + + +def _preprocess_yolo(img, input_shape, letter_box=False): + """Preprocess an image before TRT YOLO inferencing. + + # Args + img: int8 numpy array of shape (img_h, img_w, 3) + input_shape: a tuple of (H, W) + letter_box: boolean, specifies whether to keep aspect ratio and + create a "letterboxed" image for inference + + # Returns + preprocessed img: float32 numpy array of shape (3, H, W) + """ + if letter_box: + img_h, img_w, _ = img.shape + new_h, new_w = input_shape[0], input_shape[1] + offset_h, offset_w = 0, 0 + if (new_w / img_w) <= (new_h / img_h): + new_h = int(img_h * new_w / img_w) + offset_h = (input_shape[0] - new_h) // 2 + else: + new_w = int(img_w * new_h / img_h) + offset_w = (input_shape[1] - new_w) // 2 + resized = cv2.resize(img, (new_w, new_h)) + img = np.full((input_shape[0], input_shape[1], 3), 127, dtype=np.uint8) + img[offset_h:(offset_h + new_h), offset_w:(offset_w + new_w), :] = resized + else: + img = cv2.resize(img, (input_shape[1], input_shape[0])) + + img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) + img = img.transpose((2, 0, 1)).astype(np.float32) + img /= 255.0 + return img + + +def _nms_boxes(detections, nms_threshold): + """Apply the Non-Maximum Suppression (NMS) algorithm on the bounding + boxes with their confidence scores and return an array with the + indexes of the bounding boxes we want to keep. + + # Args + detections: Nx7 numpy arrays of + [[x, y, w, h, box_confidence, class_id, class_prob], + ......] + """ + x_coord = detections[:, 0] + y_coord = detections[:, 1] + width = detections[:, 2] + height = detections[:, 3] + box_confidences = detections[:, 4] * detections[:, 6] + + areas = width * height + ordered = box_confidences.argsort()[::-1] + + keep = list() + while ordered.size > 0: + # Index of the current element: + i = ordered[0] + keep.append(i) + xx1 = np.maximum(x_coord[i], x_coord[ordered[1:]]) + yy1 = np.maximum(y_coord[i], y_coord[ordered[1:]]) + xx2 = np.minimum(x_coord[i] + width[i], x_coord[ordered[1:]] + width[ordered[1:]]) + yy2 = np.minimum(y_coord[i] + height[i], y_coord[ordered[1:]] + height[ordered[1:]]) + + width1 = np.maximum(0.0, xx2 - xx1 + 1) + height1 = np.maximum(0.0, yy2 - yy1 + 1) + intersection = width1 * height1 + union = (areas[i] + areas[ordered[1:]] - intersection) + iou = intersection / union + indexes = np.where(iou <= nms_threshold)[0] + ordered = ordered[indexes + 1] + + keep = np.array(keep) + return keep + + +def _postprocess_yolo(trt_outputs, img_w, img_h, conf_th, nms_threshold, + input_shape, letter_box=False): + """Postprocess TensorRT outputs. + + # Args + trt_outputs: a list of 2 or 3 tensors, where each tensor + contains a multiple of 7 float32 numbers in + the order of [x, y, w, h, box_confidence, class_id, class_prob] + conf_th: confidence threshold + letter_box: boolean, referring to _preprocess_yolo() + + # Returns + boxes, scores, classes (after NMS) + """ + # filter low-conf detections and concatenate results of all yolo layers + detections = [] + for o in trt_outputs: + dets = o.reshape((-1, 7)) + dets = dets[dets[:, 4] * dets[:, 6] >= conf_th] + detections.append(dets) + detections = np.concatenate(detections, axis=0) + + if len(detections) == 0: + boxes = np.zeros((0, 4), dtype=np.int) + scores = np.zeros((0,), dtype=np.float32) + classes = np.zeros((0,), dtype=np.float32) + else: + box_scores = detections[:, 4] * detections[:, 6] + + # scale x, y, w, h from [0, 1] to pixel values + old_h, old_w = img_h, img_w + offset_h, offset_w = 0, 0 + if letter_box: + if (img_w / input_shape[1]) >= (img_h / input_shape[0]): + old_h = int(input_shape[0] * img_w / input_shape[1]) + offset_h = (old_h - img_h) // 2 + else: + old_w = int(input_shape[1] * img_h / input_shape[0]) + offset_w = (old_w - img_w) // 2 + detections[:, 0:4] *= np.array( + [old_w, old_h, old_w, old_h], dtype=np.float32) + + # NMS + nms_detections = np.zeros((0, 7), dtype=detections.dtype) + for class_id in set(detections[:, 5]): + idxs = np.where(detections[:, 5] == class_id) + cls_detections = detections[idxs] + keep = _nms_boxes(cls_detections, nms_threshold) + nms_detections = np.concatenate( + [nms_detections, cls_detections[keep]], axis=0) + + xx = nms_detections[:, 0].reshape(-1, 1) + yy = nms_detections[:, 1].reshape(-1, 1) + if letter_box: + xx = xx - offset_w + yy = yy - offset_h + ww = nms_detections[:, 2].reshape(-1, 1) + hh = nms_detections[:, 3].reshape(-1, 1) + boxes = np.concatenate([xx, yy, xx+ww, yy+hh], axis=1) + 0.5 + boxes = boxes.astype(np.int) + scores = nms_detections[:, 4] * nms_detections[:, 6] + classes = nms_detections[:, 5] + return boxes, scores, classes + + +class HostDeviceMem(object): + """Simple helper data class that's a little nicer to use than a 2-tuple.""" + def __init__(self, host_mem, device_mem): + self.host = host_mem + self.device = device_mem + + def __str__(self): + return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device) + + def __repr__(self): + return self.__str__() + + def __del__(self): + del self.device + del self.host + + +def get_input_shape(engine): + """Get input shape of the TensorRT YOLO engine.""" + binding = engine[0] + assert engine.binding_is_input(binding) + binding_dims = engine.get_binding_shape(binding) + if len(binding_dims) == 4: + return tuple(binding_dims[2:]) + elif len(binding_dims) == 3: + return tuple(binding_dims[1:]) + else: + raise ValueError('bad dims of binding %s: %s' % (binding, str(binding_dims))) + + +def allocate_buffers(engine): + """Allocates all host/device in/out buffers required for an engine.""" + inputs = [] + outputs = [] + bindings = [] + output_idx = 0 + stream = cuda.Stream() + for binding in engine: + binding_dims = engine.get_binding_shape(binding) + if len(binding_dims) == 4: + # explicit batch case (TensorRT 7+) + size = trt.volume(binding_dims) + elif len(binding_dims) == 3: + # implicit batch case (TensorRT 6 or older) + size = trt.volume(binding_dims) * engine.max_batch_size + else: + raise ValueError('bad dims of binding %s: %s' % (binding, str(binding_dims))) + dtype = trt.nptype(engine.get_binding_dtype(binding)) + # Allocate host and device buffers + host_mem = cuda.pagelocked_empty(size, dtype) + device_mem = cuda.mem_alloc(host_mem.nbytes) + # Append the device buffer to device bindings. + bindings.append(int(device_mem)) + # Append to the appropriate list. + if engine.binding_is_input(binding): + inputs.append(HostDeviceMem(host_mem, device_mem)) + else: + # each grid has 3 anchors, each anchor generates a detection + # output of 7 float32 values + assert size % 7 == 0 + outputs.append(HostDeviceMem(host_mem, device_mem)) + output_idx += 1 + assert len(inputs) == 1 + assert len(outputs) == 1 + return inputs, outputs, bindings, stream + + +def do_inference(context, bindings, inputs, outputs, stream, batch_size=1): + """do_inference (for TensorRT 6.x or lower) + + This function is generalized for multiple inputs/outputs. + Inputs and outputs are expected to be lists of HostDeviceMem objects. + """ + # Transfer input data to the GPU. + [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs] + # Run inference. + context.execute_async(batch_size=batch_size, + bindings=bindings, + stream_handle=stream.handle) + # Transfer predictions back from the GPU. + [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs] + # Synchronize the stream + stream.synchronize() + # Return only the host outputs. + return [out.host for out in outputs] + + +def do_inference_v2(context, bindings, inputs, outputs, stream): + """do_inference_v2 (for TensorRT 7.0+) + + This function is generalized for multiple inputs/outputs for full + dimension networks. + Inputs and outputs are expected to be lists of HostDeviceMem objects. + """ + # Transfer input data to the GPU. + [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs] + # Run inference. + context.execute_async_v2(bindings=bindings, stream_handle=stream.handle) + # Transfer predictions back from the GPU. + [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs] + # Synchronize the stream + stream.synchronize() + # Return only the host outputs. + return [out.host for out in outputs] + + +class TrtYOLO(object): + """TrtYOLO class encapsulates things needed to run TRT YOLO.""" + + def _load_engine(self): + TRTbin = 'yolo/%s.trt' % self.model + with open(TRTbin, 'rb') as f, trt.Runtime(self.trt_logger) as runtime: + return runtime.deserialize_cuda_engine(f.read()) + + def __init__(self, model, category_num=80, letter_box=False, cuda_ctx=None): + """Initialize TensorRT plugins, engine and conetxt.""" + self.model = model + self.category_num = category_num + self.letter_box = letter_box + self.cuda_ctx = cuda_ctx + if self.cuda_ctx: + self.cuda_ctx.push() + + self.inference_fn = do_inference if trt.__version__[0] < '7' \ + else do_inference_v2 + self.trt_logger = trt.Logger(trt.Logger.INFO) + self.engine = self._load_engine() + + self.input_shape = get_input_shape(self.engine) + + try: + self.context = self.engine.create_execution_context() + self.inputs, self.outputs, self.bindings, self.stream = \ + allocate_buffers(self.engine) + except Exception as e: + raise RuntimeError('fail to allocate CUDA resources') from e + finally: + if self.cuda_ctx: + self.cuda_ctx.pop() + + def __del__(self): + """Free CUDA memories.""" + del self.outputs + del self.inputs + del self.stream + + def detect(self, img, conf_th=0.3, letter_box=None): + """Detect objects in the input image.""" + letter_box = self.letter_box if letter_box is None else letter_box + img_resized = _preprocess_yolo(img, self.input_shape, letter_box) + + # Set host input to the image. The do_inference() function + # will copy the input to the GPU before executing. + self.inputs[0].host = np.ascontiguousarray(img_resized) + if self.cuda_ctx: + self.cuda_ctx.push() + trt_outputs = self.inference_fn( + context=self.context, + bindings=self.bindings, + inputs=self.inputs, + outputs=self.outputs, + stream=self.stream) + if self.cuda_ctx: + self.cuda_ctx.pop() + + boxes, scores, classes = _postprocess_yolo( + trt_outputs, img.shape[1], img.shape[0], conf_th, + nms_threshold=0.5, input_shape=self.input_shape, + letter_box=letter_box) + + # clip x1, y1, x2, y2 within original image + boxes[:, [0, 2]] = np.clip(boxes[:, [0, 2]], 0, img.shape[1]-1) + boxes[:, [1, 3]] = np.clip(boxes[:, [1, 3]], 0, img.shape[0]-1) + return boxes, scores, classes diff --git a/yolo/build_dla_engines.sh b/yolo/build_dla_engines.sh new file mode 100755 index 0000000..b21ad2e --- /dev/null +++ b/yolo/build_dla_engines.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +# I use this script to build DLA0 and DLA1 TensorRT engines for various +# yolov3 and yolov4 models. + +set -e + +models="yolov3-tiny-416 yolov3-608 yolov3-spp-608 yolov4-tiny-416 yolov4-608" + +# make sure all needed files are present +for m in ${models}; do + if [[ ! -f ${m}.cfg ]]; then + echo "ERROR: cannot find the file ${m}.cfg" + exit 1 + fi + if [[ ! -f ${m}.onnx ]]; then + echo "ERROR: cannot find the file ${m}.onnx" + exit 1 + fi +done + +# create symbolic links to cfg and onnx files +for m in ${models}; do + m_head=${m%-*} + m_tail=${m##*-} + ln -sf ${m}.cfg ${m_head}-dla0-${m_tail}.cfg + ln -sf ${m}.onnx ${m_head}-dla0-${m_tail}.onnx + ln -sf ${m}.cfg ${m_head}-dla1-${m_tail}.cfg + ln -sf ${m}.onnx ${m_head}-dla1-${m_tail}.onnx +done + +# build TensorRT engines +for m in ${models}; do + m_head=${m%-*} + m_tail=${m##*-} + echo ; echo === ${m_head}-dla0-${m_tail} === ; echo + python3 onnx_to_tensorrt.py --int8 --dla_core 0 -m ${m_head}-dla0-${m_tail} + echo ; echo === ${m_head}-dla1-${m_tail} === ; echo + python3 onnx_to_tensorrt.py --int8 --dla_core 1 -m ${m_head}-dla1-${m_tail} +done + +echo +echo "Done." diff --git a/yolo/build_int8_engines.sh b/yolo/build_int8_engines.sh new file mode 100755 index 0000000..956d8c4 --- /dev/null +++ b/yolo/build_int8_engines.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +# I use this script to build INT8 TensorRT engines for various yolov3 and +# yolov4 models. + +set -e + +models="yolov3-tiny-416 yolov3-608 yolov3-spp-608 yolov4-tiny-416 yolov4-608" + +# make sure all needed files are present +for m in ${models}; do + if [[ ! -f ${m}.cfg ]]; then + echo "ERROR: cannot find the file ${m}.cfg" + exit 1 + fi + if [[ ! -f ${m}.onnx ]]; then + echo "ERROR: cannot find the file ${m}.onnx" + exit 1 + fi +done + +# create symbolic links to cfg and onnx files +for m in ${models}; do + m_head=${m%-*} + m_tail=${m##*-} + ln -sf ${m}.cfg ${m_head}-int8-${m_tail}.cfg + ln -sf ${m}.onnx ${m_head}-int8-${m_tail}.onnx +done + +# build TensorRT engines +for m in ${models}; do + m_head=${m%-*} + m_tail=${m##*-} + echo ; echo === ${m_head}-int8-${m_tail} === ; echo + python3 onnx_to_tensorrt.py --int8 -m ${m_head}-int8-${m_tail} +done + +echo +echo "Done." diff --git a/yolo/calib_cache/calib_yolov3-int8-608.bin b/yolo/calib_cache/calib_yolov3-int8-608.bin new file mode 100644 index 0000000..addf27e --- /dev/null +++ b/yolo/calib_cache/calib_yolov3-int8-608.bin @@ -0,0 +1,251 @@ +TRT-7103-EntropyCalibration2 +000_net: 3c010a14 +001_convolutional: 3caf6955 +001_convolutional_bn: 3e068c22 +001_convolutional_lrelu: 3d9f315c +002_convolutional: 3e9e5cba +002_convolutional_bn: 3dac3fc1 +002_convolutional_lrelu: 3d23ae07 +003_convolutional: 3e07e38b +003_convolutional_bn: 3df2b08f +003_convolutional_lrelu: 3d625e33 +004_convolutional: 3db20ea3 +004_convolutional_bn: 3dd5690a +004_convolutional_lrelu: 3d908773 +005_shortcut: 3db8db93 +006_convolutional: 3e96d31a +006_convolutional_bn: 3dd71b8e +006_convolutional_lrelu: 3d6b0087 +007_convolutional: 3d80ca3f +007_convolutional_bn: 3d9a59ab +007_convolutional_lrelu: 3d0be6c5 +008_convolutional: 3dd0c902 +008_convolutional_bn: 3d41ad06 +008_convolutional_lrelu: 3d09817f +009_shortcut: 3d6a5051 +010_convolutional: 3df61395 +010_convolutional_bn: 3dda058a +010_convolutional_lrelu: 3d2f1d07 +011_convolutional: 3d60e65a +011_convolutional_bn: 3db28825 +011_convolutional_lrelu: 3d55a1c7 +012_shortcut: 3d92eb36 +013_convolutional: 3e76215d +013_convolutional_bn: 3dadb84b +013_convolutional_lrelu: 3d19feb3 +014_convolutional: 3d2e642b +014_convolutional_bn: 3d903514 +014_convolutional_lrelu: 3d0c08a6 +015_convolutional: 3ceab745 +015_convolutional_bn: 3d3364e6 +015_convolutional_lrelu: 3c9ec4fa +016_shortcut: 3d2244f8 +017_convolutional: 3d7674cc +017_convolutional_bn: 3d9297cd +017_convolutional_lrelu: 3d158097 +018_convolutional: 3d381760 +018_convolutional_bn: 3d3836c7 +018_convolutional_lrelu: 3cb3ed07 +019_shortcut: 3d27aee4 +020_convolutional: 3d5d677c +020_convolutional_bn: 3d88b4f1 +020_convolutional_lrelu: 3d01ae43 +021_convolutional: 3d1eb2b4 +021_convolutional_bn: 3d5ff557 +021_convolutional_lrelu: 3cad4ba3 +022_shortcut: 3d438d1a +023_convolutional: 3d48a468 +023_convolutional_bn: 3d786211 +023_convolutional_lrelu: 3d17a3aa +024_convolutional: 3d19821e +024_convolutional_bn: 3d500fe5 +024_convolutional_lrelu: 3c95a26c +025_shortcut: 3d5db913 +026_convolutional: 3d734ce0 +026_convolutional_bn: 3d9288af +026_convolutional_lrelu: 3cfaa739 +027_convolutional: 3d050035 +027_convolutional_bn: 3d5e24d9 +027_convolutional_lrelu: 3cf1386d +028_shortcut: 3d87ba8a +029_convolutional: 3d91eb8f +029_convolutional_bn: 3d88c4c3 +029_convolutional_lrelu: 3cf97d18 +030_convolutional: 3cbfe7a9 +030_convolutional_bn: 3d753009 +030_convolutional_lrelu: 3ce76734 +031_shortcut: 3da2b67a +032_convolutional: 3d8ae662 +032_convolutional_bn: 3d6dc036 +032_convolutional_lrelu: 3cf030df +033_convolutional: 3cc7b805 +033_convolutional_bn: 3d9e9c78 +033_convolutional_lrelu: 3d0141eb +034_shortcut: 3dadb1bd +035_convolutional: 3dc80287 +035_convolutional_bn: 3d83ea9e +035_convolutional_lrelu: 3d16f697 +036_convolutional: 3cca9a74 +036_convolutional_bn: 3da5ba97 +036_convolutional_lrelu: 3d13634a +037_shortcut: 3d9f6d7c +038_convolutional: 3e48a0d1 +038_convolutional_bn: 3da31bad +038_convolutional_lrelu: 3cf4e5a9 +039_convolutional: 3cb6eb19 +039_convolutional_bn: 3d7bc781 +039_convolutional_lrelu: 3d167ab9 +040_convolutional: 3d37a246 +040_convolutional_bn: 3d16fcfe +040_convolutional_lrelu: 3c188e32 +041_shortcut: 3d094bd6 +042_convolutional: 3cde602e +042_convolutional_bn: 3d74dd3e +042_convolutional_lrelu: 3d2fe82e +043_convolutional: 3d23234a +043_convolutional_bn: 3d2168ad +043_convolutional_lrelu: 3c9973ed +044_shortcut: 3d0d99ee +045_convolutional: 3d187446 +045_convolutional_bn: 3d92f11d +045_convolutional_lrelu: 3cec68f7 +046_convolutional: 3ccca87d +046_convolutional_bn: 3d1ac05f +046_convolutional_lrelu: 3ca53f46 +047_shortcut: 3d2deb7e +048_convolutional: 3d123aea +048_convolutional_bn: 3d7b73ce +048_convolutional_lrelu: 3cdd621a +049_convolutional: 3cb7eec5 +049_convolutional_bn: 3d285180 +049_convolutional_lrelu: 3c9f1060 +050_shortcut: 3d4183f2 +051_convolutional: 3d169fa6 +051_convolutional_bn: 3d6c5487 +051_convolutional_lrelu: 3cdc27f5 +052_convolutional: 3cafb7f1 +052_convolutional_bn: 3d676b6d +052_convolutional_lrelu: 3cc669bf +053_shortcut: 3d58553c +054_convolutional: 3d4431ff +054_convolutional_bn: 3d77211d +054_convolutional_lrelu: 3cb60dd9 +055_convolutional: 3ccbdd32 +055_convolutional_bn: 3d9dacae +055_convolutional_lrelu: 3cd91763 +056_shortcut: 3d6109ac +057_convolutional: 3d52dd55 +057_convolutional_bn: 3d6c94d2 +057_convolutional_lrelu: 3cee7561 +058_convolutional: 3cb64c42 +058_convolutional_bn: 3d6277d4 +058_convolutional_lrelu: 3cf0c943 +059_shortcut: 3d7f0354 +060_convolutional: 3d92ea8a +060_convolutional_bn: 3d72094c +060_convolutional_lrelu: 3cac1c4d +061_convolutional: 3cabc7bb +061_convolutional_bn: 3dbdaf93 +061_convolutional_lrelu: 3d0fe91c +062_shortcut: 3d676bdc +063_convolutional: 3e17162f +063_convolutional_bn: 3da49ac5 +063_convolutional_lrelu: 3cd12d71 +064_convolutional: 3ccb7e4a +064_convolutional_bn: 3d9f890e +064_convolutional_lrelu: 3cd6b1e3 +065_convolutional: 3d51c337 +065_convolutional_bn: 3d54a422 +065_convolutional_lrelu: 3cad4d05 +066_shortcut: 3cbd0480 +067_convolutional: 3d25bf62 +067_convolutional_bn: 3db19b82 +067_convolutional_lrelu: 3cadce78 +068_convolutional: 3cd4fc22 +068_convolutional_bn: 3d3a8d52 +068_convolutional_lrelu: 3c958a32 +069_shortcut: 3cf85d2e +070_convolutional: 3d20476b +070_convolutional_bn: 3da8df54 +070_convolutional_lrelu: 3caa0643 +071_convolutional: 3ce7af07 +071_convolutional_bn: 3d62d3c4 +071_convolutional_lrelu: 3c933e24 +072_shortcut: 3d2010ef +073_convolutional: 3d96e66c +073_convolutional_bn: 3dce8bc7 +073_convolutional_lrelu: 3c9a4f55 +074_convolutional: 3cbad12f +074_convolutional_bn: 3ddf4782 +074_convolutional_lrelu: 3cf96e12 +075_shortcut: 3d574761 +076_convolutional: 3d73897b +076_convolutional_bn: 3d8ce858 +076_convolutional_lrelu: 3d09d4cc +077_convolutional: 3d6a1055 +077_convolutional_bn: 3d80fb64 +077_convolutional_lrelu: 3d064bfc +078_convolutional: 3d836b76 +078_convolutional_bn: 3d7cddf5 +078_convolutional_lrelu: 3cd7e74a +079_convolutional: 3d33fd20 +079_convolutional_bn: 3d4d6a5b +079_convolutional_lrelu: 3cffb82b +080_convolutional: 3d48acf5 +080_convolutional_bn: 3d5990ea +080_convolutional_lrelu: 3ca7b18d +081_convolutional: 3d293608 +081_convolutional_bn: 3d8243ac +081_convolutional_lrelu: 3d2a41ed +082_convolutional: 3e600ce3 +085_convolutional: 3d15d9b4 +085_convolutional_bn: 3d9d4e34 +085_convolutional_lrelu: 3d0d6c79 +086_upsample: 3d676bdc +087_route: 3d676bdc +088_convolutional: 3de3e9c6 +088_convolutional_bn: 3d8bbec1 +088_convolutional_lrelu: 3ce2f1fc +089_convolutional: 3d97111d +089_convolutional_bn: 3d7d6e5b +089_convolutional_lrelu: 3cbd02b2 +090_convolutional: 3d5b221f +090_convolutional_bn: 3d5a38be +090_convolutional_lrelu: 3c9c1ce1 +091_convolutional: 3d60f3f0 +091_convolutional_bn: 3d739f0a +091_convolutional_lrelu: 3cbcc688 +092_convolutional: 3d6e15cb +092_convolutional_bn: 3d858930 +092_convolutional_lrelu: 3ca557a8 +093_convolutional: 3d23daec +093_convolutional_bn: 3d8df75f +093_convolutional_lrelu: 3d2cdaed +094_convolutional: 3e532129 +097_convolutional: 3d162469 +097_convolutional_bn: 3da84cb3 +097_convolutional_lrelu: 3d5f6229 +098_upsample: 3d9f6d7c +099_route: 3d9f6d7c +100_convolutional: 3dfac67e +100_convolutional_bn: 3d866014 +100_convolutional_lrelu: 3d0dce7d +101_convolutional: 3daa6cbe +101_convolutional_bn: 3d78cb1a +101_convolutional_lrelu: 3d0899ae +102_convolutional: 3d52238b +102_convolutional_bn: 3d81353d +102_convolutional_lrelu: 3cd2c022 +103_convolutional: 3dba7093 +103_convolutional_bn: 3d5f9b69 +103_convolutional_lrelu: 3cdd97b4 +104_convolutional: 3d7c40c4 +104_convolutional_bn: 3d84edc8 +104_convolutional_lrelu: 3d03fc1f +105_convolutional: 3dc5494f +105_convolutional_bn: 3da84277 +105_convolutional_lrelu: 3d4c3fb5 +106_convolutional: 3e82ccc7 +(Unnamed Layer* 246) [PluginV2IOExt]_output_0: 3efa5428 +(Unnamed Layer* 247) [PluginV2IOExt]_output_0: 3ee20e1c +(Unnamed Layer* 248) [PluginV2IOExt]_output_0: 3eea2ede diff --git a/yolo/calib_cache/calib_yolov3-spp-int8-608.bin b/yolo/calib_cache/calib_yolov3-spp-int8-608.bin new file mode 100644 index 0000000..6bbf71b --- /dev/null +++ b/yolo/calib_cache/calib_yolov3-spp-int8-608.bin @@ -0,0 +1,258 @@ +TRT-7103-EntropyCalibration2 +000_net: 3c010a14 +001_convolutional: 3cc1e6c2 +001_convolutional_bn: 3e3dae13 +001_convolutional_lrelu: 3ddcdcb3 +002_convolutional: 3ea885a3 +002_convolutional_bn: 3d877b95 +002_convolutional_lrelu: 3d487bb9 +003_convolutional: 3e079126 +003_convolutional_bn: 3e07c5a9 +003_convolutional_lrelu: 3d8a81fc +004_convolutional: 3e0897f9 +004_convolutional_bn: 3df6b69d +004_convolutional_lrelu: 3d74ba9f +005_shortcut: 3db98c07 +006_convolutional: 3ea9ffa3 +006_convolutional_bn: 3e049b0f +006_convolutional_lrelu: 3d6afafa +007_convolutional: 3da8e7ab +007_convolutional_bn: 3dac3f22 +007_convolutional_lrelu: 3d1aad80 +008_convolutional: 3d02ad5d +008_convolutional_bn: 3d3d3011 +008_convolutional_lrelu: 3ce0b983 +009_shortcut: 3d65222f +010_convolutional: 3e0361c9 +010_convolutional_bn: 3e02d26d +010_convolutional_lrelu: 3d2d7316 +011_convolutional: 3d627308 +011_convolutional_bn: 3daebf2f +011_convolutional_lrelu: 3d14a813 +012_shortcut: 3dacd17b +013_convolutional: 3e7e41a4 +013_convolutional_bn: 3d934c2e +013_convolutional_lrelu: 3d1b9c4b +014_convolutional: 3d328d13 +014_convolutional_bn: 3d9693da +014_convolutional_lrelu: 3d013a50 +015_convolutional: 3d145f8c +015_convolutional_bn: 3d33f221 +015_convolutional_lrelu: 3c77ff83 +016_shortcut: 3d223726 +017_convolutional: 3d79e1d7 +017_convolutional_bn: 3d910272 +017_convolutional_lrelu: 3d1818d7 +018_convolutional: 3d2430e9 +018_convolutional_bn: 3d179c24 +018_convolutional_lrelu: 3cb59c76 +019_shortcut: 3d3fad4e +020_convolutional: 3d6aa953 +020_convolutional_bn: 3d971117 +020_convolutional_lrelu: 3d0a4a66 +021_convolutional: 3cf79e4a +021_convolutional_bn: 3d51252f +021_convolutional_lrelu: 3cb389a7 +022_shortcut: 3d52790c +023_convolutional: 3d488983 +023_convolutional_bn: 3d816e4c +023_convolutional_lrelu: 3d1cd21d +024_convolutional: 3d12341b +024_convolutional_bn: 3d3ce6f1 +024_convolutional_lrelu: 3cbdf46e +025_shortcut: 3d65ade1 +026_convolutional: 3d60a84b +026_convolutional_bn: 3d93a69c +026_convolutional_lrelu: 3d013552 +027_convolutional: 3cee3507 +027_convolutional_bn: 3d7180b6 +027_convolutional_lrelu: 3cef1b2b +028_shortcut: 3d89433f +029_convolutional: 3d906be3 +029_convolutional_bn: 3d8c0d4e +029_convolutional_lrelu: 3d0547d6 +030_convolutional: 3cd3c986 +030_convolutional_bn: 3dce28f8 +030_convolutional_lrelu: 3d105248 +031_shortcut: 3d980526 +032_convolutional: 3d92a4fe +032_convolutional_bn: 3d75e748 +032_convolutional_lrelu: 3cf0bf5e +033_convolutional: 3ce85e4c +033_convolutional_bn: 3d9fce65 +033_convolutional_lrelu: 3d07d676 +034_shortcut: 3da13385 +035_convolutional: 3dbe8edc +035_convolutional_bn: 3d88b896 +035_convolutional_lrelu: 3ce5aeae +036_convolutional: 3cbb48d8 +036_convolutional_bn: 3da707a0 +036_convolutional_lrelu: 3d23e7ce +037_shortcut: 3d935901 +038_convolutional: 3e42c771 +038_convolutional_bn: 3d9cc657 +038_convolutional_lrelu: 3d052b4a +039_convolutional: 3ca36e5c +039_convolutional_bn: 3d798f57 +039_convolutional_lrelu: 3d1a9a24 +040_convolutional: 3d43e821 +040_convolutional_bn: 3cf02fb2 +040_convolutional_lrelu: 3c130957 +041_shortcut: 3d037bf1 +042_convolutional: 3cdc8f82 +042_convolutional_bn: 3d86b281 +042_convolutional_lrelu: 3d0c3612 +043_convolutional: 3d110022 +043_convolutional_bn: 3d2e627a +043_convolutional_lrelu: 3c9ca38c +044_shortcut: 3d06771f +045_convolutional: 3d06694e +045_convolutional_bn: 3d642037 +045_convolutional_lrelu: 3cf20a07 +046_convolutional: 3ca9f1fa +046_convolutional_bn: 3d417080 +046_convolutional_lrelu: 3c920518 +047_shortcut: 3d28afee +048_convolutional: 3d102eac +048_convolutional_bn: 3d685214 +048_convolutional_lrelu: 3cdff0c6 +049_convolutional: 3cb63557 +049_convolutional_bn: 3d442ca2 +049_convolutional_lrelu: 3ca82011 +050_shortcut: 3d3162ce +051_convolutional: 3d175f15 +051_convolutional_bn: 3d6b2831 +051_convolutional_lrelu: 3cc9fd32 +052_convolutional: 3cb834a6 +052_convolutional_bn: 3d62567a +052_convolutional_lrelu: 3cca7ca7 +053_shortcut: 3d61f317 +054_convolutional: 3d3a818c +054_convolutional_bn: 3d8014b4 +054_convolutional_lrelu: 3cb7e663 +055_convolutional: 3cc295f2 +055_convolutional_bn: 3d9f39c8 +055_convolutional_lrelu: 3d058ab9 +056_shortcut: 3d68d058 +057_convolutional: 3d3ddc75 +057_convolutional_bn: 3d6badad +057_convolutional_lrelu: 3cddc998 +058_convolutional: 3c94d95a +058_convolutional_bn: 3d81d762 +058_convolutional_lrelu: 3cfc320c +059_shortcut: 3d8b8048 +060_convolutional: 3d8ae0c9 +060_convolutional_bn: 3d62b696 +060_convolutional_lrelu: 3ca0c33d +061_convolutional: 3c94812c +061_convolutional_bn: 3dbea4bb +061_convolutional_lrelu: 3cfeac50 +062_shortcut: 3d4cad06 +063_convolutional: 3e0b3199 +063_convolutional_bn: 3d989a57 +063_convolutional_lrelu: 3cf7c7b9 +064_convolutional: 3ca153d8 +064_convolutional_bn: 3d8c72d2 +064_convolutional_lrelu: 3d091f48 +065_convolutional: 3d367976 +065_convolutional_bn: 3d5db8ab +065_convolutional_lrelu: 3c86a0a0 +066_shortcut: 3cf710fb +067_convolutional: 3cca075e +067_convolutional_bn: 3d92712b +067_convolutional_lrelu: 3c96748b +068_convolutional: 3cb833f7 +068_convolutional_bn: 3d4560cc +068_convolutional_lrelu: 3cab9b60 +069_shortcut: 3cf987de +070_convolutional: 3cc1e53d +070_convolutional_bn: 3d695425 +070_convolutional_lrelu: 3ccf51cd +071_convolutional: 3cc4349b +071_convolutional_bn: 3d49aaa2 +071_convolutional_lrelu: 3cdc95d3 +072_shortcut: 3d108112 +073_convolutional: 3d15383b +073_convolutional_bn: 3d8b945b +073_convolutional_lrelu: 3c9fa1ee +074_convolutional: 3cb27484 +074_convolutional_bn: 3d95f919 +074_convolutional_lrelu: 3d0fa80c +075_shortcut: 3d4f6671 +076_convolutional: 3d55c415 +076_convolutional_bn: 3d90c0ab +076_convolutional_lrelu: 3d1481a8 +077_convolutional: 3dafcaa8 +077_convolutional_bn: 3d9a1eee +077_convolutional_lrelu: 3d0acd89 +078_convolutional: 3e204e75 +078_convolutional_bn: 3da289aa +078_convolutional_lrelu: 3d143dc3 +079_maxpool: 3d143dc3 +081_maxpool: 3d143dc3 +083_maxpool: 3d143dc3 +084_route: 3d143dc3 +085_convolutional: 3d843c75 +085_convolutional_bn: 3d9a33a2 +085_convolutional_lrelu: 3d04fc19 +086_convolutional: 3d7e805b +086_convolutional_bn: 3d7404de +086_convolutional_lrelu: 3d034c6e +087_convolutional: 3d436436 +087_convolutional_bn: 3d54aef3 +087_convolutional_lrelu: 3d015c07 +088_convolutional: 3d7ed7d7 +088_convolutional_bn: 3d8b5c9d +088_convolutional_lrelu: 3d1e87df +089_convolutional: 3e5e639a +092_convolutional: 3d4060ca +092_convolutional_bn: 3d8f5a9e +092_convolutional_lrelu: 3d2d5cac +093_upsample: 3d4cad06 +094_route: 3d4cad06 +095_convolutional: 3dcc68f9 +095_convolutional_bn: 3d8521b9 +095_convolutional_lrelu: 3d289238 +096_convolutional: 3da93126 +096_convolutional_bn: 3d87f05f +096_convolutional_lrelu: 3d182fbf +097_convolutional: 3d44121b +097_convolutional_bn: 3d839409 +097_convolutional_lrelu: 3cdb454d +098_convolutional: 3d85bd57 +098_convolutional_bn: 3d7da065 +098_convolutional_lrelu: 3d04eaf6 +099_convolutional: 3d5ccbb9 +099_convolutional_bn: 3d773490 +099_convolutional_lrelu: 3cd708ff +100_convolutional: 3d6feaea +100_convolutional_bn: 3d882839 +100_convolutional_lrelu: 3d2e3ea8 +101_convolutional: 3e45b03a +104_convolutional: 3d2f9c83 +104_convolutional_bn: 3dba946d +104_convolutional_lrelu: 3d69e03b +105_upsample: 3d935901 +106_route: 3d935901 +107_convolutional: 3e161afe +107_convolutional_bn: 3d84f142 +107_convolutional_lrelu: 3d0e35d7 +108_convolutional: 3dc362e6 +108_convolutional_bn: 3d7555e5 +108_convolutional_lrelu: 3d00c803 +109_convolutional: 3d4f4d7f +109_convolutional_bn: 3d86c3ff +109_convolutional_lrelu: 3d194172 +110_convolutional: 3db35943 +110_convolutional_bn: 3d7b99e9 +110_convolutional_lrelu: 3d077a43 +111_convolutional: 3dbfbfd5 +111_convolutional_bn: 3d8f0c83 +111_convolutional_lrelu: 3d180439 +112_convolutional: 3de396c9 +112_convolutional_bn: 3d9cc189 +112_convolutional_lrelu: 3d471581 +113_convolutional: 3e5c717d +(Unnamed Layer* 253) [PluginV2IOExt]_output_0: 3ef23e7d +(Unnamed Layer* 254) [PluginV2IOExt]_output_0: 3ee20891 +(Unnamed Layer* 255) [PluginV2IOExt]_output_0: 3de21d3a diff --git a/yolo/calib_cache/calib_yolov3-tiny-int8-416.bin b/yolo/calib_cache/calib_yolov3-tiny-int8-416.bin new file mode 100644 index 0000000..1fac48d --- /dev/null +++ b/yolo/calib_cache/calib_yolov3-tiny-int8-416.bin @@ -0,0 +1,47 @@ +TRT-7103-EntropyCalibration2 +000_net: 3c010a14 +001_convolutional: 3d77cc4d +001_convolutional_bn: 3eb97554 +001_convolutional_lrelu: 3e3cfaf6 +002_maxpool: 3e3cfaf6 +003_convolutional: 3fd20362 +003_convolutional_bn: 3f05ab3e +003_convolutional_lrelu: 3dba5110 +004_maxpool: 3dba5110 +005_convolutional: 3f0ff935 +005_convolutional_bn: 3e98332b +005_convolutional_lrelu: 3dc89fbc +006_maxpool: 3dc89fbc +007_convolutional: 3f13aa2f +007_convolutional_bn: 3e6a8bc5 +007_convolutional_lrelu: 3daf3f0b +008_maxpool: 3daf3f0b +009_convolutional: 3e9a71e8 +009_convolutional_bn: 3e277a8e +009_convolutional_lrelu: 3d8e5618 +010_maxpool: 3d8b6f69 +011_convolutional: 3e32c610 +011_convolutional_bn: 3e0d719f +011_convolutional_lrelu: 3d3e0683 +012_maxpool: 3d3e0683 +013_convolutional: 3dc55cef +013_convolutional_bn: 3ec090b7 +013_convolutional_lrelu: 3e1a4216 +014_convolutional: 3e5f4d5c +014_convolutional_bn: 3d86be13 +014_convolutional_lrelu: 3cff8f32 +015_convolutional: 3d7e0dfb +015_convolutional_bn: 3dc57801 +015_convolutional_lrelu: 3d5eb027 +016_convolutional: 3e535004 +019_convolutional: 3d28d5ce +019_convolutional_bn: 3dad20cf +019_convolutional_lrelu: 3d6086c9 +020_upsample: 3d8e5618 +021_route: 3d8e5618 +022_convolutional: 3e3be517 +022_convolutional_bn: 3db901c1 +022_convolutional_lrelu: 3d58aa42 +023_convolutional: 3e46f24e +(Unnamed Layer* 43) [PluginV2IOExt]_output_0: 3efa468d +(Unnamed Layer* 44) [PluginV2IOExt]_output_0: 3ee1f1e4 diff --git a/yolo/calib_cache/calib_yolov4-int8-608.bin b/yolo/calib_cache/calib_yolov4-int8-608.bin new file mode 100644 index 0000000..c54b952 --- /dev/null +++ b/yolo/calib_cache/calib_yolov4-int8-608.bin @@ -0,0 +1,511 @@ +TRT-7103-EntropyCalibration2 +000_net: 3c010a14 +001_convolutional: 3da6aff8 +001_convolutional_bn: 3ea6a387 +001_convolutional_softplus: 3e296d45 +001_convolutional_tanh: 3c010a14 +001_convolutional_mish: 3e17fbd6 +002_convolutional: 3fb53648 +002_convolutional_bn: 3e9383f9 +002_convolutional_softplus: 3e2640de +002_convolutional_tanh: 3c010a14 +002_convolutional_mish: 3e8d7fc8 +003_convolutional: 3f1d0b4c +003_convolutional_bn: 3e569c6c +003_convolutional_softplus: 3de204c4 +003_convolutional_tanh: 3c010a14 +003_convolutional_mish: 3d8f6f42 +005_convolutional: 3f12c8ba +005_convolutional_bn: 3e0d00c7 +005_convolutional_softplus: 3dba9b4b +005_convolutional_tanh: 3c010a14 +005_convolutional_mish: 3dab1388 +006_convolutional: 3e938548 +006_convolutional_bn: 3e6d6234 +006_convolutional_softplus: 3e3874f1 +006_convolutional_tanh: 3c010a14 +006_convolutional_mish: 3dddcb43 +007_convolutional: 3f2a4aa7 +007_convolutional_bn: 3e5384a9 +007_convolutional_softplus: 3df5c8f6 +007_convolutional_tanh: 3c010a14 +007_convolutional_mish: 3dda4c4a +008_shortcut: 3e528e26 +009_convolutional: 3f01ddd0 +009_convolutional_bn: 3e58618d +009_convolutional_softplus: 3de09ee4 +009_convolutional_tanh: 3c010a14 +009_convolutional_mish: 3d8f6f42 +010_route: 3d8f6f42 +011_convolutional: 3eef7ec3 +011_convolutional_bn: 3e3cc2f2 +011_convolutional_softplus: 3ddecbd2 +011_convolutional_tanh: 3c010a14 +011_convolutional_mish: 3da723ff +012_convolutional: 3f8e6c14 +012_convolutional_bn: 3e175ef7 +012_convolutional_softplus: 3db368a7 +012_convolutional_tanh: 3c010a14 +012_convolutional_mish: 3da47a3e +013_convolutional: 3ec71022 +013_convolutional_bn: 3df7f8cd +013_convolutional_softplus: 3db10627 +013_convolutional_tanh: 3c010a14 +013_convolutional_mish: 3da03ba8 +015_convolutional: 3ea96d61 +015_convolutional_bn: 3d9d8cdd +015_convolutional_softplus: 3d8abb2d +015_convolutional_tanh: 3c021427 +015_convolutional_mish: 3d804d31 +016_convolutional: 3e318b56 +016_convolutional_bn: 3da302a3 +016_convolutional_softplus: 3d902621 +016_convolutional_tanh: 3c01f3e7 +016_convolutional_mish: 3d9e63bb +017_convolutional: 3e863e49 +017_convolutional_bn: 3dbdb322 +017_convolutional_softplus: 3d9893cf +017_convolutional_tanh: 3c021427 +017_convolutional_mish: 3d392afd +018_shortcut: 3dd31aa3 +019_convolutional: 3e4cac42 +019_convolutional_bn: 3d9b0161 +019_convolutional_softplus: 3d5f678f +019_convolutional_tanh: 3c061c33 +019_convolutional_mish: 3d55644e +020_convolutional: 3e8c293c +020_convolutional_bn: 3e1c4b6a +020_convolutional_softplus: 3da6a2dd +020_convolutional_tanh: 3c010a14 +020_convolutional_mish: 3da6a2dd +021_shortcut: 3e1adb45 +022_convolutional: 3ed98343 +022_convolutional_bn: 3e0a40cc +022_convolutional_softplus: 3db71b3f +022_convolutional_tanh: 3c010a14 +022_convolutional_mish: 3da03ba8 +023_route: 3da03ba8 +024_convolutional: 3ee448cf +024_convolutional_bn: 3e1e7ef8 +024_convolutional_softplus: 3d7bb1f9 +024_convolutional_tanh: 3c010a14 +024_convolutional_mish: 3d8607b8 +025_convolutional: 3f08c3e7 +025_convolutional_bn: 3df97e0e +025_convolutional_softplus: 3d97ba96 +025_convolutional_tanh: 3c010a14 +025_convolutional_mish: 3d38c530 +026_convolutional: 3e8d62f0 +026_convolutional_bn: 3dedaad6 +026_convolutional_softplus: 3d93e66e +026_convolutional_tanh: 3c021427 +026_convolutional_mish: 3d83b0d4 +028_convolutional: 3e8973a3 +028_convolutional_bn: 3dba83a4 +028_convolutional_softplus: 3d994c28 +028_convolutional_tanh: 3c010a14 +028_convolutional_mish: 3d8240d3 +029_convolutional: 3e21d9ce +029_convolutional_bn: 3dbe8121 +029_convolutional_softplus: 3d717a22 +029_convolutional_tanh: 3c010a14 +029_convolutional_mish: 3d1141b8 +030_convolutional: 3e9586c8 +030_convolutional_bn: 3daf7179 +030_convolutional_softplus: 3d4e4250 +030_convolutional_tanh: 3c021427 +030_convolutional_mish: 3d235725 +031_shortcut: 3db5fe0f +032_convolutional: 3e4179ab +032_convolutional_bn: 3dc46552 +032_convolutional_softplus: 3d78390e +032_convolutional_tanh: 3c01121e +032_convolutional_mish: 3d24ec37 +033_convolutional: 3e43846b +033_convolutional_bn: 3dd3beb8 +033_convolutional_softplus: 3d5bfe3f +033_convolutional_tanh: 3c03162a +033_convolutional_mish: 3d107ef6 +034_shortcut: 3dbe8cd4 +035_convolutional: 3e706786 +035_convolutional_bn: 3e08b8e1 +035_convolutional_softplus: 3d690deb +035_convolutional_tanh: 3c02141c +035_convolutional_mish: 3d24584c +036_convolutional: 3e30ec80 +036_convolutional_bn: 3dc29a0a +036_convolutional_softplus: 3d5ee2b8 +036_convolutional_tanh: 3c02141f +036_convolutional_mish: 3cd5180c +037_shortcut: 3dfa1fdd +038_convolutional: 3ea10c50 +038_convolutional_bn: 3e12447d +038_convolutional_softplus: 3d5a0570 +038_convolutional_tanh: 3c011223 +038_convolutional_mish: 3d02a407 +039_convolutional: 3e5baa4a +039_convolutional_bn: 3e065b91 +039_convolutional_softplus: 3dcd6135 +039_convolutional_tanh: 3c010a14 +039_convolutional_mish: 3d15f581 +040_shortcut: 3e26c262 +041_convolutional: 3e8d42dc +041_convolutional_bn: 3ddb7633 +041_convolutional_softplus: 3d4a02f0 +041_convolutional_tanh: 3c0111e6 +041_convolutional_mish: 3d119983 +042_convolutional: 3dffd3ad +042_convolutional_bn: 3db72fe8 +042_convolutional_softplus: 3d7bc282 +042_convolutional_tanh: 3c021427 +042_convolutional_mish: 3d38f535 +043_shortcut: 3e253907 +044_convolutional: 3ea7c803 +044_convolutional_bn: 3dd24023 +044_convolutional_softplus: 3d2ee27e +044_convolutional_tanh: 3c011209 +044_convolutional_mish: 3cc691eb +045_convolutional: 3df677c6 +045_convolutional_bn: 3df0ab1f +045_convolutional_softplus: 3d8ab5cf +045_convolutional_tanh: 3c010a14 +045_convolutional_mish: 3d21fa8d +046_shortcut: 3e2b4214 +047_convolutional: 3e9bf0c3 +047_convolutional_bn: 3dc24ce9 +047_convolutional_softplus: 3d48ddaf +047_convolutional_tanh: 3c011222 +047_convolutional_mish: 3cec277c +048_convolutional: 3e067637 +048_convolutional_bn: 3e175474 +048_convolutional_softplus: 3db71eb1 +048_convolutional_tanh: 3c010a14 +048_convolutional_mish: 3da7e136 +049_shortcut: 3e5afcbe +050_convolutional: 3ed4a1e6 +050_convolutional_bn: 3dea922f +050_convolutional_softplus: 3d29bb2b +050_convolutional_tanh: 3c010a14 +050_convolutional_mish: 3d0e1420 +051_convolutional: 3e0be5b5 +051_convolutional_bn: 3e187487 +051_convolutional_softplus: 3dba801d +051_convolutional_tanh: 3c010a14 +051_convolutional_mish: 3daafa9d +052_shortcut: 3e786f2a +053_convolutional: 3f251892 +053_convolutional_bn: 3df5ec06 +053_convolutional_softplus: 3dad6084 +053_convolutional_tanh: 3c010a14 +053_convolutional_mish: 3d83b0d4 +054_route: 3d83b0d4 +055_convolutional: 3e97dd13 +055_convolutional_bn: 3e1ea207 +055_convolutional_softplus: 3d4dc4f2 +055_convolutional_tanh: 3c010a14 +055_convolutional_mish: 3d39f7e7 +056_convolutional: 3eb1fce8 +056_convolutional_bn: 3dd683d4 +056_convolutional_softplus: 3d8c3215 +056_convolutional_tanh: 3c010a14 +056_convolutional_mish: 3d0e6272 +057_convolutional: 3e1c7a19 +057_convolutional_bn: 3db82deb +057_convolutional_softplus: 3d7d9903 +057_convolutional_tanh: 3c010a14 +057_convolutional_mish: 3d160c32 +059_convolutional: 3e506407 +059_convolutional_bn: 3d9f9d99 +059_convolutional_softplus: 3d7c9682 +059_convolutional_tanh: 3c021411 +059_convolutional_mish: 3d3af590 +060_convolutional: 3db81469 +060_convolutional_bn: 3db931a1 +060_convolutional_softplus: 3d93914f +060_convolutional_tanh: 3c021427 +060_convolutional_mish: 3d017403 +061_convolutional: 3ebd1ec2 +061_convolutional_bn: 3da85604 +061_convolutional_softplus: 3d5dbe02 +061_convolutional_tanh: 3c03161e +061_convolutional_mish: 3d226600 +062_shortcut: 3d8e58d4 +063_convolutional: 3dad8279 +063_convolutional_bn: 3da76549 +063_convolutional_softplus: 3d512597 +063_convolutional_tanh: 3c011223 +063_convolutional_mish: 3d25a0b9 +064_convolutional: 3e175192 +064_convolutional_bn: 3db03377 +064_convolutional_softplus: 3d35ed9a +064_convolutional_tanh: 3c01114d +064_convolutional_mish: 3caf9999 +065_shortcut: 3d7f109e +066_convolutional: 3e01908b +066_convolutional_bn: 3dc251b0 +066_convolutional_softplus: 3d552ea7 +066_convolutional_tanh: 3c0111fe +066_convolutional_mish: 3d11918e +067_convolutional: 3de36fdb +067_convolutional_bn: 3dab86db +067_convolutional_softplus: 3d347d29 +067_convolutional_tanh: 3c011138 +067_convolutional_mish: 3d02bdc7 +068_shortcut: 3db379aa +069_convolutional: 3e06e991 +069_convolutional_bn: 3e031644 +069_convolutional_softplus: 3d3123db +069_convolutional_tanh: 3c011204 +069_convolutional_mish: 3cc4695a +070_convolutional: 3e082370 +070_convolutional_bn: 3df795f0 +070_convolutional_softplus: 3d74e50b +070_convolutional_tanh: 3c031628 +070_convolutional_mish: 3d5dc953 +071_shortcut: 3dc06bd4 +072_convolutional: 3e0f9dde +072_convolutional_bn: 3db1944b +072_convolutional_softplus: 3d4aaf62 +072_convolutional_tanh: 3c0111dc +072_convolutional_mish: 3d0fd5ed +073_convolutional: 3dc66a6a +073_convolutional_bn: 3dccd1c3 +073_convolutional_softplus: 3d834750 +073_convolutional_tanh: 3c0213fc +073_convolutional_mish: 3d0fe4cb +074_shortcut: 3dcfbd61 +075_convolutional: 3e15e4c1 +075_convolutional_bn: 3db3383a +075_convolutional_softplus: 3d2b90b3 +075_convolutional_tanh: 3c02113a +075_convolutional_mish: 3ceb5f10 +076_convolutional: 3db6ba74 +076_convolutional_bn: 3dd2e09e +076_convolutional_softplus: 3d741c69 +076_convolutional_tanh: 3c010a14 +076_convolutional_mish: 3d58cf6e +077_shortcut: 3dff3205 +078_convolutional: 3e424805 +078_convolutional_bn: 3db97a3c +078_convolutional_softplus: 3d2c6de4 +078_convolutional_tanh: 3c010fa6 +078_convolutional_mish: 3d0332bf +079_convolutional: 3dc29c00 +079_convolutional_bn: 3debf2e9 +079_convolutional_softplus: 3d707c08 +079_convolutional_tanh: 3c010a14 +079_convolutional_mish: 3d0e49e1 +080_shortcut: 3e1abc32 +081_convolutional: 3e6626a4 +081_convolutional_bn: 3db644c5 +081_convolutional_softplus: 3d1d1ed9 +081_convolutional_tanh: 3c011197 +081_convolutional_mish: 3cafa27f +082_convolutional: 3daec08c +082_convolutional_bn: 3e09a51a +082_convolutional_softplus: 3d915698 +082_convolutional_tanh: 3c010a14 +082_convolutional_mish: 3d8782a8 +083_shortcut: 3e382b5d +084_convolutional: 3ec83556 +084_convolutional_bn: 3dcdf03d +084_convolutional_softplus: 3d827ec2 +084_convolutional_tanh: 3c021426 +084_convolutional_mish: 3d160c32 +085_route: 3d160c32 +086_convolutional: 3e459e81 +086_convolutional_bn: 3e135046 +086_convolutional_softplus: 3d4a0725 +086_convolutional_tanh: 3c010a14 +086_convolutional_mish: 3d3b1017 +087_convolutional: 3e598534 +087_convolutional_bn: 3db52443 +087_convolutional_softplus: 3d205b0d +087_convolutional_tanh: 3c010a14 +087_convolutional_mish: 3d0e39a0 +088_convolutional: 3da5c757 +088_convolutional_bn: 3e0a0194 +088_convolutional_softplus: 3d05a7db +088_convolutional_tanh: 3c010a14 +088_convolutional_mish: 3d24e64e +090_convolutional: 3d8d17c5 +090_convolutional_bn: 3da38f3a +090_convolutional_softplus: 3d4f2686 +090_convolutional_tanh: 3c011223 +090_convolutional_mish: 3cc704b3 +091_convolutional: 3d28f40b +091_convolutional_bn: 3db158be +091_convolutional_softplus: 3d318655 +091_convolutional_tanh: 3c010a14 +091_convolutional_mish: 3d1fbc8b +092_convolutional: 3ea03076 +092_convolutional_bn: 3dd7e12b +092_convolutional_softplus: 3d22360e +092_convolutional_tanh: 3c010f4a +092_convolutional_mish: 3cc77029 +093_shortcut: 3d0712ee +094_convolutional: 3d67e7c1 +094_convolutional_bn: 3ddd0718 +094_convolutional_softplus: 3d2e4ee2 +094_convolutional_tanh: 3c010a14 +094_convolutional_mish: 3ced2ad6 +095_convolutional: 3db228a1 +095_convolutional_bn: 3e00baba +095_convolutional_softplus: 3d145200 +095_convolutional_tanh: 3c0111d3 +095_convolutional_mish: 3cb729c8 +096_shortcut: 3d2e3725 +097_convolutional: 3d94712a +097_convolutional_bn: 3dc951ef +097_convolutional_softplus: 3d34fad3 +097_convolutional_tanh: 3c01121e +097_convolutional_mish: 3ca623ee +098_convolutional: 3dc946d4 +098_convolutional_bn: 3e08652f +098_convolutional_softplus: 3d51ba2d +098_convolutional_tanh: 3c0315fb +098_convolutional_mish: 3cc6364b +099_shortcut: 3d65c687 +100_convolutional: 3d9368a5 +100_convolutional_bn: 3d9fe445 +100_convolutional_softplus: 3d067d20 +100_convolutional_tanh: 3c011126 +100_convolutional_mish: 3cd85a6d +101_convolutional: 3dbe050e +101_convolutional_bn: 3dc5c1cc +101_convolutional_softplus: 3d7c1e4d +101_convolutional_tanh: 3c031629 +101_convolutional_mish: 3d12d5fd +102_shortcut: 3d835161 +103_convolutional: 3e1a388d +103_convolutional_bn: 3dcff4e9 +103_convolutional_softplus: 3cef7e61 +103_convolutional_tanh: 3c0111ac +103_convolutional_mish: 3d24e64e +104_route: 3d24e64e +105_convolutional: 3d378b5b +105_convolutional_bn: 3dde51b2 +105_convolutional_softplus: 3d4f5d5c +105_convolutional_tanh: 3c021427 +105_convolutional_mish: 3d11e14d +106_convolutional: 3dd1ccd1 +106_convolutional_bn: 3db4909b +106_convolutional_lrelu: 3d3e9554 +107_convolutional: 3e6bbcf6 +107_convolutional_bn: 3d62fae8 +107_convolutional_lrelu: 3d098c08 +108_convolutional: 3e57167e +108_convolutional_bn: 3d69182f +108_convolutional_lrelu: 3d6315b8 +109_maxpool: 3d6315b8 +111_maxpool: 3d6315b8 +113_maxpool: 3d6315b8 +114_route: 3d6315b8 +115_convolutional: 3e975b6c +115_convolutional_bn: 3e3ffa3e +115_convolutional_lrelu: 3d478d26 +116_convolutional: 3e96cfcf +116_convolutional_bn: 3e1f5386 +116_convolutional_lrelu: 3d2c2404 +117_convolutional: 3e013937 +117_convolutional_bn: 3dafc777 +117_convolutional_lrelu: 3d406a0c +118_convolutional: 3e2472be +118_convolutional_bn: 3db75685 +118_convolutional_lrelu: 3d61eb07 +119_upsample: 3d8b686d +121_convolutional: 3dd3583e +121_convolutional_bn: 3df79627 +121_convolutional_lrelu: 3d8b686d +122_route: 3d8b686d +123_convolutional: 3e78551f +123_convolutional_bn: 3e06f23b +123_convolutional_lrelu: 3d9afbda +124_convolutional: 3ec91fd2 +124_convolutional_bn: 3dddea03 +124_convolutional_lrelu: 3d7a7f34 +125_convolutional: 3e357062 +125_convolutional_bn: 3e105b62 +125_convolutional_lrelu: 3d963d9e +126_convolutional: 3e9e68d8 +126_convolutional_bn: 3dec07b5 +126_convolutional_lrelu: 3d6f86d8 +127_convolutional: 3e4ab9ce +127_convolutional_bn: 3df50bd8 +127_convolutional_lrelu: 3d5df499 +128_convolutional: 3e482c42 +128_convolutional_bn: 3e1f8984 +128_convolutional_lrelu: 3d9f61bf +129_upsample: 3da79f33 +131_convolutional: 3dfe1df4 +131_convolutional_bn: 3e04dae5 +131_convolutional_lrelu: 3da79f33 +132_route: 3da79f33 +133_convolutional: 3ed4232f +133_convolutional_bn: 3e2a99f8 +133_convolutional_lrelu: 3da4d9f2 +134_convolutional: 3f0cba6a +134_convolutional_bn: 3e1fb5d2 +134_convolutional_lrelu: 3d824bb3 +135_convolutional: 3e8553b8 +135_convolutional_bn: 3e31fd22 +135_convolutional_lrelu: 3dc32006 +136_convolutional: 3f16c6d8 +136_convolutional_bn: 3df91ca0 +136_convolutional_lrelu: 3dcbe87c +137_convolutional: 3ecf149b +137_convolutional_bn: 3e940813 +137_convolutional_lrelu: 3daff33e +138_convolutional: 400b24ac +138_convolutional_bn: 3ded9b06 +138_convolutional_lrelu: 3d9285a1 +139_convolutional: 3eb67f3d +142_convolutional: 3eec4444 +142_convolutional_bn: 3e064b3d +142_convolutional_lrelu: 3d5df499 +143_route: 3d5df499 +144_convolutional: 3e3782d6 +144_convolutional_bn: 3dff93f4 +144_convolutional_lrelu: 3d73aced +145_convolutional: 3ea2181a +145_convolutional_bn: 3dcc7e51 +145_convolutional_lrelu: 3d3d80cb +146_convolutional: 3e339dcd +146_convolutional_bn: 3df741c2 +146_convolutional_lrelu: 3da73e4f +147_convolutional: 3ec12716 +147_convolutional_bn: 3dd63716 +147_convolutional_lrelu: 3d348d02 +148_convolutional: 3e5ee5c5 +148_convolutional_bn: 3e407ba6 +148_convolutional_lrelu: 3dc105c4 +149_convolutional: 3f42a297 +149_convolutional_bn: 3dc6953f +149_convolutional_lrelu: 3d2a1cb0 +150_convolutional: 3eab8522 +153_convolutional: 3e35e087 +153_convolutional_bn: 3dc8f32d +153_convolutional_lrelu: 3d406a0c +154_route: 3d406a0c +155_convolutional: 3dcc13cd +155_convolutional_bn: 3d9bbd98 +155_convolutional_lrelu: 3d0ae902 +156_convolutional: 3ddb1c39 +156_convolutional_bn: 3d82d2fd +156_convolutional_lrelu: 3cf31a37 +157_convolutional: 3d7bd773 +157_convolutional_bn: 3d998229 +157_convolutional_lrelu: 3d0e6b9c +158_convolutional: 3dd09e57 +158_convolutional_bn: 3d95eb83 +158_convolutional_lrelu: 3cd82f0a +159_convolutional: 3d97cd8f +159_convolutional_bn: 3dcdaf39 +159_convolutional_lrelu: 3d173dbd +160_convolutional: 3e5f62f2 +160_convolutional_bn: 3d8dedb4 +160_convolutional_lrelu: 3d2ee001 +161_convolutional: 3e63c8d9 +(Unnamed Layer* 506) [PluginV2IOExt]_output_0: 4016060c +(Unnamed Layer* 507) [PluginV2IOExt]_output_0: 3ef64102 +(Unnamed Layer* 508) [PluginV2IOExt]_output_0: 3efa5428 diff --git a/yolo/calib_cache/calib_yolov4-tiny-int8-416.bin b/yolo/calib_cache/calib_yolov4-tiny-int8-416.bin new file mode 100644 index 0000000..4c0faa0 --- /dev/null +++ b/yolo/calib_cache/calib_yolov4-tiny-int8-416.bin @@ -0,0 +1,77 @@ +TRT-7103-EntropyCalibration2 +000_net: 3c010a14 +001_convolutional: 3d1c8e6f +001_convolutional_bn: 3e4974f2 +001_convolutional_lrelu: 3dc86a5b +002_convolutional: 3ece0986 +002_convolutional_bn: 3e5586a9 +002_convolutional_lrelu: 3db733ca +003_convolutional: 3f0e2de4 +003_convolutional_bn: 3e60045a +003_convolutional_lrelu: 3da01dc1 +004_route: 3d82b8ef +005_convolutional: 3e6609bc +005_convolutional_bn: 3e24dc23 +005_convolutional_lrelu: 3dab644a +006_convolutional: 3e9b3825 +006_convolutional_bn: 3e14e8af +006_convolutional_lrelu: 3dab644a +007_route: 3dab644a +008_convolutional: 3e5af597 +008_convolutional_bn: 3e6056b7 +008_convolutional_lrelu: 3da01dc1 +009_route: 3da01dc1 +010_maxpool: 3da01dc1 +011_convolutional: 3f03ea95 +011_convolutional_bn: 3e06fedb +011_convolutional_lrelu: 3d82f2db +012_route: 3d48c651 +013_convolutional: 3e183f49 +013_convolutional_bn: 3e05719a +013_convolutional_lrelu: 3d94d68b +014_convolutional: 3e4a5ee5 +014_convolutional_bn: 3e031d6c +014_convolutional_lrelu: 3d94d68b +015_route: 3d94d68b +016_convolutional: 3e174a7d +016_convolutional_bn: 3e332af1 +016_convolutional_lrelu: 3d82f2db +017_route: 3d82f2db +018_maxpool: 3d82f2db +019_convolutional: 3e6a4db7 +019_convolutional_bn: 3dfa9047 +019_convolutional_lrelu: 3d5576c5 +020_route: 3d21b8b8 +021_convolutional: 3dbccf7c +021_convolutional_bn: 3df2a13a +021_convolutional_lrelu: 3d8c2655 +022_convolutional: 3e30f046 +022_convolutional_bn: 3e06213a +022_convolutional_lrelu: 3d8c2655 +023_route: 3d8c2655 +024_convolutional: 3def9521 +024_convolutional_bn: 3e5bb6dd +024_convolutional_lrelu: 3d5cf432 +025_route: 3d5576c5 +026_maxpool: 3d5576c5 +027_convolutional: 3e0fb964 +027_convolutional_bn: 3d904460 +027_convolutional_lrelu: 3ce5e15a +028_convolutional: 3d2a22a6 +028_convolutional_bn: 3daa0d77 +028_convolutional_lrelu: 3cf3a519 +029_convolutional: 3d8c79cd +029_convolutional_bn: 3dc4fed3 +029_convolutional_lrelu: 3d538d7b +030_convolutional: 3e5a4f2e +033_convolutional: 3d2151e9 +033_convolutional_bn: 3da734e6 +033_convolutional_lrelu: 3d2f6b4e +034_upsample: 3d5cf432 +035_route: 3d5cf432 +036_convolutional: 3e08d1ff +036_convolutional_bn: 3d9e9b27 +036_convolutional_lrelu: 3d31538c +037_convolutional: 3e46fd84 +(Unnamed Layer* 76) [PluginV2IOExt]_output_0: 3efa468d +(Unnamed Layer* 77) [PluginV2IOExt]_output_0: 3ef222c6 diff --git a/yolo/calibrator.py b/yolo/calibrator.py new file mode 100644 index 0000000..ecb4802 --- /dev/null +++ b/yolo/calibrator.py @@ -0,0 +1,153 @@ +"""calibrator.py + +The original code could be found in TensorRT-7.x sample code: +"samples/python/int8_caffe_mnist/calibrator.py". I made the +modification so that the Calibrator could handle MS-COCO dataset +images instead of MNIST. +""" + +# +# Copyright 1993-2019 NVIDIA Corporation. All rights reserved. +# +# NOTICE TO LICENSEE: +# +# This source code and/or documentation ("Licensed Deliverables") are +# subject to NVIDIA intellectual property rights under U.S. and +# international Copyright laws. +# +# These Licensed Deliverables contained herein is PROPRIETARY and +# CONFIDENTIAL to NVIDIA and is being provided under the terms and +# conditions of a form of NVIDIA software license agreement by and +# between NVIDIA and Licensee ("License Agreement") or electronically +# accepted by Licensee. Notwithstanding any terms or conditions to +# the contrary in the License Agreement, reproduction or disclosure +# of the Licensed Deliverables to any third party without the express +# written consent of NVIDIA is prohibited. +# +# NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +# LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE +# SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS +# PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. +# NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED +# DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, +# NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +# NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +# LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY +# SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY +# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS +# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE +# OF THESE LICENSED DELIVERABLES. +# +# U.S. Government End Users. These Licensed Deliverables are a +# "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT +# 1995), consisting of "commercial computer software" and "commercial +# computer software documentation" as such terms are used in 48 +# C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government +# only as a commercial end item. Consistent with 48 C.F.R.12.212 and +# 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all +# U.S. Government End Users acquire the Licensed Deliverables with +# only those rights set forth herein. +# +# Any use of the Licensed Deliverables in individual and commercial +# software must include, in the user documentation and internal +# comments to the code, the above Disclaimer and U.S. Government End +# Users Notice. + + +import os + +import numpy as np +import cv2 +import pycuda.autoinit +import pycuda.driver as cuda +import tensorrt as trt + + +def _preprocess_yolo(img, input_shape): + """Preprocess an image before TRT YOLO inferencing. + + # Args + img: uint8 numpy array of shape either (img_h, img_w, 3) + or (img_h, img_w) + input_shape: a tuple of (H, W) + + # Returns + preprocessed img: float32 numpy array of shape (3, H, W) + """ + if img.ndim == 2: + img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) + img = cv2.resize(img, (input_shape[1], input_shape[0])) + img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) + img = img.transpose((2, 0, 1)).astype(np.float32) + img /= 255.0 + return img + + +class YOLOEntropyCalibrator(trt.IInt8EntropyCalibrator2): + """YOLOEntropyCalibrator + + This class implements TensorRT's IInt8EntropyCalibtrator2 interface. + It reads all images from the specified directory and generates INT8 + calibration data for YOLO models accordingly. + """ + + def __init__(self, img_dir, net_hw, cache_file, batch_size=1): + if not os.path.isdir(img_dir): + raise FileNotFoundError('%s does not exist' % img_dir) + if len(net_hw) != 2 or net_hw[0] % 32 or net_hw[1] % 32: + raise ValueError('bad net shape: %s' % str(net_hw)) + + super().__init__() # trt.IInt8EntropyCalibrator2.__init__(self) + + self.img_dir = img_dir + self.net_hw = net_hw + self.cache_file = cache_file + self.batch_size = batch_size + self.blob_size = 3 * net_hw[0] * net_hw[1] * np.dtype('float32').itemsize * batch_size + + self.jpgs = [f for f in os.listdir(img_dir) if f.endswith('.jpg')] + # The number "500" is NVIDIA's suggestion. See here: + # https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#optimizing_int8_c + if len(self.jpgs) < 500: + print('WARNING: found less than 500 images in %s!' % img_dir) + self.current_index = 0 + + # Allocate enough memory for a whole batch. + self.device_input = cuda.mem_alloc(self.blob_size) + + def __del__(self): + del self.device_input # free CUDA memory + + def get_batch_size(self): + return self.batch_size + + def get_batch(self, names): + if self.current_index + self.batch_size > len(self.jpgs): + return None + current_batch = int(self.current_index / self.batch_size) + + batch = [] + for i in range(self.batch_size): + img_path = os.path.join( + self.img_dir, self.jpgs[self.current_index + i]) + img = cv2.imread(img_path) + assert img is not None, 'failed to read %s' % img_path + batch.append(_preprocess_yolo(img, self.net_hw)) + batch = np.stack(batch) + assert batch.nbytes == self.blob_size + + cuda.memcpy_htod(self.device_input, np.ascontiguousarray(batch)) + self.current_index += self.batch_size + return [self.device_input] + + def read_calibration_cache(self): + # If there is a cache, use it instead of calibrating again. + # Otherwise, implicitly return None. + if os.path.exists(self.cache_file): + with open(self.cache_file, 'rb') as f: + return f.read() + + def write_calibration_cache(self, cache): + with open(self.cache_file, 'wb') as f: + f.write(cache) diff --git a/yolo/download_yolo.sh b/yolo/download_yolo.sh new file mode 100755 index 0000000..975660e --- /dev/null +++ b/yolo/download_yolo.sh @@ -0,0 +1,108 @@ +#!/bin/bash + +set -e + +# yolov3-tiny +wget https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/yolov3-tiny.cfg -q --show-progress --no-clobber +wget https://pjreddie.com/media/files/yolov3-tiny.weights -q --show-progress --no-clobber + +# yolov3 +wget https://raw.githubusercontent.com/pjreddie/darknet/master/cfg/yolov3.cfg -q --show-progress --no-clobber +wget https://pjreddie.com/media/files/yolov3.weights -q --show-progress --no-clobber + +# yolov3-spp +wget https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/yolov3-spp.cfg -q --show-progress --no-clobber +wget https://pjreddie.com/media/files/yolov3-spp.weights -q --show-progress --no-clobber + +# yolov4-tiny +wget https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/yolov4-tiny.cfg -q --show-progress --no-clobber +wget https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v4_pre/yolov4-tiny.weights -q --show-progress --no-clobber + +# yolov4 +wget https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/yolov4.cfg -q --show-progress --no-clobber +wget https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v3_optimal/yolov4.weights -q --show-progress --no-clobber + +# yolov4-csp +wget https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/yolov4-csp.cfg -q --show-progress --no-clobber +wget https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v4_pre/yolov4-csp.weights -q --show-progress --no-clobber + +# yolov4x-mish +wget https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/yolov4x-mish.cfg -q --show-progress --no-clobber +wget https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v4_pre/yolov4x-mish.weights -q --show-progress --no-clobber + +# yolov4-p5 +wget https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/yolov4-p5.cfg -q --show-progress --no-clobber +wget https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v4_pre/yolov4-p5.weights -q --show-progress --no-clobber + +echo +echo "Creating yolov3-tiny-288.cfg and yolov3-tiny-288.weights" +cat yolov3-tiny.cfg | sed -e '8s/width=416/width=288/' | sed -e '9s/height=416/height=288/' > yolov3-tiny-288.cfg +echo >> yolov3-tiny-288.cfg +ln -sf yolov3-tiny.weights yolov3-tiny-288.weights +echo "Creating yolov3-tiny-416.cfg and yolov3-tiny-416.weights" +cp yolov3-tiny.cfg yolov3-tiny-416.cfg +echo >> yolov3-tiny-416.cfg +ln -sf yolov3-tiny.weights yolov3-tiny-416.weights + +echo "Creating yolov3-288.cfg and yolov3-288.weights" +cat yolov3.cfg | sed -e '8s/width=608/width=288/' | sed -e '9s/height=608/height=288/' > yolov3-288.cfg +ln -sf yolov3.weights yolov3-288.weights +echo "Creating yolov3-416.cfg and yolov3-416.weights" +cat yolov3.cfg | sed -e '8s/width=608/width=416/' | sed -e '9s/height=608/height=416/' > yolov3-416.cfg +ln -sf yolov3.weights yolov3-416.weights +echo "Creating yolov3-608.cfg and yolov3-608.weights" +cp yolov3.cfg yolov3-608.cfg +ln -sf yolov3.weights yolov3-608.weights + +echo "Creating yolov3-spp-288.cfg and yolov3-spp-288.weights" +cat yolov3-spp.cfg | sed -e '8s/width=608/width=288/' | sed -e '9s/height=608/height=288/' > yolov3-spp-288.cfg +ln -sf yolov3-spp.weights yolov3-spp-288.weights +echo "Creating yolov3-spp-416.cfg and yolov3-spp-416.weights" +cat yolov3-spp.cfg | sed -e '8s/width=608/width=416/' | sed -e '9s/height=608/height=416/' > yolov3-spp-416.cfg +ln -sf yolov3-spp.weights yolov3-spp-416.weights +echo "Creating yolov3-spp-608.cfg and yolov3-spp-608.weights" +cp yolov3-spp.cfg yolov3-spp-608.cfg +ln -sf yolov3-spp.weights yolov3-spp-608.weights + +echo "Creating yolov4-tiny-288.cfg and yolov4-tiny-288.weights" +cat yolov4-tiny.cfg | sed -e '6s/batch=64/batch=1/' | sed -e '8s/width=416/width=288/' | sed -e '9s/height=416/height=288/' > yolov4-tiny-288.cfg +echo >> yolov4-tiny-288.cfg +ln -sf yolov4-tiny.weights yolov4-tiny-288.weights +echo "Creating yolov4-tiny-416.cfg and yolov4-tiny-416.weights" +cat yolov4-tiny.cfg | sed -e '6s/batch=64/batch=1/' > yolov4-tiny-416.cfg +echo >> yolov4-tiny-416.cfg +ln -sf yolov4-tiny.weights yolov4-tiny-416.weights + +echo "Creating yolov4-288.cfg and yolov4-288.weights" +cat yolov4.cfg | sed -e '2s/batch=64/batch=1/' | sed -e '7s/width=608/width=288/' | sed -e '8s/height=608/height=288/' > yolov4-288.cfg +ln -sf yolov4.weights yolov4-288.weights +echo "Creating yolov4-416.cfg and yolov4-416.weights" +cat yolov4.cfg | sed -e '2s/batch=64/batch=1/' | sed -e '7s/width=608/width=416/' | sed -e '8s/height=608/height=416/' > yolov4-416.cfg +ln -sf yolov4.weights yolov4-416.weights +echo "Creating yolov4-608.cfg and yolov4-608.weights" +cat yolov4.cfg | sed -e '2s/batch=64/batch=1/' > yolov4-608.cfg +ln -sf yolov4.weights yolov4-608.weights + +echo "Creating yolov4-csp-256.cfg and yolov4-csp-256.weights" +cat yolov4-csp.cfg | sed -e '6s/batch=64/batch=1/' | sed -e '8s/width=512/width=256/' | sed -e '9s/height=512/height=256/' > yolov4-csp-256.cfg +ln -sf yolov4-csp.weights yolov4-csp-256.weights +echo "Creating yolov4-csp-512.cfg and yolov4x-csp-512.weights" +cat yolov4-csp.cfg | sed -e '6s/batch=64/batch=1/' > yolov4-csp-512.cfg +ln -sf yolov4-csp.weights yolov4-csp-512.weights + +echo "Creating yolov4x-mish-320.cfg and yolov4x-mish-320.weights" +cat yolov4x-mish.cfg | sed -e '6s/batch=64/batch=1/' | sed -e '8s/width=640/width=320/' | sed -e '9s/height=640/height=320/' > yolov4x-mish-320.cfg +ln -sf yolov4x-mish.weights yolov4x-mish-320.weights +echo "Creating yolov4x-mish-640.cfg and yolov4x-mish-640.weights" +cat yolov4x-mish.cfg | sed -e '6s/batch=64/batch=1/' > yolov4x-mish-640.cfg +ln -sf yolov4x-mish.weights yolov4x-mish-640.weights + +echo "Creating yolov4-p5-448.cfg and yolov4-p5-448.weights" +cat yolov4-p5.cfg | sed -e '6s/batch=64/batch=1/' | sed -e '8s/width=896/width=448/' | sed -e '9s/height=896/height=448/' > yolov4-p5-448.cfg +ln -sf yolov4-p5.weights yolov4-p5-448.weights +echo "Creating yolov4-p5-896.cfg and yolov4-p5-896.weights" +cat yolov4-p5.cfg | sed -e '6s/batch=64/batch=1/' > yolov4-p5-896.cfg +ln -sf yolov4-p5.weights yolov4-p5-896.weights + +echo +echo "Done." diff --git a/yolo/install_pycuda.sh b/yolo/install_pycuda.sh new file mode 100755 index 0000000..578ad60 --- /dev/null +++ b/yolo/install_pycuda.sh @@ -0,0 +1,43 @@ +#!/bin/bash +# +# Reference for installing 'pycuda': https://wiki.tiker.net/PyCuda/Installation/Linux/Ubuntu + +set -e + +if ! which nvcc > /dev/null; then + echo "ERROR: nvcc not found" + exit +fi + +arch=$(uname -m) +folder=${HOME}/src +mkdir -p $folder + +echo "** Install requirements" +sudo apt-get install -y build-essential python3-dev +sudo apt-get install -y libboost-python-dev libboost-thread-dev +sudo pip3 install setuptools + +boost_pylib=$(basename /usr/lib/${arch}-linux-gnu/libboost_python*-py3?.so) +boost_pylibname=${boost_pylib%.so} +boost_pyname=${boost_pylibname/lib/} + +echo "** Download pycuda-2019.1.2 sources" +pushd $folder +if [ ! -f pycuda-2019.1.2.tar.gz ]; then + wget https://files.pythonhosted.org/packages/5e/3f/5658c38579b41866ba21ee1b5020b8225cec86fe717e4b1c5c972de0a33c/pycuda-2019.1.2.tar.gz +fi + +echo "** Build and install pycuda-2019.1.2" +CPU_CORES=$(nproc) +echo "** cpu cores available: " $CPU_CORES +tar xzvf pycuda-2019.1.2.tar.gz +cd pycuda-2019.1.2 +python3 ./configure.py --python-exe=/usr/bin/python3 --cuda-root=/usr/local/cuda --cudadrv-lib-dir=/usr/lib/${arch}-linux-gnu --boost-inc-dir=/usr/include --boost-lib-dir=/usr/lib/${arch}-linux-gnu --boost-python-libname=${boost_pyname} --boost-thread-libname=boost_thread --no-use-shipped-boost +make -j$CPU_CORES +python3 setup.py build +sudo python3 setup.py install + +popd + +python3 -c "import pycuda; print('pycuda version:', pycuda.VERSION)" diff --git a/yolo/onnx_to_tensorrt.py b/yolo/onnx_to_tensorrt.py new file mode 100644 index 0000000..01366b6 --- /dev/null +++ b/yolo/onnx_to_tensorrt.py @@ -0,0 +1,212 @@ +# onnx_to_tensorrt.py +# +# Copyright 1993-2019 NVIDIA Corporation. All rights reserved. +# +# NOTICE TO LICENSEE: +# +# This source code and/or documentation ("Licensed Deliverables") are +# subject to NVIDIA intellectual property rights under U.S. and +# international Copyright laws. +# +# These Licensed Deliverables contained herein is PROPRIETARY and +# CONFIDENTIAL to NVIDIA and is being provided under the terms and +# conditions of a form of NVIDIA software license agreement by and +# between NVIDIA and Licensee ("License Agreement") or electronically +# accepted by Licensee. Notwithstanding any terms or conditions to +# the contrary in the License Agreement, reproduction or disclosure +# of the Licensed Deliverables to any third party without the express +# written consent of NVIDIA is prohibited. +# +# NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +# LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE +# SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS +# PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. +# NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED +# DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, +# NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +# NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +# LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY +# SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY +# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS +# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE +# OF THESE LICENSED DELIVERABLES. +# +# U.S. Government End Users. These Licensed Deliverables are a +# "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT +# 1995), consisting of "commercial computer software" and "commercial +# computer software documentation" as such terms are used in 48 +# C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government +# only as a commercial end item. Consistent with 48 C.F.R.12.212 and +# 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all +# U.S. Government End Users acquire the Licensed Deliverables with +# only those rights set forth herein. +# +# Any use of the Licensed Deliverables in individual and commercial +# software must include, in the user documentation and internal +# comments to the code, the above Disclaimer and U.S. Government End +# Users Notice. +# + + +from __future__ import print_function + +import os +import argparse + +import tensorrt as trt + +from yolo_to_onnx import DarkNetParser, get_h_and_w +from plugins import add_yolo_plugins, add_concat + + +MAX_BATCH_SIZE = 1 + + +def get_c(layer_configs): + """Find input channels of the yolo model from layer configs.""" + net_config = layer_configs['000_net'] + return net_config.get('channels', 3) + + +def load_onnx(model_name): + """Read the ONNX file.""" + onnx_path = '%s.onnx' % model_name + if not os.path.isfile(onnx_path): + print('ERROR: file (%s) not found! You might want to run yolo_to_onnx.py first to generate it.' % onnx_path) + return None + else: + with open(onnx_path, 'rb') as f: + return f.read() + + +def set_net_batch(network, batch_size): + """Set network input batch size. + + The ONNX file might have been generated with a different batch size, + say, 64. + """ + if trt.__version__[0] >= '7': + shape = list(network.get_input(0).shape) + shape[0] = batch_size + network.get_input(0).shape = shape + return network + + +def build_engine(model_name, do_int8, dla_core, verbose=False): + """Build a TensorRT engine from ONNX using the older API.""" + cfg_file_path = model_name + '.cfg' + parser = DarkNetParser() + layer_configs = parser.parse_cfg_file(cfg_file_path) + net_c = get_c(layer_configs) + net_h, net_w = get_h_and_w(layer_configs) + + print('Loading the ONNX file...') + onnx_data = load_onnx(model_name) + if onnx_data is None: + return None + + TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE) if verbose else trt.Logger() + EXPLICIT_BATCH = [] if trt.__version__[0] < '7' else \ + [1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)] + with trt.Builder(TRT_LOGGER) as builder, builder.create_network(*EXPLICIT_BATCH) as network, trt.OnnxParser(network, TRT_LOGGER) as parser: + if do_int8 and not builder.platform_has_fast_int8: + raise RuntimeError('INT8 not supported on this platform') + if not parser.parse(onnx_data): + print('ERROR: Failed to parse the ONNX file.') + for error in range(parser.num_errors): + print(parser.get_error(error)) + return None + network = set_net_batch(network, MAX_BATCH_SIZE) + + print('Adding yolo_layer plugins.') + network = add_yolo_plugins(network, model_name, TRT_LOGGER) + + print('Adding a concatenated output as "detections".') + network = add_concat(network, model_name, TRT_LOGGER) + + print('Naming the input tensort as "input".') + network.get_input(0).name = 'input' + + print('Building the TensorRT engine. This would take a while...') + print('(Use "--verbose" or "-v" to enable verbose logging.)') + if trt.__version__[0] < '7': # older API: build_cuda_engine() + if dla_core >= 0: + raise RuntimeError('DLA core not supported by old API') + builder.max_batch_size = MAX_BATCH_SIZE + builder.max_workspace_size = 1 << 30 + builder.fp16_mode = True # alternative: builder.platform_has_fast_fp16 + if do_int8: + from calibrator import YOLOEntropyCalibrator + builder.int8_mode = True + builder.int8_calibrator = YOLOEntropyCalibrator( + 'calib_images', (net_h, net_w), 'calib_%s.bin' % model_name) + engine = builder.build_cuda_engine(network) + else: # new API: build_engine() with builder config + builder.max_batch_size = MAX_BATCH_SIZE + config = builder.create_builder_config() + config.max_workspace_size = 1 << 30 + config.set_flag(trt.BuilderFlag.GPU_FALLBACK) + config.set_flag(trt.BuilderFlag.FP16) + profile = builder.create_optimization_profile() + profile.set_shape( + 'input', # input tensor name + (MAX_BATCH_SIZE, net_c, net_h, net_w), # min shape + (MAX_BATCH_SIZE, net_c, net_h, net_w), # opt shape + (MAX_BATCH_SIZE, net_c, net_h, net_w)) # max shape + config.add_optimization_profile(profile) + if do_int8: + from calibrator import YOLOEntropyCalibrator + config.set_flag(trt.BuilderFlag.INT8) + config.int8_calibrator = YOLOEntropyCalibrator( + 'calib_images', (net_h, net_w), + 'calib_%s.bin' % model_name) + config.set_calibration_profile(profile) + if dla_core >= 0: + config.default_device_type = trt.DeviceType.DLA + config.DLA_core = dla_core + config.set_flag(trt.BuilderFlag.STRICT_TYPES) + print('Using DLA core %d.' % dla_core) + engine = builder.build_engine(network, config) + + if engine is not None: + print('Completed creating engine.') + return engine + + +def main(): + """Create a TensorRT engine for ONNX-based YOLO.""" + parser = argparse.ArgumentParser() + parser.add_argument( + '-v', '--verbose', action='store_true', + help='enable verbose output (for debugging)') + parser.add_argument( + '-c', '--category_num', type=int, + help='number of object categories (obsolete)') + parser.add_argument( + '-m', '--model', type=str, required=True, + help=('[yolov3-tiny|yolov3|yolov3-spp|yolov4-tiny|yolov4|' + 'yolov4-csp|yolov4x-mish|yolov4-p5]-[{dimension}], where ' + '{dimension} could be either a single number (e.g. ' + '288, 416, 608) or 2 numbers, WxH (e.g. 416x256)')) + parser.add_argument( + '--int8', action='store_true', + help='build INT8 TensorRT engine') + parser.add_argument( + '--dla_core', type=int, default=-1, + help='id of DLA core for inference (0 ~ N-1)') + args = parser.parse_args() + + engine = build_engine( + args.model, args.int8, args.dla_core, args.verbose) + if engine is None: + raise SystemExit('ERROR: failed to build the TensorRT engine!') + + engine_path = '%s.trt' % args.model + with open(engine_path, 'wb') as f: + f.write(engine.serialize()) + print('Serialized the TensorRT engine to file: %s' % engine_path) + + +if __name__ == '__main__': + main() diff --git a/yolo/plugins.py b/yolo/plugins.py new file mode 100644 index 0000000..b48d8ff --- /dev/null +++ b/yolo/plugins.py @@ -0,0 +1,137 @@ +"""plugins.py + +I referenced the code from https://github.com/dongfangduoshou123/YoloV3-TensorRT/blob/master/seralizeEngineFromPythonAPI.py +""" + + +import ctypes + +import numpy as np +import tensorrt as trt + +from yolo_to_onnx import (is_pan_arch, DarkNetParser, get_category_num, + get_h_and_w, get_output_convs, get_anchors) + + +try: + ctypes.cdll.LoadLibrary('../plugins/libyolo_layer.so') +except OSError as e: + raise SystemExit('ERROR: failed to load ../plugins/libyolo_layer.so. ' + 'Did you forget to do a "make" in the "../plugins/" ' + 'subdirectory?') from e + + +def get_scales(cfg_file_path): + """Get scale_x_y's of all yolo layers from the cfg file.""" + with open(cfg_file_path, 'r') as f: + cfg_lines = f.readlines() + yolo_lines = [l.strip() for l in cfg_lines if l.startswith('[yolo]')] + scale_lines = [l.strip() for l in cfg_lines if l.startswith('scale_x_y')] + if len(scale_lines) == 0: + return [1.0] * len(yolo_lines) + else: + assert len(scale_lines) == len(yolo_lines) + return [float(l.split('=')[-1]) for l in scale_lines] + + +def get_new_coords(cfg_file_path): + """Get new_coords flag of yolo layers from the cfg file.""" + with open(cfg_file_path, 'r') as f: + cfg_lines = f.readlines() + yolo_lines = [l.strip() for l in cfg_lines if l.startswith('[yolo]')] + newc_lines = [l.strip() for l in cfg_lines if l.startswith('new_coords')] + if len(newc_lines) == 0: + return 0 + else: + assert len(newc_lines) == len(yolo_lines) + return int(newc_lines[-1].split('=')[-1]) + + +def get_plugin_creator(plugin_name, logger): + """Get the TensorRT plugin creator.""" + trt.init_libnvinfer_plugins(logger, '') + plugin_creator_list = trt.get_plugin_registry().plugin_creator_list + for c in plugin_creator_list: + if c.name == plugin_name: + return c + return None + + +def add_yolo_plugins(network, model_name, logger): + """Add yolo plugins into a TensorRT network.""" + cfg_file_path = model_name + '.cfg' + parser = DarkNetParser() + layer_configs = parser.parse_cfg_file(cfg_file_path) + num_classes = get_category_num(cfg_file_path) + output_tensor_names = get_output_convs(layer_configs) + h, w = get_h_and_w(layer_configs) + if len(output_tensor_names) == 2: + yolo_whs = [ + [w // 32, h // 32], [w // 16, h // 16]] + elif len(output_tensor_names) == 3: + yolo_whs = [ + [w // 32, h // 32], [w // 16, h // 16], + [w // 8, h // 8]] + elif len(output_tensor_names) == 4: + yolo_whs = [ + [w // 64, h // 64], [w // 32, h // 32], + [w // 16, h // 16], [w // 8, h // 8]] + else: + raise TypeError('bad number of outputs: %d' % len(output_tensor_names)) + if is_pan_arch(cfg_file_path): + yolo_whs.reverse() + anchors = get_anchors(cfg_file_path) + if len(anchors) != len(yolo_whs): + raise ValueError('bad number of yolo layers: %d vs. %d' % + (len(anchors), len(yolo_whs))) + if network.num_outputs != len(anchors): + raise ValueError('bad number of network outputs: %d vs. %d' % + (network.num_outputs, len(anchors))) + scales = get_scales(cfg_file_path) + if any([s < 1.0 for s in scales]): + raise ValueError('bad scale_x_y: %s' % str(scales)) + if len(scales) != len(anchors): + raise ValueError('bad number of scales: %d vs. %d' % + (len(scales), len(anchors))) + new_coords = get_new_coords(cfg_file_path) + + plugin_creator = get_plugin_creator('YoloLayer_TRT', logger) + if not plugin_creator: + raise RuntimeError('cannot get YoloLayer_TRT plugin creator') + old_tensors = [network.get_output(i) for i in range(network.num_outputs)] + new_tensors = [None] * network.num_outputs + for i, old_tensor in enumerate(old_tensors): + input_multiplier = w // yolo_whs[i][0] + new_tensors[i] = network.add_plugin_v2( + [old_tensor], + plugin_creator.create_plugin('YoloLayer_TRT', trt.PluginFieldCollection([ + trt.PluginField("yoloWidth", np.array(yolo_whs[i][0], dtype=np.int32), trt.PluginFieldType.INT32), + trt.PluginField("yoloHeight", np.array(yolo_whs[i][1], dtype=np.int32), trt.PluginFieldType.INT32), + trt.PluginField("inputMultiplier", np.array(input_multiplier, dtype=np.int32), trt.PluginFieldType.INT32), + trt.PluginField("newCoords", np.array(new_coords, dtype=np.int32), trt.PluginFieldType.INT32), + trt.PluginField("numClasses", np.array(num_classes, dtype=np.int32), trt.PluginFieldType.INT32), + trt.PluginField("numAnchors", np.array(len(anchors[i]) // 2, dtype=np.int32), trt.PluginFieldType.INT32), + trt.PluginField("anchors", np.ascontiguousarray(anchors[i], dtype=np.float32), trt.PluginFieldType.FLOAT32), + trt.PluginField("scaleXY", np.array(scales[i], dtype=np.float32), trt.PluginFieldType.FLOAT32), + ])) + ).get_output(0) + + for new_tensor in new_tensors: + network.mark_output(new_tensor) + for old_tensor in old_tensors: + network.unmark_output(old_tensor) + + return network + + +def add_concat(network, model_name, logger): + """Add a final concatenation output into a TensorRT network.""" + if network.num_outputs < 2 or network.num_outputs > 4: + raise TypeError('bad number of yolo layers: %d' % network.num_outputs) + yolo_tensors = [network.get_output(i) for i in range(network.num_outputs)] + concat_tensor = network.add_concatenation(yolo_tensors).get_output(0) + for yolo_tensor in yolo_tensors: + network.unmark_output(yolo_tensor) + concat_tensor.name = 'detections' + network.mark_output(concat_tensor) + return network diff --git a/yolo/requirements.txt b/yolo/requirements.txt new file mode 100644 index 0000000..1b41021 --- /dev/null +++ b/yolo/requirements.txt @@ -0,0 +1,3 @@ +numpy +onnx==1.9.0 +pycuda diff --git a/yolo/yolo_to_onnx.py b/yolo/yolo_to_onnx.py new file mode 100644 index 0000000..fc1080d --- /dev/null +++ b/yolo/yolo_to_onnx.py @@ -0,0 +1,1067 @@ +# yolo_to_onnx.py +# +# Copyright 1993-2019 NVIDIA Corporation. All rights reserved. +# +# NOTICE TO LICENSEE: +# +# This source code and/or documentation ("Licensed Deliverables") are +# subject to NVIDIA intellectual property rights under U.S. and +# international Copyright laws. +# +# These Licensed Deliverables contained herein is PROPRIETARY and +# CONFIDENTIAL to NVIDIA and is being provided under the terms and +# conditions of a form of NVIDIA software license agreement by and +# between NVIDIA and Licensee ("License Agreement") or electronically +# accepted by Licensee. Notwithstanding any terms or conditions to +# the contrary in the License Agreement, reproduction or disclosure +# of the Licensed Deliverables to any third party without the express +# written consent of NVIDIA is prohibited. +# +# NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +# LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE +# SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS +# PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. +# NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED +# DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, +# NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +# NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +# LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY +# SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY +# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS +# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE +# OF THESE LICENSED DELIVERABLES. +# +# U.S. Government End Users. These Licensed Deliverables are a +# "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT +# 1995), consisting of "commercial computer software" and "commercial +# computer software documentation" as such terms are used in 48 +# C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government +# only as a commercial end item. Consistent with 48 C.F.R.12.212 and +# 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all +# U.S. Government End Users acquire the Licensed Deliverables with +# only those rights set forth herein. +# +# Any use of the Licensed Deliverables in individual and commercial +# software must include, in the user documentation and internal +# comments to the code, the above Disclaimer and U.S. Government End +# Users Notice. +# + + +import os +import sys +import argparse +from collections import OrderedDict + +import numpy as np +import onnx +from onnx import helper, TensorProto + + +MAX_BATCH_SIZE = 1 + + +def parse_args(): + """Parse command-line arguments.""" + parser = argparse.ArgumentParser() + parser.add_argument( + '-c', '--category_num', type=int, + help='number of object categories (obsolete)') + parser.add_argument( + '-m', '--model', type=str, required=True, + help=('[yolov3-tiny|yolov3|yolov3-spp|yolov4-tiny|yolov4|' + 'yolov4-csp|yolov4x-mish|yolov4-p5]-[{dimension}], where ' + '{dimension} could be either a single number (e.g. ' + '288, 416, 608) or 2 numbers, WxH (e.g. 416x256)')) + args = parser.parse_args() + return args + + +def rreplace(s, old, new, occurrence=1): + """Replace old pattern in the string with new from the right.""" + return new.join(s.rsplit(old, occurrence)) + + +def is_pan_arch(cfg_file_path): + """Determine whether the yolo model is with PAN architecture.""" + with open(cfg_file_path, 'r') as f: + cfg_lines = [l.strip() for l in f.readlines()] + yolos_or_upsamples = [l for l in cfg_lines + if l in ['[yolo]', '[upsample]']] + yolo_count = len([l for l in yolos_or_upsamples if l == '[yolo]']) + upsample_count = len(yolos_or_upsamples) - yolo_count + assert yolo_count in (2, 3, 4) # at most 4 yolo layers + assert upsample_count == yolo_count - 1 or upsample_count == 0 + # the model is with PAN if an upsample layer appears before the 1st yolo + return yolos_or_upsamples[0] == '[upsample]' + + +def get_output_convs(layer_configs): + """Find output conv layer names from layer configs. + + The output conv layers are those conv layers immediately proceeding + the yolo layers. + + # Arguments + layer_configs: output of the DarkNetParser, i.e. a OrderedDict of + the yolo layers. + """ + output_convs = [] + previous_layer = None + for current_layer in layer_configs.keys(): + if previous_layer is not None and current_layer.endswith('yolo'): + assert previous_layer.endswith('convolutional') + activation = layer_configs[previous_layer]['activation'] + if activation == 'linear': + output_convs.append(previous_layer) + elif activation == 'logistic': + output_convs.append(previous_layer + '_lgx') + else: + raise TypeError('unexpected activation: %s' % activation) + previous_layer = current_layer + return output_convs + + +def get_category_num(cfg_file_path): + """Find number of output classes of the yolo model.""" + with open(cfg_file_path, 'r') as f: + cfg_lines = [l.strip() for l in f.readlines()] + classes_lines = [l for l in cfg_lines if l.startswith('classes=')] + assert len(set(classes_lines)) == 1 + return int(classes_lines[-1].split('=')[-1].strip()) + + +def get_h_and_w(layer_configs): + """Find input height and width of the yolo model from layer configs.""" + net_config = layer_configs['000_net'] + return net_config['height'], net_config['width'] + + +def get_anchors(cfg_file_path): + """Get anchors of all yolo layers from the cfg file.""" + with open(cfg_file_path, 'r') as f: + cfg_lines = f.readlines() + yolo_lines = [l.strip() for l in cfg_lines if l.startswith('[yolo]')] + mask_lines = [l.strip() for l in cfg_lines if l.startswith('mask')] + anch_lines = [l.strip() for l in cfg_lines if l.startswith('anchors')] + assert len(mask_lines) == len(yolo_lines) + assert len(anch_lines) == len(yolo_lines) + anchor_list = eval('[%s]' % anch_lines[0].split('=')[-1]) + mask_strs = [l.split('=')[-1] for l in mask_lines] + masks = [eval('[%s]' % s) for s in mask_strs] + anchors = [] + for mask in masks: + curr_anchors = [] + for m in mask: + curr_anchors.append(anchor_list[m * 2]) + curr_anchors.append(anchor_list[m * 2 + 1]) + anchors.append(curr_anchors) + return anchors + + +def get_anchor_num(cfg_file_path): + """Find number of anchors (masks) of the yolo model.""" + anchors = get_anchors(cfg_file_path) + num_anchors = [len(a) // 2 for a in anchors] + + assert len(num_anchors) > 0, 'Found no `mask` fields in config' + assert len(set(num_anchors)) == 1, 'Found different num anchors' + + return num_anchors[0] + + +class DarkNetParser(object): + """Definition of a parser for DarkNet-based YOLO model.""" + + def __init__(self, supported_layers=None): + """Initializes a DarkNetParser object. + + Keyword argument: + supported_layers -- a string list of supported layers in DarkNet naming convention, + parameters are only added to the class dictionary if a parsed layer is included. + """ + + # A list of YOLO layers containing dictionaries with all layer + # parameters: + self.layer_configs = OrderedDict() + self.supported_layers = supported_layers if supported_layers else \ + ['net', 'convolutional', 'maxpool', 'shortcut', + 'route', 'upsample', 'yolo'] + self.layer_counter = 0 + + def parse_cfg_file(self, cfg_file_path): + """Takes the yolov?.cfg file and parses it layer by layer, + appending each layer's parameters as a dictionary to layer_configs. + + Keyword argument: + cfg_file_path + """ + with open(cfg_file_path, 'r') as cfg_file: + remainder = cfg_file.read() + while remainder is not None: + layer_dict, layer_name, remainder = self._next_layer(remainder) + if layer_dict is not None: + self.layer_configs[layer_name] = layer_dict + return self.layer_configs + + def _next_layer(self, remainder): + """Takes in a string and segments it by looking for DarkNet delimiters. + Returns the layer parameters and the remaining string after the last delimiter. + Example for the first Conv layer in yolo.cfg ... + + [convolutional] + batch_normalize=1 + filters=32 + size=3 + stride=1 + pad=1 + activation=leaky + + ... becomes the following layer_dict return value: + {'activation': 'leaky', 'stride': 1, 'pad': 1, 'filters': 32, + 'batch_normalize': 1, 'type': 'convolutional', 'size': 3}. + + '001_convolutional' is returned as layer_name, and all lines that follow in yolo.cfg + are returned as the next remainder. + + Keyword argument: + remainder -- a string with all raw text after the previously parsed layer + """ + remainder = remainder.split('[', 1) + while len(remainder[0]) > 0 and remainder[0][-1] == '#': + # '#[...' case (the left bracket is proceeded by a pound sign), + # assuming this layer is commented out, so go find the next '[' + remainder = remainder[1].split('[', 1) + if len(remainder) == 2: + remainder = remainder[1] + else: + # no left bracket found in remainder + return None, None, None + remainder = remainder.split(']', 1) + if len(remainder) == 2: + layer_type, remainder = remainder + else: + # no right bracket + raise ValueError('no closing bracket!') + if layer_type not in self.supported_layers: + raise ValueError('%s layer not supported!' % layer_type) + + out = remainder.split('\n[', 1) + if len(out) == 2: + layer_param_block, remainder = out[0], '[' + out[1] + else: + layer_param_block, remainder = out[0], '' + layer_param_lines = layer_param_block.split('\n') + # remove empty lines + layer_param_lines = [l.lstrip() for l in layer_param_lines if l.lstrip()] + # don't parse yolo layers + if layer_type == 'yolo': layer_param_lines = [] + skip_params = ['steps', 'scales'] if layer_type == 'net' else [] + layer_name = str(self.layer_counter).zfill(3) + '_' + layer_type + layer_dict = dict(type=layer_type) + for param_line in layer_param_lines: + param_line = param_line.split('#')[0] + if not param_line: continue + assert '[' not in param_line + param_type, param_value = self._parse_params(param_line, skip_params) + layer_dict[param_type] = param_value + self.layer_counter += 1 + return layer_dict, layer_name, remainder + + def _parse_params(self, param_line, skip_params=None): + """Identifies the parameters contained in one of the cfg file and returns + them in the required format for each parameter type, e.g. as a list, an int or a float. + + Keyword argument: + param_line -- one parsed line within a layer block + """ + param_line = param_line.replace(' ', '') + param_type, param_value_raw = param_line.split('=') + assert param_value_raw + param_value = None + if skip_params and param_type in skip_params: + param_type = None + elif param_type == 'layers': + layer_indexes = list() + for index in param_value_raw.split(','): + layer_indexes.append(int(index)) + param_value = layer_indexes + elif isinstance(param_value_raw, str) and not param_value_raw.isalpha(): + condition_param_value_positive = param_value_raw.isdigit() + condition_param_value_negative = param_value_raw[0] == '-' and \ + param_value_raw[1:].isdigit() + if condition_param_value_positive or condition_param_value_negative: + param_value = int(param_value_raw) + else: + param_value = float(param_value_raw) + else: + param_value = str(param_value_raw) + return param_type, param_value + + +class MajorNodeSpecs(object): + """Helper class used to store the names of ONNX output names, + corresponding to the output of a DarkNet layer and its output channels. + Some DarkNet layers are not created and there is no corresponding ONNX node, + but we still need to track them in order to set up skip connections. + """ + + def __init__(self, name, channels): + """ Initialize a MajorNodeSpecs object. + + Keyword arguments: + name -- name of the ONNX node + channels -- number of output channels of this node + """ + self.name = name + self.channels = channels + self.created_onnx_node = False + if name is not None and isinstance(channels, int) and channels > 0: + self.created_onnx_node = True + + +class ConvParams(object): + """Helper class to store the hyper parameters of a Conv layer, + including its prefix name in the ONNX graph and the expected dimensions + of weights for convolution, bias, and batch normalization. + + Additionally acts as a wrapper for generating safe names for all + weights, checking on feasible combinations. + """ + + def __init__(self, node_name, batch_normalize, conv_weight_dims): + """Constructor based on the base node name (e.g. 101_convolutional), the batch + normalization setting, and the convolutional weights shape. + + Keyword arguments: + node_name -- base name of this YOLO convolutional layer + batch_normalize -- bool value if batch normalization is used + conv_weight_dims -- the dimensions of this layer's convolutional weights + """ + self.node_name = node_name + self.batch_normalize = batch_normalize + assert len(conv_weight_dims) == 4 + self.conv_weight_dims = conv_weight_dims + + def generate_param_name(self, param_category, suffix): + """Generates a name based on two string inputs, + and checks if the combination is valid.""" + assert suffix + assert param_category in ['bn', 'conv'] + assert(suffix in ['scale', 'mean', 'var', 'weights', 'bias']) + if param_category == 'bn': + assert self.batch_normalize + assert suffix in ['scale', 'bias', 'mean', 'var'] + elif param_category == 'conv': + assert suffix in ['weights', 'bias'] + if suffix == 'bias': + assert not self.batch_normalize + param_name = self.node_name + '_' + param_category + '_' + suffix + return param_name + +class ResizeParams(object): + #Helper class to store the scale parameter for an Resize node. + + def __init__(self, node_name, value): + """Constructor based on the base node name (e.g. 86_Resize), + and the value of the scale input tensor. + + Keyword arguments: + node_name -- base name of this YOLO Resize layer + value -- the value of the scale input to the Resize layer as numpy array + """ + self.node_name = node_name + self.value = value + + def generate_param_name(self): + """Generates the scale parameter name for the Resize node.""" + param_name = self.node_name + '_' + "scale" + return param_name + + def generate_roi_name(self): + """Generates the roi input name for the Resize node.""" + param_name = self.node_name + '_' + "roi" + return param_name + +class WeightLoader(object): + """Helper class used for loading the serialized weights of a binary file stream + and returning the initializers and the input tensors required for populating + the ONNX graph with weights. + """ + + def __init__(self, weights_file_path): + """Initialized with a path to the YOLO .weights file. + + Keyword argument: + weights_file_path -- path to the weights file. + """ + self.weights_file = self._open_weights_file(weights_file_path) + + def load_resize_scales(self, resize_params): + """Returns the initializers with the value of the scale input + tensor given by resize_params. + + Keyword argument: + resize_params -- a ResizeParams object + """ + initializer = list() + inputs = list() + name = resize_params.generate_param_name() + shape = resize_params.value.shape + data = resize_params.value + scale_init = helper.make_tensor( + name, TensorProto.FLOAT, shape, data) + scale_input = helper.make_tensor_value_info( + name, TensorProto.FLOAT, shape) + initializer.append(scale_init) + inputs.append(scale_input) + + # In opset 11 an additional input named roi is required. Create a dummy tensor to satisfy this. + # It is a 1D tensor of size of the rank of the input (4) + rank = 4 + roi_name = resize_params.generate_roi_name() + roi_input = helper.make_tensor_value_info(roi_name, TensorProto.FLOAT, [rank]) + roi_init = helper.make_tensor(roi_name, TensorProto.FLOAT, [rank], [0,0,0,0]) + initializer.append(roi_init) + inputs.append(roi_input) + + return initializer, inputs + + def load_conv_weights(self, conv_params): + """Returns the initializers with weights from the weights file and + the input tensors of a convolutional layer for all corresponding ONNX nodes. + + Keyword argument: + conv_params -- a ConvParams object + """ + initializer = list() + inputs = list() + if conv_params.batch_normalize: + bias_init, bias_input = self._create_param_tensors( + conv_params, 'bn', 'bias') + bn_scale_init, bn_scale_input = self._create_param_tensors( + conv_params, 'bn', 'scale') + bn_mean_init, bn_mean_input = self._create_param_tensors( + conv_params, 'bn', 'mean') + bn_var_init, bn_var_input = self._create_param_tensors( + conv_params, 'bn', 'var') + initializer.extend( + [bn_scale_init, bias_init, bn_mean_init, bn_var_init]) + inputs.extend([bn_scale_input, bias_input, + bn_mean_input, bn_var_input]) + else: + bias_init, bias_input = self._create_param_tensors( + conv_params, 'conv', 'bias') + initializer.append(bias_init) + inputs.append(bias_input) + conv_init, conv_input = self._create_param_tensors( + conv_params, 'conv', 'weights') + initializer.append(conv_init) + inputs.append(conv_input) + return initializer, inputs + + def _open_weights_file(self, weights_file_path): + """Opens a YOLO DarkNet file stream and skips the header. + + Keyword argument: + weights_file_path -- path to the weights file. + """ + weights_file = open(weights_file_path, 'rb') + length_header = 5 + np.ndarray(shape=(length_header, ), dtype='int32', + buffer=weights_file.read(length_header * 4)) + return weights_file + + def _create_param_tensors(self, conv_params, param_category, suffix): + """Creates the initializers with weights from the weights file together with + the input tensors. + + Keyword arguments: + conv_params -- a ConvParams object + param_category -- the category of parameters to be created ('bn' or 'conv') + suffix -- a string determining the sub-type of above param_category (e.g., + 'weights' or 'bias') + """ + param_name, param_data, param_data_shape = self._load_one_param_type( + conv_params, param_category, suffix) + + initializer_tensor = helper.make_tensor( + param_name, TensorProto.FLOAT, param_data_shape, param_data) + input_tensor = helper.make_tensor_value_info( + param_name, TensorProto.FLOAT, param_data_shape) + return initializer_tensor, input_tensor + + def _load_one_param_type(self, conv_params, param_category, suffix): + """Deserializes the weights from a file stream in the DarkNet order. + + Keyword arguments: + conv_params -- a ConvParams object + param_category -- the category of parameters to be created ('bn' or 'conv') + suffix -- a string determining the sub-type of above param_category (e.g., + 'weights' or 'bias') + """ + param_name = conv_params.generate_param_name(param_category, suffix) + channels_out, channels_in, filter_h, filter_w = conv_params.conv_weight_dims + if param_category == 'bn': + param_shape = [channels_out] + elif param_category == 'conv': + if suffix == 'weights': + param_shape = [channels_out, channels_in, filter_h, filter_w] + elif suffix == 'bias': + param_shape = [channels_out] + param_size = np.product(np.array(param_shape)) + param_data = np.ndarray( + shape=param_shape, + dtype='float32', + buffer=self.weights_file.read(param_size * 4)) + param_data = param_data.flatten().astype(float) + return param_name, param_data, param_shape + + +class GraphBuilderONNX(object): + """Class for creating an ONNX graph from a previously generated list of layer dictionaries.""" + + def __init__(self, model_name, output_tensors, batch_size): + """Initialize with all DarkNet default parameters used creating + YOLO, and specify the output tensors as an OrderedDict for their + output dimensions with their names as keys. + + Keyword argument: + output_tensors -- the output tensors as an OrderedDict containing the keys' + output dimensions + """ + self.model_name = model_name + self.output_tensors = output_tensors + self._nodes = list() + self.graph_def = None + self.input_tensor = None + self.epsilon_bn = 1e-5 + self.momentum_bn = 0.99 + self.alpha_lrelu = 0.1 + self.param_dict = OrderedDict() + self.major_node_specs = list() + self.batch_size = batch_size + self.route_spec = 0 # keeping track of the current active 'route' + + def build_onnx_graph( + self, + layer_configs, + weights_file_path, + verbose=True): + """Iterate over all layer configs (parsed from the DarkNet + representation of YOLO), create an ONNX graph, populate it with + weights from the weights file and return the graph definition. + + Keyword arguments: + layer_configs -- an OrderedDict object with all parsed layers' configurations + weights_file_path -- location of the weights file + verbose -- toggles if the graph is printed after creation (default: True) + """ + for layer_name in layer_configs.keys(): + layer_dict = layer_configs[layer_name] + major_node_specs = self._make_onnx_node(layer_name, layer_dict) + if major_node_specs.name is not None: + self.major_node_specs.append(major_node_specs) + # remove dummy 'route' and 'yolo' nodes + self.major_node_specs = [node for node in self.major_node_specs + if 'dummy' not in node.name] + outputs = list() + for tensor_name in self.output_tensors.keys(): + output_dims = [self.batch_size, ] + \ + self.output_tensors[tensor_name] + output_tensor = helper.make_tensor_value_info( + tensor_name, TensorProto.FLOAT, output_dims) + outputs.append(output_tensor) + inputs = [self.input_tensor] + weight_loader = WeightLoader(weights_file_path) + initializer = list() + # If a layer has parameters, add them to the initializer and input lists. + for layer_name in self.param_dict.keys(): + _, layer_type = layer_name.split('_', 1) + params = self.param_dict[layer_name] + if layer_type == 'convolutional': + initializer_layer, inputs_layer = weight_loader.load_conv_weights( + params) + initializer.extend(initializer_layer) + inputs.extend(inputs_layer) + elif layer_type == 'upsample': + initializer_layer, inputs_layer = weight_loader.load_resize_scales( + params) + initializer.extend(initializer_layer) + inputs.extend(inputs_layer) + del weight_loader + self.graph_def = helper.make_graph( + nodes=self._nodes, + name=self.model_name, + inputs=inputs, + outputs=outputs, + initializer=initializer + ) + if verbose: + print(helper.printable_graph(self.graph_def)) + model_def = helper.make_model(self.graph_def, + producer_name='NVIDIA TensorRT sample') + return model_def + + def _make_onnx_node(self, layer_name, layer_dict): + """Take in a layer parameter dictionary, choose the correct function for + creating an ONNX node and store the information important to graph creation + as a MajorNodeSpec object. + + Keyword arguments: + layer_name -- the layer's name (also the corresponding key in layer_configs) + layer_dict -- a layer parameter dictionary (one element of layer_configs) + """ + layer_type = layer_dict['type'] + if self.input_tensor is None: + if layer_type == 'net': + major_node_output_name, major_node_output_channels = self._make_input_tensor( + layer_name, layer_dict) + major_node_specs = MajorNodeSpecs(major_node_output_name, + major_node_output_channels) + else: + raise ValueError('The first node has to be of type "net".') + else: + node_creators = dict() + node_creators['convolutional'] = self._make_conv_node + node_creators['maxpool'] = self._make_maxpool_node + node_creators['shortcut'] = self._make_shortcut_node + node_creators['route'] = self._make_route_node + node_creators['upsample'] = self._make_resize_node + node_creators['yolo'] = self._make_yolo_node + + if layer_type in node_creators.keys(): + major_node_output_name, major_node_output_channels = \ + node_creators[layer_type](layer_name, layer_dict) + major_node_specs = MajorNodeSpecs(major_node_output_name, + major_node_output_channels) + else: + raise TypeError('layer of type %s not supported' % layer_type) + return major_node_specs + + def _make_input_tensor(self, layer_name, layer_dict): + """Create an ONNX input tensor from a 'net' layer and store the batch size. + + Keyword arguments: + layer_name -- the layer's name (also the corresponding key in layer_configs) + layer_dict -- a layer parameter dictionary (one element of layer_configs) + """ + #batch_size = layer_dict['batch'] + channels = layer_dict['channels'] + height = layer_dict['height'] + width = layer_dict['width'] + #self.batch_size = batch_size + input_tensor = helper.make_tensor_value_info( + str(layer_name), TensorProto.FLOAT, [ + self.batch_size, channels, height, width]) + self.input_tensor = input_tensor + return layer_name, channels + + def _get_previous_node_specs(self, target_index=0): + """Get a previously ONNX node. + + Target index can be passed for jumping to a specific index. + + Keyword arguments: + target_index -- optional for jumping to a specific index, + default: 0 for the previous element, while + taking 'route' spec into account + """ + if target_index == 0: + if self.route_spec != 0: + previous_node = self.major_node_specs[self.route_spec] + assert 'dummy' not in previous_node.name + self.route_spec = 0 + else: + previous_node = self.major_node_specs[-1] + else: + previous_node = self.major_node_specs[target_index] + assert previous_node.created_onnx_node + return previous_node + + def _make_conv_node(self, layer_name, layer_dict): + """Create an ONNX Conv node with optional batch normalization and + activation nodes. + + Keyword arguments: + layer_name -- the layer's name (also the corresponding key in layer_configs) + layer_dict -- a layer parameter dictionary (one element of layer_configs) + """ + previous_node_specs = self._get_previous_node_specs() + inputs = [previous_node_specs.name] + previous_channels = previous_node_specs.channels + kernel_size = layer_dict['size'] + stride = layer_dict['stride'] + filters = layer_dict['filters'] + batch_normalize = False + if layer_dict.get('batch_normalize', 0) > 0: + batch_normalize = True + + kernel_shape = [kernel_size, kernel_size] + weights_shape = [filters, previous_channels] + kernel_shape + conv_params = ConvParams(layer_name, batch_normalize, weights_shape) + + strides = [stride, stride] + dilations = [1, 1] + weights_name = conv_params.generate_param_name('conv', 'weights') + inputs.append(weights_name) + if not batch_normalize: + bias_name = conv_params.generate_param_name('conv', 'bias') + inputs.append(bias_name) + + conv_node = helper.make_node( + 'Conv', + inputs=inputs, + outputs=[layer_name], + kernel_shape=kernel_shape, + strides=strides, + auto_pad='SAME_LOWER', + dilations=dilations, + name=layer_name + ) + self._nodes.append(conv_node) + inputs = [layer_name] + layer_name_output = layer_name + + if batch_normalize: + layer_name_bn = layer_name + '_bn' + bn_param_suffixes = ['scale', 'bias', 'mean', 'var'] + for suffix in bn_param_suffixes: + bn_param_name = conv_params.generate_param_name('bn', suffix) + inputs.append(bn_param_name) + batchnorm_node = helper.make_node( + 'BatchNormalization', + inputs=inputs, + outputs=[layer_name_bn], + epsilon=self.epsilon_bn, + momentum=self.momentum_bn, + name=layer_name_bn + ) + self._nodes.append(batchnorm_node) + inputs = [layer_name_bn] + layer_name_output = layer_name_bn + + if layer_dict['activation'] == 'leaky': + layer_name_lrelu = layer_name + '_lrelu' + + lrelu_node = helper.make_node( + 'LeakyRelu', + inputs=inputs, + outputs=[layer_name_lrelu], + name=layer_name_lrelu, + alpha=self.alpha_lrelu + ) + self._nodes.append(lrelu_node) + inputs = [layer_name_lrelu] + layer_name_output = layer_name_lrelu + elif layer_dict['activation'] == 'mish': + layer_name_softplus = layer_name + '_softplus' + layer_name_tanh = layer_name + '_tanh' + layer_name_mish = layer_name + '_mish' + + softplus_node = helper.make_node( + 'Softplus', + inputs=inputs, + outputs=[layer_name_softplus], + name=layer_name_softplus + ) + self._nodes.append(softplus_node) + tanh_node = helper.make_node( + 'Tanh', + inputs=[layer_name_softplus], + outputs=[layer_name_tanh], + name=layer_name_tanh + ) + self._nodes.append(tanh_node) + + inputs.append(layer_name_tanh) + mish_node = helper.make_node( + 'Mul', + inputs=inputs, + outputs=[layer_name_mish], + name=layer_name_mish + ) + self._nodes.append(mish_node) + + inputs = [layer_name_mish] + layer_name_output = layer_name_mish + elif layer_dict['activation'] == 'swish': + layer_name_sigmoid = layer_name + '_sigmoid' + layer_name_swish = layer_name + '_swish' + + sigmoid_node = helper.make_node( + 'Sigmoid', + inputs=inputs, + outputs=[layer_name_sigmoid], + name=layer_name_sigmoid + ) + self._nodes.append(sigmoid_node) + + inputs.append(layer_name_sigmoid) + swish_node = helper.make_node( + 'Mul', + inputs=inputs, + outputs=[layer_name_swish], + name=layer_name_swish + ) + self._nodes.append(swish_node) + + inputs = [layer_name_swish] + layer_name_output = layer_name_swish + elif layer_dict['activation'] == 'logistic': + layer_name_lgx = layer_name + '_lgx' + + lgx_node = helper.make_node( + 'Sigmoid', + inputs=inputs, + outputs=[layer_name_lgx], + name=layer_name_lgx + ) + self._nodes.append(lgx_node) + inputs = [layer_name_lgx] + layer_name_output = layer_name_lgx + elif layer_dict['activation'] == 'linear': + pass + else: + raise TypeError('%s activation not supported' % layer_dict['activation']) + + self.param_dict[layer_name] = conv_params + return layer_name_output, filters + + def _make_shortcut_node(self, layer_name, layer_dict): + """Create an ONNX Add node with the shortcut properties from + the DarkNet-based graph. + + Keyword arguments: + layer_name -- the layer's name (also the corresponding key in layer_configs) + layer_dict -- a layer parameter dictionary (one element of layer_configs) + """ + shortcut_index = layer_dict['from'] + activation = layer_dict['activation'] + assert activation == 'linear' + + first_node_specs = self._get_previous_node_specs() + second_node_specs = self._get_previous_node_specs( + target_index=shortcut_index) + assert first_node_specs.channels == second_node_specs.channels + channels = first_node_specs.channels + inputs = [first_node_specs.name, second_node_specs.name] + shortcut_node = helper.make_node( + 'Add', + inputs=inputs, + outputs=[layer_name], + name=layer_name, + ) + self._nodes.append(shortcut_node) + return layer_name, channels + + def _make_route_node(self, layer_name, layer_dict): + """If the 'layers' parameter from the DarkNet configuration is only one index, continue + node creation at the indicated (negative) index. Otherwise, create an ONNX Concat node + with the route properties from the DarkNet-based graph. + + Keyword arguments: + layer_name -- the layer's name (also the corresponding key in layer_configs) + layer_dict -- a layer parameter dictionary (one element of layer_configs) + """ + route_node_indexes = layer_dict['layers'] + if len(route_node_indexes) == 1: + if 'groups' in layer_dict.keys(): + # for CSPNet-kind of architecture + assert 'group_id' in layer_dict.keys() + groups = layer_dict['groups'] + group_id = int(layer_dict['group_id']) + assert group_id < groups + index = route_node_indexes[0] + if index > 0: + # +1 for input node (same reason as below) + index += 1 + route_node_specs = self._get_previous_node_specs( + target_index=index) + assert route_node_specs.channels % groups == 0 + channels = route_node_specs.channels // groups + + outputs = [layer_name + '_dummy%d' % i for i in range(groups)] + outputs[group_id] = layer_name + route_node = helper.make_node( + 'Split', + axis=1, + #split=[channels] * groups, # not needed for opset 11 + inputs=[route_node_specs.name], + outputs=outputs, + name=layer_name, + ) + self._nodes.append(route_node) + else: + if route_node_indexes[0] < 0: + # route should skip self, thus -1 + self.route_spec = route_node_indexes[0] - 1 + elif route_node_indexes[0] > 0: + # +1 for input node (same reason as below) + self.route_spec = route_node_indexes[0] + 1 + # This dummy route node would be removed in the end. + layer_name = layer_name + '_dummy' + channels = 1 + else: + assert 'groups' not in layer_dict.keys(), \ + 'groups not implemented for multiple-input route layer!' + inputs = list() + channels = 0 + for index in route_node_indexes: + if index > 0: + # Increment by one because we count the input as + # a node (DarkNet does not) + index += 1 + route_node_specs = self._get_previous_node_specs( + target_index=index) + inputs.append(route_node_specs.name) + channels += route_node_specs.channels + assert inputs + assert channels > 0 + + route_node = helper.make_node( + 'Concat', + axis=1, + inputs=inputs, + outputs=[layer_name], + name=layer_name, + ) + self._nodes.append(route_node) + return layer_name, channels + + def _make_resize_node(self, layer_name, layer_dict): + """Create an ONNX Resize node with the properties from + the DarkNet-based graph. + + Keyword arguments: + layer_name -- the layer's name (also the corresponding key in layer_configs) + layer_dict -- a layer parameter dictionary (one element of layer_configs) + """ + resize_scale_factors = float(layer_dict['stride']) + # Create the scale factor array with node parameters + scales=np.array([1.0, 1.0, resize_scale_factors, resize_scale_factors]).astype(np.float32) + previous_node_specs = self._get_previous_node_specs() + inputs = [previous_node_specs.name] + + channels = previous_node_specs.channels + assert channels > 0 + resize_params = ResizeParams(layer_name, scales) + + # roi input is the second input, so append it before scales + roi_name = resize_params.generate_roi_name() + inputs.append(roi_name) + + scales_name = resize_params.generate_param_name() + inputs.append(scales_name) + + resize_node = helper.make_node( + 'Resize', + coordinate_transformation_mode='asymmetric', + mode='nearest', + nearest_mode='floor', + inputs=inputs, + outputs=[layer_name], + name=layer_name, + ) + self._nodes.append(resize_node) + self.param_dict[layer_name] = resize_params + return layer_name, channels + + def _make_maxpool_node(self, layer_name, layer_dict): + """Create an ONNX Maxpool node with the properties from + the DarkNet-based graph. + + Keyword arguments: + layer_name -- the layer's name (also the corresponding key in layer_configs) + layer_dict -- a layer parameter dictionary (one element of layer_configs) + """ + stride = layer_dict['stride'] + kernel_size = layer_dict['size'] + previous_node_specs = self._get_previous_node_specs() + inputs = [previous_node_specs.name] + channels = previous_node_specs.channels + kernel_shape = [kernel_size, kernel_size] + strides = [stride, stride] + assert channels > 0 + maxpool_node = helper.make_node( + 'MaxPool', + inputs=inputs, + outputs=[layer_name], + kernel_shape=kernel_shape, + strides=strides, + auto_pad='SAME_UPPER', + name=layer_name, + ) + self._nodes.append(maxpool_node) + return layer_name, channels + + def _make_yolo_node(self, layer_name, layer_dict): + """Create an ONNX Yolo node. + + These are dummy nodes which would be removed in the end. + """ + channels = 1 + return layer_name + '_dummy', channels + + +def main(): + if sys.version_info[0] < 3: + raise SystemExit('ERROR: This modified version of yolov3_to_onnx.py ' + 'script is only compatible with python3...') + + args = parse_args() + # 网络模型路径 + cfg_file_path = '%s.cfg' % args.model + if not os.path.isfile(cfg_file_path): + raise SystemExit('ERROR: file (%s) not found!' % cfg_file_path) + # 权重模型路径 + weights_file_path = '%s.weights' % args.model + if not os.path.isfile(weights_file_path): + raise SystemExit('ERROR: file (%s) not found!' % weights_file_path) + output_file_path = '%s.onnx' % args.model + + print('Parsing DarkNet cfg file...') + parser = DarkNetParser() + layer_configs = parser.parse_cfg_file(cfg_file_path) + category_num = get_category_num(cfg_file_path) + output_tensor_names = get_output_convs(layer_configs) + # e.g. ['036_convolutional', '044_convolutional', '052_convolutional'] + + c = (category_num + 5) * get_anchor_num(cfg_file_path) + h, w = get_h_and_w(layer_configs) + if len(output_tensor_names) == 2: + output_tensor_shapes = [ + [c, h // 32, w // 32], [c, h // 16, w // 16]] + elif len(output_tensor_names) == 3: + output_tensor_shapes = [ + [c, h // 32, w // 32], [c, h // 16, w // 16], + [c, h // 8, w // 8]] + elif len(output_tensor_names) == 4: + output_tensor_shapes = [ + [c, h // 64, w // 64], [c, h // 32, w // 32], + [c, h // 16, w // 16], [c, h // 8, w // 8]] + if is_pan_arch(cfg_file_path): + output_tensor_shapes.reverse() + output_tensor_dims = OrderedDict( + zip(output_tensor_names, output_tensor_shapes)) + + print('Building ONNX graph...') + builder = GraphBuilderONNX( + args.model, output_tensor_dims, MAX_BATCH_SIZE) + yolo_model_def = builder.build_onnx_graph( + layer_configs=layer_configs, + weights_file_path=weights_file_path, + verbose=True) + + print('Checking ONNX model...') + onnx.checker.check_model(yolo_model_def) + + print('Saving ONNX file...') + onnx.save(yolo_model_def, output_file_path) + + print('Done.') + + +if __name__ == '__main__': + main()