TensorRT-Demo/plugins/yolo_layer.cu

373 lines
14 KiB
Plaintext

/*
* yolo_layer.cu
*
* This code was originally written by wang-xinyu under MIT license.
* I took it from:
*
* https://github.com/wang-xinyu/tensorrtx/tree/master/yolov4
*
* and made necessary modifications.
*
* - JK Jung
*/
#include "yolo_layer.h"
using namespace Yolo;
namespace
{
// Write values into buffer
template <typename T>
void write(char*& buffer, const T& val)
{
*reinterpret_cast<T*>(buffer) = val;
buffer += sizeof(T);
}
// Read values from buffer
template <typename T>
void read(const char*& buffer, T& val)
{
val = *reinterpret_cast<const T*>(buffer);
buffer += sizeof(T);
}
} // namespace
namespace nvinfer1
{
YoloLayerPlugin::YoloLayerPlugin(int yolo_width, int yolo_height, int num_anchors, float* anchors, int num_classes, int input_width, int input_height, float scale_x_y, int new_coords)
{
mYoloWidth = yolo_width;
mYoloHeight = yolo_height;
mNumAnchors = num_anchors;
memcpy(mAnchorsHost, anchors, num_anchors * 2 * sizeof(float));
mNumClasses = num_classes;
mInputWidth = input_width;
mInputHeight = input_height;
mScaleXY = scale_x_y;
mNewCoords = new_coords;
CHECK(cudaMalloc(&mAnchors, MAX_ANCHORS * 2 * sizeof(float)));
CHECK(cudaMemcpy(mAnchors, mAnchorsHost, mNumAnchors * 2 * sizeof(float), cudaMemcpyHostToDevice));
}
YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length)
{
const char *d = reinterpret_cast<const char *>(data), *a = d;
read(d, mThreadCount);
read(d, mYoloWidth);
read(d, mYoloHeight);
read(d, mNumAnchors);
memcpy(mAnchorsHost, d, MAX_ANCHORS * 2 * sizeof(float));
d += MAX_ANCHORS * 2 * sizeof(float);
read(d, mNumClasses);
read(d, mInputWidth);
read(d, mInputHeight);
read(d, mScaleXY);
read(d, mNewCoords);
CHECK(cudaMalloc(&mAnchors, MAX_ANCHORS * 2 * sizeof(float)));
CHECK(cudaMemcpy(mAnchors, mAnchorsHost, mNumAnchors * 2 * sizeof(float), cudaMemcpyHostToDevice));
assert(d == a + length);
}
IPluginV2IOExt* YoloLayerPlugin::clone() const NOEXCEPT
{
YoloLayerPlugin *p = new YoloLayerPlugin(mYoloWidth, mYoloHeight, mNumAnchors, (float*) mAnchorsHost, mNumClasses, mInputWidth, mInputHeight, mScaleXY, mNewCoords);
p->setPluginNamespace(mPluginNamespace);
return p;
}
void YoloLayerPlugin::terminate() NOEXCEPT
{
CHECK(cudaFree(mAnchors));
}
size_t YoloLayerPlugin::getSerializationSize() const NOEXCEPT
{
return sizeof(mThreadCount) + \
sizeof(mYoloWidth) + sizeof(mYoloHeight) + \
sizeof(mNumAnchors) + MAX_ANCHORS * 2 * sizeof(float) + \
sizeof(mNumClasses) + \
sizeof(mInputWidth) + sizeof(mInputHeight) + \
sizeof(mScaleXY) + sizeof(mNewCoords);
}
void YoloLayerPlugin::serialize(void* buffer) const NOEXCEPT
{
char* d = static_cast<char*>(buffer), *a = d;
write(d, mThreadCount);
write(d, mYoloWidth);
write(d, mYoloHeight);
write(d, mNumAnchors);
memcpy(d, mAnchorsHost, MAX_ANCHORS * 2 * sizeof(float));
d += MAX_ANCHORS * 2 * sizeof(float);
write(d, mNumClasses);
write(d, mInputWidth);
write(d, mInputHeight);
write(d, mScaleXY);
write(d, mNewCoords);
assert(d == a + getSerializationSize());
}
Dims YoloLayerPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims) NOEXCEPT
{
assert(index == 0);
assert(nbInputDims == 1);
assert(inputs[0].d[0] == (mNumClasses + 5) * mNumAnchors);
assert(inputs[0].d[1] == mYoloHeight);
assert(inputs[0].d[2] == mYoloWidth);
// output detection results to the channel dimension
int totalsize = mYoloWidth * mYoloHeight * mNumAnchors * sizeof(Detection) / sizeof(float);
return Dims3(totalsize, 1, 1);
}
inline __device__ float sigmoidGPU(float x) { return 1.0f / (1.0f + __expf(-x)); }
inline __device__ float scale_sigmoidGPU(float x, float s)
{
return s * sigmoidGPU(x) - (s - 1.0f) * 0.5f;
}
// CalDetection(): This kernel processes 1 yolo layer calculation. It
// distributes calculations so that 1 GPU thread would be responsible
// for each grid/anchor combination.
// NOTE: The output (x, y, w, h) are between 0.0 and 1.0
// (relative to orginal image width and height).
__global__ void CalDetection(const float *input, float *output,
int batch_size,
int yolo_width, int yolo_height,
int num_anchors, const float *anchors,
int num_classes, int input_w, int input_h,
float scale_x_y)
{
int idx = threadIdx.x + blockDim.x * blockIdx.x;
Detection* det = ((Detection*) output) + idx;
int total_grids = yolo_width * yolo_height;
if (idx >= batch_size * total_grids * num_anchors) return;
int info_len = 5 + num_classes;
//int batch_idx = idx / (total_grids * num_anchors);
int group_idx = idx / total_grids;
int anchor_idx = group_idx % num_anchors;
const float* cur_input = input + group_idx * (info_len * total_grids) + (idx % total_grids);
int class_id;
float max_cls_logit = -CUDART_INF_F; // minus infinity
for (int i = 5; i < info_len; ++i) {
float l = *(cur_input + i * total_grids);
if (l > max_cls_logit) {
max_cls_logit = l;
class_id = i - 5;
}
}
float max_cls_prob = sigmoidGPU(max_cls_logit);
float box_prob = sigmoidGPU(*(cur_input + 4 * total_grids));
//if (max_cls_prob < IGNORE_THRESH || box_prob < IGNORE_THRESH)
// return;
int row = (idx % total_grids) / yolo_width;
int col = (idx % total_grids) % yolo_width;
det->bbox[0] = (col + scale_sigmoidGPU(*(cur_input + 0 * total_grids), scale_x_y)) / yolo_width; // [0, 1]
det->bbox[1] = (row + scale_sigmoidGPU(*(cur_input + 1 * total_grids), scale_x_y)) / yolo_height; // [0, 1]
det->bbox[2] = __expf(*(cur_input + 2 * total_grids)) * *(anchors + 2 * anchor_idx + 0) / input_w; // [0, 1]
det->bbox[3] = __expf(*(cur_input + 3 * total_grids)) * *(anchors + 2 * anchor_idx + 1) / input_h; // [0, 1]
det->bbox[0] -= det->bbox[2] / 2; // shift from center to top-left
det->bbox[1] -= det->bbox[3] / 2;
det->det_confidence = box_prob;
det->class_id = class_id;
det->class_confidence = max_cls_prob;
}
inline __device__ float scale(float x, float s)
{
return s * x - (s - 1.0f) * 0.5f;
}
inline __device__ float square(float x)
{
return x * x;
}
__global__ void CalDetection_NewCoords(const float *input, float *output,
int batch_size,
int yolo_width, int yolo_height,
int num_anchors, const float *anchors,
int num_classes, int input_w, int input_h,
float scale_x_y)
{
int idx = threadIdx.x + blockDim.x * blockIdx.x;
Detection* det = ((Detection*) output) + idx;
int total_grids = yolo_width * yolo_height;
if (idx >= batch_size * total_grids * num_anchors) return;
int info_len = 5 + num_classes;
//int batch_idx = idx / (total_grids * num_anchors);
int group_idx = idx / total_grids;
int anchor_idx = group_idx % num_anchors;
const float* cur_input = input + group_idx * (info_len * total_grids) + (idx % total_grids);
int class_id;
float max_cls_prob = -CUDART_INF_F; // minus infinity
for (int i = 5; i < info_len; ++i) {
float l = *(cur_input + i * total_grids);
if (l > max_cls_prob) {
max_cls_prob = l;
class_id = i - 5;
}
}
float box_prob = *(cur_input + 4 * total_grids);
//if (max_cls_prob < IGNORE_THRESH || box_prob < IGNORE_THRESH)
// return;
int row = (idx % total_grids) / yolo_width;
int col = (idx % total_grids) % yolo_width;
det->bbox[0] = (col + scale(*(cur_input + 0 * total_grids), scale_x_y)) / yolo_width; // [0, 1]
det->bbox[1] = (row + scale(*(cur_input + 1 * total_grids), scale_x_y)) / yolo_height; // [0, 1]
det->bbox[2] = square(*(cur_input + 2 * total_grids)) * 4 * *(anchors + 2 * anchor_idx + 0) / input_w; // [0, 1]
det->bbox[3] = square(*(cur_input + 3 * total_grids)) * 4 * *(anchors + 2 * anchor_idx + 1) / input_h; // [0, 1]
det->bbox[0] -= det->bbox[2] / 2; // shift from center to top-left
det->bbox[1] -= det->bbox[3] / 2;
det->det_confidence = box_prob;
det->class_id = class_id;
det->class_confidence = max_cls_prob;
}
void YoloLayerPlugin::forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int batchSize)
{
int num_elements = batchSize * mNumAnchors * mYoloWidth * mYoloHeight;
//CHECK(cudaMemset(output, 0, num_elements * sizeof(Detection)));
if (mNewCoords) {
CalDetection_NewCoords<<<(num_elements + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream>>>
(inputs[0], output, batchSize, mYoloWidth, mYoloHeight, mNumAnchors, (const float*) mAnchors, mNumClasses, mInputWidth, mInputHeight, mScaleXY);
} else {
CalDetection<<<(num_elements + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream>>>
(inputs[0], output, batchSize, mYoloWidth, mYoloHeight, mNumAnchors, (const float*) mAnchors, mNumClasses, mInputWidth, mInputHeight, mScaleXY);
}
}
#if NV_TENSORRT_MAJOR >= 8
int32_t YoloLayerPlugin::enqueue(int32_t batchSize, void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) NOEXCEPT
#else // NV_TENSORRT_MAJOR < 8
int YoloLayerPlugin::enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream)
#endif // NV_TENSORRT_MAJOR
{
forwardGpu((const float* const*)inputs, (float*)outputs[0], stream, batchSize);
return 0;
}
YoloPluginCreator::YoloPluginCreator()
{
mPluginAttributes.clear();
mFC.nbFields = mPluginAttributes.size();
mFC.fields = mPluginAttributes.data();
}
const char* YoloPluginCreator::getPluginName() const NOEXCEPT
{
return "YoloLayer_TRT";
}
const char* YoloPluginCreator::getPluginVersion() const NOEXCEPT
{
return "1";
}
const PluginFieldCollection* YoloPluginCreator::getFieldNames() NOEXCEPT
{
return &mFC;
}
IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) NOEXCEPT
{
assert(!strcmp(name, getPluginName()));
const PluginField* fields = fc->fields;
int yolo_width, yolo_height, num_anchors = 0;
float anchors[MAX_ANCHORS * 2];
int num_classes, input_multiplier, new_coords = 0;
float scale_x_y = 1.0;
for (int i = 0; i < fc->nbFields; ++i)
{
const char* attrName = fields[i].name;
if (!strcmp(attrName, "yoloWidth"))
{
assert(fields[i].type == PluginFieldType::kINT32);
yolo_width = *(static_cast<const int*>(fields[i].data));
}
else if (!strcmp(attrName, "yoloHeight"))
{
assert(fields[i].type == PluginFieldType::kINT32);
yolo_height = *(static_cast<const int*>(fields[i].data));
}
else if (!strcmp(attrName, "numAnchors"))
{
assert(fields[i].type == PluginFieldType::kINT32);
num_anchors = *(static_cast<const int*>(fields[i].data));
}
else if (!strcmp(attrName, "numClasses"))
{
assert(fields[i].type == PluginFieldType::kINT32);
num_classes = *(static_cast<const int*>(fields[i].data));
}
else if (!strcmp(attrName, "inputMultiplier"))
{
assert(fields[i].type == PluginFieldType::kINT32);
input_multiplier = *(static_cast<const int*>(fields[i].data));
}
else if (!strcmp(attrName, "anchors")){
assert(num_anchors > 0 && num_anchors <= MAX_ANCHORS);
assert(fields[i].type == PluginFieldType::kFLOAT32);
memcpy(anchors, static_cast<const float*>(fields[i].data), num_anchors * 2 * sizeof(float));
}
else if (!strcmp(attrName, "scaleXY"))
{
assert(fields[i].type == PluginFieldType::kFLOAT32);
scale_x_y = *(static_cast<const float*>(fields[i].data));
}
else if (!strcmp(attrName, "newCoords"))
{
assert(fields[i].type == PluginFieldType::kINT32);
new_coords = *(static_cast<const int*>(fields[i].data));
}
else
{
std::cerr << "Unknown attribute: " << attrName << std::endl;
assert(0);
}
}
assert(yolo_width > 0 && yolo_height > 0);
assert(anchors[0] > 0.0f && anchors[1] > 0.0f);
assert(num_classes > 0);
assert(input_multiplier == 64 || input_multiplier == 32 || \
input_multiplier == 16 || input_multiplier == 8);
assert(scale_x_y >= 1.0);
YoloLayerPlugin* obj = new YoloLayerPlugin(yolo_width, yolo_height, num_anchors, anchors, num_classes, yolo_width * input_multiplier, yolo_height * input_multiplier, scale_x_y, new_coords);
obj->setPluginNamespace(mNamespace.c_str());
return obj;
}
IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) NOEXCEPT
{
YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength);
obj->setPluginNamespace(mNamespace.c_str());
return obj;
}
PluginFieldCollection YoloPluginCreator::mFC{};
std::vector<PluginField> YoloPluginCreator::mPluginAttributes;
} // namespace nvinfer1