/* * yolo_layer.cu * * This code was originally written by wang-xinyu under MIT license. * I took it from: * * https://github.com/wang-xinyu/tensorrtx/tree/master/yolov4 * * and made necessary modifications. * * - JK Jung */ #include "yolo_layer.h" using namespace Yolo; namespace { // Write values into buffer template void write(char*& buffer, const T& val) { *reinterpret_cast(buffer) = val; buffer += sizeof(T); } // Read values from buffer template void read(const char*& buffer, T& val) { val = *reinterpret_cast(buffer); buffer += sizeof(T); } } // namespace namespace nvinfer1 { YoloLayerPlugin::YoloLayerPlugin(int yolo_width, int yolo_height, int num_anchors, float* anchors, int num_classes, int input_width, int input_height, float scale_x_y, int new_coords) { mYoloWidth = yolo_width; mYoloHeight = yolo_height; mNumAnchors = num_anchors; memcpy(mAnchorsHost, anchors, num_anchors * 2 * sizeof(float)); mNumClasses = num_classes; mInputWidth = input_width; mInputHeight = input_height; mScaleXY = scale_x_y; mNewCoords = new_coords; CHECK(cudaMalloc(&mAnchors, MAX_ANCHORS * 2 * sizeof(float))); CHECK(cudaMemcpy(mAnchors, mAnchorsHost, mNumAnchors * 2 * sizeof(float), cudaMemcpyHostToDevice)); } YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) { const char *d = reinterpret_cast(data), *a = d; read(d, mThreadCount); read(d, mYoloWidth); read(d, mYoloHeight); read(d, mNumAnchors); memcpy(mAnchorsHost, d, MAX_ANCHORS * 2 * sizeof(float)); d += MAX_ANCHORS * 2 * sizeof(float); read(d, mNumClasses); read(d, mInputWidth); read(d, mInputHeight); read(d, mScaleXY); read(d, mNewCoords); CHECK(cudaMalloc(&mAnchors, MAX_ANCHORS * 2 * sizeof(float))); CHECK(cudaMemcpy(mAnchors, mAnchorsHost, mNumAnchors * 2 * sizeof(float), cudaMemcpyHostToDevice)); assert(d == a + length); } IPluginV2IOExt* YoloLayerPlugin::clone() const NOEXCEPT { YoloLayerPlugin *p = new YoloLayerPlugin(mYoloWidth, mYoloHeight, mNumAnchors, (float*) mAnchorsHost, mNumClasses, mInputWidth, mInputHeight, mScaleXY, mNewCoords); p->setPluginNamespace(mPluginNamespace); return p; } void YoloLayerPlugin::terminate() NOEXCEPT { CHECK(cudaFree(mAnchors)); } size_t YoloLayerPlugin::getSerializationSize() const NOEXCEPT { return sizeof(mThreadCount) + \ sizeof(mYoloWidth) + sizeof(mYoloHeight) + \ sizeof(mNumAnchors) + MAX_ANCHORS * 2 * sizeof(float) + \ sizeof(mNumClasses) + \ sizeof(mInputWidth) + sizeof(mInputHeight) + \ sizeof(mScaleXY) + sizeof(mNewCoords); } void YoloLayerPlugin::serialize(void* buffer) const NOEXCEPT { char* d = static_cast(buffer), *a = d; write(d, mThreadCount); write(d, mYoloWidth); write(d, mYoloHeight); write(d, mNumAnchors); memcpy(d, mAnchorsHost, MAX_ANCHORS * 2 * sizeof(float)); d += MAX_ANCHORS * 2 * sizeof(float); write(d, mNumClasses); write(d, mInputWidth); write(d, mInputHeight); write(d, mScaleXY); write(d, mNewCoords); assert(d == a + getSerializationSize()); } Dims YoloLayerPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims) NOEXCEPT { assert(index == 0); assert(nbInputDims == 1); assert(inputs[0].d[0] == (mNumClasses + 5) * mNumAnchors); assert(inputs[0].d[1] == mYoloHeight); assert(inputs[0].d[2] == mYoloWidth); // output detection results to the channel dimension int totalsize = mYoloWidth * mYoloHeight * mNumAnchors * sizeof(Detection) / sizeof(float); return Dims3(totalsize, 1, 1); } inline __device__ float sigmoidGPU(float x) { return 1.0f / (1.0f + __expf(-x)); } inline __device__ float scale_sigmoidGPU(float x, float s) { return s * sigmoidGPU(x) - (s - 1.0f) * 0.5f; } // CalDetection(): This kernel processes 1 yolo layer calculation. It // distributes calculations so that 1 GPU thread would be responsible // for each grid/anchor combination. // NOTE: The output (x, y, w, h) are between 0.0 and 1.0 // (relative to orginal image width and height). __global__ void CalDetection(const float *input, float *output, int batch_size, int yolo_width, int yolo_height, int num_anchors, const float *anchors, int num_classes, int input_w, int input_h, float scale_x_y) { int idx = threadIdx.x + blockDim.x * blockIdx.x; Detection* det = ((Detection*) output) + idx; int total_grids = yolo_width * yolo_height; if (idx >= batch_size * total_grids * num_anchors) return; int info_len = 5 + num_classes; //int batch_idx = idx / (total_grids * num_anchors); int group_idx = idx / total_grids; int anchor_idx = group_idx % num_anchors; const float* cur_input = input + group_idx * (info_len * total_grids) + (idx % total_grids); int class_id; float max_cls_logit = -CUDART_INF_F; // minus infinity for (int i = 5; i < info_len; ++i) { float l = *(cur_input + i * total_grids); if (l > max_cls_logit) { max_cls_logit = l; class_id = i - 5; } } float max_cls_prob = sigmoidGPU(max_cls_logit); float box_prob = sigmoidGPU(*(cur_input + 4 * total_grids)); //if (max_cls_prob < IGNORE_THRESH || box_prob < IGNORE_THRESH) // return; int row = (idx % total_grids) / yolo_width; int col = (idx % total_grids) % yolo_width; det->bbox[0] = (col + scale_sigmoidGPU(*(cur_input + 0 * total_grids), scale_x_y)) / yolo_width; // [0, 1] det->bbox[1] = (row + scale_sigmoidGPU(*(cur_input + 1 * total_grids), scale_x_y)) / yolo_height; // [0, 1] det->bbox[2] = __expf(*(cur_input + 2 * total_grids)) * *(anchors + 2 * anchor_idx + 0) / input_w; // [0, 1] det->bbox[3] = __expf(*(cur_input + 3 * total_grids)) * *(anchors + 2 * anchor_idx + 1) / input_h; // [0, 1] det->bbox[0] -= det->bbox[2] / 2; // shift from center to top-left det->bbox[1] -= det->bbox[3] / 2; det->det_confidence = box_prob; det->class_id = class_id; det->class_confidence = max_cls_prob; } inline __device__ float scale(float x, float s) { return s * x - (s - 1.0f) * 0.5f; } inline __device__ float square(float x) { return x * x; } __global__ void CalDetection_NewCoords(const float *input, float *output, int batch_size, int yolo_width, int yolo_height, int num_anchors, const float *anchors, int num_classes, int input_w, int input_h, float scale_x_y) { int idx = threadIdx.x + blockDim.x * blockIdx.x; Detection* det = ((Detection*) output) + idx; int total_grids = yolo_width * yolo_height; if (idx >= batch_size * total_grids * num_anchors) return; int info_len = 5 + num_classes; //int batch_idx = idx / (total_grids * num_anchors); int group_idx = idx / total_grids; int anchor_idx = group_idx % num_anchors; const float* cur_input = input + group_idx * (info_len * total_grids) + (idx % total_grids); int class_id; float max_cls_prob = -CUDART_INF_F; // minus infinity for (int i = 5; i < info_len; ++i) { float l = *(cur_input + i * total_grids); if (l > max_cls_prob) { max_cls_prob = l; class_id = i - 5; } } float box_prob = *(cur_input + 4 * total_grids); //if (max_cls_prob < IGNORE_THRESH || box_prob < IGNORE_THRESH) // return; int row = (idx % total_grids) / yolo_width; int col = (idx % total_grids) % yolo_width; det->bbox[0] = (col + scale(*(cur_input + 0 * total_grids), scale_x_y)) / yolo_width; // [0, 1] det->bbox[1] = (row + scale(*(cur_input + 1 * total_grids), scale_x_y)) / yolo_height; // [0, 1] det->bbox[2] = square(*(cur_input + 2 * total_grids)) * 4 * *(anchors + 2 * anchor_idx + 0) / input_w; // [0, 1] det->bbox[3] = square(*(cur_input + 3 * total_grids)) * 4 * *(anchors + 2 * anchor_idx + 1) / input_h; // [0, 1] det->bbox[0] -= det->bbox[2] / 2; // shift from center to top-left det->bbox[1] -= det->bbox[3] / 2; det->det_confidence = box_prob; det->class_id = class_id; det->class_confidence = max_cls_prob; } void YoloLayerPlugin::forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int batchSize) { int num_elements = batchSize * mNumAnchors * mYoloWidth * mYoloHeight; //CHECK(cudaMemset(output, 0, num_elements * sizeof(Detection))); if (mNewCoords) { CalDetection_NewCoords<<<(num_elements + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream>>> (inputs[0], output, batchSize, mYoloWidth, mYoloHeight, mNumAnchors, (const float*) mAnchors, mNumClasses, mInputWidth, mInputHeight, mScaleXY); } else { CalDetection<<<(num_elements + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream>>> (inputs[0], output, batchSize, mYoloWidth, mYoloHeight, mNumAnchors, (const float*) mAnchors, mNumClasses, mInputWidth, mInputHeight, mScaleXY); } } #if NV_TENSORRT_MAJOR >= 8 int32_t YoloLayerPlugin::enqueue(int32_t batchSize, void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) NOEXCEPT #else // NV_TENSORRT_MAJOR < 8 int YoloLayerPlugin::enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) #endif // NV_TENSORRT_MAJOR { forwardGpu((const float* const*)inputs, (float*)outputs[0], stream, batchSize); return 0; } YoloPluginCreator::YoloPluginCreator() { mPluginAttributes.clear(); mFC.nbFields = mPluginAttributes.size(); mFC.fields = mPluginAttributes.data(); } const char* YoloPluginCreator::getPluginName() const NOEXCEPT { return "YoloLayer_TRT"; } const char* YoloPluginCreator::getPluginVersion() const NOEXCEPT { return "1"; } const PluginFieldCollection* YoloPluginCreator::getFieldNames() NOEXCEPT { return &mFC; } IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) NOEXCEPT { assert(!strcmp(name, getPluginName())); const PluginField* fields = fc->fields; int yolo_width, yolo_height, num_anchors = 0; float anchors[MAX_ANCHORS * 2]; int num_classes, input_multiplier, new_coords = 0; float scale_x_y = 1.0; for (int i = 0; i < fc->nbFields; ++i) { const char* attrName = fields[i].name; if (!strcmp(attrName, "yoloWidth")) { assert(fields[i].type == PluginFieldType::kINT32); yolo_width = *(static_cast(fields[i].data)); } else if (!strcmp(attrName, "yoloHeight")) { assert(fields[i].type == PluginFieldType::kINT32); yolo_height = *(static_cast(fields[i].data)); } else if (!strcmp(attrName, "numAnchors")) { assert(fields[i].type == PluginFieldType::kINT32); num_anchors = *(static_cast(fields[i].data)); } else if (!strcmp(attrName, "numClasses")) { assert(fields[i].type == PluginFieldType::kINT32); num_classes = *(static_cast(fields[i].data)); } else if (!strcmp(attrName, "inputMultiplier")) { assert(fields[i].type == PluginFieldType::kINT32); input_multiplier = *(static_cast(fields[i].data)); } else if (!strcmp(attrName, "anchors")){ assert(num_anchors > 0 && num_anchors <= MAX_ANCHORS); assert(fields[i].type == PluginFieldType::kFLOAT32); memcpy(anchors, static_cast(fields[i].data), num_anchors * 2 * sizeof(float)); } else if (!strcmp(attrName, "scaleXY")) { assert(fields[i].type == PluginFieldType::kFLOAT32); scale_x_y = *(static_cast(fields[i].data)); } else if (!strcmp(attrName, "newCoords")) { assert(fields[i].type == PluginFieldType::kINT32); new_coords = *(static_cast(fields[i].data)); } else { std::cerr << "Unknown attribute: " << attrName << std::endl; assert(0); } } assert(yolo_width > 0 && yolo_height > 0); assert(anchors[0] > 0.0f && anchors[1] > 0.0f); assert(num_classes > 0); assert(input_multiplier == 64 || input_multiplier == 32 || \ input_multiplier == 16 || input_multiplier == 8); assert(scale_x_y >= 1.0); YoloLayerPlugin* obj = new YoloLayerPlugin(yolo_width, yolo_height, num_anchors, anchors, num_classes, yolo_width * input_multiplier, yolo_height * input_multiplier, scale_x_y, new_coords); obj->setPluginNamespace(mNamespace.c_str()); return obj; } IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) NOEXCEPT { YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength); obj->setPluginNamespace(mNamespace.c_str()); return obj; } PluginFieldCollection YoloPluginCreator::mFC{}; std::vector YoloPluginCreator::mPluginAttributes; } // namespace nvinfer1