熟悉目标检测的应该都清楚NMS是什么算法,可是若是咱们要与C++和cuda结合直接写成Pytorch的操做大家清楚怎么写吗?最近在看mmdetection的源码,发现其实原来写C++和cuda的扩展也不难,下面给你们讲一下。html
C ++的扩展是容许用户来建立自定义PyTorch框架外的操做(operators )的,即从PyTorch后端分离。此方法与实现本地PyTorch操做的方式不一样。C ++扩展旨在为您节省大量与将操做与PyTorch后端集成在一块儿相关的样板,同时为基于PyTorch的项目提供高度的灵活性。python
官方给出了一个LLTM的例子,你们也能够看一下。ios
先复习一下NMS的算法:c++
这里我给出一份纯numpy的实现:git
def nms(bounding_boxes, Nt): if len(bounding_boxes) == 0: return [], [] bboxes = np.array(bounding_boxes) x1 = bboxes[:, 0] y1 = bboxes[:, 1] x2 = bboxes[:, 2] y2 = bboxes[:, 3] scores = bboxes[:, 4] areas = (x2 - x1 + 1) * (y2 - y1 + 1) order = np.argsort(scores) picked_boxes = [] while order.size > 0: index = order[-1] picked_boxes.append(bounding_boxes[index]) x11 = np.maximum(x1[index], x1[order[:-1]]) y11 = np.maximum(y1[index], y1[order[:-1]]) x22 = np.minimum(x2[index], x2[order[:-1]]) y22 = np.minimum(y2[index], y2[order[:-1]]) w = np.maximum(0.0, x22 - x11 + 1) h = np.maximum(0.0, y22 - y11 + 1) intersection = w * h ious = intersection / (areas[index] + areas[order[:-1]] - intersection) left = np.where(ious < Nt) order = order[left] return picked_boxes
须要编写下面5个文件:github
(1) nms_kernel.cu 主要使用ATen和THC库编写nms_cuda_forward的函数,使用C++编写,涉及一些lazyInitCUDA,THCudaFree,THCCeilDiv 的操做,算法跟咱们前面写的numpy差不太多。算法
(2) nms_cuda.cpp 是调用了nms_kernel.cu文件的nms_cuda_forward封装了一下变成nms_cuda函数。后端
(3) nms_ext.cpp 进一步封装nms_cuda函数为nms,而且经过PYBIND11_MODULE绑定成python可调用的函数。app
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("nms", &nms, "non-maximum suppression"); }
经过上面那样就至关于告诉python函数名定义为nms了。框架
(4) setup.py 就是编译一遍nms_ext,至此你就能够经过nms_ext.nms调用cpp extension做为pytorch的操做了
make_cuda_ext( name='nms_ext', module='mmdet.ops.nms', sources=['src/nms_ext.cpp', 'src/cpu/nms_cpu.cpp'], sources_cuda=[ 'src/cuda/nms_cuda.cpp', 'src/cuda/nms_kernel.cu' ]),
(5) nms_wrapper.py 再次封装 nms_ext.nms,方便使用,使用实例:
from . import nms_ext inds = nms_ext.nms(dets_th, iou_thr)
稍微完整的代码以下,可是我也删减了一些,只剩下nms相关的代码,想要看完整代码能够点击下面的文件名。
nms_kernel.cu (这个估计有部分是Facebook写的)
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. #include <ATen/ATen.h> #include <ATen/cuda/CUDAContext.h> #include <ATen/DeviceGuard.h> #include <THC/THC.h> #include <THC/THCDeviceUtils.cuh> #include <vector> #include <iostream> int const threadsPerBlock = sizeof(unsigned long long) * 8; __device__ inline float devIoU(float const * const a, float const * const b) { float left = max(a[0], b[0]), right = min(a[2], b[2]); float top = max(a[1], b[1]), bottom = min(a[3], b[3]); float width = max(right - left, 0.f), height = max(bottom - top, 0.f); float interS = width * height; float Sa = (a[2] - a[0]) * (a[3] - a[1]); float Sb = (b[2] - b[0]) * (b[3] - b[1]); return interS / (Sa + Sb - interS); } __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, const float *dev_boxes, unsigned long long *dev_mask) { const int row_start = blockIdx.y; const int col_start = blockIdx.x; // if (row_start > col_start) return; const int row_size = min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); const int col_size = min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); __shared__ float block_boxes[threadsPerBlock * 5]; if (threadIdx.x < col_size) { block_boxes[threadIdx.x * 5 + 0] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; block_boxes[threadIdx.x * 5 + 1] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; block_boxes[threadIdx.x * 5 + 2] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; block_boxes[threadIdx.x * 5 + 3] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; block_boxes[threadIdx.x * 5 + 4] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; } __syncthreads(); if (threadIdx.x < row_size) { const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; const float *cur_box = dev_boxes + cur_box_idx * 5; int i = 0; unsigned long long t = 0; int start = 0; if (row_start == col_start) { start = threadIdx.x + 1; } for (i = start; i < col_size; i++) { if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { t |= 1ULL << i; } } const int col_blocks = THCCeilDiv(n_boxes, threadsPerBlock); dev_mask[cur_box_idx * col_blocks + col_start] = t; } } // boxes is a N x 5 tensor at::Tensor nms_cuda_forward(const at::Tensor boxes, float nms_overlap_thresh) { // Ensure CUDA uses the input tensor device. at::DeviceGuard guard(boxes.device()); using scalar_t = float; AT_ASSERTM(boxes.device().is_cuda(), "boxes must be a CUDA tensor"); auto scores = boxes.select(1, 4); auto order_t = std::get<1>(scores.sort(0, /* descending=*/true)); auto boxes_sorted = boxes.index_select(0, order_t); int boxes_num = boxes.size(0); const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock); scalar_t* boxes_dev = boxes_sorted.data_ptr<scalar_t>(); THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState unsigned long long* mask_dev = NULL; //THCudaCheck(THCudaMalloc(state, (void**) &mask_dev, // boxes_num * col_blocks * sizeof(unsigned long long))); mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long)); dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock), THCCeilDiv(boxes_num, threadsPerBlock)); dim3 threads(threadsPerBlock); nms_kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(boxes_num, nms_overlap_thresh, boxes_dev, mask_dev); std::vector<unsigned long long> mask_host(boxes_num * col_blocks); THCudaCheck(cudaMemcpyAsync( &mask_host[0], mask_dev, sizeof(unsigned long long) * boxes_num * col_blocks, cudaMemcpyDeviceToHost, at::cuda::getCurrentCUDAStream() )); std::vector<unsigned long long> remv(col_blocks); memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); at::Tensor keep = at::empty({boxes_num}, boxes.options().dtype(at::kLong).device(at::kCPU)); int64_t* keep_out = keep.data_ptr<int64_t>(); int num_to_keep = 0; for (int i = 0; i < boxes_num; i++) { int nblock = i / threadsPerBlock; int inblock = i % threadsPerBlock; if (!(remv[nblock] & (1ULL << inblock))) { keep_out[num_to_keep++] = i; unsigned long long *p = &mask_host[0] + i * col_blocks; for (int j = nblock; j < col_blocks; j++) { remv[j] |= p[j]; } } } THCudaFree(state, mask_dev); // TODO improve this part return order_t.index({ keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to( order_t.device(), keep.scalar_type())}); }
#include <torch/extension.h> #define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ") at::Tensor nms_cuda_forward(const at::Tensor boxes, float nms_overlap_thresh); at::Tensor nms_cuda(const at::Tensor& dets, const float threshold) { CHECK_CUDA(dets); if (dets.numel() == 0) return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)); return nms_cuda_forward(dets, threshold); }
#include <torch/extension.h> #ifdef WITH_CUDA at::Tensor nms_cuda(const at::Tensor& dets, const float threshold); #endif at::Tensor nms(const at::Tensor& dets, const float threshold){ if (dets.device().is_cuda()) { #ifdef WITH_CUDA return nms_cuda(dets, threshold); #else AT_ERROR("nms is not compiled with GPU support"); #endif } # return nms_cpu(dets, threshold); } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("nms", &nms, "non-maximum suppression"); }
def make_cuda_ext(name, module, sources, sources_cuda=[]): define_macros = [] extra_compile_args = {'cxx': []} if torch.cuda.is_available() or os.getenv('FORCE_CUDA', '0') == '1': define_macros += [('WITH_CUDA', None)] extension = CUDAExtension extra_compile_args['nvcc'] = [ '-D__CUDA_NO_HALF_OPERATORS__', '-D__CUDA_NO_HALF_CONVERSIONS__', '-D__CUDA_NO_HALF2_OPERATORS__', ] sources += sources_cuda else: print(f'Compiling {name} without CUDA') extension = CppExtension # raise EnvironmentError('CUDA is required to compile MMDetection!') return extension( name=f'{module}.{name}', sources=[os.path.join(*module.split('.'), p) for p in sources], define_macros=define_macros, extra_compile_args=extra_compile_args) if __name__ == '__main__': write_version_py() setup( name='mmdet', version=get_version(), description='Open MMLab Detection Toolbox and Benchmark', long_description=readme(), author='OpenMMLab', author_email='chenkaidev@gmail.com', keywords='computer vision, object detection', url='https://github.com/open-mmlab/mmdetection', packages=find_packages(exclude=('configs', 'tools', 'demo')), package_data={'mmdet.ops': ['*/*.so']}, classifiers=[ 'Development Status :: 4 - Beta', 'License :: OSI Approved :: Apache Software License', 'Operating System :: OS Independent', 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', ], license='Apache License 2.0', setup_requires=parse_requirements('requirements/build.txt'), tests_require=parse_requirements('requirements/tests.txt'), install_requires=parse_requirements('requirements/runtime.txt'), extras_require={ 'all': parse_requirements('requirements.txt'), 'tests': parse_requirements('requirements/tests.txt'), 'build': parse_requirements('requirements/build.txt'), 'optional': parse_requirements('requirements/optional.txt'), }, ext_modules=[ make_cuda_ext( name='compiling_info', module='mmdet.ops.utils', sources=['src/compiling_info.cpp']), make_cuda_ext( name='nms_ext', module='mmdet.ops.nms', sources=['src/nms_ext.cpp', 'src/cpu/nms_cpu.cpp'], sources_cuda=[ 'src/cuda/nms_cuda.cpp', 'src/cuda/nms_kernel.cu' ]), ], cmdclass={'build_ext': BuildExtension}, zip_safe=False)
from . import nms_ext def nms(dets, iou_thr, device_id=None): # convert dets (tensor or numpy array) to tensor if isinstance(dets, torch.Tensor): is_numpy = False dets_th = dets elif isinstance(dets, np.ndarray): is_numpy = True device = 'cpu' if device_id is None else f'cuda:{device_id}' dets_th = torch.from_numpy(dets).to(device) else: raise TypeError('dets must be either a Tensor or numpy array, ' f'but got {type(dets)}') # execute cpu or cuda nms if dets_th.shape[0] == 0: inds = dets_th.new_zeros(0, dtype=torch.long) else: if dets_th.is_cuda: inds = nms_ext.nms(dets_th, iou_thr) else: inds = nms_ext.nms(dets_th, iou_thr) if is_numpy: inds = inds.cpu().numpy() return dets[inds, :], inds
想要编写一个c++和cuda的扩展给Pytorch使用,其实主要就4步:
经过绑定的函数名function就能够在Pytorch中调用了。