DepthwiseConvExecution.cpp 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207
  1. //
  2. // DepthwiseConvExecution.cpp
  3. // MNN
  4. //
  5. // Created by MNN on 2019/02/28.
  6. // Copyright © 2018, Alibaba Group Holding Limited
  7. //
  8. #include "backend/opencl/execution/image/DepthwiseConvExecution.hpp"
  9. #include "backend/opencl/execution/image/MultiInputDWConvExecution.hpp"
  10. #include "core/Macro.h"
  11. #include <string.h>
  12. #include "core/TensorUtils.hpp"
  13. #include "backend/opencl/core/OpenCLRunningUtils.hpp"
  14. #include "core/ConvolutionCommon.hpp"
  15. namespace MNN {
  16. namespace OpenCL {
  17. DepthwiseConvExecution::DepthwiseConvExecution(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend)
  18. : ConvCommonExecution(op->main_as_Convolution2D(), backend) {
  19. mOpenCLBackend = static_cast<OpenCLBackend *>(backend);
  20. mCon2dParams = op->main_as_Convolution2D();
  21. mConv2dCommonParams = mCon2dParams->common();
  22. mStrides = {mConv2dCommonParams->strideY(), mConv2dCommonParams->strideX()};
  23. mDilations = {mConv2dCommonParams->dilateY(), mConv2dCommonParams->dilateX()};
  24. int kernelWidth = mConv2dCommonParams->kernelX();
  25. int kernelHeight = mConv2dCommonParams->kernelY();
  26. int outputChannel = mConv2dCommonParams->outputCount();
  27. std::vector<int> filterShape{1, outputChannel, kernelHeight, kernelWidth};
  28. std::vector<int> filterImageShape{(int)kernelHeight * kernelWidth, (int)UP_DIV(outputChannel, 4)};
  29. const float* filterDataPtr = nullptr;
  30. int filterDataSize = 0;
  31. std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
  32. ConvolutionCommon::getConvParameters(&quanCommon, backend, mCon2dParams, &filterDataPtr, &filterDataSize);
  33. mFilter.reset(Tensor::createDevice<float>({1, filterImageShape[1], 1, 4 * filterImageShape[0]}));
  34. std::shared_ptr<Tensor> filterBuffer(Tensor::createDevice<float>(filterShape));
  35. int buffer_size = filterBuffer->elementSize();
  36. if(mOpenCLBackend->getOpenCLRuntime()->isWeightCpuTransHalf()) {
  37. buffer_size *= sizeof(half_float::half);
  38. } else {
  39. buffer_size *= sizeof(float);
  40. }
  41. cl::Buffer filterBufferCL(mOpenCLBackend->getOpenCLRuntime()->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, buffer_size);
  42. filterBuffer->buffer().device = (uint64_t)(&filterBufferCL);
  43. cl_int error;
  44. auto ptrCL = mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueMapBuffer(filterBufferCL, true, CL_MAP_WRITE, 0, buffer_size, nullptr, nullptr, &error);
  45. if(ptrCL != nullptr && error == CL_SUCCESS){
  46. if(mOpenCLBackend->getOpenCLRuntime()->isWeightCpuTransHalf()){
  47. for (int i = 0; i < filterBuffer->elementSize(); i++) {
  48. ((half_float::half *)ptrCL)[i] = (half_float::half)(filterDataPtr[i]);
  49. }
  50. } else {
  51. ::memcpy(ptrCL, filterDataPtr, filterBuffer->size());
  52. }
  53. }else{
  54. MNN_ERROR("Map error ptrCL == nullptr \n");
  55. }
  56. mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueUnmapMemObject(filterBufferCL, ptrCL);
  57. mOpenCLBackend->onAcquireBuffer(mFilter.get(), Backend::STATIC);
  58. MNN::OpenCL::ImageBufferConvertor imageBufferConvertor{mOpenCLBackend->getOpenCLRuntime()};
  59. std::string buildOption = "";
  60. if(mOpenCLBackend->getOpenCLRuntime()->isWeightCpuTransHalf() == false){
  61. buildOption = "-DBUFFER_INP_FP32";
  62. }
  63. imageBufferConvertor.convertBufferToImage(filterBuffer.get(), MNN::OpenCL::DW_CONV2D_FILTER, mFilter.get(), false, buildOption);
  64. auto runtime = mOpenCLBackend->getOpenCLRuntime();
  65. std::set<std::string> buildOptions;
  66. std::string kernelName = "depthwise_conv2d";
  67. if (mConv2dCommonParams->strideX() == 1 && mConv2dCommonParams->strideY() == 1 &&
  68. mConv2dCommonParams->dilateX() == 1 && mConv2dCommonParams->dilateY() == 1) {
  69. kernelName = "depthwise_conv2d_s1";
  70. }
  71. if (mConv2dCommonParams->relu() == true) {
  72. buildOptions.emplace("-DRELU");
  73. } else if (mConv2dCommonParams->relu6() == true) {
  74. buildOptions.emplace("-DRELU6");
  75. }
  76. mKernel = runtime->buildKernel("depthwise_conv2d", kernelName, buildOptions);
  77. mMaxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mKernel));
  78. }
  79. DepthwiseConvExecution::~DepthwiseConvExecution() {
  80. mOpenCLBackend->onReleaseBuffer(mFilter.get(), Backend::STATIC);
  81. }
  82. ErrorCode DepthwiseConvExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
  83. startRecord(mOpenCLBackend->getOpenCLRuntime(), mRecording);
  84. auto input = inputs[0];
  85. auto output = outputs[0];
  86. std::vector<int> inputShape = tensorShapeFormat(input);
  87. std::vector<int> outputShape = tensorShapeFormat(output);
  88. mGlobalWorkSize = {static_cast<uint32_t>(UP_DIV(outputShape.at(3), 4) * UP_DIV(outputShape.at(2), 4)),
  89. static_cast<uint32_t>(outputShape.at(0) * outputShape.at(1))};
  90. auto padding = ConvolutionCommon::convolutionPad(input, output, mConv2dCommonParams);
  91. mPaddings[0] = padding.second;//padY
  92. mPaddings[1] = padding.first;//padX
  93. const int outputHeight = outputShape.at(1);
  94. const int outputWidth = outputShape.at(2);
  95. const int inputHeight = inputShape.at(1);
  96. const int inputWidth = inputShape.at(2);
  97. const int inputChannels = inputShape.at(3);
  98. const int inputChannelBlocks = UP_DIV(inputChannels, 4);
  99. const int filterHeight = mCon2dParams->common()->kernelY();
  100. const int filterWidth = mCon2dParams->common()->kernelX();
  101. uint32_t idx = 0;
  102. auto kernel = &mKernel;
  103. int inputImageShape[2] = {inputHeight, inputWidth};
  104. int outputImageShape[2] = {outputHeight, outputWidth};
  105. int strideShape[2] = {mStrides[0], mStrides[1]};
  106. int paddingShape[2] = {mPaddings[0], mPaddings[1]};
  107. int kernelShape[2] = {filterHeight, filterWidth};
  108. int dilationShape[2] = {mDilations[0], mDilations[1]};
  109. std::string kernelName = "depthwise_conv2d_s1";
  110. kernel->setArg(idx++, mGlobalWorkSize[0]);
  111. kernel->setArg(idx++, mGlobalWorkSize[1]);
  112. kernel->setArg(idx++, openCLImage(input));
  113. kernel->setArg(idx++, openCLImage(mFilter.get()));
  114. kernel->setArg(idx++, openCLImage(mBias.get()));
  115. kernel->setArg(idx++, openCLImage(output));
  116. kernel->setArg(idx++, sizeof(inputImageShape), inputImageShape);
  117. kernel->setArg(idx++, static_cast<int>(inputChannelBlocks));
  118. kernel->setArg(idx++, sizeof(outputImageShape), outputImageShape);
  119. kernel->setArg(idx++, sizeof(kernelShape), kernelShape);
  120. kernel->setArg(idx++, sizeof(paddingShape), paddingShape);
  121. if (mStrides[0] != 1 || mStrides[1] != 1 || mDilations[0] != 1 || mDilations[1] != 1) {
  122. kernel->setArg(idx++, sizeof(dilationShape), dilationShape);
  123. kernel->setArg(idx++, sizeof(strideShape), strideShape);
  124. kernelName = "depthwise_conv2d";
  125. }
  126. mLocalWorkSize = localWS2DDefault(mGlobalWorkSize, mMaxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), kernelName, mKernel).first;
  127. recordKernel2d(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime());
  128. endRecord(mOpenCLBackend->getOpenCLRuntime(), mRecording);
  129. return NO_ERROR;
  130. }
  131. ErrorCode DepthwiseConvExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
  132. #ifdef LOG_VERBOSE
  133. MNN_PRINT("start DepthwiseConvExecution onExecute !\n");
  134. #endif
  135. #ifdef ENABLE_OPENCL_TIME_PROFILER
  136. cl::Event event;
  137. runKernel2D(mKernel, mGlobalWorkSize, mLocalWorkSize,
  138. mOpenCLBackend->getOpenCLRuntime(),
  139. &event);
  140. mOpenCLBackend->getOpenCLRuntime()->pushEvent({"DepthwiseConv", event});
  141. #else
  142. if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){
  143. if(mOpenCLBackend->getOpenCLRuntime()->isDevideOpRecord())
  144. mOpenCLBackend->getOpenCLRuntime()->getRecordings()->emplace_back(mRecording);
  145. #ifdef LOG_VERBOSE
  146. MNN_PRINT("End DepthwiseConvExecution onExecute... \n");
  147. #endif
  148. return NO_ERROR;
  149. }
  150. runKernel2D(mKernel, mGlobalWorkSize, mLocalWorkSize,
  151. mOpenCLBackend->getOpenCLRuntime());
  152. #endif
  153. #ifdef LOG_VERBOSE
  154. MNN_PRINT("end DepthwiseConvExecution onExecute !\n");
  155. #endif
  156. return NO_ERROR;
  157. }
  158. class DepthwiseConvolutionCreator : public OpenCLBackend::Creator {
  159. public:
  160. virtual ~DepthwiseConvolutionCreator() = default;
  161. virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
  162. const MNN::Op *op, Backend *backend) const override {
  163. MNN_ASSERT(inputs.size() <= 3);
  164. if (inputs.size() == 2 || inputs.size() == 3) {
  165. return new MultiInputDWConvExecution(op, backend);
  166. }
  167. MNN_ASSERT(inputs.size() == 1);
  168. return new DepthwiseConvExecution(inputs, op, backend);
  169. }
  170. };
  171. OpenCLCreatorRegister<DepthwiseConvolutionCreator> __DepthwiseConv_op(OpType_ConvolutionDepthwise, IMAGE);
  172. } // namespace OpenCL
  173. } // namespace MNN