ConvExecution.cpp 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637
  1. //
  2. // ConvExecution.cpp
  3. // MNN
  4. //
  5. // Created by MNN on 2019/02/28.
  6. // Copyright © 2018, Alibaba Group Holding Limited
  7. //
  8. #include "ConvExecution.hpp"
  9. #include "ConvWinograd.hpp"
  10. #include "core/ConvolutionCommon.hpp"
  11. #include "core/Macro.h"
  12. #include "core/TensorUtils.hpp"
  13. #include "backend/opencl/core/OpenCLBackend.hpp"
  14. #include "backend/opencl/core/OpenCLRunningUtils.hpp"
  15. #define UNIT 4
  16. namespace MNN {
  17. namespace OpenCL {
  18. ConvCommonExecution::ConvCommonExecution(const Convolution2D *conv2dParams, Backend *backend) : Execution(backend) {
  19. auto openclBackend = (OpenCLBackend *)backend;
  20. int biasSize = conv2dParams->bias()->size();
  21. const float *biasDataPtr = conv2dParams->bias()->data();
  22. int buffer_size = ALIGN_UP4(biasSize);
  23. if(openclBackend->getOpenCLRuntime()->isWeightCpuTransHalf()) {
  24. buffer_size *= sizeof(half_float::half);
  25. } else {
  26. buffer_size *= sizeof(float);
  27. }
  28. cl::Buffer biasBuffer(openclBackend->getOpenCLRuntime()->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, buffer_size);
  29. cl_int error;
  30. auto biasPtrCL = openclBackend->getOpenCLRuntime()->commandQueue().enqueueMapBuffer(
  31. biasBuffer, true, CL_MAP_WRITE, 0, buffer_size, nullptr, nullptr, &error);
  32. if(biasPtrCL != nullptr && error == CL_SUCCESS){
  33. if(openclBackend->getOpenCLRuntime()->isWeightCpuTransHalf()){
  34. for(int i=0; i<biasSize; i++) {
  35. ((half_float::half*)biasPtrCL)[i] = (half_float::half)(biasDataPtr[i]);
  36. }
  37. for(int i=biasSize; i<ALIGN_UP4(biasSize); i++) {
  38. ((half_float::half*)biasPtrCL)[i] = (half_float::half)(0.0f);
  39. }
  40. }else{
  41. ::memset(biasPtrCL, 0, ALIGN_UP4(biasSize) * sizeof(float));
  42. ::memcpy(biasPtrCL, biasDataPtr, biasSize * sizeof(float));
  43. }
  44. }else{
  45. MNN_ERROR("Map error biasPtrCL == nullptr \n");
  46. }
  47. openclBackend->getOpenCLRuntime()->commandQueue().enqueueUnmapMemObject(biasBuffer, biasPtrCL);
  48. mBias.reset(Tensor::createDevice<float>({1, 1, 1, biasSize}));
  49. backend->onAcquireBuffer(mBias.get(), Backend::STATIC);
  50. copyBufferToImage(openclBackend->getOpenCLRuntime(), biasBuffer, openCLImage(mBias.get()), UP_DIV(biasSize, 4), 1);
  51. }
  52. ConvCommonExecution::~ConvCommonExecution() {
  53. MNN_ASSERT(nullptr != mBias);
  54. backend()->onReleaseBuffer(mBias.get(), Backend::STATIC);
  55. }
  56. ConvExecution::ConvExecution(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, const MNN::Op *op, Backend *backend)
  57. : ConvCommonExecution(op->main_as_Convolution2D(), backend) {
  58. #ifdef LOG_VERBOSE
  59. MNN_PRINT("Start ConvExecution init !\n");
  60. #endif
  61. mOpenCLBackend = static_cast<OpenCLBackend *>(backend);
  62. const auto *conv2dParams = op->main_as_Convolution2D();
  63. const auto *conv2dCommonParams = conv2dParams->common();
  64. mConv2dCommonParams = conv2dCommonParams;
  65. mStrides = {conv2dCommonParams->strideY(), conv2dCommonParams->strideX()};
  66. mDilations = {conv2dCommonParams->dilateY(), conv2dCommonParams->dilateX()};
  67. auto pad = ConvolutionCommon::convolutionPad(inputs[0], outputs[0], mConv2dCommonParams);
  68. mPaddings[0] = pad.second;
  69. mPaddings[1] = pad.first;
  70. int kernelWidth = conv2dCommonParams->kernelX();
  71. int kernelHeight = conv2dCommonParams->kernelY();
  72. int outputChannel = conv2dCommonParams->outputCount();
  73. auto gpuType = mOpenCLBackend->getOpenCLRuntime()->getGpuType();
  74. mWeightUseBuffer = gpuType == GpuType::MALI;
  75. int weightSize = 0;
  76. const float *filterDataPtr = nullptr;
  77. std::shared_ptr<MNN::ConvolutionCommon::Int8Common> quanCommon;
  78. if (nullptr != conv2dParams->quanParameter()) {
  79. quanCommon = ConvolutionCommon::load(conv2dParams, backend, true);
  80. if (nullptr == quanCommon) {
  81. MNN_ERROR("Memory not Enough, can't extract IDST Convolution: %s \n", op->name()->c_str());
  82. }
  83. if (quanCommon->weightFloat.get() == nullptr) {
  84. MNN_PRINT("quanCommon->weightFloat.get() == nullptr \n");
  85. }
  86. // Back to float
  87. filterDataPtr = quanCommon->weightFloat.get();
  88. weightSize = quanCommon->weightFloat.size();
  89. } else if (nullptr == conv2dParams->weight() || nullptr == conv2dParams->bias()) {
  90. MNN_ERROR("%s has no weight or bias. The model may be benchmark model, please revert the weight/bias firstly\n", op->name()->c_str());
  91. }
  92. if (nullptr == filterDataPtr) {
  93. weightSize = conv2dParams->weight()->size();
  94. filterDataPtr = conv2dParams->weight()->data();
  95. }
  96. int inputChannel = weightSize / (kernelWidth * kernelHeight * outputChannel);
  97. //select opt conv method
  98. std::string kernelName = "conv_2d_c4h1w4";
  99. if (kernelHeight == kernelWidth && kernelHeight == 1 && mPaddings[0] == 0 &&
  100. mPaddings[1] == 0) {
  101. mConv1x1Opt = (mStrides[0] == 1 && mStrides[1] == 1 && gpuType == GpuType::MALI && !mWeightUseBuffer);
  102. #if 0
  103. if((gpuType == GpuType::ADRENO)){
  104. uint64_t useLocalSize = UNIT*UNIT*4*sizeof(float)*4;
  105. if(useLocalSize >= mOpenCLBackend->getOpenCLRuntime()->getMaxLocalMem()){
  106. mUseLocalMem = false;
  107. }else{
  108. kernelName = "conv_2d_1x1_local";
  109. mUseLocalMem=true;
  110. }
  111. }
  112. #endif
  113. if(!mUseLocalMem){
  114. if(mConv1x1Opt){
  115. kernelName = "conv_2d_1x1_mali";
  116. }else{
  117. kernelName = "conv_2d_1x1";
  118. }
  119. }
  120. }
  121. if(mConv1x1Opt && !mUseLocalMem){
  122. cl_int error;
  123. std::shared_ptr<Tensor> filterBuffer(Tensor::createDevice<float>({UP_DIV(outputChannel, 4)*4, UP_DIV(inputChannel, 4)*4, kernelWidth, kernelHeight}));
  124. int buffer_size = filterBuffer->elementSize();
  125. if(mOpenCLBackend->getOpenCLRuntime()->isSupportedFP16()) {
  126. buffer_size *= sizeof(half_float::half);
  127. } else {
  128. buffer_size *= sizeof(float);
  129. }
  130. mKernelBuffer.reset(new cl::Buffer(mOpenCLBackend->getOpenCLRuntime()->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, buffer_size));
  131. auto kernelBufferPtr = mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueMapBuffer(*(mKernelBuffer.get()), true, CL_MAP_WRITE, 0, buffer_size, nullptr, nullptr, &error);
  132. if(kernelBufferPtr != nullptr && error == CL_SUCCESS){
  133. ::memset(kernelBufferPtr, 0, buffer_size);
  134. for(int o = 0; o < outputChannel; o++){
  135. for(int i = 0 ; i < inputChannel; i++){
  136. int bufferIdx = (o/4) * ROUND_UP(inputChannel, 4)*4 + (i/4)*16 + (o%4)*4 + (i%4);
  137. int filterIdx = o*inputChannel + i;
  138. if(mOpenCLBackend->getOpenCLRuntime()->isSupportedFP16()){
  139. ((half_float::half*)kernelBufferPtr)[bufferIdx] = (half_float::half)(filterDataPtr[filterIdx]);
  140. }else{
  141. ((float*)kernelBufferPtr)[bufferIdx] = (float)(filterDataPtr[filterIdx]);
  142. }
  143. }
  144. }
  145. }else{
  146. MNN_ERROR("Map error ptrCL == nullptr \n");
  147. }
  148. mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueUnmapMemObject(*(mKernelBuffer.get()), kernelBufferPtr);
  149. //bias
  150. int biasSize = conv2dParams->bias()->size();
  151. const float *biasDataPtr = conv2dParams->bias()->data();
  152. buffer_size = ALIGN_UP4(biasSize);
  153. if(mOpenCLBackend->getOpenCLRuntime()->isSupportedFP16()) {
  154. buffer_size *= sizeof(half_float::half);
  155. } else {
  156. buffer_size *= sizeof(float);
  157. }
  158. mBiasBuffer.reset(new cl::Buffer(mOpenCLBackend->getOpenCLRuntime()->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, buffer_size));
  159. auto biasPtrCL = mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueMapBuffer(
  160. *(mBiasBuffer.get()), true, CL_MAP_WRITE, 0, buffer_size, nullptr, nullptr, &error);
  161. if(biasPtrCL != nullptr && error == CL_SUCCESS){
  162. if(mOpenCLBackend->getOpenCLRuntime()->isSupportedFP16()){
  163. for (int i = 0; i < biasSize; i++)
  164. {
  165. ((half_float::half*)biasPtrCL)[i] = (half_float::half)(biasDataPtr[i]);
  166. }
  167. for(int i=biasSize; i<ALIGN_UP4(biasSize); i++) {
  168. ((half_float::half*)biasPtrCL)[i] = (half_float::half)(0.0f);
  169. }
  170. }else{
  171. ::memset(biasPtrCL, 0, ALIGN_UP4(biasSize) * sizeof(float));
  172. ::memcpy(biasPtrCL, biasDataPtr, biasSize * sizeof(float));
  173. }
  174. }else{
  175. MNN_ERROR("Map error biasPtrCL == nullptr \n");
  176. }
  177. mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueUnmapMemObject(*(mBiasBuffer.get()), biasPtrCL);
  178. }else if(kernelHeight == kernelWidth && kernelHeight == 1 && mPaddings[0] == 0 && mPaddings[1] == 0 && mWeightUseBuffer){
  179. cl_int error;
  180. std::shared_ptr<Tensor> filterBuffer(Tensor::createDevice<float>({UP_DIV(outputChannel, 4), ROUND_UP(inputChannel, 4), 4}));
  181. int buffer_size = filterBuffer->elementSize();
  182. if(mOpenCLBackend->getOpenCLRuntime()->isSupportedFP16()) {
  183. buffer_size *= sizeof(half_float::half);
  184. } else {
  185. buffer_size *= sizeof(float);
  186. }
  187. mKernelBuffer.reset(new cl::Buffer(mOpenCLBackend->getOpenCLRuntime()->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, buffer_size));
  188. auto kernelBufferPtr = mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueMapBuffer(*(mKernelBuffer.get()), true, CL_MAP_WRITE, 0, buffer_size, nullptr, nullptr, &error);
  189. if(kernelBufferPtr != nullptr && error == CL_SUCCESS){
  190. ::memset(kernelBufferPtr, 0, buffer_size);
  191. for(int o = 0; o < outputChannel; o++){
  192. for(int i = 0 ; i < inputChannel; i++){
  193. int bufferIdx = (o/4) * ROUND_UP(inputChannel, 4)*4 + i*4 + (o%4);
  194. int filterIdx = o*inputChannel + i;
  195. if(mOpenCLBackend->getOpenCLRuntime()->isSupportedFP16()){
  196. ((half_float::half*)kernelBufferPtr)[bufferIdx] = (half_float::half)(filterDataPtr[filterIdx]);
  197. }else{
  198. ((float*)kernelBufferPtr)[bufferIdx] = (float)(filterDataPtr[filterIdx]);
  199. }
  200. }
  201. }
  202. }else{
  203. MNN_ERROR("Map error ptrCL == nullptr \n");
  204. }
  205. mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueUnmapMemObject(*(mKernelBuffer.get()), kernelBufferPtr);
  206. }else{
  207. std::vector<int> filterImageShape{(int)inputChannel, (int)(UP_DIV(outputChannel, 4) * kernelWidth * kernelHeight)};
  208. std::shared_ptr<Tensor> filterBuffer(
  209. Tensor::createDevice<float>({outputChannel, inputChannel, kernelWidth, kernelHeight}));
  210. int buffer_size = filterBuffer->elementSize();
  211. if(mOpenCLBackend->getOpenCLRuntime()->isWeightCpuTransHalf()) {
  212. buffer_size *= sizeof(half_float::half);
  213. } else {
  214. buffer_size *= sizeof(float);
  215. }
  216. cl::Buffer filterBufferCL(mOpenCLBackend->getOpenCLRuntime()->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, buffer_size);
  217. filterBuffer->buffer().device = (uint64_t)(&filterBufferCL);
  218. cl_int error;
  219. auto ptrCL = mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueMapBuffer(filterBufferCL, true, CL_MAP_WRITE, 0, buffer_size, nullptr, nullptr, &error);
  220. if(ptrCL != nullptr && error == CL_SUCCESS) {
  221. ::memset(ptrCL, 0, buffer_size);
  222. if(mOpenCLBackend->getOpenCLRuntime()->isWeightCpuTransHalf()){
  223. for(int i = 0 ; i < filterBuffer->elementSize(); i++){
  224. ((half_float::half*)ptrCL)[i] = (half_float::half)(filterDataPtr[i]);
  225. }
  226. }else{
  227. ::memcpy(ptrCL, filterDataPtr, filterBuffer->size());
  228. }
  229. }else{
  230. MNN_ERROR("Map error ptrCL == nullptr \n");
  231. }
  232. mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueUnmapMemObject(filterBufferCL, ptrCL);
  233. if(mWeightUseBuffer){
  234. mFilter.reset(Tensor::createDevice<float>({UP_DIV(inputChannel, 4)*4, UP_DIV(outputChannel, 4), kernelWidth * kernelHeight, 4}));
  235. int kernel_buffer_size = UP_DIV(outputChannel, 4)*4* UP_DIV(inputChannel, 4)*4* kernelWidth* kernelHeight;
  236. if(mOpenCLBackend->getOpenCLRuntime()->isSupportedFP16()) {
  237. kernel_buffer_size *= sizeof(half_float::half);
  238. } else {
  239. kernel_buffer_size *= sizeof(float);
  240. }
  241. mKernelBuffer.reset(new cl::Buffer(mOpenCLBackend->getOpenCLRuntime()->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, kernel_buffer_size));
  242. mFilter.get()->buffer().device = (uint64_t)mKernelBuffer.get();
  243. MNN::OpenCL::BufferConvertor bufferConvertor{mOpenCLBackend->getOpenCLRuntime()};
  244. bool needTrans = false;
  245. if(mOpenCLBackend->getOpenCLRuntime()->isWeightCpuTransHalf() == false){
  246. needTrans = true;
  247. }
  248. bufferConvertor.convertToNC4HW4Buffer(filterBuffer.get(), MNN::OpenCL::CONV2D_FILTER, mFilter.get(), needTrans);
  249. } else{
  250. mFilter.reset(Tensor::createDevice<float>({1, filterImageShape[1], 1, 4 * filterImageShape[0]}));
  251. mOpenCLBackend->onAcquireBuffer(mFilter.get(), Backend::STATIC);
  252. MNN::OpenCL::ImageBufferConvertor imageBufferConvertor{mOpenCLBackend->getOpenCLRuntime()};
  253. std::string buildOption = "";
  254. if(mOpenCLBackend->getOpenCLRuntime()->isWeightCpuTransHalf() == false){
  255. buildOption = "-DBUFFER_INP_FP32";
  256. }
  257. imageBufferConvertor.convertBufferToImage(filterBuffer.get(), MNN::OpenCL::CONV2D_FILTER, mFilter.get(), false, buildOption);
  258. }
  259. }
  260. // Create Kernel
  261. if (mStrides[0] == 1 && mStrides[1] == 1 && mDilations[0] == 1 && mDilations[1] == 1) {
  262. mBuildOptions.emplace("-DMNN_CONV_S1D1");
  263. }
  264. mBuildOptions.emplace("-DBIAS");
  265. if (mConv2dCommonParams->relu()) {
  266. mBuildOptions.emplace("-DRELU");
  267. } else if (mConv2dCommonParams->relu6()) {
  268. mBuildOptions.emplace("-DRELU6");
  269. }
  270. if(mWeightUseBuffer){
  271. mBuildOptions.emplace("-DUSE_BUFFER");
  272. }
  273. mKernel = mOpenCLBackend->getOpenCLRuntime()->buildKernel("conv_2d", kernelName, mBuildOptions);
  274. mMaxWorkGroupSize = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(mKernel));
  275. #ifdef LOG_VERBOSE
  276. MNN_PRINT("end ConvExecution init !\n");
  277. #endif
  278. }
  279. ConvExecution::~ConvExecution() {
  280. if((mUseLocalMem || !mConv1x1Opt) && !mWeightUseBuffer){
  281. mOpenCLBackend->onReleaseBuffer(mFilter.get(), Backend::STATIC);
  282. }
  283. }
  284. ErrorCode ConvExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
  285. #ifdef LOG_VERBOSE
  286. MNN_PRINT("Start ConvExecution onResize !\n");
  287. #endif
  288. startRecord(mOpenCLBackend->getOpenCLRuntime(), mRecording);
  289. auto input = inputs[0];
  290. auto output = outputs[0];
  291. std::vector<int> inputShape = tensorShapeFormat(input);
  292. std::vector<int> outputShape = tensorShapeFormat(output);
  293. const int height = outputShape.at(1);
  294. const int width = outputShape.at(2);
  295. const int inputHeight = inputShape.at(1);
  296. const int inputWidth = inputShape.at(2);
  297. const int inputChannels = inputShape.at(3);
  298. const int inputChannelBlocks = UP_DIV(inputChannels, 4);
  299. int kernelHeight = mConv2dCommonParams->kernelY();
  300. int kernelWidth = mConv2dCommonParams->kernelX();
  301. auto pad = ConvolutionCommon::convolutionPad(input, output, mConv2dCommonParams);
  302. mPaddings[0] = pad.second;
  303. mPaddings[1] = pad.first;
  304. std::string info = std::to_string(inputChannels) + "_" + std::to_string(kernelHeight) + "_" + std::to_string(kernelWidth) + "_" + std::to_string(mStrides[0]) + "_" + std::to_string(mStrides[1]) + "_" + std::to_string(mDilations[0]) + "_" + std::to_string(mDilations[1]);
  305. if (kernelHeight == kernelWidth && kernelHeight == 1 && mPaddings[0] == 0 && mPaddings[1] == 0) {
  306. if(mConv1x1Opt){
  307. auto kernel = &mKernel;
  308. uint32_t idx = 0;
  309. if(mUseLocalMem){
  310. mGlobalWorkSize = {static_cast<uint32_t>(UP_DIV(outputShape.at(3), 4)), static_cast<uint32_t>(UP_DIV(outputShape.at(2), 4)),
  311. static_cast<uint32_t>(outputShape.at(0) * outputShape.at(1))};
  312. std::vector<uint32_t> lws{UNIT, UNIT, 1};
  313. mLocalWorkSize = lws;
  314. kernel->setArg(idx++, mGlobalWorkSize[0]);
  315. kernel->setArg(idx++, mGlobalWorkSize[1]);
  316. kernel->setArg(idx++, mGlobalWorkSize[2]);
  317. kernel->setArg(idx++, openCLImage(input));
  318. kernel->setArg(idx++, openCLImage(mFilter.get()));
  319. kernel->setArg(idx++, openCLImage(mBias.get()));
  320. kernel->setArg(idx++, openCLImage(output));
  321. kernel->setArg(idx++, static_cast<int>(inputChannelBlocks));
  322. kernel->setArg(idx++, height);
  323. kernel->setArg(idx++, width);
  324. recordKernel3d(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime());
  325. }else{
  326. mGlobalWorkSize = {static_cast<uint32_t>(UP_DIV(outputShape.at(3), 4) * UP_DIV(outputShape.at(2), 4)),
  327. static_cast<uint32_t>(outputShape.at(0) * outputShape.at(1))};
  328. kernel->setArg(idx++, mGlobalWorkSize[0]);
  329. kernel->setArg(idx++, mGlobalWorkSize[1]);
  330. kernel->setArg(idx++, UP_DIV(width, 4));
  331. kernel->setArg(idx++, openCLImage(input));
  332. kernel->setArg(idx++, *mKernelBuffer.get());
  333. kernel->setArg(idx++, *mBiasBuffer.get());
  334. kernel->setArg(idx++, openCLImage(output));
  335. kernel->setArg(idx++, static_cast<int>(inputChannelBlocks));
  336. kernel->setArg(idx++, height);
  337. kernel->setArg(idx++, width);
  338. std::string kernelName = "conv_2d_1x1_mali";
  339. mLocalWorkSize = localWS2DDefault(mGlobalWorkSize, mMaxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), kernelName, mKernel).first;
  340. recordKernel2d(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime());
  341. }
  342. }else{
  343. int inputImageShape[2] = {inputHeight, inputWidth};
  344. int outputImageShape[2] = {height, width};
  345. int stideShape[2] = {mStrides[0], mStrides[1]};
  346. const int total_kernel = 2;
  347. std::string kernelName[total_kernel] = {"conv_2d_1x1", "conv_2d_1x1_c8h1w4"};
  348. int itemC[total_kernel] = {4, 8};
  349. int itemH[total_kernel] = {1, 1};
  350. int itemW[total_kernel] = {4, 4};
  351. int actual_kernel = total_kernel;
  352. cl::Kernel kernel[total_kernel];
  353. std::vector<uint32_t> globalWorkSize[total_kernel];
  354. std::vector<uint32_t> localWorkSize[total_kernel];
  355. std::pair<int, int> min_cost(INT_MAX, 0);//(min_time, min_index)
  356. for(int knl_idx = 0; knl_idx < total_kernel; knl_idx++) {
  357. kernel[knl_idx] = mOpenCLBackend->getOpenCLRuntime()->buildKernel("conv_2d", kernelName[knl_idx], mBuildOptions);
  358. uint32_t maxWorkGroupSize = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(kernel[knl_idx]));
  359. globalWorkSize[knl_idx] = {static_cast<uint32_t>(UP_DIV(outputShape.at(3), itemC[knl_idx]) * UP_DIV(outputShape.at(2), itemW[knl_idx])), static_cast<uint32_t>(outputShape.at(0) * UP_DIV(outputShape.at(1), itemH[knl_idx]))};
  360. uint32_t idx = 0;
  361. kernel[knl_idx].setArg(idx++, globalWorkSize[knl_idx][0]);
  362. kernel[knl_idx].setArg(idx++, globalWorkSize[knl_idx][1]);
  363. kernel[knl_idx].setArg(idx++, openCLImage(input));
  364. if(mWeightUseBuffer){
  365. kernel[knl_idx].setArg(idx++, *mKernelBuffer.get());
  366. }else{
  367. kernel[knl_idx].setArg(idx++, openCLImage(mFilter.get()));
  368. }
  369. kernel[knl_idx].setArg(idx++, openCLImage(mBias.get()));
  370. kernel[knl_idx].setArg(idx++, openCLImage(output));
  371. kernel[knl_idx].setArg(idx++, sizeof(inputImageShape), inputImageShape);
  372. kernel[knl_idx].setArg(idx++, static_cast<int>(inputChannelBlocks));
  373. kernel[knl_idx].setArg(idx++, sizeof(outputImageShape), outputImageShape);
  374. kernel[knl_idx].setArg(idx++, sizeof(stideShape), stideShape);
  375. kernel[knl_idx].setArg(idx++, UP_DIV(width, 4));
  376. kernel[knl_idx].setArg(idx++, UP_DIV(outputShape.at(3), 4));
  377. std::pair<std::vector<uint32_t>, uint32_t> retTune;
  378. retTune = localWS2DDefault(globalWorkSize[knl_idx], mMaxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), kernelName[knl_idx] + info, kernel[knl_idx]);
  379. //printf("conv1x1 kernel_%d = %d [%d, %d]\n", knl_idx, retTune.second, retTune.first[0], retTune.first[1]);
  380. if(min_cost.first > retTune.second) {
  381. min_cost.first = retTune.second;
  382. min_cost.second = knl_idx;
  383. mLocalWorkSize = {retTune.first[0], retTune.first[1]};
  384. }
  385. }
  386. int min_index = min_cost.second;
  387. //printf("min_index = %d %d\n", min_index, min_cost.first);
  388. mGlobalWorkSize = {globalWorkSize[min_index][0], globalWorkSize[min_index][1]};
  389. mKernel = mOpenCLBackend->getOpenCLRuntime()->buildKernel("conv_2d", kernelName[min_index], mBuildOptions);
  390. uint32_t idx = 0;
  391. mKernel.setArg(idx++, mGlobalWorkSize[0]);
  392. mKernel.setArg(idx++, mGlobalWorkSize[1]);
  393. mKernel.setArg(idx++, openCLImage(input));
  394. if(mWeightUseBuffer){
  395. mKernel.setArg(idx++, *mKernelBuffer.get());
  396. }else{
  397. mKernel.setArg(idx++, openCLImage(mFilter.get()));
  398. }
  399. mKernel.setArg(idx++, openCLImage(mBias.get()));
  400. mKernel.setArg(idx++, openCLImage(output));
  401. mKernel.setArg(idx++, sizeof(inputImageShape), inputImageShape);
  402. mKernel.setArg(idx++, static_cast<int>(inputChannelBlocks));
  403. mKernel.setArg(idx++, sizeof(outputImageShape), outputImageShape);
  404. mKernel.setArg(idx++, sizeof(stideShape), stideShape);
  405. mKernel.setArg(idx++, UP_DIV(width, 4));
  406. mKernel.setArg(idx++, UP_DIV(outputShape.at(3), 4));
  407. recordKernel2d(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime());
  408. }
  409. }else {
  410. int inputImageShape[2] = {inputHeight, inputWidth};
  411. int outputImageShape[2] = {height, width};
  412. int kernelShape[2] = {kernelHeight, kernelWidth};
  413. int strideShape[2] = {mStrides[0], mStrides[1]};
  414. int paddingShape[2] = {mPaddings[0], mPaddings[1]};
  415. int dilationShape[2] = {mDilations[0], mDilations[1]};
  416. const int total_kernel = 3;
  417. std::string kernelName[total_kernel] = {"conv_2d_c4h1w4", "conv_2d_c4h4w1", "conv_2d_c8h4w1" };
  418. int itemC[total_kernel] = {4, 4, 8};
  419. int itemH[total_kernel] = {1, 4, 4};
  420. int itemW[total_kernel] = {4, 1, 1};
  421. int actual_kernel = total_kernel;
  422. cl::Kernel kernel[total_kernel];
  423. std::vector<uint32_t> globalWorkSize[total_kernel];
  424. std::vector<uint32_t> localWorkSize[total_kernel];
  425. std::pair<int, int> min_cost(INT_MAX, 0);//(min_time, min_index)
  426. for(int knl_idx = 0; knl_idx < total_kernel; knl_idx++) {
  427. kernel[knl_idx] = mOpenCLBackend->getOpenCLRuntime()->buildKernel("conv_2d", kernelName[knl_idx], mBuildOptions);
  428. uint32_t maxWorkGroupSize = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(kernel[knl_idx]));
  429. globalWorkSize[knl_idx] = {static_cast<uint32_t>(UP_DIV(outputShape.at(3), itemC[knl_idx]) * UP_DIV(outputShape.at(2), itemW[knl_idx])), static_cast<uint32_t>(outputShape.at(0) * UP_DIV(outputShape.at(1), itemH[knl_idx]))};
  430. uint32_t idx = 0;
  431. cl_int ret = CL_SUCCESS;
  432. ret |= kernel[knl_idx].setArg(idx++, globalWorkSize[knl_idx][0]);
  433. ret |= kernel[knl_idx].setArg(idx++, globalWorkSize[knl_idx][1]);
  434. ret |= kernel[knl_idx].setArg(idx++, openCLImage(input));
  435. if(mWeightUseBuffer){
  436. ret |= kernel[knl_idx].setArg(idx++, openCLBuffer(mFilter.get()));
  437. }else{
  438. ret |= kernel[knl_idx].setArg(idx++, openCLImage(mFilter.get()));
  439. }
  440. ret |= kernel[knl_idx].setArg(idx++, openCLImage(mBias.get()));
  441. ret |= kernel[knl_idx].setArg(idx++, openCLImage(output));
  442. ret |= kernel[knl_idx].setArg(idx++, sizeof(inputImageShape), inputImageShape);
  443. ret |= kernel[knl_idx].setArg(idx++, inputChannelBlocks);
  444. ret |= kernel[knl_idx].setArg(idx++, sizeof(outputImageShape), outputImageShape);
  445. ret |= kernel[knl_idx].setArg(idx++, sizeof(kernelShape), kernelShape);
  446. ret |= kernel[knl_idx].setArg(idx++, sizeof(strideShape), strideShape);
  447. ret |= kernel[knl_idx].setArg(idx++, sizeof(paddingShape), paddingShape);
  448. ret |= kernel[knl_idx].setArg(idx++, sizeof(dilationShape), dilationShape);
  449. ret |= kernel[knl_idx].setArg(idx++, UP_DIV(width, itemW[knl_idx]));
  450. ret |= kernel[knl_idx].setArg(idx++, UP_DIV(outputShape.at(3), 4));
  451. ret |= kernel[knl_idx].setArg(idx++, UP_DIV(height, itemH[knl_idx]));
  452. MNN_CHECK_CL_SUCCESS(ret, "setArg ConvExecution Kernel Select");
  453. std::pair<std::vector<uint32_t>, uint32_t> retTune;
  454. retTune = localWS2DDefault(globalWorkSize[knl_idx], mMaxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), kernelName[knl_idx] + info, kernel[knl_idx]);
  455. if(min_cost.first > retTune.second) {
  456. min_cost.first = retTune.second;
  457. min_cost.second = knl_idx;
  458. mLocalWorkSize = {retTune.first[0], retTune.first[1]};
  459. }
  460. }
  461. int min_index = min_cost.second;
  462. mGlobalWorkSize = {globalWorkSize[min_index][0], globalWorkSize[min_index][1]};
  463. mKernel = mOpenCLBackend->getOpenCLRuntime()->buildKernel("conv_2d", kernelName[min_index], mBuildOptions);
  464. uint32_t idx = 0;
  465. cl_int ret = CL_SUCCESS;
  466. ret |= mKernel.setArg(idx++, mGlobalWorkSize[0]);
  467. ret |= mKernel.setArg(idx++, mGlobalWorkSize[1]);
  468. ret |= mKernel.setArg(idx++, openCLImage(input));
  469. if(mWeightUseBuffer){
  470. ret |= mKernel.setArg(idx++, openCLBuffer(mFilter.get()));
  471. }else{
  472. ret |= mKernel.setArg(idx++, openCLImage(mFilter.get()));
  473. }
  474. ret |= mKernel.setArg(idx++, openCLImage(mBias.get()));
  475. ret |= mKernel.setArg(idx++, openCLImage(output));
  476. ret |= mKernel.setArg(idx++, sizeof(inputImageShape), inputImageShape);
  477. ret |= mKernel.setArg(idx++, inputChannelBlocks);
  478. ret |= mKernel.setArg(idx++, sizeof(outputImageShape), outputImageShape);
  479. ret |= mKernel.setArg(idx++, sizeof(kernelShape), kernelShape);
  480. ret |= mKernel.setArg(idx++, sizeof(strideShape), strideShape);
  481. ret |= mKernel.setArg(idx++, sizeof(paddingShape), paddingShape);
  482. ret |= mKernel.setArg(idx++, sizeof(dilationShape), dilationShape);
  483. ret |= mKernel.setArg(idx++, UP_DIV(width, itemW[min_index]));
  484. ret |= mKernel.setArg(idx++, UP_DIV(outputShape.at(3), 4));
  485. ret |= mKernel.setArg(idx++, UP_DIV(height, itemH[min_index]));
  486. MNN_CHECK_CL_SUCCESS(ret, "setArg ConvExecution");
  487. recordKernel2d(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime());
  488. }
  489. endRecord(mOpenCLBackend->getOpenCLRuntime(), mRecording);
  490. #ifdef LOG_VERBOSE
  491. MNN_PRINT("end ConvExecution onResize !\n");
  492. #endif
  493. return NO_ERROR;
  494. }
  495. ErrorCode ConvExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
  496. #ifdef LOG_VERBOSE
  497. MNN_PRINT("Start ConvExecution onExecute !\n");
  498. #endif
  499. if(mUseLocalMem){
  500. #ifdef ENABLE_OPENCL_TIME_PROFILER
  501. cl::Event event;
  502. run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalWorkSize,
  503. mOpenCLBackend->getOpenCLRuntime(), &event);
  504. mOpenCLBackend->getOpenCLRuntime()->pushEvent({"Conv UseLocalMem", event});
  505. #else
  506. if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){
  507. if(mOpenCLBackend->getOpenCLRuntime()->isDevideOpRecord())
  508. mOpenCLBackend->getOpenCLRuntime()->getRecordings()->emplace_back(mRecording);
  509. #ifdef LOG_VERBOSE
  510. MNN_PRINT("end ConvExecution onExecute !\n");
  511. #endif
  512. return NO_ERROR;
  513. }
  514. run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalWorkSize,
  515. mOpenCLBackend->getOpenCLRuntime());
  516. #endif
  517. }
  518. #ifdef ENABLE_OPENCL_TIME_PROFILER
  519. cl::Event event;
  520. runKernel2D(mKernel, mGlobalWorkSize, mLocalWorkSize,
  521. mOpenCLBackend->getOpenCLRuntime(), &event);
  522. mOpenCLBackend->getOpenCLRuntime()->pushEvent({"Conv2D", event});
  523. #else
  524. if(mOpenCLBackend->getOpenCLRuntime()->isUseRecordQueue()){
  525. if(mOpenCLBackend->getOpenCLRuntime()->isDevideOpRecord())
  526. mOpenCLBackend->getOpenCLRuntime()->getRecordings()->emplace_back(mRecording);
  527. #ifdef LOG_VERBOSE
  528. MNN_PRINT("end ConvExecution onExecute !\n");
  529. #endif
  530. return NO_ERROR;
  531. }
  532. runKernel2D(mKernel, mGlobalWorkSize, mLocalWorkSize,
  533. mOpenCLBackend->getOpenCLRuntime());
  534. #endif
  535. #ifdef LOG_VERBOSE
  536. MNN_PRINT("end ConvExecution onExecute !\n");
  537. #endif
  538. return NO_ERROR;
  539. }
  540. class ConvolutionCreator : public OpenCLBackend::Creator {
  541. public:
  542. virtual ~ConvolutionCreator() = default;
  543. virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
  544. const MNN::Op *op, Backend *backend) const override {
  545. if (inputs.size() > 1) {
  546. return nullptr;
  547. }
  548. if (nullptr != op->main_as_Convolution2D()->quanParameter()) {
  549. auto quan = op->main_as_Convolution2D()->quanParameter();
  550. if (1 == quan->type() || 2 == quan->type()) {
  551. if (quan->has_scaleInt()) {
  552. // Don't support IDST-int8 because of error
  553. return nullptr;
  554. }
  555. }
  556. }
  557. auto conv2D = op->main_as_Convolution2D();
  558. int maxWidth = static_cast<OpenCLBackend *>(backend)->getOpenCLRuntime()->getMaxImage2DSize()[0];
  559. int maxHeight = static_cast<OpenCLBackend *>(backend)->getOpenCLRuntime()->getMaxImage2DSize()[1];
  560. if (ConvWinograd::valid(conv2D->common(), inputs[0], outputs[0], maxWidth, maxHeight)) {
  561. return new ConvWinograd(conv2D, backend);
  562. }
  563. return new ConvExecution(inputs, outputs, op, backend);
  564. }
  565. };
  566. OpenCLCreatorRegister<ConvolutionCreator> __conv_op(OpType_Convolution, IMAGE);
  567. } // namespace OpenCL
  568. } // namespace MNN