AVX2Backend.cpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456
  1. //
  2. // AVX2Backend.cpp
  3. // MNN
  4. //
  5. // Created by MNN on 2021/05/16.
  6. // Copyright © 2018, Alibaba Group Holding Limited
  7. //
  8. #include <algorithm>
  9. #if defined(_MSC_VER)
  10. #include <intrin.h>
  11. #else
  12. #include <x86intrin.h>
  13. #endif
  14. #include "AVX2Functions.hpp"
  15. #include "AVX2Backend.hpp"
  16. #include "core/BufferAllocator.hpp"
  17. #include "core/TensorUtils.hpp"
  18. #include "backend/cpu/CPURaster.hpp"
  19. #include "backend/cpu/CPUReduction.hpp"
  20. #include "backend/cpu/CPUSoftmax.hpp"
  21. #include "backend/cpu/CPUTensorConvert.hpp"
  22. #include "core/OpCommonUtils.hpp"
  23. #include "backend/cpu/CPUCast.hpp"
  24. extern "C" {
  25. void MNNInt8ToUInt8(void* ptr, int count);
  26. void MNNUInt8ToInt8(void* ptr, int count);
  27. }
  28. namespace MNN {
  29. bool AVX2Backend::isValid() {
  30. return nullptr != AVX2Functions::get();
  31. }
  32. AVX2Backend::AVX2Backend(const CPURuntime* runtime, size_t flags) : CPUBackend(runtime, BackendConfig::Precision_Low, MNN_FORWARD_CPU_EXTENSION, flags) {
  33. mCoreFunctions = AVX2Functions::get();
  34. mInt8CoreFunctions = AVX2Functions::getInt8();
  35. }
  36. AVX2Backend::~AVX2Backend() {
  37. // nothing to do
  38. }
  39. // TODO: Move to functions
  40. static void _CopyC16ToC4_int8(float* dstO, const float* srcO, int channelC4, int area) {
  41. auto dst = (int32_t*)dstO;
  42. auto src = (int32_t*)srcO;
  43. int c8 = channelC4 / 4;
  44. int cR = channelC4 % 4;
  45. for (int z=0; z<c8; ++z) {
  46. auto s0 = dst + 4 * z * area;
  47. auto s1 = dst + (4 * z + 1) * area;
  48. auto s2 = dst + (4 * z + 2) * area;
  49. auto s3 = dst + (4 * z + 3) * area;
  50. auto d = src + z * area * 4;
  51. for (int x=0; x<area; ++x) {
  52. *s0 = d[0];
  53. *s1 = d[1];
  54. *s2 = d[2];
  55. *s3 = d[3];
  56. s0++;
  57. s1++;
  58. s2++;
  59. s3++;
  60. d+=4;
  61. }
  62. }
  63. if (cR > 0) {
  64. auto s0 = dst + 4 * c8 * area;
  65. auto d = src + c8 * area * 4;
  66. for (int x=0; x<area; ++x) {
  67. for (int v=0; v<cR; ++v) {
  68. s0[v * area] = d[v];
  69. }
  70. s0++;
  71. d+=4;
  72. }
  73. }
  74. }
  75. static void _CopyC4ToC16_int8(float* dstO, const float* srcO, int channelC4, int area) {
  76. auto dst = (int32_t*)dstO;
  77. auto src = (int32_t*)srcO;
  78. int c8 = channelC4 / 4;
  79. int cR = channelC4 % 4;
  80. for (int z=0; z<c8; ++z) {
  81. auto s0 = src + 4 * z * area;
  82. auto s1 = src + (4 * z + 1) * area;
  83. auto s2 = src + (4 * z + 2) * area;
  84. auto s3 = src + (4 * z + 3) * area;
  85. auto d = dst + z * area * 4;
  86. for (int x=0; x<area; ++x) {
  87. d[0] = *s0;
  88. d[1] = *s1;
  89. d[2] = *s2;
  90. d[3] = *s3;
  91. s0 ++;
  92. s1 ++;
  93. s2 ++;
  94. s3 ++;
  95. d += 4;
  96. }
  97. }
  98. if (cR > 0) {
  99. auto s0 = src + 4 * c8 * area;
  100. auto d = dst + c8 * area * 4;
  101. for (int x=0; x<area; ++x) {
  102. for (int v=0; v<cR; ++v) {
  103. d[v] = s0[v * area];
  104. }
  105. for (int v=cR; v<4; ++v) {
  106. d[v] = 0;
  107. }
  108. s0 += 4;
  109. d += 16;
  110. }
  111. }
  112. }
  113. static void _CopyC4ToC16(float* dst, const float* src, int channelC4, int area) {
  114. int c8 = channelC4 / 4;
  115. int cR = channelC4 % 4;
  116. for (int z=0; z<c8; ++z) {
  117. auto s0 = src + 4 * z * area * 4;
  118. auto s1 = src + (4 * z + 1) * area * 4;
  119. auto s2 = src + (4 * z + 2) * area * 4;
  120. auto s3 = src + (4 * z + 3) * area * 4;
  121. auto d = dst + z * area * 16;
  122. for (int x=0; x<area; ++x) {
  123. auto v0 = _mm_loadu_ps(s0);
  124. auto v1 = _mm_loadu_ps(s1);
  125. auto v2 = _mm_loadu_ps(s2);
  126. auto v3 = _mm_loadu_ps(s3);
  127. _mm_storeu_ps(d + 0, v0);
  128. _mm_storeu_ps(d + 4, v1);
  129. _mm_storeu_ps(d + 8, v2);
  130. _mm_storeu_ps(d + 12, v3);
  131. s0 += 4;
  132. s1 += 4;
  133. s2 += 4;
  134. s3 += 4;
  135. d += 16;
  136. }
  137. }
  138. if (cR > 0) {
  139. auto s0 = src + 4 * c8 * area * 4;
  140. auto d = dst + c8 * area * 16;
  141. auto v1 = _mm_setzero_ps();
  142. for (int x=0; x<area; ++x) {
  143. for (int v=0; v<cR; ++v) {
  144. auto v0 = _mm_loadu_ps(s0 + v * area * 4);
  145. _mm_storeu_ps(d + 4 * v, v0);
  146. }
  147. for (int v=cR; v<4; ++v) {
  148. _mm_storeu_ps(d + 4 * v, v1);
  149. }
  150. s0 += 4;
  151. d += 16;
  152. }
  153. }
  154. }
  155. static void _CopyC16ToC4(float* dst, const float* src, int channelC4, int area) {
  156. int c8 = channelC4 / 4;
  157. int cR = channelC4 % 4;
  158. for (int z=0; z<c8; ++z) {
  159. auto s0 = dst + 4 * z * area * 4;
  160. auto s1 = dst + (4 * z + 1) * area * 4;
  161. auto s2 = dst + (4 * z + 2) * area * 4;
  162. auto s3 = dst + (4 * z + 3) * area * 4;
  163. auto d = src + z * area * 16;
  164. for (int x=0; x<area; ++x) {
  165. auto v0 = _mm_loadu_ps(d);
  166. auto v1 = _mm_loadu_ps(d + 4);
  167. auto v2 = _mm_loadu_ps(d + 8);
  168. auto v3 = _mm_loadu_ps(d + 12);
  169. _mm_storeu_ps(s0, v0);
  170. _mm_storeu_ps(s1, v1);
  171. _mm_storeu_ps(s2, v2);
  172. _mm_storeu_ps(s3, v3);
  173. s0 += 4;
  174. s1 += 4;
  175. s2 += 4;
  176. s3 += 4;
  177. d+= 16;
  178. }
  179. }
  180. if (cR > 0) {
  181. auto s0 = dst + 4 * c8 * area * 4;
  182. auto d = src + c8 * area * 16;
  183. for (int x=0; x<area; ++x) {
  184. for (int v=0; v<cR; ++v) {
  185. auto v0 = _mm_loadu_ps(d + v * 4);
  186. _mm_storeu_ps(s0 + 4 * v * area, v0);
  187. }
  188. s0 += 4;
  189. d+= 16;
  190. }
  191. }
  192. }
  193. static void _CopyC4ToC8(float* dst, const float* src, int channelC4, int area) {
  194. int c8 = channelC4 / 2;
  195. int cR = channelC4 % 2;
  196. for (int z=0; z<c8; ++z) {
  197. auto s0 = src + 2 * z * area * 4;
  198. auto s1 = src + (2 * z + 1) * area * 4;
  199. auto d = dst + z * area * 8;
  200. for (int x=0; x<area; ++x) {
  201. auto v0 = _mm_loadu_ps(s0);
  202. auto v1 = _mm_loadu_ps(s1);
  203. _mm_storeu_ps(d + 0, v0);
  204. _mm_storeu_ps(d + 4, v1);
  205. s0 += 4;
  206. s1 += 4;
  207. d += 8;
  208. }
  209. }
  210. if (cR > 0) {
  211. auto s0 = src + 2 * c8 * area * 4;
  212. auto d = dst + c8 * area * 8;
  213. auto v1 = _mm_setzero_ps();
  214. for (int x=0; x<area; ++x) {
  215. auto v0 = _mm_loadu_ps(s0);
  216. _mm_storeu_ps(d + 0, v0);
  217. _mm_storeu_ps(d + 4, v1);
  218. s0 += 4;
  219. d += 8;
  220. }
  221. }
  222. }
  223. static void _CopyC8ToC4(float* dst, const float* src, int channelC4, int area) {
  224. int c8 = channelC4 / 2;
  225. int cR = channelC4 % 2;
  226. for (int z=0; z<c8; ++z) {
  227. auto s0 = dst + 2 * z * area * 4;
  228. auto s1 = dst + (2 * z + 1) * area * 4;
  229. auto d = src + z * area * 8;
  230. for (int x=0; x<area; ++x) {
  231. auto v0 = _mm_loadu_ps(d);
  232. auto v1 = _mm_loadu_ps(d + 4);
  233. _mm_storeu_ps(s0, v0);
  234. _mm_storeu_ps(s1, v1);
  235. s0 += 4;
  236. s1 += 4;
  237. d+= 8;
  238. }
  239. }
  240. if (cR > 0) {
  241. auto s0 = dst + 2 * c8 * area * 4;
  242. auto d = src + c8 * area * 8;
  243. for (int x=0; x<area; ++x) {
  244. auto v0 = _mm_loadu_ps(d);
  245. _mm_storeu_ps(s0, v0);
  246. s0 += 4;
  247. d+= 8;
  248. }
  249. }
  250. }
  251. static void _CopyC4ToC8_int8(float* dstPtr, const float* srcPtr, int channelC4, int area) {
  252. int8_t* dst = (int8_t*)(dstPtr);
  253. const int8_t* src = (const int8_t*)(srcPtr);
  254. int c8 = channelC4 / 2;
  255. int cR = channelC4 % 2;
  256. for (int z=0; z<c8; ++z) {
  257. auto s0 = src + 2 * z * area * 4;
  258. auto s1 = src + (2 * z + 1) * area * 4;
  259. auto d = dst + z * area * 8;
  260. for (int x=0; x<area; ++x) {
  261. *(int*)d = *(int*)s0;
  262. *((int*)d + 1) = *(int*)s1;
  263. s0 += 4;
  264. s1 += 4;
  265. d += 8;
  266. }
  267. }
  268. if (cR > 0) {
  269. auto s0 = src + 2 * c8 * area * 4;
  270. auto d = dst + c8 * area * 8;
  271. for (int x=0; x<area; ++x) {
  272. *(int*)d = *(int*)s0;
  273. *((int*)d + 1) = 0;
  274. s0 += 4;
  275. d += 8;
  276. }
  277. }
  278. }
  279. static void _CopyC8ToC4_int8(float* dstPtr, const float* srcPtr, int channelC4, int area) {
  280. int8_t* dst = (int8_t*)(dstPtr);
  281. const int8_t* src = (const int8_t*)(srcPtr);
  282. int c8 = channelC4 / 2;
  283. int cR = channelC4 % 2;
  284. for (int z=0; z<c8; ++z) {
  285. auto s0 = dst + 2 * z * area * 4;
  286. auto s1 = dst + (2 * z + 1) * area * 4;
  287. auto d = src + z * area * 8;
  288. for (int x=0; x<area; ++x) {
  289. *(int*)s0 = *(int*)d;
  290. *(int*)s1 = *((int*)d + 1);
  291. s0 += 4;
  292. s1 += 4;
  293. d+= 8;
  294. }
  295. }
  296. if (cR > 0) {
  297. auto s0 = dst + 2 * c8 * area * 4;
  298. auto d = src + c8 * area * 8;
  299. for (int x=0; x<area; ++x) {
  300. *(int*)s0 = *(int*)d;
  301. s0 += 4;
  302. d += 8;
  303. }
  304. }
  305. }
  306. Execution* AVX2Backend::onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
  307. const MNN::Op* op) {
  308. for (auto t : outputs) {
  309. if (t->getType().code != halide_type_float && t->getType().bits != 8) {
  310. return nullptr;
  311. }
  312. if (t->getType().code == halide_type_uint) {
  313. return nullptr;
  314. }
  315. }
  316. bool originCreate = OpCommonUtils::opCompabilityForLowp(op);
  317. if (originCreate || op->type() == OpType_Softmax || op->type() == OpType_Reduction || op->type() == OpType_ConvInt8 || op->type() == OpType_DepthwiseConvInt8 || op->type() == OpType_FloatToInt8 || op->type() == OpType_Int8ToFloat) {
  318. return CPUBackend::onCreate(inputs, outputs, op);
  319. }
  320. return nullptr;
  321. }
  322. bool AVX2Backend::onAcquireBuffer(const Tensor* nativeTensor, StorageType storageType) {
  323. // arm82 backend tensor data type is fp16 default
  324. auto tensor = const_cast<Tensor*>(nativeTensor);
  325. auto& buffer = tensor->buffer();
  326. auto tensorSize = getTensorSize(nativeTensor);
  327. auto res = allocBuffer(tensorSize * buffer.type.bytes(), (Tensor*)nativeTensor, storageType);
  328. if (!res) {
  329. return false;
  330. }
  331. // Set mask in device for easy to determine
  332. buffer.device = 1;
  333. return true;
  334. }
  335. void AVX2Backend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) const {
  336. auto& ib = srcTensor->buffer();
  337. auto& ob = dstTensor->buffer();
  338. std::unique_ptr<Tensor> wrapTensor;
  339. if (ib.type.code != halide_type_float && ib.type != halide_type_of<int8_t>()) {
  340. CPUBackend::onCopyBuffer(srcTensor, dstTensor);
  341. return;
  342. }
  343. if (ib.type.code != ob.type.code) {
  344. auto dimType = Tensor::CAFFE;
  345. switch (TensorUtils::getDescribe(srcTensor)->dimensionFormat) {
  346. case MNN_DATA_FORMAT_NCHW:
  347. break;
  348. case MNN_DATA_FORMAT_NC4HW4:
  349. dimType = Tensor::CAFFE_C4;
  350. break;
  351. case MNN_DATA_FORMAT_NHWC:
  352. dimType = Tensor::TENSORFLOW;
  353. break;
  354. default:
  355. break;
  356. }
  357. wrapTensor.reset(Tensor::createDevice(srcTensor->shape(), dstTensor->getType(), dimType));
  358. wrapTensor->buffer().host = (uint8_t*)MNNMemoryAllocAlign(getTensorSize(wrapTensor.get()) * wrapTensor->getType().bytes(), MNN_MEMORY_ALIGN_DEFAULT);
  359. TensorUtils::getDescribe(wrapTensor.get())->memoryType = Tensor::InsideDescribe::MEMORY_HOST;
  360. auto code = CPUCastCreator::cast(srcTensor, wrapTensor.get(), this);
  361. if (NO_ERROR != code) {
  362. MNN_ERROR("Error in CPUBackend::onCopyBuffer:cast\n");
  363. }
  364. srcTensor = wrapTensor.get();
  365. }
  366. auto source = TensorUtils::getDescribe(srcTensor)->dimensionFormat;
  367. auto dest = TensorUtils::getDescribe(dstTensor)->dimensionFormat;
  368. auto srcType = MNN_FORWARD_CPU;
  369. if (ib.device != 0) {
  370. srcType = MNN_FORWARD_CPU_EXTENSION;
  371. }
  372. auto dstType = MNN_FORWARD_CPU;
  373. if (ob.device != 0) {
  374. dstType = MNN_FORWARD_CPU_EXTENSION;
  375. }
  376. if (srcType == dstType) {
  377. if(srcType == MNN_FORWARD_CPU_EXTENSION) {
  378. CPUTensorConverter::convert(srcTensor, dstTensor, mCoreFunctions);
  379. } else {
  380. CPUTensorConverter::convert(srcTensor, dstTensor, MNNGetCoreFunctions());
  381. }
  382. return;
  383. }
  384. if (source != MNN_DATA_FORMAT_NC4HW4 && dest != MNN_DATA_FORMAT_NC4HW4) {
  385. CPUTensorConverter::convert(srcTensor, dstTensor, mCoreFunctions);
  386. return;
  387. }
  388. if (source == MNN_DATA_FORMAT_NC4HW4 && dest == MNN_DATA_FORMAT_NC4HW4) {
  389. auto outF = _CopyC8ToC4;
  390. auto inF = _CopyC4ToC8;
  391. if (ob.type.bytes() == 1) {
  392. outF = _CopyC8ToC4_int8;
  393. inF = _CopyC4ToC8_int8;
  394. }
  395. if (mCoreFunctions->pack == 16) {
  396. outF = _CopyC16ToC4;
  397. inF = _CopyC4ToC16;
  398. if (ob.type.bytes() == 1) {
  399. outF = _CopyC16ToC4_int8;
  400. inF = _CopyC4ToC16_int8;
  401. }
  402. }
  403. // NC4HW4 <-> NC8HW8
  404. if (1 == srcTensor->dimensions()) {
  405. ::memcpy(dstTensor->host<void>(), srcTensor->host<void>(), srcTensor->length(0) * srcTensor->getType().bytes());
  406. return;
  407. }
  408. auto dims = CPUTensorConverter::splitDimensions(srcTensor->buffer(), source);
  409. int area = std::get<1>(dims) * std::get<0>(dims);
  410. int channel = std::get<2>(dims);
  411. auto c4 = UP_DIV(channel, 4);
  412. if (srcType == MNN_FORWARD_CPU_EXTENSION) {
  413. outF(dstTensor->host<float>(), srcTensor->host<float>(), c4, area);
  414. } else {
  415. inF(dstTensor->host<float>(), srcTensor->host<float>(), c4, area);
  416. }
  417. return;
  418. }
  419. if (source == MNN_DATA_FORMAT_NC4HW4) {
  420. if (srcType == MNN_FORWARD_CPU_EXTENSION) {
  421. CPUTensorConverter::convert(srcTensor, dstTensor, mCoreFunctions);
  422. } else {
  423. CPUTensorConverter::convert(srcTensor, dstTensor, MNNGetCoreFunctions());
  424. }
  425. return;
  426. }
  427. if (dest == MNN_DATA_FORMAT_NC4HW4) {
  428. if (dstType == MNN_FORWARD_CPU_EXTENSION) {
  429. CPUTensorConverter::convert(srcTensor, dstTensor, mCoreFunctions);
  430. } else {
  431. CPUTensorConverter::convert(srcTensor, dstTensor, MNNGetCoreFunctions());
  432. }
  433. return;
  434. }
  435. MNN_ASSERT(false);
  436. return;
  437. }
  438. } // namespace MNN