123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456 |
- //
- // AVX2Backend.cpp
- // MNN
- //
- // Created by MNN on 2021/05/16.
- // Copyright © 2018, Alibaba Group Holding Limited
- //
- #include <algorithm>
- #if defined(_MSC_VER)
- #include <intrin.h>
- #else
- #include <x86intrin.h>
- #endif
- #include "AVX2Functions.hpp"
- #include "AVX2Backend.hpp"
- #include "core/BufferAllocator.hpp"
- #include "core/TensorUtils.hpp"
- #include "backend/cpu/CPURaster.hpp"
- #include "backend/cpu/CPUReduction.hpp"
- #include "backend/cpu/CPUSoftmax.hpp"
- #include "backend/cpu/CPUTensorConvert.hpp"
- #include "core/OpCommonUtils.hpp"
- #include "backend/cpu/CPUCast.hpp"
- extern "C" {
- void MNNInt8ToUInt8(void* ptr, int count);
- void MNNUInt8ToInt8(void* ptr, int count);
- }
- namespace MNN {
- bool AVX2Backend::isValid() {
- return nullptr != AVX2Functions::get();
- }
- AVX2Backend::AVX2Backend(const CPURuntime* runtime, size_t flags) : CPUBackend(runtime, BackendConfig::Precision_Low, MNN_FORWARD_CPU_EXTENSION, flags) {
- mCoreFunctions = AVX2Functions::get();
- mInt8CoreFunctions = AVX2Functions::getInt8();
- }
- AVX2Backend::~AVX2Backend() {
- // nothing to do
- }
- // TODO: Move to functions
- static void _CopyC16ToC4_int8(float* dstO, const float* srcO, int channelC4, int area) {
- auto dst = (int32_t*)dstO;
- auto src = (int32_t*)srcO;
- int c8 = channelC4 / 4;
- int cR = channelC4 % 4;
- for (int z=0; z<c8; ++z) {
- auto s0 = dst + 4 * z * area;
- auto s1 = dst + (4 * z + 1) * area;
- auto s2 = dst + (4 * z + 2) * area;
- auto s3 = dst + (4 * z + 3) * area;
- auto d = src + z * area * 4;
- for (int x=0; x<area; ++x) {
- *s0 = d[0];
- *s1 = d[1];
- *s2 = d[2];
- *s3 = d[3];
- s0++;
- s1++;
- s2++;
- s3++;
- d+=4;
- }
- }
- if (cR > 0) {
- auto s0 = dst + 4 * c8 * area;
- auto d = src + c8 * area * 4;
- for (int x=0; x<area; ++x) {
- for (int v=0; v<cR; ++v) {
- s0[v * area] = d[v];
- }
- s0++;
- d+=4;
- }
- }
- }
- static void _CopyC4ToC16_int8(float* dstO, const float* srcO, int channelC4, int area) {
- auto dst = (int32_t*)dstO;
- auto src = (int32_t*)srcO;
- int c8 = channelC4 / 4;
- int cR = channelC4 % 4;
- for (int z=0; z<c8; ++z) {
- auto s0 = src + 4 * z * area;
- auto s1 = src + (4 * z + 1) * area;
- auto s2 = src + (4 * z + 2) * area;
- auto s3 = src + (4 * z + 3) * area;
- auto d = dst + z * area * 4;
- for (int x=0; x<area; ++x) {
- d[0] = *s0;
- d[1] = *s1;
- d[2] = *s2;
- d[3] = *s3;
- s0 ++;
- s1 ++;
- s2 ++;
- s3 ++;
- d += 4;
- }
- }
- if (cR > 0) {
- auto s0 = src + 4 * c8 * area;
- auto d = dst + c8 * area * 4;
- for (int x=0; x<area; ++x) {
- for (int v=0; v<cR; ++v) {
- d[v] = s0[v * area];
- }
- for (int v=cR; v<4; ++v) {
- d[v] = 0;
- }
- s0 += 4;
- d += 16;
- }
- }
- }
- static void _CopyC4ToC16(float* dst, const float* src, int channelC4, int area) {
- int c8 = channelC4 / 4;
- int cR = channelC4 % 4;
- for (int z=0; z<c8; ++z) {
- auto s0 = src + 4 * z * area * 4;
- auto s1 = src + (4 * z + 1) * area * 4;
- auto s2 = src + (4 * z + 2) * area * 4;
- auto s3 = src + (4 * z + 3) * area * 4;
- auto d = dst + z * area * 16;
- for (int x=0; x<area; ++x) {
- auto v0 = _mm_loadu_ps(s0);
- auto v1 = _mm_loadu_ps(s1);
- auto v2 = _mm_loadu_ps(s2);
- auto v3 = _mm_loadu_ps(s3);
- _mm_storeu_ps(d + 0, v0);
- _mm_storeu_ps(d + 4, v1);
- _mm_storeu_ps(d + 8, v2);
- _mm_storeu_ps(d + 12, v3);
- s0 += 4;
- s1 += 4;
- s2 += 4;
- s3 += 4;
- d += 16;
- }
- }
- if (cR > 0) {
- auto s0 = src + 4 * c8 * area * 4;
- auto d = dst + c8 * area * 16;
- auto v1 = _mm_setzero_ps();
- for (int x=0; x<area; ++x) {
- for (int v=0; v<cR; ++v) {
- auto v0 = _mm_loadu_ps(s0 + v * area * 4);
- _mm_storeu_ps(d + 4 * v, v0);
- }
- for (int v=cR; v<4; ++v) {
- _mm_storeu_ps(d + 4 * v, v1);
- }
- s0 += 4;
- d += 16;
- }
- }
- }
- static void _CopyC16ToC4(float* dst, const float* src, int channelC4, int area) {
- int c8 = channelC4 / 4;
- int cR = channelC4 % 4;
- for (int z=0; z<c8; ++z) {
- auto s0 = dst + 4 * z * area * 4;
- auto s1 = dst + (4 * z + 1) * area * 4;
- auto s2 = dst + (4 * z + 2) * area * 4;
- auto s3 = dst + (4 * z + 3) * area * 4;
- auto d = src + z * area * 16;
- for (int x=0; x<area; ++x) {
- auto v0 = _mm_loadu_ps(d);
- auto v1 = _mm_loadu_ps(d + 4);
- auto v2 = _mm_loadu_ps(d + 8);
- auto v3 = _mm_loadu_ps(d + 12);
- _mm_storeu_ps(s0, v0);
- _mm_storeu_ps(s1, v1);
- _mm_storeu_ps(s2, v2);
- _mm_storeu_ps(s3, v3);
- s0 += 4;
- s1 += 4;
- s2 += 4;
- s3 += 4;
- d+= 16;
- }
- }
- if (cR > 0) {
- auto s0 = dst + 4 * c8 * area * 4;
- auto d = src + c8 * area * 16;
- for (int x=0; x<area; ++x) {
- for (int v=0; v<cR; ++v) {
- auto v0 = _mm_loadu_ps(d + v * 4);
- _mm_storeu_ps(s0 + 4 * v * area, v0);
- }
- s0 += 4;
- d+= 16;
- }
- }
- }
- static void _CopyC4ToC8(float* dst, const float* src, int channelC4, int area) {
- int c8 = channelC4 / 2;
- int cR = channelC4 % 2;
- for (int z=0; z<c8; ++z) {
- auto s0 = src + 2 * z * area * 4;
- auto s1 = src + (2 * z + 1) * area * 4;
- auto d = dst + z * area * 8;
- for (int x=0; x<area; ++x) {
- auto v0 = _mm_loadu_ps(s0);
- auto v1 = _mm_loadu_ps(s1);
- _mm_storeu_ps(d + 0, v0);
- _mm_storeu_ps(d + 4, v1);
- s0 += 4;
- s1 += 4;
- d += 8;
- }
- }
- if (cR > 0) {
- auto s0 = src + 2 * c8 * area * 4;
- auto d = dst + c8 * area * 8;
- auto v1 = _mm_setzero_ps();
- for (int x=0; x<area; ++x) {
- auto v0 = _mm_loadu_ps(s0);
- _mm_storeu_ps(d + 0, v0);
- _mm_storeu_ps(d + 4, v1);
- s0 += 4;
- d += 8;
- }
- }
- }
- static void _CopyC8ToC4(float* dst, const float* src, int channelC4, int area) {
- int c8 = channelC4 / 2;
- int cR = channelC4 % 2;
- for (int z=0; z<c8; ++z) {
- auto s0 = dst + 2 * z * area * 4;
- auto s1 = dst + (2 * z + 1) * area * 4;
- auto d = src + z * area * 8;
- for (int x=0; x<area; ++x) {
- auto v0 = _mm_loadu_ps(d);
- auto v1 = _mm_loadu_ps(d + 4);
- _mm_storeu_ps(s0, v0);
- _mm_storeu_ps(s1, v1);
- s0 += 4;
- s1 += 4;
- d+= 8;
- }
- }
- if (cR > 0) {
- auto s0 = dst + 2 * c8 * area * 4;
- auto d = src + c8 * area * 8;
- for (int x=0; x<area; ++x) {
- auto v0 = _mm_loadu_ps(d);
- _mm_storeu_ps(s0, v0);
- s0 += 4;
- d+= 8;
- }
- }
- }
- static void _CopyC4ToC8_int8(float* dstPtr, const float* srcPtr, int channelC4, int area) {
- int8_t* dst = (int8_t*)(dstPtr);
- const int8_t* src = (const int8_t*)(srcPtr);
- int c8 = channelC4 / 2;
- int cR = channelC4 % 2;
- for (int z=0; z<c8; ++z) {
- auto s0 = src + 2 * z * area * 4;
- auto s1 = src + (2 * z + 1) * area * 4;
- auto d = dst + z * area * 8;
- for (int x=0; x<area; ++x) {
- *(int*)d = *(int*)s0;
- *((int*)d + 1) = *(int*)s1;
- s0 += 4;
- s1 += 4;
- d += 8;
- }
- }
- if (cR > 0) {
- auto s0 = src + 2 * c8 * area * 4;
- auto d = dst + c8 * area * 8;
- for (int x=0; x<area; ++x) {
- *(int*)d = *(int*)s0;
- *((int*)d + 1) = 0;
- s0 += 4;
- d += 8;
- }
- }
- }
- static void _CopyC8ToC4_int8(float* dstPtr, const float* srcPtr, int channelC4, int area) {
- int8_t* dst = (int8_t*)(dstPtr);
- const int8_t* src = (const int8_t*)(srcPtr);
- int c8 = channelC4 / 2;
- int cR = channelC4 % 2;
- for (int z=0; z<c8; ++z) {
- auto s0 = dst + 2 * z * area * 4;
- auto s1 = dst + (2 * z + 1) * area * 4;
- auto d = src + z * area * 8;
- for (int x=0; x<area; ++x) {
- *(int*)s0 = *(int*)d;
- *(int*)s1 = *((int*)d + 1);
- s0 += 4;
- s1 += 4;
- d+= 8;
- }
- }
- if (cR > 0) {
- auto s0 = dst + 2 * c8 * area * 4;
- auto d = src + c8 * area * 8;
- for (int x=0; x<area; ++x) {
- *(int*)s0 = *(int*)d;
- s0 += 4;
- d += 8;
- }
- }
- }
- Execution* AVX2Backend::onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
- const MNN::Op* op) {
- for (auto t : outputs) {
- if (t->getType().code != halide_type_float && t->getType().bits != 8) {
- return nullptr;
- }
- if (t->getType().code == halide_type_uint) {
- return nullptr;
- }
- }
- bool originCreate = OpCommonUtils::opCompabilityForLowp(op);
- if (originCreate || op->type() == OpType_Softmax || op->type() == OpType_Reduction || op->type() == OpType_ConvInt8 || op->type() == OpType_DepthwiseConvInt8 || op->type() == OpType_FloatToInt8 || op->type() == OpType_Int8ToFloat) {
- return CPUBackend::onCreate(inputs, outputs, op);
- }
- return nullptr;
- }
- bool AVX2Backend::onAcquireBuffer(const Tensor* nativeTensor, StorageType storageType) {
- // arm82 backend tensor data type is fp16 default
- auto tensor = const_cast<Tensor*>(nativeTensor);
- auto& buffer = tensor->buffer();
- auto tensorSize = getTensorSize(nativeTensor);
- auto res = allocBuffer(tensorSize * buffer.type.bytes(), (Tensor*)nativeTensor, storageType);
- if (!res) {
- return false;
- }
- // Set mask in device for easy to determine
- buffer.device = 1;
- return true;
- }
- void AVX2Backend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) const {
- auto& ib = srcTensor->buffer();
- auto& ob = dstTensor->buffer();
- std::unique_ptr<Tensor> wrapTensor;
- if (ib.type.code != halide_type_float && ib.type != halide_type_of<int8_t>()) {
- CPUBackend::onCopyBuffer(srcTensor, dstTensor);
- return;
- }
- if (ib.type.code != ob.type.code) {
- auto dimType = Tensor::CAFFE;
- switch (TensorUtils::getDescribe(srcTensor)->dimensionFormat) {
- case MNN_DATA_FORMAT_NCHW:
- break;
- case MNN_DATA_FORMAT_NC4HW4:
- dimType = Tensor::CAFFE_C4;
- break;
- case MNN_DATA_FORMAT_NHWC:
- dimType = Tensor::TENSORFLOW;
- break;
- default:
- break;
- }
- wrapTensor.reset(Tensor::createDevice(srcTensor->shape(), dstTensor->getType(), dimType));
- wrapTensor->buffer().host = (uint8_t*)MNNMemoryAllocAlign(getTensorSize(wrapTensor.get()) * wrapTensor->getType().bytes(), MNN_MEMORY_ALIGN_DEFAULT);
- TensorUtils::getDescribe(wrapTensor.get())->memoryType = Tensor::InsideDescribe::MEMORY_HOST;
- auto code = CPUCastCreator::cast(srcTensor, wrapTensor.get(), this);
- if (NO_ERROR != code) {
- MNN_ERROR("Error in CPUBackend::onCopyBuffer:cast\n");
- }
- srcTensor = wrapTensor.get();
- }
- auto source = TensorUtils::getDescribe(srcTensor)->dimensionFormat;
- auto dest = TensorUtils::getDescribe(dstTensor)->dimensionFormat;
- auto srcType = MNN_FORWARD_CPU;
- if (ib.device != 0) {
- srcType = MNN_FORWARD_CPU_EXTENSION;
- }
- auto dstType = MNN_FORWARD_CPU;
- if (ob.device != 0) {
- dstType = MNN_FORWARD_CPU_EXTENSION;
- }
- if (srcType == dstType) {
- if(srcType == MNN_FORWARD_CPU_EXTENSION) {
- CPUTensorConverter::convert(srcTensor, dstTensor, mCoreFunctions);
- } else {
- CPUTensorConverter::convert(srcTensor, dstTensor, MNNGetCoreFunctions());
- }
- return;
- }
- if (source != MNN_DATA_FORMAT_NC4HW4 && dest != MNN_DATA_FORMAT_NC4HW4) {
- CPUTensorConverter::convert(srcTensor, dstTensor, mCoreFunctions);
- return;
- }
- if (source == MNN_DATA_FORMAT_NC4HW4 && dest == MNN_DATA_FORMAT_NC4HW4) {
- auto outF = _CopyC8ToC4;
- auto inF = _CopyC4ToC8;
- if (ob.type.bytes() == 1) {
- outF = _CopyC8ToC4_int8;
- inF = _CopyC4ToC8_int8;
- }
- if (mCoreFunctions->pack == 16) {
- outF = _CopyC16ToC4;
- inF = _CopyC4ToC16;
- if (ob.type.bytes() == 1) {
- outF = _CopyC16ToC4_int8;
- inF = _CopyC4ToC16_int8;
- }
- }
- // NC4HW4 <-> NC8HW8
- if (1 == srcTensor->dimensions()) {
- ::memcpy(dstTensor->host<void>(), srcTensor->host<void>(), srcTensor->length(0) * srcTensor->getType().bytes());
- return;
- }
- auto dims = CPUTensorConverter::splitDimensions(srcTensor->buffer(), source);
- int area = std::get<1>(dims) * std::get<0>(dims);
- int channel = std::get<2>(dims);
- auto c4 = UP_DIV(channel, 4);
- if (srcType == MNN_FORWARD_CPU_EXTENSION) {
- outF(dstTensor->host<float>(), srcTensor->host<float>(), c4, area);
- } else {
- inF(dstTensor->host<float>(), srcTensor->host<float>(), c4, area);
- }
- return;
- }
- if (source == MNN_DATA_FORMAT_NC4HW4) {
- if (srcType == MNN_FORWARD_CPU_EXTENSION) {
- CPUTensorConverter::convert(srcTensor, dstTensor, mCoreFunctions);
- } else {
- CPUTensorConverter::convert(srcTensor, dstTensor, MNNGetCoreFunctions());
- }
- return;
- }
- if (dest == MNN_DATA_FORMAT_NC4HW4) {
- if (dstType == MNN_FORWARD_CPU_EXTENSION) {
- CPUTensorConverter::convert(srcTensor, dstTensor, mCoreFunctions);
- } else {
- CPUTensorConverter::convert(srcTensor, dstTensor, MNNGetCoreFunctions());
- }
- return;
- }
- MNN_ASSERT(false);
- return;
- }
- } // namespace MNN
|