|
@@ -18,7 +18,6 @@
|
|
|
#include "core/ConvolutionCommon.hpp"
|
|
|
#include "compute/CommonOptFunction.h"
|
|
|
#include "compute/ConvOpt.h"
|
|
|
-#include "compute/DeconvolutionWithStride.hpp"
|
|
|
//#define MNN_OPEN_TIME_TRACE
|
|
|
#include <MNN/AutoTime.hpp>
|
|
|
|
|
@@ -83,63 +82,13 @@ static void _transformWeight(const uint8_t* tempWeight, uint8_t* dest, int outpu
|
|
|
//printf("%d - %d - %d - %d\n", outputCount, srcCount, fh, fw);
|
|
|
core->MNNPackForMatMul_B((float*)dest, (const float*)cache, outputC4 * fw * fh * core->pack, srcCount, false);
|
|
|
}
|
|
|
-// Int8 Weight.
|
|
|
-static void _reorderWeightInt8(Backend* bn, const Convolution2DCommon* common, const int8_t* srcPtr,
|
|
|
- std::shared_ptr<Tensor>& weight) {
|
|
|
- auto core = static_cast<CPUBackend*>(bn)->int8Functions();
|
|
|
- auto gcore = static_cast<CPUBackend*>(bn)->functions();
|
|
|
- int UNIT, SRC_UNIT, DST_XUNIT;
|
|
|
- core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
|
|
|
- UNIT = gcore->pack;
|
|
|
|
|
|
- int oc = common->outputCount(), ic = common->inputCount(), kernelCount = common->kernelX() * common->kernelY();
|
|
|
- std::vector<int> shape = {UP_DIV(oc, UNIT), UP_DIV(ic, SRC_UNIT) * kernelCount, UNIT, SRC_UNIT};
|
|
|
-
|
|
|
- weight.reset(Tensor::createDevice<int8_t>(shape));
|
|
|
- bool succ = bn->onAcquireBuffer(weight.get(), Backend::STATIC);
|
|
|
- if (!succ) {
|
|
|
- MNN_ERROR("Memory not enough");
|
|
|
- return;
|
|
|
- }
|
|
|
- auto dstPtr = weight->host<int8_t>();
|
|
|
- ::memset(dstPtr, 0, weight->size());
|
|
|
-
|
|
|
- int icDiv = UP_DIV(ic, SRC_UNIT);
|
|
|
- for (int k = 0; k < kernelCount; ++k) {
|
|
|
- auto srcK = srcPtr + k;
|
|
|
- auto dstK = dstPtr + k * SRC_UNIT * UNIT * icDiv;
|
|
|
- for (int x = 0; x < oc; ++x) {
|
|
|
- int xout = x / UNIT;
|
|
|
- int xin = x % UNIT;
|
|
|
- auto srcY = srcK + x * kernelCount;
|
|
|
- auto dstY = dstK + xout * SRC_UNIT * UNIT * icDiv * kernelCount + xin * SRC_UNIT;
|
|
|
- for (int y = 0; y < ic; ++y) {
|
|
|
- int yout = y / SRC_UNIT;
|
|
|
- int yin = y % SRC_UNIT;
|
|
|
-
|
|
|
- const int dstIndex = yout * SRC_UNIT * UNIT + yin;
|
|
|
- const int srcIndex = y * oc * kernelCount;
|
|
|
- dstY[dstIndex] = srcY[srcIndex];
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
-}
|
|
|
CPUDeconvolution::CPUDeconvolution(const Tensor* input, const Op* convOp, Backend* backend, bool dynamicWeight)
|
|
|
: MNN::CPUDeconvolutionCommon(input, convOp, backend, dynamicWeight) {
|
|
|
auto core = static_cast<CPUBackend*>(backend)->functions();
|
|
|
auto coreInt8 = static_cast<CPUBackend*>(backend)->int8Functions();
|
|
|
int eP, lP, hP;
|
|
|
core->MNNGetMatMulPackMode(&eP, &lP, &hP);
|
|
|
- int UNIT, SRC_UNIT, DST_XUNIT;
|
|
|
- coreInt8->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
|
|
|
- bool ModeInt8 = false;
|
|
|
-
|
|
|
- if (CPUBackend::getDataType(input) == DataType_DT_INT8 || input->getType().bytes() == 1) {
|
|
|
- eP = DST_XUNIT;
|
|
|
- lP = SRC_UNIT;
|
|
|
- hP = UNIT;
|
|
|
- ModeInt8 = true;
|
|
|
- }
|
|
|
auto conv2d = convOp->main_as_Convolution2D();
|
|
|
auto layer = conv2d->common();
|
|
|
int outputCount = layer->outputCount();
|
|
@@ -155,30 +104,17 @@ CPUDeconvolution::CPUDeconvolution(const Tensor* input, const Op* convOp, Backen
|
|
|
mWeight.reset(Tensor::createDevice<float>(std::vector<int>{UP_DIV(outputAlign, hP), UP_DIV(srcCount, lP) * lP, hP}));
|
|
|
std::shared_ptr<Tensor> cache(Tensor::createDevice<float>({outputAlign * srcCount}));
|
|
|
if (dynamicWeight) {
|
|
|
- mOrigin.reset(new CPUDeconvolutionOrigin(input, mWeight.get(), convOp, backend, ModeInt8));
|
|
|
+ mOrigin.reset(new CPUDeconvolutionOrigin(input, mWeight.get(), convOp, backend, false));
|
|
|
mWeightTransformCache = cache;
|
|
|
return;
|
|
|
}
|
|
|
|
|
|
const float* tempWeight = nullptr;
|
|
|
- const int8_t* quanWeightInt8 = nullptr;
|
|
|
|
|
|
int tempWeightSize = 0;
|
|
|
- std::unique_ptr<Tensor> externalWeightTensor;
|
|
|
std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
|
|
|
|
|
|
- std::vector<int32_t> _bias(outputChannleUp4, 0);
|
|
|
- std::vector<float> _scale(outputChannleUp4, 0);
|
|
|
- std::vector<int32_t> _beta(outputChannleUp4, 0);
|
|
|
- auto biasPtr = _bias.data();
|
|
|
- auto scalePtr = _scale.data();
|
|
|
- auto betaPtr = _beta.data();
|
|
|
-
|
|
|
- if (ModeInt8) {
|
|
|
- ConvolutionCommon::getConvInt8Parameters(convOp, quanCommon, backend, quanWeightInt8, tempWeightSize, scalePtr, biasPtr, betaPtr);
|
|
|
- } else {
|
|
|
- ConvolutionCommon::getConvParameters(&quanCommon, backend, convOp, &tempWeight, &tempWeightSize);
|
|
|
- }
|
|
|
+ ConvolutionCommon::getConvParameters(&quanCommon, backend, convOp, &tempWeight, &tempWeightSize);
|
|
|
|
|
|
bool success = backend->onAcquireBuffer(mWeight.get(), Backend::STATIC) &&
|
|
|
backend->onAcquireBuffer(cache.get(), Backend::STATIC);
|
|
@@ -196,26 +132,16 @@ CPUDeconvolution::CPUDeconvolution(const Tensor* input, const Op* convOp, Backen
|
|
|
core->MNNFp32ToLowp(tempWeight, (int16_t*)lowpWeight.get(), outputCount * srcCount * fh * fw);
|
|
|
tempWeight = (float*)lowpWeight.get();
|
|
|
}
|
|
|
- if (!ModeInt8) {
|
|
|
- mWeight.reset(Tensor::createDevice<float>(std::vector<int>{UP_DIV(outputAlign, hP), UP_DIV(srcCount, lP) * lP, hP}));
|
|
|
- success = backend->onAcquireBuffer(mWeight.get(), Backend::STATIC);
|
|
|
- if (!success) {
|
|
|
- mValid = false;
|
|
|
- return;
|
|
|
- }
|
|
|
- auto dest = mWeight->host<uint8_t>();
|
|
|
- _transformWeight((uint8_t*)tempWeight, dest, outputCount, srcCount, fh, fw, cache->host<uint8_t>(), core);
|
|
|
- } else {
|
|
|
- mWeight.reset(Tensor::createDevice<int8_t>(std::vector<int>{UP_DIV(outputAlign, hP), UP_DIV(srcCount, lP) * lP, hP}));
|
|
|
- success = backend->onAcquireBuffer(mWeight.get(), Backend::STATIC);
|
|
|
- if (!success) {
|
|
|
- mValid = false;
|
|
|
- return;
|
|
|
- }
|
|
|
- _reorderWeightInt8(backend, layer, quanWeightInt8, mWeight);
|
|
|
+ mWeight.reset(Tensor::createDevice<float>(std::vector<int>{UP_DIV(outputAlign, hP), UP_DIV(srcCount, lP) * lP, hP}));
|
|
|
+ success = backend->onAcquireBuffer(mWeight.get(), Backend::STATIC);
|
|
|
+ if (!success) {
|
|
|
+ mValid = false;
|
|
|
+ return;
|
|
|
}
|
|
|
+ auto dest = mWeight->host<uint8_t>();
|
|
|
+ _transformWeight((uint8_t*)tempWeight, dest, outputCount, srcCount, fh, fw, cache->host<uint8_t>(), core);
|
|
|
backend->onReleaseBuffer(cache.get(), Backend::STATIC);
|
|
|
- mOrigin.reset(new CPUDeconvolutionOrigin(input, mWeight.get(), convOp, backend, ModeInt8));
|
|
|
+ mOrigin.reset(new CPUDeconvolutionOrigin(input, mWeight.get(), convOp, backend, false));
|
|
|
}
|
|
|
|
|
|
CPUDeconvolution::~CPUDeconvolution() {
|
|
@@ -261,68 +187,21 @@ ErrorCode CPUDeconvolution::onResize(const std::vector<Tensor *> &inputs, const
|
|
|
}
|
|
|
|
|
|
CPUDeconvolutionOrigin::CPUDeconvolutionOrigin(const Tensor *input, Tensor *weight, const Op *convOp, Backend *b, bool ModeInt8) : CPUDeconvolutionBasic(input, convOp, b) {
|
|
|
- if (ModeInt8) {
|
|
|
- const auto weightDataPtr = weight->host<int8_t>();
|
|
|
- auto conv2d = convOp->main_as_Convolution2D();
|
|
|
- auto common = conv2d->common();
|
|
|
- auto pack = static_cast<CPUBackend*>(b)->functions()->pack;
|
|
|
- mResource = CPUConvolution::makeResourceInt8(backend(), convOp, pack);
|
|
|
- CPUConvolution::MutableResourceInt8 mutableResource(mResource, b);
|
|
|
- auto core = static_cast<CPUBackend*>(b)->int8Functions();
|
|
|
- auto gemmKernel = core->Int8GemmKernel;
|
|
|
- int UNIT, SRC_UNIT, DST_XUNIT;
|
|
|
- core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
|
|
|
- const auto kEleCnt = mCommon->kernelX() * mCommon->kernelY();
|
|
|
- const int ocDiv4 = UP_DIV(common->outputCount(), pack) * kEleCnt;
|
|
|
- const int icDiv4 = UP_DIV(common->inputCount(), SRC_UNIT);
|
|
|
- const int ocDivUnit = UP_DIV(common->outputCount(), UNIT);
|
|
|
- const int oc4 = ocDiv4 / kEleCnt;
|
|
|
- const int bias_elesize = ocDiv4 * pack;
|
|
|
- // set offset if use SSE.
|
|
|
- auto inputQuant = TensorUtils::getQuantInfo(input);
|
|
|
- auto inputZeroPoint = inputQuant[1];
|
|
|
- std::vector<int32_t> _bias(bias_elesize, inputZeroPoint);
|
|
|
-#ifdef MNN_USE_SSE
|
|
|
- int actBits = conv2d->symmetricQuan()->nbits();
|
|
|
- if (actBits <= 7) {
|
|
|
- gemmKernel = core->Int8GemmKernelFast;
|
|
|
- }
|
|
|
- for (int a = 0; a < kEleCnt; ++a){
|
|
|
- for (int oz = 0; oz < ocDivUnit * UNIT; ++oz) {
|
|
|
- int offset = inputZeroPoint, oz4 = oz / UNIT, ozRemain = oz % UNIT;
|
|
|
- for (int sz = 0; sz < icDiv4 * SRC_UNIT; ++sz) {
|
|
|
- int sz4 = sz / SRC_UNIT, szRemain = sz % SRC_UNIT;
|
|
|
- int index = (((a * oc4 + oz4) * icDiv4 + sz4) * UNIT + ozRemain) * SRC_UNIT + szRemain;
|
|
|
- auto weightInt8Data = weightDataPtr[index];
|
|
|
- offset += weightInt8Data * (-128);
|
|
|
- }
|
|
|
- if (oz < oc4 * pack) {
|
|
|
- _bias[a * oc4 * pack + oz] = offset;
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
-#else
|
|
|
- if(conv2d->symmetricQuan() && conv2d->symmetricQuan()->method() == QuantizeAlgo_OVERFLOW_AWARE){
|
|
|
- gemmKernel = core->Int8GemmKernelFast;
|
|
|
- }
|
|
|
-#endif
|
|
|
- mDeconvInt8Exe.reset(new GemmInt8Executor(b, mResource, convOp, gemmKernel, _bias));
|
|
|
- }
|
|
|
+ // Do nothing
|
|
|
}
|
|
|
|
|
|
ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
|
|
|
CPUDeconvolutionBasic::onResize(inputs, outputs);
|
|
|
auto core = static_cast<CPUBackend*>(backend())->functions();
|
|
|
- auto gcore = static_cast<CPUBackend*>(backend())->int8Functions();
|
|
|
int bytes = core->bytes;
|
|
|
auto input = inputs[0];
|
|
|
auto output = outputs[0];
|
|
|
auto oc = output->channel();
|
|
|
- int UNIT, SRC_UNIT, DST_XUNIT;
|
|
|
- gcore->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
|
|
|
if (UP_DIV(oc, core->pack) * core->pack != inputs[2]->length(0)) {
|
|
|
return INPUT_DATA_ERROR;
|
|
|
}
|
|
|
+ int eP, lP, hP;
|
|
|
+ core->MNNGetMatMulPackMode(&eP, &lP, &hP);
|
|
|
|
|
|
auto ocC4 = UP_DIV(output->channel(), core->pack);
|
|
|
auto icC4 = UP_DIV(input->channel(), core->pack);
|
|
@@ -339,136 +218,132 @@ ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector<Tensor*>& inputs, c
|
|
|
auto src_height = output->height();
|
|
|
auto src_width = output->width();
|
|
|
auto batch = output->batch();
|
|
|
+ auto weightTensor = inputs[1];
|
|
|
+ auto biasTensor = inputs[2];
|
|
|
|
|
|
auto kernelCount = ocC4 * mCommon->kernelX() * mCommon->kernelY();
|
|
|
- mPostFunctions.clear();
|
|
|
- auto plane = width * height * batch;
|
|
|
- const int maxDepth = 5;
|
|
|
+ auto plane = width * height * batch;
|
|
|
auto allocator = static_cast<CPUBackend*>(backend())->getBufferAllocator();
|
|
|
- //int zeroPoint = 0;
|
|
|
-
|
|
|
- auto biasTensor = inputs[2];
|
|
|
-
|
|
|
- // prepare for float2int8 if necessary.
|
|
|
- auto outputQuant = TensorUtils::getQuantInfo(outputs[0]);
|
|
|
- float scale = outputQuant[0];
|
|
|
- scale = (scale == 0.f ? 0.f : 1.f / scale);
|
|
|
- auto maxValue = outputQuant[3];
|
|
|
- auto minValue = outputQuant[2];
|
|
|
- auto zeroPoint = outputQuant[1];
|
|
|
-
|
|
|
- AutoRelease<Tensor> tempInput(Tensor::createDevice<float>({icC4, plane, core->pack}));
|
|
|
- bool needReleaseTempInput = true;
|
|
|
- int outi8 = 0;
|
|
|
- if (CPUBackend::getDataType(output) == DataType_DT_INT8 || output->getType().bytes() == 1) {
|
|
|
- outi8 = 1;
|
|
|
+ auto threadNumber = static_cast<CPUBackend*>(backend())->threadNumber();
|
|
|
+ auto tileCount = UP_DIV(plane, eP);
|
|
|
+ threadNumber = ALIMIN(tileCount, threadNumber);
|
|
|
+ auto im2colOutputStride = input->channel() * eP * core->bytes;
|
|
|
+ mGemmInput = allocator->alloc(threadNumber * im2colOutputStride);
|
|
|
+ auto gemmOutputStride = kernelCount * core->pack * eP * core->bytes;
|
|
|
+ mGemmOutput = allocator->alloc(threadNumber * gemmOutputStride);
|
|
|
+ auto outputSize = batch*src_width*src_height*ocC4*core->pack*core->bytes;
|
|
|
+ if (threadNumber > 1) {
|
|
|
+ mExtraOutput = allocator->alloc((threadNumber-1)*outputSize);
|
|
|
}
|
|
|
- if (CPUBackend::getDataType(inputs[0]) == DataType_DT_INT8 || inputs[0]->getType().bytes() == 1) {
|
|
|
- mTempOutput.reset(Tensor::createDevice<float>({batch, height, width, ocC4 * kw * kh * core->pack}));
|
|
|
- auto res = backend()->onAcquireBuffer(mTempOutput.get(), Backend::DYNAMIC);
|
|
|
- if (!res) {
|
|
|
- return OUT_OF_MEMORY;
|
|
|
- }
|
|
|
- mDeconvInt8Exe->onResize({input}, {mTempOutput.get()});
|
|
|
- if (mResource->mRelu) {
|
|
|
- minValue = outputQuant[1];
|
|
|
- }
|
|
|
+ allocator->free(mGemmInput);
|
|
|
+ allocator->free(mGemmOutput);
|
|
|
+ if (threadNumber > 1) {
|
|
|
+ allocator->free(mExtraOutput);
|
|
|
}
|
|
|
- else {
|
|
|
- mTempOutput.reset(Tensor::createDevice<float>({kernelCount, plane, core->pack}));
|
|
|
- auto res = backend()->onAcquireBuffer(mTempOutput.get(), Backend::DYNAMIC);
|
|
|
- if (!res) {
|
|
|
- return OUT_OF_MEMORY;
|
|
|
- }
|
|
|
- mMatMul.reset(new StrassenMatrixComputor(backend(), true, maxDepth));
|
|
|
- // tempInput->buffer().host = (uint8_t*)inputPtr;
|
|
|
-
|
|
|
- needReleaseTempInput = false;
|
|
|
- TensorUtils::getDescribeOrigin(tempInput.get())->mem = new CPUMemObj(nullptr, TensorUtils::getDescribeOrigin(input)->mem->chunk(), 0);
|
|
|
- mMatMul->onEncode({tempInput.get(), inputs[1]}, {mTempOutput.get()});
|
|
|
- }
|
|
|
- auto threadNumber = ((CPUBackend*)backend())->threadNumber();
|
|
|
- std::vector<float> scales(core->pack * src_height * src_width * batch, scale);
|
|
|
- MemChunk outputFp32Ptr;
|
|
|
- if (outi8) {
|
|
|
- outputFp32Ptr = allocator->alloc(batch * src_height * src_width * ocC4 * core->pack * bytes);
|
|
|
- if (outputFp32Ptr.invalid()) {
|
|
|
- return OUT_OF_MEMORY;
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- mPostFunctions.emplace_back(std::make_pair([ocC4, width, height, kh, kw, padY, padX, dilateY, dilateX, strideY,
|
|
|
- strideX, threadNumber, src_width, src_height, plane, input, biasTensor, this, core, gcore, batch, outi8, scale,
|
|
|
- minValue, maxValue, zeroPoint, outputFp32Ptr](uint8_t* outputPtr, int tId) {
|
|
|
- auto colBufferPtr = mTempOutput->host<uint8_t>();
|
|
|
- auto biasPtr = biasTensor->host<float>();
|
|
|
- auto inputPtr = input->host<float>();
|
|
|
+ auto first = std::make_pair([=](uint8_t* outputPtr, int tId) {
|
|
|
+ auto gemmInputBufferPtr = mGemmInput.ptr() + tId * im2colOutputStride;
|
|
|
+ auto colBufferPtr = mGemmOutput.ptr() + tId * gemmOutputStride;
|
|
|
+ auto inputPtr = input->host<uint8_t>();
|
|
|
auto unitBytes = core->pack * core->bytes;
|
|
|
auto tempOutPtr = outputPtr;
|
|
|
- auto float2Int8_step = src_height * src_width * batch;
|
|
|
- if (outi8) {
|
|
|
- tempOutPtr = outputFp32Ptr.ptr();
|
|
|
+ if (tId > 0) {
|
|
|
+ tempOutPtr = mExtraOutput.ptr() + (tId-1) * outputSize;
|
|
|
}
|
|
|
- for (int z = (tId); z < ocC4; z += threadNumber) {
|
|
|
- auto dstZ = tempOutPtr + z * src_height * src_width * batch * unitBytes;
|
|
|
- auto srcZ = colBufferPtr + kw * kh * plane * z * unitBytes;
|
|
|
- ::memset(dstZ, 0, src_width * src_height * batch * unitBytes);
|
|
|
- for (int b = 0; b < batch; ++b) {
|
|
|
- auto dstB = dstZ + b * src_width * src_height * unitBytes;
|
|
|
- auto srcB = srcZ + b * width * height * unitBytes;
|
|
|
- for (int oy = 0; oy < height; ++oy) {
|
|
|
- for (int ox = 0; ox < width; ++ox) {
|
|
|
- int srcStartX = ox * strideX - padX;
|
|
|
- int srcStartY = oy * strideY - padY;
|
|
|
-
|
|
|
- int sfy = ALIMAX(0, (UP_DIV(-srcStartY, dilateY)));
|
|
|
- int efy = ALIMIN(kh, UP_DIV(src_height - srcStartY, dilateY));
|
|
|
-
|
|
|
- int sfx = ALIMAX(0, (UP_DIV(-srcStartX, dilateX)));
|
|
|
- int efx = ALIMIN(kw, UP_DIV(src_width - srcStartX, dilateX));
|
|
|
-
|
|
|
- auto dstStart = dstB + srcStartX * unitBytes + srcStartY * src_width * unitBytes;
|
|
|
- auto srcStart = srcB + unitBytes * (ox + oy * width);
|
|
|
- if (sfy >= efy || sfx >= efx) {
|
|
|
- continue;
|
|
|
- }
|
|
|
-
|
|
|
- for (int fy = sfy; fy < efy; ++fy) {
|
|
|
- auto dstY = dstStart + fy * unitBytes * dilateY * src_width;
|
|
|
- auto srcY = srcStart + fy * kw * plane * unitBytes;
|
|
|
- core->MNNAddC4WithStride((const float*)(srcY + sfx * plane * unitBytes), (float*)(dstY + sfx * dilateX * unitBytes), plane * core->pack, dilateX * core->pack, efx - sfx);
|
|
|
- }
|
|
|
+ ::memset(tempOutPtr, 0, outputSize);
|
|
|
+
|
|
|
+ int l = mSrcCount;
|
|
|
+ int h = kernelCount * core->pack;
|
|
|
+ auto weightPtr = weightTensor->host<uint8_t>();
|
|
|
+ for (int index=tId; index < tileCount; index+=threadNumber) {
|
|
|
+ int xStart = index * eP;
|
|
|
+ int xEnd = ALIMIN(xStart + eP, plane);
|
|
|
+ int xCount = xEnd-xStart;
|
|
|
+ if (xCount <= 0) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ size_t parameters[7];
|
|
|
+ parameters[0] = xCount * core->bytes;
|
|
|
+ parameters[1] = l;
|
|
|
+ parameters[2] = h;
|
|
|
+ parameters[3] = xCount * core->bytes * core->pack;
|
|
|
+ parameters[4] = 0;
|
|
|
+ parameters[5] = 0;
|
|
|
+ parameters[6] = 0;
|
|
|
+ const float* postParametersPtr = nullptr;
|
|
|
+ int32_t info[4];
|
|
|
+ int32_t stride[4];
|
|
|
+ stride[0] = xCount;
|
|
|
+ stride[1] = (int32_t)parameters[1];
|
|
|
+ stride[2] = 0;
|
|
|
+ stride[3] = 0;
|
|
|
+ info[0] = 1;
|
|
|
+ info[1] = plane;
|
|
|
+ info[2] = xCount;
|
|
|
+ info[3] = 1;
|
|
|
+ auto aStart = inputPtr + xStart * unitBytes;
|
|
|
+ core->MNNPackC4ForMatMul_A((float*)(gemmInputBufferPtr), (const float**)(&aStart), info, stride);
|
|
|
+ if (xCount == eP) {
|
|
|
+ core->MNNPackedMatMul((float*)(colBufferPtr), (float*)gemmInputBufferPtr, (float*)weightPtr, parameters, postParametersPtr, nullptr, nullptr, nullptr);
|
|
|
+ } else {
|
|
|
+ core->MNNPackedMatMulRemain((float*)(colBufferPtr), (float*)gemmInputBufferPtr, (float*)weightPtr, xCount, parameters, postParametersPtr, nullptr, nullptr, nullptr);
|
|
|
+ }
|
|
|
+ // Col2Im
|
|
|
+ for (int z = 0; z < ocC4; ++z) {
|
|
|
+ auto dstZ = tempOutPtr + z * src_height * src_width * batch * unitBytes;
|
|
|
+ auto srcZ = colBufferPtr + kw * kh * xCount * z * unitBytes;
|
|
|
+ for (int x=0; x<xCount; ++x) {
|
|
|
+ auto index = xStart + x;
|
|
|
+ int b = index / (width * height);
|
|
|
+ index = index % (width * height);
|
|
|
+ int oy = index / width;
|
|
|
+ int ox = index % width;
|
|
|
+ int srcStartX = ox * strideX - padX;
|
|
|
+ int srcStartY = oy * strideY - padY;
|
|
|
+
|
|
|
+ int sfy = ALIMAX(0, (UP_DIV(-srcStartY, dilateY)));
|
|
|
+ int efy = ALIMIN(kh, UP_DIV(src_height - srcStartY, dilateY));
|
|
|
+
|
|
|
+ int sfx = ALIMAX(0, (UP_DIV(-srcStartX, dilateX)));
|
|
|
+ int efx = ALIMIN(kw, UP_DIV(src_width - srcStartX, dilateX));
|
|
|
+
|
|
|
+ auto dstStart = dstZ + b * src_width * src_height * unitBytes + srcStartX * unitBytes + srcStartY * src_width * unitBytes;
|
|
|
+ auto srcStart = srcZ + x * unitBytes;
|
|
|
+ if (sfy >= efy || sfx >= efx) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ for (int fy = sfy; fy < efy; ++fy) {
|
|
|
+ auto dstY = dstStart + fy * unitBytes * dilateY * src_width;
|
|
|
+ auto srcY = srcStart + fy * kw * xCount * unitBytes;
|
|
|
+ core->MNNAddC4WithStride((const float*)(srcY + sfx * xCount * unitBytes), (float*)(dstY + sfx * dilateX * unitBytes), xCount * core->pack, dilateX * core->pack, efx - sfx);
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
- core->MNNAxByClampBroadcastUnit((float*)dstZ, (float*)dstZ, (const float*)((uint8_t*)biasPtr + unitBytes * z), src_height * src_width * batch, 0, 0, 1, mPostParameters.data());
|
|
|
- if (outi8) {
|
|
|
- float scaleOne = scale;
|
|
|
- float zeroOne = zeroPoint;
|
|
|
- gcore->MNNFloat2Int8((float*)dstZ, (int8_t*)(outputPtr + z * float2Int8_step * core->pack), float2Int8_step, &scaleOne, minValue, maxValue, &zeroOne, 0);
|
|
|
+ }
|
|
|
+ }, threadNumber);
|
|
|
+ auto second = std::make_pair([ocC4, src_height, src_width, threadNumber, batch, biasTensor, this, outputSize, core](uint8_t* outputPtr, int tId) {
|
|
|
+ auto unitBytes = core->pack * core->bytes;
|
|
|
+ auto biasPtr = biasTensor->host<uint8_t>();
|
|
|
+ for (int z = tId; z < ocC4; z+=threadNumber) {
|
|
|
+ auto dstZ = outputPtr + z * src_height * src_width * batch * unitBytes;
|
|
|
+ if (threadNumber > 1) {
|
|
|
+ for (int index=0; index<threadNumber-1; ++index) {
|
|
|
+ auto src = mExtraOutput.ptr() + index * outputSize + z * src_height * src_width * batch * unitBytes;
|
|
|
+ core->MNNMatrixAdd((float*)(dstZ), (float*)(src), (float*)(dstZ), src_height * src_width * batch, 0, 0, 0, 1);
|
|
|
+ }
|
|
|
}
|
|
|
+ core->MNNAxByClampBroadcastUnit((float*)dstZ, (float*)dstZ, (const float*)((uint8_t*)biasPtr + unitBytes * z), src_height * src_width * batch, 0, 0, 1, mPostParameters.data());
|
|
|
}
|
|
|
- }, threadNumber));
|
|
|
- if (outi8) {
|
|
|
- allocator->free(outputFp32Ptr);
|
|
|
- }
|
|
|
- if (needReleaseTempInput) {
|
|
|
- backend()->onReleaseBuffer(tempInput.get(), Backend::DYNAMIC);
|
|
|
- }
|
|
|
- backend()->onReleaseBuffer(mTempOutput.get(), Backend::DYNAMIC);
|
|
|
+
|
|
|
+ }, threadNumber);
|
|
|
+ mExecuteFuntion = {first, second};
|
|
|
return NO_ERROR;
|
|
|
}
|
|
|
|
|
|
ErrorCode CPUDeconvolutionOrigin::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
|
|
|
auto inputPtr = inputs[0]->host<uint8_t>();
|
|
|
auto outputPtr = outputs[0]->host<uint8_t>();
|
|
|
- if (mDeconvInt8Exe.get() != nullptr) {
|
|
|
- mDeconvInt8Exe->onExecute({inputs[0], inputs[1]}, {mTempOutput.get()});
|
|
|
- }
|
|
|
- else {
|
|
|
- mMatMul->onExecute();
|
|
|
- }
|
|
|
- for (auto& unit : mPostFunctions) {
|
|
|
+ for (auto& unit : mExecuteFuntion) {
|
|
|
MNN_CONCURRENCY_BEGIN(tId, unit.second) {
|
|
|
unit.first(outputPtr, (int)tId);
|
|
|
}
|
|
@@ -482,15 +357,6 @@ public:
|
|
|
const MNN::Op* op, Backend* backend) const {
|
|
|
auto convOp = op->main_as_Convolution2D();
|
|
|
auto common = convOp->common();
|
|
|
- if (backend->type() == MNN_FORWARD_CPU && inputs.size() == 1) {
|
|
|
- if (common->strideY() > 1 || common->strideX() > 1) {
|
|
|
- if (common->dilateX() == 1 && common->dilateY() == 1) {
|
|
|
- if (common->kernelX() / common->strideX() > 2 || common->kernelY() / common->strideY() > 2) {
|
|
|
- return new DeconvolutionWithStride(inputs[0], op, backend);
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
return new CPUDeconvolution(inputs[0], op, backend, inputs.size() > 1);
|
|
|
}
|
|
|
};
|